{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.515025776436565, "eval_steps": 500, "global_step": 15000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 251.78126049041748, "epoch": 0.00033530323986755525, "grad_norm": 0.2821206415206205, "kl": 0.0, "learning_rate": 2.4691358024691357e-10, "loss": -0.0, "reward": 1.7428572177886963, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7517857365310192, "rewards/format_reward_func": 0.9910714328289032, "step": 2 }, { "completion_length": 251.26340675354004, "epoch": 0.0006706064797351105, "grad_norm": 0.23751592598417418, "kl": 2.596992999315262e-05, "learning_rate": 4.938271604938271e-10, "loss": 0.0, "reward": 1.7803571969270706, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 4 }, { "completion_length": 255.4955472946167, "epoch": 0.0010059097196026656, "grad_norm": 0.34231130475220095, "kl": 0.00018531084060668945, "learning_rate": 7.407407407407407e-10, "loss": 0.0, "reward": 1.7053572311997414, "reward_std": 0.10354063473641872, "rewards/equation_reward_func": 0.7187500223517418, "rewards/format_reward_func": 0.9866071492433548, "step": 6 }, { "completion_length": 242.8571538925171, "epoch": 0.001341212959470221, "grad_norm": 0.282011994670917, "kl": 0.0002923011779785156, "learning_rate": 9.876543209876543e-10, "loss": 0.0, "reward": 1.7816964909434319, "reward_std": 0.08649431029334664, "rewards/equation_reward_func": 0.793750025331974, "rewards/format_reward_func": 0.9879464358091354, "step": 8 }, { "completion_length": 246.70090103149414, "epoch": 0.0016765161993377762, "grad_norm": 0.4075528008052861, "kl": 0.0003095269203186035, "learning_rate": 1.2345679012345679e-09, "loss": 0.0, "reward": 1.7267858013510704, "reward_std": 0.11364216078072786, "rewards/equation_reward_func": 0.7401785962283611, "rewards/format_reward_func": 0.9866071492433548, "step": 10 }, { "completion_length": 243.15179634094238, "epoch": 0.002011819439205331, "grad_norm": 0.2539315791340793, "kl": 0.00033289194107055664, "learning_rate": 1.4814814814814814e-09, "loss": 0.0, "reward": 1.775000050663948, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 12 }, { "completion_length": 246.633939743042, "epoch": 0.0023471226790728866, "grad_norm": 0.24597106986052078, "kl": 0.0003146529197692871, "learning_rate": 1.728395061728395e-09, "loss": 0.0, "reward": 1.687500074505806, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.691964328289032, "rewards/format_reward_func": 0.9955357164144516, "step": 14 }, { "completion_length": 248.13840293884277, "epoch": 0.002682425918940442, "grad_norm": 0.2283280840772994, "kl": 0.0003129243850708008, "learning_rate": 1.9753086419753086e-09, "loss": 0.0, "reward": 1.7732143476605415, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7776786126196384, "rewards/format_reward_func": 0.9955357164144516, "step": 16 }, { "completion_length": 243.49108219146729, "epoch": 0.003017729158807997, "grad_norm": 0.25527874503701053, "kl": 0.0002974867820739746, "learning_rate": 2.222222222222222e-09, "loss": 0.0, "reward": 1.7375000715255737, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 18 }, { "completion_length": 257.25000858306885, "epoch": 0.0033530323986755524, "grad_norm": 0.3755577292202996, "kl": 0.0003231167793273926, "learning_rate": 2.4691358024691357e-09, "loss": 0.0, "reward": 1.751785784959793, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 20 }, { "completion_length": 248.70090293884277, "epoch": 0.0036883356385431073, "grad_norm": 0.25981511886182895, "kl": 0.00031244754791259766, "learning_rate": 2.7160493827160493e-09, "loss": 0.0, "reward": 1.7410715147852898, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7544643171131611, "rewards/format_reward_func": 0.9866071492433548, "step": 22 }, { "completion_length": 254.64733219146729, "epoch": 0.004023638878410662, "grad_norm": 0.15723937143886035, "kl": 0.0003066062927246094, "learning_rate": 2.962962962962963e-09, "loss": 0.0, "reward": 1.769642911851406, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7741071823984385, "rewards/format_reward_func": 0.9955357164144516, "step": 24 }, { "completion_length": 240.6919765472412, "epoch": 0.004358942118278218, "grad_norm": 0.45759362392893915, "kl": 0.000310361385345459, "learning_rate": 3.209876543209876e-09, "loss": 0.0, "reward": 1.7950893640518188, "reward_std": 0.0776554741896689, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 0.9986607171595097, "step": 26 }, { "completion_length": 250.5446548461914, "epoch": 0.004694245358145773, "grad_norm": 0.15906385599381753, "kl": 0.0002815127372741699, "learning_rate": 3.45679012345679e-09, "loss": 0.0, "reward": 1.7589286491274834, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928805589676, "rewards/format_reward_func": 0.9955357164144516, "step": 28 }, { "completion_length": 245.8571548461914, "epoch": 0.005029548598013328, "grad_norm": 0.20213909633387014, "kl": 0.0003229975700378418, "learning_rate": 3.7037037037037036e-09, "loss": 0.0, "reward": 1.7946429252624512, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7991071566939354, "rewards/format_reward_func": 0.9955357164144516, "step": 30 }, { "completion_length": 239.7812614440918, "epoch": 0.005364851837880884, "grad_norm": 0.2975602670727807, "kl": 0.0003528594970703125, "learning_rate": 3.950617283950617e-09, "loss": 0.0, "reward": 1.7732143327593803, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 32 }, { "completion_length": 244.4196548461914, "epoch": 0.005700155077748439, "grad_norm": 0.19631508669566625, "kl": 0.00030624866485595703, "learning_rate": 4.197530864197531e-09, "loss": 0.0, "reward": 1.7767857685685158, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7812500335276127, "rewards/format_reward_func": 0.9955357164144516, "step": 34 }, { "completion_length": 248.04018878936768, "epoch": 0.006035458317615994, "grad_norm": 0.1693595352246246, "kl": 0.0003243684768676758, "learning_rate": 4.444444444444444e-09, "loss": 0.0, "reward": 1.6482143849134445, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.6616071630269289, "rewards/format_reward_func": 0.9866071492433548, "step": 36 }, { "completion_length": 246.6294765472412, "epoch": 0.006370761557483549, "grad_norm": 0.2152008240652874, "kl": 0.0003387331962585449, "learning_rate": 4.6913580246913574e-09, "loss": 0.0, "reward": 1.7517857775092125, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.765178594738245, "rewards/format_reward_func": 0.9866071492433548, "step": 38 }, { "completion_length": 242.79465579986572, "epoch": 0.006706064797351105, "grad_norm": 0.2081818106050967, "kl": 0.00032889842987060547, "learning_rate": 4.938271604938271e-09, "loss": 0.0, "reward": 1.7482143640518188, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7526786122471094, "rewards/format_reward_func": 0.9955357164144516, "step": 40 }, { "completion_length": 247.04911708831787, "epoch": 0.00704136803721866, "grad_norm": 0.197765113235337, "kl": 0.00030040740966796875, "learning_rate": 5.1851851851851846e-09, "loss": 0.0, "reward": 1.8107143640518188, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.810714315623045, "rewards/format_reward_func": 1.0, "step": 42 }, { "completion_length": 233.49108219146729, "epoch": 0.007376671277086215, "grad_norm": 0.22273231251306044, "kl": 0.00031495094299316406, "learning_rate": 5.4320987654320985e-09, "loss": 0.0, "reward": 1.8017857447266579, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.8062500357627869, "rewards/format_reward_func": 0.9955357164144516, "step": 44 }, { "completion_length": 237.5714406967163, "epoch": 0.00771197451695377, "grad_norm": 0.18181933051003635, "kl": 0.00033354759216308594, "learning_rate": 5.679012345679012e-09, "loss": 0.0, "reward": 1.7785715013742447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 46 }, { "completion_length": 246.5937614440918, "epoch": 0.008047277756821325, "grad_norm": 0.1606775878103379, "kl": 0.00032961368560791016, "learning_rate": 5.925925925925926e-09, "loss": 0.0, "reward": 1.7714286372065544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286018162966, "rewards/format_reward_func": 1.0, "step": 48 }, { "completion_length": 239.64733219146729, "epoch": 0.008382580996688881, "grad_norm": 0.3020505688952993, "kl": 0.00031131505966186523, "learning_rate": 6.172839506172839e-09, "loss": 0.0, "reward": 1.7571429312229156, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 50 }, { "completion_length": 241.3169755935669, "epoch": 0.008717884236556436, "grad_norm": 0.28625665696322333, "kl": 0.0003256797790527344, "learning_rate": 6.419753086419752e-09, "loss": 0.0, "reward": 1.7821429148316383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 52 }, { "completion_length": 248.96429443359375, "epoch": 0.009053187476423991, "grad_norm": 0.26129573486345886, "kl": 0.0003027915954589844, "learning_rate": 6.666666666666667e-09, "loss": 0.0, "reward": 1.7571429088711739, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.766071455553174, "rewards/format_reward_func": 0.9910714328289032, "step": 54 }, { "completion_length": 250.23215293884277, "epoch": 0.009388490716291546, "grad_norm": 0.27548165311054307, "kl": 0.00035876035690307617, "learning_rate": 6.91358024691358e-09, "loss": 0.0, "reward": 1.7053572162985802, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7098214589059353, "rewards/format_reward_func": 0.9955357164144516, "step": 56 }, { "completion_length": 244.34376049041748, "epoch": 0.009723793956159101, "grad_norm": 0.22151518083461771, "kl": 0.00031697750091552734, "learning_rate": 7.160493827160494e-09, "loss": 0.0, "reward": 1.8053571954369545, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8098214492201805, "rewards/format_reward_func": 0.9955357164144516, "step": 58 }, { "completion_length": 241.06697273254395, "epoch": 0.010059097196026656, "grad_norm": 0.22549107156528939, "kl": 0.000319063663482666, "learning_rate": 7.407407407407407e-09, "loss": 0.0, "reward": 1.7375000640749931, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 60 }, { "completion_length": 243.7544755935669, "epoch": 0.010394400435894211, "grad_norm": 0.17997447995320268, "kl": 0.00031387805938720703, "learning_rate": 7.654320987654321e-09, "loss": 0.0, "reward": 1.741071492433548, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.74553575925529, "rewards/format_reward_func": 0.9955357164144516, "step": 62 }, { "completion_length": 239.93750953674316, "epoch": 0.010729703675761768, "grad_norm": 0.29247137737836243, "kl": 0.00034987926483154297, "learning_rate": 7.901234567901234e-09, "loss": 0.0, "reward": 1.7285714894533157, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7375000305473804, "rewards/format_reward_func": 0.9910714328289032, "step": 64 }, { "completion_length": 246.06697273254395, "epoch": 0.011065006915629323, "grad_norm": 0.2353839170492452, "kl": 0.00032019615173339844, "learning_rate": 8.148148148148147e-09, "loss": 0.0, "reward": 1.7714286297559738, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286129921675, "rewards/format_reward_func": 1.0, "step": 66 }, { "completion_length": 239.946439743042, "epoch": 0.011400310155496878, "grad_norm": 0.23810286576070455, "kl": 0.0003294944763183594, "learning_rate": 8.395061728395062e-09, "loss": 0.0, "reward": 1.7625000774860382, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9955357164144516, "step": 68 }, { "completion_length": 244.83929538726807, "epoch": 0.011735613395364433, "grad_norm": 0.4474750559486092, "kl": 0.0003364682197570801, "learning_rate": 8.641975308641974e-09, "loss": 0.0, "reward": 1.7714286148548126, "reward_std": 0.11111677531152964, "rewards/equation_reward_func": 0.7714286185801029, "rewards/format_reward_func": 1.0, "step": 70 }, { "completion_length": 242.31697463989258, "epoch": 0.012070916635231988, "grad_norm": 0.23940161865679357, "kl": 0.000335693359375, "learning_rate": 8.888888888888889e-09, "loss": 0.0, "reward": 1.6892857998609543, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6892857477068901, "rewards/format_reward_func": 1.0, "step": 72 }, { "completion_length": 250.2500123977661, "epoch": 0.012406219875099543, "grad_norm": 0.3547914135677079, "kl": 0.00031691789627075195, "learning_rate": 9.135802469135803e-09, "loss": 0.0, "reward": 1.748214341700077, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7526785861700773, "rewards/format_reward_func": 0.9955357164144516, "step": 74 }, { "completion_length": 247.17858505249023, "epoch": 0.012741523114967098, "grad_norm": 0.37705598365442405, "kl": 0.0003069639205932617, "learning_rate": 9.382716049382715e-09, "loss": 0.0, "reward": 1.7142857983708382, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7142857499420643, "rewards/format_reward_func": 1.0, "step": 76 }, { "completion_length": 240.8080472946167, "epoch": 0.013076826354834654, "grad_norm": 0.2500330365327647, "kl": 0.00031298398971557617, "learning_rate": 9.62962962962963e-09, "loss": 0.0, "reward": 1.7339286282658577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7383928969502449, "rewards/format_reward_func": 0.9955357164144516, "step": 78 }, { "completion_length": 252.24554824829102, "epoch": 0.01341212959470221, "grad_norm": 0.24171287363674568, "kl": 0.000303804874420166, "learning_rate": 9.876543209876543e-09, "loss": 0.0, "reward": 1.7433036342263222, "reward_std": 0.044825518038123846, "rewards/equation_reward_func": 0.7580357380211353, "rewards/format_reward_func": 0.9852678664028645, "step": 80 }, { "completion_length": 247.12054824829102, "epoch": 0.013747432834569764, "grad_norm": 0.3022600089442828, "kl": 0.00033527612686157227, "learning_rate": 1.0123456790123458e-08, "loss": 0.0, "reward": 1.6678572446107864, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6678571682423353, "rewards/format_reward_func": 1.0, "step": 82 }, { "completion_length": 245.23662281036377, "epoch": 0.01408273607443732, "grad_norm": 0.27039366876832605, "kl": 0.00029647350311279297, "learning_rate": 1.0370370370370369e-08, "loss": 0.0, "reward": 1.80892863124609, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.813392885029316, "rewards/format_reward_func": 0.9955357164144516, "step": 84 }, { "completion_length": 250.4330472946167, "epoch": 0.014418039314304874, "grad_norm": 0.7187583509006711, "kl": 0.0003364086151123047, "learning_rate": 1.0617283950617284e-08, "loss": 0.0, "reward": 1.7321429327130318, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7321428917348385, "rewards/format_reward_func": 1.0, "step": 86 }, { "completion_length": 247.8169765472412, "epoch": 0.01475334255417243, "grad_norm": 0.23848320000262993, "kl": 0.0003071427345275879, "learning_rate": 1.0864197530864197e-08, "loss": 0.0, "reward": 1.751785770058632, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 88 }, { "completion_length": 240.4776906967163, "epoch": 0.015088645794039984, "grad_norm": 0.27460619568878064, "kl": 0.0003440380096435547, "learning_rate": 1.111111111111111e-08, "loss": 0.0, "reward": 1.798214353621006, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.8026785980910063, "rewards/format_reward_func": 0.9955357164144516, "step": 90 }, { "completion_length": 251.821439743042, "epoch": 0.01542394903390754, "grad_norm": 0.29052291960498944, "kl": 0.00032770633697509766, "learning_rate": 1.1358024691358023e-08, "loss": 0.0, "reward": 1.7571429386734962, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 92 }, { "completion_length": 245.17411613464355, "epoch": 0.015759252273775094, "grad_norm": 0.1360821787656637, "kl": 0.00029844045639038086, "learning_rate": 1.1604938271604938e-08, "loss": 0.0, "reward": 1.7446429282426834, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071783006191, "rewards/format_reward_func": 0.9955357164144516, "step": 94 }, { "completion_length": 256.1517963409424, "epoch": 0.01609455551364265, "grad_norm": 0.27993944292911604, "kl": 0.0003134012222290039, "learning_rate": 1.1851851851851851e-08, "loss": 0.0, "reward": 1.7660714983940125, "reward_std": 0.09848987311124802, "rewards/equation_reward_func": 0.7794643267989159, "rewards/format_reward_func": 0.9866071492433548, "step": 96 }, { "completion_length": 242.4553689956665, "epoch": 0.016429858753510204, "grad_norm": 0.2106037899562362, "kl": 0.00028455257415771484, "learning_rate": 1.2098765432098765e-08, "loss": 0.0, "reward": 1.760714367032051, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 98 }, { "completion_length": 239.50447368621826, "epoch": 0.016765161993377763, "grad_norm": 0.3103782359638075, "kl": 0.0003159642219543457, "learning_rate": 1.2345679012345678e-08, "loss": 0.0, "reward": 1.764285795390606, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857693135738, "rewards/format_reward_func": 1.0, "step": 100 }, { "completion_length": 254.9285831451416, "epoch": 0.017100465233245318, "grad_norm": 0.17884850735394575, "kl": 0.00032651424407958984, "learning_rate": 1.2592592592592592e-08, "loss": 0.0, "reward": 1.8000000640749931, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000417232513, "rewards/format_reward_func": 1.0, "step": 102 }, { "completion_length": 248.58483219146729, "epoch": 0.017435768473112873, "grad_norm": 0.29105440062209287, "kl": 0.00033777952194213867, "learning_rate": 1.2839506172839504e-08, "loss": 0.0, "reward": 1.7629464864730835, "reward_std": 0.03219861118122935, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 0.9986607171595097, "step": 104 }, { "completion_length": 251.37500953674316, "epoch": 0.017771071712980428, "grad_norm": 0.24668718286887112, "kl": 0.0003491640090942383, "learning_rate": 1.3086419753086419e-08, "loss": 0.0, "reward": 1.7071429342031479, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7071428932249546, "rewards/format_reward_func": 1.0, "step": 106 }, { "completion_length": 241.62501049041748, "epoch": 0.018106374952847983, "grad_norm": 0.2734650656617372, "kl": 0.0003235936164855957, "learning_rate": 1.3333333333333334e-08, "loss": 0.0, "reward": 1.7803572043776512, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 108 }, { "completion_length": 252.74108219146729, "epoch": 0.018441678192715538, "grad_norm": 0.2705035956067135, "kl": 0.00032901763916015625, "learning_rate": 1.3580246913580247e-08, "loss": 0.0, "reward": 1.7375000789761543, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.741964315995574, "rewards/format_reward_func": 0.9955357164144516, "step": 110 }, { "completion_length": 240.09822463989258, "epoch": 0.018776981432583092, "grad_norm": 0.14575037715148337, "kl": 0.0003132820129394531, "learning_rate": 1.382716049382716e-08, "loss": 0.0, "reward": 1.8000000640749931, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000342726707, "rewards/format_reward_func": 1.0, "step": 112 }, { "completion_length": 251.90626049041748, "epoch": 0.019112284672450647, "grad_norm": 0.28566608159303575, "kl": 0.0003253817558288574, "learning_rate": 1.4074074074074073e-08, "loss": 0.0, "reward": 1.7392857745289803, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7482143174856901, "rewards/format_reward_func": 0.9910714328289032, "step": 114 }, { "completion_length": 245.4419755935669, "epoch": 0.019447587912318202, "grad_norm": 0.22007001347735153, "kl": 0.0003275871276855469, "learning_rate": 1.4320987654320988e-08, "loss": 0.0, "reward": 1.7660714983940125, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357447266579, "rewards/format_reward_func": 0.9955357164144516, "step": 116 }, { "completion_length": 246.47769165039062, "epoch": 0.019782891152185757, "grad_norm": 0.24532766247165583, "kl": 0.00031566619873046875, "learning_rate": 1.4567901234567901e-08, "loss": 0.0, "reward": 1.7267857939004898, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7312500327825546, "rewards/format_reward_func": 0.9955357164144516, "step": 118 }, { "completion_length": 246.93304634094238, "epoch": 0.020118194392053312, "grad_norm": 0.21397852222502559, "kl": 0.00031769275665283203, "learning_rate": 1.4814814814814814e-08, "loss": 0.0, "reward": 1.7232143506407738, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7276785969734192, "rewards/format_reward_func": 0.9955357164144516, "step": 120 }, { "completion_length": 245.8035831451416, "epoch": 0.020453497631920867, "grad_norm": 0.44949071446508276, "kl": 0.0003509521484375, "learning_rate": 1.5061728395061727e-08, "loss": 0.0, "reward": 1.7589286342263222, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928768336773, "rewards/format_reward_func": 0.9955357164144516, "step": 122 }, { "completion_length": 250.5848331451416, "epoch": 0.020788800871788422, "grad_norm": 0.00010308908116451039, "kl": 0.00032585859298706055, "learning_rate": 1.5308641975308642e-08, "loss": 0.0, "reward": 1.732142947614193, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7321428954601288, "rewards/format_reward_func": 1.0, "step": 124 }, { "completion_length": 242.9062623977661, "epoch": 0.021124104111655977, "grad_norm": 0.2326498304971465, "kl": 0.0003529787063598633, "learning_rate": 1.5555555555555554e-08, "loss": 0.0, "reward": 1.7750000655651093, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.775000037625432, "rewards/format_reward_func": 1.0, "step": 126 }, { "completion_length": 248.71876049041748, "epoch": 0.021459407351523536, "grad_norm": 0.2724319389491217, "kl": 0.0003319978713989258, "learning_rate": 1.580246913580247e-08, "loss": 0.0, "reward": 1.767857201397419, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.7767857573926449, "rewards/format_reward_func": 0.9910714328289032, "step": 128 }, { "completion_length": 244.5982265472412, "epoch": 0.02179471059139109, "grad_norm": 0.28980555293971616, "kl": 0.00033468008041381836, "learning_rate": 1.6049382716049383e-08, "loss": 0.0, "reward": 1.796428620815277, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 130 }, { "completion_length": 246.5491189956665, "epoch": 0.022130013831258646, "grad_norm": 0.22034484681712568, "kl": 0.00032633543014526367, "learning_rate": 1.6296296296296295e-08, "loss": 0.0, "reward": 1.7267857939004898, "reward_std": 0.0732360603287816, "rewards/equation_reward_func": 0.7401786036789417, "rewards/format_reward_func": 0.9866071492433548, "step": 132 }, { "completion_length": 251.6919765472412, "epoch": 0.0224653170711262, "grad_norm": 0.21857858965530919, "kl": 0.00031572580337524414, "learning_rate": 1.654320987654321e-08, "loss": 0.0, "reward": 1.7660714909434319, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7705357261002064, "rewards/format_reward_func": 0.9955357164144516, "step": 134 }, { "completion_length": 244.83483505249023, "epoch": 0.022800620310993756, "grad_norm": 0.2697978233876309, "kl": 0.00033473968505859375, "learning_rate": 1.6790123456790124e-08, "loss": 0.0, "reward": 1.7500000447034836, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 136 }, { "completion_length": 256.58483505249023, "epoch": 0.02313592355086131, "grad_norm": 0.300841254760372, "kl": 0.0002970099449157715, "learning_rate": 1.7037037037037036e-08, "loss": 0.0, "reward": 1.7285715192556381, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 138 }, { "completion_length": 246.47768783569336, "epoch": 0.023471226790728866, "grad_norm": 0.2083502710840338, "kl": 0.0003154277801513672, "learning_rate": 1.7283950617283947e-08, "loss": 0.0, "reward": 1.733928643167019, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7383928969502449, "rewards/format_reward_func": 0.9955357164144516, "step": 140 }, { "completion_length": 253.37054538726807, "epoch": 0.02380653003059642, "grad_norm": 0.19329695372740538, "kl": 0.00031769275665283203, "learning_rate": 1.7530864197530862e-08, "loss": 0.0, "reward": 1.7571429386734962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 142 }, { "completion_length": 244.33483123779297, "epoch": 0.024141833270463976, "grad_norm": 0.2657022566055933, "kl": 0.00033587217330932617, "learning_rate": 1.7777777777777777e-08, "loss": 0.0, "reward": 1.7642857655882835, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.773214302957058, "rewards/format_reward_func": 0.9910714328289032, "step": 144 }, { "completion_length": 241.88840293884277, "epoch": 0.02447713651033153, "grad_norm": 0.24003225151699342, "kl": 0.00033402442932128906, "learning_rate": 1.8024691358024692e-08, "loss": 0.0, "reward": 1.7392857894301414, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7482143193483353, "rewards/format_reward_func": 0.9910714328289032, "step": 146 }, { "completion_length": 254.4598331451416, "epoch": 0.024812439750199086, "grad_norm": 0.2815512180517347, "kl": 0.0003273487091064453, "learning_rate": 1.8271604938271607e-08, "loss": 0.0, "reward": 1.7017857804894447, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7062500528991222, "rewards/format_reward_func": 0.9955357164144516, "step": 148 }, { "completion_length": 235.10268878936768, "epoch": 0.02514774299006664, "grad_norm": 0.32352647991566374, "kl": 0.00031512975692749023, "learning_rate": 1.8518518518518518e-08, "loss": 0.0, "reward": 1.7821429073810577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 1.0, "step": 150 }, { "completion_length": 243.23215198516846, "epoch": 0.025483046229934195, "grad_norm": 0.1719160101460022, "kl": 0.0003261566162109375, "learning_rate": 1.876543209876543e-08, "loss": 0.0, "reward": 1.7571429312229156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 152 }, { "completion_length": 247.1071548461914, "epoch": 0.02581834946980175, "grad_norm": 0.2568745935002292, "kl": 0.00030517578125, "learning_rate": 1.9012345679012344e-08, "loss": 0.0, "reward": 1.7089286521077156, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7133929040282965, "rewards/format_reward_func": 0.9955357164144516, "step": 154 }, { "completion_length": 242.92411994934082, "epoch": 0.02615365270966931, "grad_norm": 0.32403116735365, "kl": 0.00030052661895751953, "learning_rate": 1.925925925925926e-08, "loss": 0.0, "reward": 1.783928632736206, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 156 }, { "completion_length": 245.2053680419922, "epoch": 0.026488955949536864, "grad_norm": 0.2945305207519835, "kl": 0.00034099817276000977, "learning_rate": 1.950617283950617e-08, "loss": 0.0, "reward": 1.760714367032051, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 158 }, { "completion_length": 250.0312623977661, "epoch": 0.02682425918940442, "grad_norm": 0.2981654874681236, "kl": 0.00033992528915405273, "learning_rate": 1.9753086419753086e-08, "loss": 0.0, "reward": 1.7785714864730835, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7875000201165676, "rewards/format_reward_func": 0.9910714328289032, "step": 160 }, { "completion_length": 252.8616180419922, "epoch": 0.027159562429271974, "grad_norm": 0.20431294977682624, "kl": 0.0002976059913635254, "learning_rate": 2e-08, "loss": 0.0, "reward": 1.7964286357164383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 162 }, { "completion_length": 246.81697368621826, "epoch": 0.02749486566913953, "grad_norm": 0.1897492145271756, "kl": 0.00032776594161987305, "learning_rate": 2.0246913580246915e-08, "loss": 0.0, "reward": 1.7375000715255737, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7419643253087997, "rewards/format_reward_func": 0.9955357164144516, "step": 164 }, { "completion_length": 242.5178689956665, "epoch": 0.027830168909007084, "grad_norm": 0.1981890035443643, "kl": 0.00030219554901123047, "learning_rate": 2.0493827160493823e-08, "loss": 0.0, "reward": 1.7089286670088768, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.713392898440361, "rewards/format_reward_func": 0.9955357164144516, "step": 166 }, { "completion_length": 242.95536994934082, "epoch": 0.02816547214887464, "grad_norm": 0.27498948462332695, "kl": 0.00031065940856933594, "learning_rate": 2.0740740740740738e-08, "loss": 0.0, "reward": 1.7267858237028122, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.731250025331974, "rewards/format_reward_func": 0.9955357164144516, "step": 168 }, { "completion_length": 244.3169765472412, "epoch": 0.028500775388742194, "grad_norm": 0.18119311717593303, "kl": 0.0003190040588378906, "learning_rate": 2.0987654320987653e-08, "loss": 0.0, "reward": 1.7535715103149414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 170 }, { "completion_length": 250.15179824829102, "epoch": 0.02883607862860975, "grad_norm": 0.1600883159721621, "kl": 0.0003666877746582031, "learning_rate": 2.1234567901234568e-08, "loss": 0.0, "reward": 1.775000087916851, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 172 }, { "completion_length": 252.1607255935669, "epoch": 0.029171381868477304, "grad_norm": 0.20143375779388584, "kl": 0.00031435489654541016, "learning_rate": 2.148148148148148e-08, "loss": 0.0, "reward": 1.7308036610484123, "reward_std": 0.0473508988507092, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 0.9986607171595097, "step": 174 }, { "completion_length": 245.46429538726807, "epoch": 0.02950668510834486, "grad_norm": 0.38570735794849914, "kl": 0.0003286004066467285, "learning_rate": 2.1728395061728394e-08, "loss": 0.0, "reward": 1.7125001028180122, "reward_std": 0.10354063380509615, "rewards/equation_reward_func": 0.7169643118977547, "rewards/format_reward_func": 0.9955357164144516, "step": 176 }, { "completion_length": 247.22769165039062, "epoch": 0.029841988348212414, "grad_norm": 0.3972032973187878, "kl": 0.0003095269203186035, "learning_rate": 2.197530864197531e-08, "loss": 0.0, "reward": 1.748214341700077, "reward_std": 0.10354063287377357, "rewards/equation_reward_func": 0.7526786141097546, "rewards/format_reward_func": 0.9955357164144516, "step": 178 }, { "completion_length": 243.81697463989258, "epoch": 0.03017729158807997, "grad_norm": 0.3034668175618403, "kl": 0.00033104419708251953, "learning_rate": 2.222222222222222e-08, "loss": 0.0, "reward": 1.7535714954137802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 180 }, { "completion_length": 244.0089406967163, "epoch": 0.030512594827947524, "grad_norm": 0.20606761170561028, "kl": 0.00030857324600219727, "learning_rate": 2.2469135802469135e-08, "loss": 0.0, "reward": 1.7875000536441803, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7919643055647612, "rewards/format_reward_func": 0.9955357164144516, "step": 182 }, { "completion_length": 250.7098331451416, "epoch": 0.03084789806781508, "grad_norm": 0.20840818777387127, "kl": 0.00032150745391845703, "learning_rate": 2.2716049382716047e-08, "loss": 0.0, "reward": 1.7678572088479996, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571604192257, "rewards/format_reward_func": 1.0, "step": 184 }, { "completion_length": 238.39733219146729, "epoch": 0.031183201307682637, "grad_norm": 0.30613951822839514, "kl": 0.00033026933670043945, "learning_rate": 2.296296296296296e-08, "loss": 0.0, "reward": 1.8000000566244125, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8000000156462193, "rewards/format_reward_func": 1.0, "step": 186 }, { "completion_length": 242.68751335144043, "epoch": 0.03151850454755019, "grad_norm": 0.24889609063442605, "kl": 0.0003234744071960449, "learning_rate": 2.3209876543209876e-08, "loss": 0.0, "reward": 1.7571429163217545, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428958326578, "rewards/format_reward_func": 1.0, "step": 188 }, { "completion_length": 236.5714406967163, "epoch": 0.03185380778741775, "grad_norm": 0.25782909812310634, "kl": 0.0003020763397216797, "learning_rate": 2.345679012345679e-08, "loss": 0.0, "reward": 1.7535714954137802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714693367481, "rewards/format_reward_func": 1.0, "step": 190 }, { "completion_length": 242.67412090301514, "epoch": 0.0321891110272853, "grad_norm": 0.16179825374337622, "kl": 0.0003166794776916504, "learning_rate": 2.3703703703703703e-08, "loss": 0.0, "reward": 1.8178571984171867, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.8267857432365417, "rewards/format_reward_func": 0.9910714328289032, "step": 192 }, { "completion_length": 246.08929634094238, "epoch": 0.03252441426715286, "grad_norm": 0.20594663668147256, "kl": 0.00032722949981689453, "learning_rate": 2.3950617283950614e-08, "loss": 0.0, "reward": 1.8089286237955093, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8133928775787354, "rewards/format_reward_func": 0.9955357164144516, "step": 194 }, { "completion_length": 249.73215770721436, "epoch": 0.03285971750702041, "grad_norm": 0.2088766158551277, "kl": 0.00032842159271240234, "learning_rate": 2.419753086419753e-08, "loss": 0.0, "reward": 1.7089286595582962, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7133928947150707, "rewards/format_reward_func": 0.9955357164144516, "step": 196 }, { "completion_length": 246.8571548461914, "epoch": 0.03319502074688797, "grad_norm": 0.19387291215229924, "kl": 0.00029861927032470703, "learning_rate": 2.4444444444444444e-08, "loss": 0.0, "reward": 1.7946429252624512, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7991071604192257, "rewards/format_reward_func": 0.9955357164144516, "step": 198 }, { "completion_length": 246.4910831451416, "epoch": 0.033530323986755525, "grad_norm": 0.14395851317716862, "kl": 0.00030988454818725586, "learning_rate": 2.4691358024691355e-08, "loss": 0.0, "reward": 1.7553572207689285, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214540630579, "rewards/format_reward_func": 0.9955357164144516, "step": 200 }, { "completion_length": 234.1294765472412, "epoch": 0.03386562722662308, "grad_norm": 0.24436453124931118, "kl": 0.00032639503479003906, "learning_rate": 2.493827160493827e-08, "loss": 0.0, "reward": 1.7517857775092125, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.75625004991889, "rewards/format_reward_func": 0.9955357164144516, "step": 202 }, { "completion_length": 248.93304634094238, "epoch": 0.034200930466490635, "grad_norm": 0.2248675849511274, "kl": 0.0003234744071960449, "learning_rate": 2.5185185185185185e-08, "loss": 0.0, "reward": 1.7285715267062187, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714596509933, "rewards/format_reward_func": 1.0, "step": 204 }, { "completion_length": 246.19197368621826, "epoch": 0.03453623370635819, "grad_norm": 0.22934369353586878, "kl": 0.0003287792205810547, "learning_rate": 2.54320987654321e-08, "loss": 0.0, "reward": 1.7500000894069672, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 206 }, { "completion_length": 245.40179634094238, "epoch": 0.034871536946225745, "grad_norm": 0.29961009398828276, "kl": 0.00034368038177490234, "learning_rate": 2.5679012345679008e-08, "loss": 0.0, "reward": 1.7616072222590446, "reward_std": 0.06944798538461328, "rewards/equation_reward_func": 0.7723214626312256, "rewards/format_reward_func": 0.9892857223749161, "step": 208 }, { "completion_length": 248.41518878936768, "epoch": 0.0352068401860933, "grad_norm": 0.15792865989980773, "kl": 0.00034034252166748047, "learning_rate": 2.5925925925925923e-08, "loss": 0.0, "reward": 1.769642896950245, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7741071861237288, "rewards/format_reward_func": 0.9955357164144516, "step": 210 }, { "completion_length": 242.93304538726807, "epoch": 0.035542143425960855, "grad_norm": 0.3301838137697797, "kl": 0.0003197193145751953, "learning_rate": 2.6172839506172838e-08, "loss": 0.0, "reward": 1.750446505844593, "reward_std": 0.07007933082059026, "rewards/equation_reward_func": 0.7562500145286322, "rewards/format_reward_func": 0.9941964335739613, "step": 212 }, { "completion_length": 248.7276906967163, "epoch": 0.03587744666582841, "grad_norm": 0.35616311274054235, "kl": 0.0003361701965332031, "learning_rate": 2.6419753086419752e-08, "loss": 0.0, "reward": 1.7678572162985802, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 214 }, { "completion_length": 248.99108505249023, "epoch": 0.036212749905695965, "grad_norm": 0.23756977130728407, "kl": 0.00034689903259277344, "learning_rate": 2.6666666666666667e-08, "loss": 0.0, "reward": 1.7017857804894447, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7062500342726707, "rewards/format_reward_func": 0.9955357164144516, "step": 216 }, { "completion_length": 248.22768688201904, "epoch": 0.03654805314556352, "grad_norm": 0.2756674602915548, "kl": 0.00029724836349487305, "learning_rate": 2.691358024691358e-08, "loss": 0.0, "reward": 1.705357238650322, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7098214626312256, "rewards/format_reward_func": 0.9955357164144516, "step": 218 }, { "completion_length": 233.95983219146729, "epoch": 0.036883356385431075, "grad_norm": 0.2544126349698565, "kl": 0.0003007054328918457, "learning_rate": 2.7160493827160494e-08, "loss": 0.0, "reward": 1.7625000700354576, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7669643107801676, "rewards/format_reward_func": 0.9955357164144516, "step": 220 }, { "completion_length": 250.9553680419922, "epoch": 0.03721865962529863, "grad_norm": 0.213788279693055, "kl": 0.0003082156181335449, "learning_rate": 2.740740740740741e-08, "loss": 0.0, "reward": 1.7589286267757416, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 222 }, { "completion_length": 249.2500123977661, "epoch": 0.037553962865166185, "grad_norm": 0.2610069019462948, "kl": 0.0003325939178466797, "learning_rate": 2.765432098765432e-08, "loss": 0.0, "reward": 1.72857154160738, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.728571455925703, "rewards/format_reward_func": 1.0, "step": 224 }, { "completion_length": 247.03572368621826, "epoch": 0.037889266105033736, "grad_norm": 0.2206000139730647, "kl": 0.0003177523612976074, "learning_rate": 2.790123456790123e-08, "loss": 0.0, "reward": 1.773214340209961, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.9955357164144516, "step": 226 }, { "completion_length": 243.68304634094238, "epoch": 0.038224569344901295, "grad_norm": 0.34760430842090373, "kl": 0.0003180503845214844, "learning_rate": 2.8148148148148146e-08, "loss": 0.0, "reward": 1.7267857789993286, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.731250025331974, "rewards/format_reward_func": 0.9955357164144516, "step": 228 }, { "completion_length": 259.589298248291, "epoch": 0.03855987258476885, "grad_norm": 0.20944117193770692, "kl": 0.000335693359375, "learning_rate": 2.839506172839506e-08, "loss": 0.0, "reward": 1.6821429431438446, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.6910714581608772, "rewards/format_reward_func": 0.9910714328289032, "step": 230 }, { "completion_length": 241.665189743042, "epoch": 0.038895175824636405, "grad_norm": 0.2512126660543398, "kl": 0.00031304359436035156, "learning_rate": 2.8641975308641976e-08, "loss": 0.0, "reward": 1.7285714894533157, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 232 }, { "completion_length": 234.51786708831787, "epoch": 0.03923047906450396, "grad_norm": 0.18261909607261712, "kl": 0.000345766544342041, "learning_rate": 2.8888888888888887e-08, "loss": 0.0, "reward": 1.73392865806818, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7375000305473804, "rewards/format_reward_func": 0.9964285790920258, "step": 234 }, { "completion_length": 252.02679634094238, "epoch": 0.039565782304371515, "grad_norm": 0.20775712922379, "kl": 0.0003249645233154297, "learning_rate": 2.9135802469135802e-08, "loss": 0.0, "reward": 1.7589286416769028, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 236 }, { "completion_length": 242.97768783569336, "epoch": 0.03990108554423907, "grad_norm": 0.1833960305143577, "kl": 0.00034111738204956055, "learning_rate": 2.9382716049382714e-08, "loss": 0.0, "reward": 1.7446429207921028, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071820259094, "rewards/format_reward_func": 0.9955357164144516, "step": 238 }, { "completion_length": 244.35269165039062, "epoch": 0.040236388784106625, "grad_norm": 0.08606613377561309, "kl": 0.0003235340118408203, "learning_rate": 2.962962962962963e-08, "loss": 0.0, "reward": 1.750000074505806, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000186264515, "rewards/format_reward_func": 1.0, "step": 240 }, { "completion_length": 242.70090675354004, "epoch": 0.04057169202397418, "grad_norm": 0.22739026239901686, "kl": 0.00033462047576904297, "learning_rate": 2.987654320987654e-08, "loss": 0.0, "reward": 1.7982143387198448, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.802678607404232, "rewards/format_reward_func": 0.9955357164144516, "step": 242 }, { "completion_length": 240.8169755935669, "epoch": 0.040906995263841735, "grad_norm": 0.23189861598153808, "kl": 0.00033158063888549805, "learning_rate": 3.0123456790123455e-08, "loss": 0.0, "reward": 1.7357143685221672, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9910714328289032, "step": 244 }, { "completion_length": 245.93305015563965, "epoch": 0.04124229850370929, "grad_norm": 0.3452260535215257, "kl": 0.0003554821014404297, "learning_rate": 3.037037037037037e-08, "loss": 0.0, "reward": 1.7750000730156898, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 246 }, { "completion_length": 254.03572750091553, "epoch": 0.041577601743576845, "grad_norm": 0.3311219707962642, "kl": 0.0003204345703125, "learning_rate": 3.0617283950617284e-08, "loss": 0.0, "reward": 1.6928572431206703, "reward_std": 0.10101525112986565, "rewards/equation_reward_func": 0.6928571779280901, "rewards/format_reward_func": 1.0, "step": 248 }, { "completion_length": 249.513409614563, "epoch": 0.0419129049834444, "grad_norm": 0.37071766869235806, "kl": 0.0003209114074707031, "learning_rate": 3.086419753086419e-08, "loss": 0.0, "reward": 1.7535714954137802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 250 }, { "completion_length": 244.571439743042, "epoch": 0.042248208223311955, "grad_norm": 0.2739936128591675, "kl": 0.0003472566604614258, "learning_rate": 3.111111111111111e-08, "loss": 0.0, "reward": 1.7607143446803093, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143130153418, "rewards/format_reward_func": 1.0, "step": 252 }, { "completion_length": 232.39733028411865, "epoch": 0.04258351146317951, "grad_norm": 0.17136269147118577, "kl": 0.00031685829162597656, "learning_rate": 3.135802469135802e-08, "loss": 0.0, "reward": 1.7750000655651093, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 254 }, { "completion_length": 242.82144165039062, "epoch": 0.04291881470304707, "grad_norm": 0.24005201682447744, "kl": 0.0003197789192199707, "learning_rate": 3.160493827160494e-08, "loss": 0.0, "reward": 1.735714353621006, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143275439739, "rewards/format_reward_func": 1.0, "step": 256 }, { "completion_length": 247.8571538925171, "epoch": 0.04325411794291462, "grad_norm": 0.25000441328741113, "kl": 0.0003287792205810547, "learning_rate": 3.185185185185185e-08, "loss": 0.0, "reward": 1.7410714998841286, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7455357387661934, "rewards/format_reward_func": 0.9955357164144516, "step": 258 }, { "completion_length": 246.6428680419922, "epoch": 0.04358942118278218, "grad_norm": 0.18778229429765905, "kl": 0.0002881288528442383, "learning_rate": 3.2098765432098767e-08, "loss": 0.0, "reward": 1.7500000894069672, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7589285969734192, "rewards/format_reward_func": 0.9910714328289032, "step": 260 }, { "completion_length": 252.77679634094238, "epoch": 0.04392472442264973, "grad_norm": 0.2507058768188766, "kl": 0.0003418922424316406, "learning_rate": 3.234567901234568e-08, "loss": 0.0, "reward": 1.6964286491274834, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6964286118745804, "rewards/format_reward_func": 1.0, "step": 262 }, { "completion_length": 250.5223331451416, "epoch": 0.04426002766251729, "grad_norm": 0.2145908398443362, "kl": 0.00032597780227661133, "learning_rate": 3.259259259259259e-08, "loss": 0.0, "reward": 1.7250000685453415, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7339286059141159, "rewards/format_reward_func": 0.9910714328289032, "step": 264 }, { "completion_length": 244.2946538925171, "epoch": 0.04459533090238484, "grad_norm": 0.1814048605402375, "kl": 0.0003126859664916992, "learning_rate": 3.2839506172839504e-08, "loss": 0.0, "reward": 1.7357143461704254, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7446428835391998, "rewards/format_reward_func": 0.9910714328289032, "step": 266 }, { "completion_length": 240.95090579986572, "epoch": 0.0449306341422524, "grad_norm": 0.573185463890021, "kl": 0.00038886070251464844, "learning_rate": 3.308641975308642e-08, "loss": 0.0, "reward": 1.7875000461935997, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7919642999768257, "rewards/format_reward_func": 0.9955357164144516, "step": 268 }, { "completion_length": 251.35269165039062, "epoch": 0.04526593738211995, "grad_norm": 0.345458955989029, "kl": 0.0003148317337036133, "learning_rate": 3.3333333333333334e-08, "loss": 0.0, "reward": 1.7607143595814705, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 270 }, { "completion_length": 240.8303680419922, "epoch": 0.04560124062198751, "grad_norm": 0.21387981859355704, "kl": 0.00033462047576904297, "learning_rate": 3.358024691358025e-08, "loss": 0.0, "reward": 1.7678572237491608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 272 }, { "completion_length": 243.8482265472412, "epoch": 0.04593654386185506, "grad_norm": 0.1372445448617604, "kl": 0.00030040740966796875, "learning_rate": 3.382716049382716e-08, "loss": 0.0, "reward": 1.8017857745289803, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.806250024586916, "rewards/format_reward_func": 0.9955357164144516, "step": 274 }, { "completion_length": 242.38393878936768, "epoch": 0.04627184710172262, "grad_norm": 0.2538804221997863, "kl": 0.0003288388252258301, "learning_rate": 3.407407407407407e-08, "loss": 0.0, "reward": 1.7482143491506577, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7526785973459482, "rewards/format_reward_func": 0.9955357164144516, "step": 276 }, { "completion_length": 243.4553680419922, "epoch": 0.04660715034159017, "grad_norm": 0.17161419413289314, "kl": 0.0003286600112915039, "learning_rate": 3.4320987654320987e-08, "loss": 0.0, "reward": 1.739285796880722, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 278 }, { "completion_length": 253.1071548461914, "epoch": 0.04694245358145773, "grad_norm": 0.1583888212171527, "kl": 0.0003293752670288086, "learning_rate": 3.4567901234567895e-08, "loss": 0.0, "reward": 1.7857143580913544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143152505159, "rewards/format_reward_func": 1.0, "step": 280 }, { "completion_length": 247.33483600616455, "epoch": 0.04727775682132528, "grad_norm": 0.2364266681687748, "kl": 0.0003266334533691406, "learning_rate": 3.481481481481481e-08, "loss": 0.0, "reward": 1.725000075995922, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7250000461935997, "rewards/format_reward_func": 1.0, "step": 282 }, { "completion_length": 247.9196548461914, "epoch": 0.04761306006119284, "grad_norm": 0.15755245023632508, "kl": 0.0003324151039123535, "learning_rate": 3.5061728395061724e-08, "loss": 0.0, "reward": 1.7964286282658577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 284 }, { "completion_length": 244.0982265472412, "epoch": 0.0479483633010604, "grad_norm": 0.248909793697624, "kl": 0.0002993345260620117, "learning_rate": 3.530864197530864e-08, "loss": 0.0, "reward": 1.7017857879400253, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7062500305473804, "rewards/format_reward_func": 0.9955357164144516, "step": 286 }, { "completion_length": 245.4866180419922, "epoch": 0.04828366654092795, "grad_norm": 0.2979174520331835, "kl": 0.0003222227096557617, "learning_rate": 3.5555555555555554e-08, "loss": 0.0, "reward": 1.7160715088248253, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7205357514321804, "rewards/format_reward_func": 0.9955357164144516, "step": 288 }, { "completion_length": 245.1696548461914, "epoch": 0.04861896978079551, "grad_norm": 0.2121228295030136, "kl": 0.0003127455711364746, "learning_rate": 3.580246913580247e-08, "loss": 0.0, "reward": 1.7750000730156898, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 290 }, { "completion_length": 254.25001621246338, "epoch": 0.04895427302066306, "grad_norm": 0.2591013783701026, "kl": 0.0003070831298828125, "learning_rate": 3.6049382716049384e-08, "loss": 0.0, "reward": 1.7107143625617027, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7107143253087997, "rewards/format_reward_func": 1.0, "step": 292 }, { "completion_length": 248.9285831451416, "epoch": 0.04928957626053062, "grad_norm": 0.25970339981154605, "kl": 0.00030744075775146484, "learning_rate": 3.62962962962963e-08, "loss": 0.0, "reward": 1.7142858132719994, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7142857424914837, "rewards/format_reward_func": 1.0, "step": 294 }, { "completion_length": 245.6607255935669, "epoch": 0.04962487950039817, "grad_norm": 0.1693241175450565, "kl": 0.0003269314765930176, "learning_rate": 3.6543209876543213e-08, "loss": 0.0, "reward": 1.7928572073578835, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 296 }, { "completion_length": 244.47768878936768, "epoch": 0.04996018274026573, "grad_norm": 0.28498148563751646, "kl": 0.00032448768615722656, "learning_rate": 3.679012345679012e-08, "loss": 0.0, "reward": 1.7232143804430962, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7276786062866449, "rewards/format_reward_func": 0.9955357164144516, "step": 298 }, { "completion_length": 249.23662090301514, "epoch": 0.05029548598013328, "grad_norm": 0.17194495222367387, "kl": 0.00031006336212158203, "learning_rate": 3.7037037037037036e-08, "loss": 0.0, "reward": 1.7250000908970833, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.725000036880374, "rewards/format_reward_func": 1.0, "step": 300 }, { "completion_length": 241.9509038925171, "epoch": 0.05063078922000084, "grad_norm": 0.09369687516121022, "kl": 0.0003318190574645996, "learning_rate": 3.7283950617283945e-08, "loss": 0.0, "reward": 1.7589286640286446, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7633928805589676, "rewards/format_reward_func": 0.9955357164144516, "step": 302 }, { "completion_length": 245.50893878936768, "epoch": 0.05096609245986839, "grad_norm": 0.20374576311730827, "kl": 0.0003292560577392578, "learning_rate": 3.753086419753086e-08, "loss": 0.0, "reward": 1.762500062584877, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643182307482, "rewards/format_reward_func": 0.9955357164144516, "step": 304 }, { "completion_length": 246.89286708831787, "epoch": 0.05130139569973595, "grad_norm": 0.33707103491103274, "kl": 0.00032824277877807617, "learning_rate": 3.7777777777777774e-08, "loss": 0.0, "reward": 1.7375000640749931, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7419643141329288, "rewards/format_reward_func": 0.9955357164144516, "step": 306 }, { "completion_length": 251.9910831451416, "epoch": 0.0516366989396035, "grad_norm": 0.16043020844799977, "kl": 0.0003122687339782715, "learning_rate": 3.802469135802469e-08, "loss": 0.0, "reward": 1.7214286476373672, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214286103844643, "rewards/format_reward_func": 1.0, "step": 308 }, { "completion_length": 252.6696548461914, "epoch": 0.05197200217947106, "grad_norm": 0.32476087022862743, "kl": 0.00031131505966186523, "learning_rate": 3.8271604938271604e-08, "loss": 0.0, "reward": 1.7142858058214188, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7232143059372902, "rewards/format_reward_func": 0.9910714328289032, "step": 310 }, { "completion_length": 250.33037090301514, "epoch": 0.05230730541933862, "grad_norm": 0.20662703392624346, "kl": 0.00035762786865234375, "learning_rate": 3.851851851851852e-08, "loss": 0.0, "reward": 1.7232143431901932, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.727678619325161, "rewards/format_reward_func": 0.9955357164144516, "step": 312 }, { "completion_length": 245.37947368621826, "epoch": 0.05264260865920617, "grad_norm": 0.3116052051179476, "kl": 0.00034052133560180664, "learning_rate": 3.876543209876543e-08, "loss": 0.0, "reward": 1.7267857939004898, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7312500216066837, "rewards/format_reward_func": 0.9955357164144516, "step": 314 }, { "completion_length": 254.11608219146729, "epoch": 0.05297791189907373, "grad_norm": 0.23588516688376734, "kl": 0.00032001733779907227, "learning_rate": 3.901234567901234e-08, "loss": 0.0, "reward": 1.7589286491274834, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.772321455180645, "rewards/format_reward_func": 0.9866071492433548, "step": 316 }, { "completion_length": 250.17858409881592, "epoch": 0.05331321513894128, "grad_norm": 0.21352114622407678, "kl": 0.0003173947334289551, "learning_rate": 3.9259259259259256e-08, "loss": 0.0, "reward": 1.708928644657135, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7133928779512644, "rewards/format_reward_func": 0.9955357164144516, "step": 318 }, { "completion_length": 241.33036708831787, "epoch": 0.05364851837880884, "grad_norm": 0.19064104806150775, "kl": 0.000352323055267334, "learning_rate": 3.950617283950617e-08, "loss": 0.0, "reward": 1.782142922282219, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 320 }, { "completion_length": 247.5714406967163, "epoch": 0.05398382161867639, "grad_norm": 0.09471434497682275, "kl": 0.00035202503204345703, "learning_rate": 3.9753086419753086e-08, "loss": 0.0, "reward": 1.7357143759727478, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357142977416515, "rewards/format_reward_func": 1.0, "step": 322 }, { "completion_length": 247.8392972946167, "epoch": 0.05431912485854395, "grad_norm": 0.16439367857374707, "kl": 0.0003255605697631836, "learning_rate": 4e-08, "loss": 0.0, "reward": 1.7250000685453415, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7339286040514708, "rewards/format_reward_func": 0.9910714328289032, "step": 324 }, { "completion_length": 244.4821538925171, "epoch": 0.0546544280984115, "grad_norm": 0.28376400955422515, "kl": 0.0003072023391723633, "learning_rate": 4.0246913580246916e-08, "loss": 0.0, "reward": 1.7357143610715866, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7446429003030062, "rewards/format_reward_func": 0.9910714328289032, "step": 326 }, { "completion_length": 241.10715293884277, "epoch": 0.05498973133827906, "grad_norm": 0.21232563662149967, "kl": 0.00036221742630004883, "learning_rate": 4.049382716049383e-08, "loss": 0.0, "reward": 1.76071435213089, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 328 }, { "completion_length": 244.8259048461914, "epoch": 0.05532503457814661, "grad_norm": 0.31284424481049833, "kl": 0.00032901763916015625, "learning_rate": 4.0740740740740745e-08, "loss": 0.0, "reward": 1.7642857655882835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 330 }, { "completion_length": 233.58036708831787, "epoch": 0.05566033781801417, "grad_norm": 0.14740794798794307, "kl": 0.00032466650009155273, "learning_rate": 4.098765432098765e-08, "loss": 0.0, "reward": 1.7500000670552254, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000484287739, "rewards/format_reward_func": 1.0, "step": 332 }, { "completion_length": 247.05358219146729, "epoch": 0.05599564105788172, "grad_norm": 0.21326826487289274, "kl": 0.0003064870834350586, "learning_rate": 4.123456790123456e-08, "loss": 0.0, "reward": 1.7660714760422707, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9955357164144516, "step": 334 }, { "completion_length": 252.5134038925171, "epoch": 0.05633094429774928, "grad_norm": 0.2820975150743322, "kl": 0.0003514289855957031, "learning_rate": 4.1481481481481476e-08, "loss": 0.0, "reward": 1.737500086426735, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 336 }, { "completion_length": 245.87054538726807, "epoch": 0.05666624753761683, "grad_norm": 0.28892632014326425, "kl": 0.00032830238342285156, "learning_rate": 4.172839506172839e-08, "loss": 0.0, "reward": 1.7392857894301414, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.739285733550787, "rewards/format_reward_func": 1.0, "step": 338 }, { "completion_length": 257.6428699493408, "epoch": 0.05700155077748439, "grad_norm": 0.20801041134465126, "kl": 0.0003235936164855957, "learning_rate": 4.1975308641975306e-08, "loss": 0.0, "reward": 1.7035714983940125, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.712500024586916, "rewards/format_reward_func": 0.9910714328289032, "step": 340 }, { "completion_length": 239.0446538925171, "epoch": 0.057336854017351946, "grad_norm": 0.20466302680456666, "kl": 0.00033277273178100586, "learning_rate": 4.222222222222222e-08, "loss": 0.0, "reward": 1.7785714864730835, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7875000350177288, "rewards/format_reward_func": 0.9910714328289032, "step": 342 }, { "completion_length": 247.04911994934082, "epoch": 0.0576721572572195, "grad_norm": 0.2544168425529169, "kl": 0.0003313422203063965, "learning_rate": 4.2469135802469136e-08, "loss": 0.0, "reward": 1.7500000670552254, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 344 }, { "completion_length": 247.80805110931396, "epoch": 0.058007460497087056, "grad_norm": 0.28697432321700045, "kl": 0.0003412961959838867, "learning_rate": 4.271604938271605e-08, "loss": 0.0, "reward": 1.7464286461472511, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 346 }, { "completion_length": 241.80358409881592, "epoch": 0.05834276373695461, "grad_norm": 0.2989078442481178, "kl": 0.0003070235252380371, "learning_rate": 4.296296296296296e-08, "loss": 0.0, "reward": 1.771428644657135, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285887777805, "rewards/format_reward_func": 1.0, "step": 348 }, { "completion_length": 241.81697463989258, "epoch": 0.058678066976822166, "grad_norm": 0.18283142606513708, "kl": 0.0003039836883544922, "learning_rate": 4.3209876543209874e-08, "loss": 0.0, "reward": 1.7857143431901932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 350 }, { "completion_length": 245.33929634094238, "epoch": 0.05901337021668972, "grad_norm": 0.2445142243174735, "kl": 0.0003504753112792969, "learning_rate": 4.345679012345679e-08, "loss": 0.0, "reward": 1.771428644657135, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 352 }, { "completion_length": 246.90179920196533, "epoch": 0.059348673456557276, "grad_norm": 0.16569502991379484, "kl": 0.00032913684844970703, "learning_rate": 4.37037037037037e-08, "loss": 0.0, "reward": 1.7571429312229156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571429088711739, "rewards/format_reward_func": 1.0, "step": 354 }, { "completion_length": 240.02679634094238, "epoch": 0.05968397669642483, "grad_norm": 0.2585568390691014, "kl": 0.0003421306610107422, "learning_rate": 4.395061728395062e-08, "loss": 0.0, "reward": 1.758928656578064, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 356 }, { "completion_length": 245.41072463989258, "epoch": 0.060019279936292386, "grad_norm": 0.13192074596224, "kl": 0.00033915042877197266, "learning_rate": 4.419753086419753e-08, "loss": 0.0, "reward": 1.7678571939468384, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 358 }, { "completion_length": 235.59375953674316, "epoch": 0.06035458317615994, "grad_norm": 0.1860175287251393, "kl": 0.00028246641159057617, "learning_rate": 4.444444444444444e-08, "loss": 0.0, "reward": 1.7875000536441803, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7919643186032772, "rewards/format_reward_func": 0.9955357164144516, "step": 360 }, { "completion_length": 250.3169755935669, "epoch": 0.060689886416027496, "grad_norm": 0.296173081498714, "kl": 0.0003261566162109375, "learning_rate": 4.4691358024691356e-08, "loss": 0.0, "reward": 1.7500000521540642, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 362 }, { "completion_length": 245.74108028411865, "epoch": 0.06102518965589505, "grad_norm": 0.17763701007983315, "kl": 0.0003126859664916992, "learning_rate": 4.493827160493827e-08, "loss": 0.0, "reward": 1.7714286372065544, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7803571783006191, "rewards/format_reward_func": 0.9910714328289032, "step": 364 }, { "completion_length": 249.30358219146729, "epoch": 0.061360492895762606, "grad_norm": 0.2121971068500702, "kl": 0.00033473968505859375, "learning_rate": 4.518518518518518e-08, "loss": 0.0, "reward": 1.751785770058632, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 366 }, { "completion_length": 243.9285831451416, "epoch": 0.06169579613563016, "grad_norm": 0.15114489649405735, "kl": 0.00029456615447998047, "learning_rate": 4.5432098765432094e-08, "loss": 0.0, "reward": 1.7892857640981674, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 368 }, { "completion_length": 248.87500953674316, "epoch": 0.062031099375497716, "grad_norm": 0.2841277990992117, "kl": 0.00031626224517822266, "learning_rate": 4.567901234567901e-08, "loss": 0.0, "reward": 1.741071492433548, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7544643115252256, "rewards/format_reward_func": 0.9866071492433548, "step": 370 }, { "completion_length": 244.41965293884277, "epoch": 0.062366402615365274, "grad_norm": 0.4126531872874137, "kl": 0.0003432035446166992, "learning_rate": 4.592592592592592e-08, "loss": 0.0, "reward": 1.7303572222590446, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7348214574158192, "rewards/format_reward_func": 0.9955357164144516, "step": 372 }, { "completion_length": 249.9821548461914, "epoch": 0.06270170585523283, "grad_norm": 0.25716336513716315, "kl": 0.0003498196601867676, "learning_rate": 4.617283950617284e-08, "loss": 0.0, "reward": 1.7821429446339607, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.782142873853445, "rewards/format_reward_func": 1.0, "step": 374 }, { "completion_length": 258.8259086608887, "epoch": 0.06303700909510038, "grad_norm": 0.209967257582214, "kl": 0.00030928850173950195, "learning_rate": 4.641975308641975e-08, "loss": 0.0, "reward": 1.696428656578064, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7053571715950966, "rewards/format_reward_func": 0.9910714328289032, "step": 376 }, { "completion_length": 243.48661613464355, "epoch": 0.06337231233496794, "grad_norm": 0.357820262167908, "kl": 0.00035440921783447266, "learning_rate": 4.666666666666667e-08, "loss": 0.0, "reward": 1.7553572058677673, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.759821455925703, "rewards/format_reward_func": 0.9955357164144516, "step": 378 }, { "completion_length": 250.60269165039062, "epoch": 0.0637076155748355, "grad_norm": 0.2806225729005327, "kl": 0.0003170967102050781, "learning_rate": 4.691358024691358e-08, "loss": 0.0, "reward": 1.760714367032051, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143092900515, "rewards/format_reward_func": 1.0, "step": 380 }, { "completion_length": 243.09376049041748, "epoch": 0.06404291881470305, "grad_norm": 0.18647090907402403, "kl": 0.0003414154052734375, "learning_rate": 4.716049382716049e-08, "loss": 0.0, "reward": 1.725000061094761, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7339285928755999, "rewards/format_reward_func": 0.9910714328289032, "step": 382 }, { "completion_length": 246.5089406967163, "epoch": 0.0643782220545706, "grad_norm": 0.6344869457901994, "kl": 0.0003355741500854492, "learning_rate": 4.7407407407407405e-08, "loss": 0.0, "reward": 1.725000075995922, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.733928594738245, "rewards/format_reward_func": 0.9910714328289032, "step": 384 }, { "completion_length": 242.91519260406494, "epoch": 0.06471352529443816, "grad_norm": 0.3172293127272545, "kl": 0.0003371238708496094, "learning_rate": 4.765432098765432e-08, "loss": 0.0, "reward": 1.762500062584877, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9955357164144516, "step": 386 }, { "completion_length": 238.28126335144043, "epoch": 0.06504882853430571, "grad_norm": 0.2713314165205857, "kl": 0.0003418922424316406, "learning_rate": 4.790123456790123e-08, "loss": 0.0, "reward": 1.7482143566012383, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7526786141097546, "rewards/format_reward_func": 0.9955357164144516, "step": 388 }, { "completion_length": 243.88393783569336, "epoch": 0.06538413177417327, "grad_norm": 0.15007989523396975, "kl": 0.0003464221954345703, "learning_rate": 4.814814814814814e-08, "loss": 0.0, "reward": 1.8035714998841286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 390 }, { "completion_length": 247.31697845458984, "epoch": 0.06571943501404082, "grad_norm": 0.4454137762015419, "kl": 0.0003800392150878906, "learning_rate": 4.839506172839506e-08, "loss": 0.0, "reward": 1.7357143759727478, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7446428947150707, "rewards/format_reward_func": 0.9910714328289032, "step": 392 }, { "completion_length": 252.01340579986572, "epoch": 0.06605473825390838, "grad_norm": 0.5044913641727443, "kl": 0.00032645463943481445, "learning_rate": 4.864197530864197e-08, "loss": 0.0, "reward": 1.698214367032051, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7026786152273417, "rewards/format_reward_func": 0.9955357164144516, "step": 394 }, { "completion_length": 241.75001430511475, "epoch": 0.06639004149377593, "grad_norm": 0.19490645814902496, "kl": 0.0003345012664794922, "learning_rate": 4.888888888888889e-08, "loss": 0.0, "reward": 1.733928643167019, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 396 }, { "completion_length": 248.08036994934082, "epoch": 0.06672534473364349, "grad_norm": 0.2114488097393688, "kl": 0.0003476142883300781, "learning_rate": 4.91358024691358e-08, "loss": 0.0, "reward": 1.775000087916851, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 398 }, { "completion_length": 249.24554920196533, "epoch": 0.06706064797351105, "grad_norm": 0.3736813422927126, "kl": 0.0003236532211303711, "learning_rate": 4.938271604938271e-08, "loss": 0.0, "reward": 1.7535715028643608, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 400 }, { "completion_length": 250.5491189956665, "epoch": 0.0673959512133786, "grad_norm": 0.09379494996574526, "kl": 0.0003262758255004883, "learning_rate": 4.9629629629629626e-08, "loss": 0.0, "reward": 1.8142857626080513, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8232143111526966, "rewards/format_reward_func": 0.9910714328289032, "step": 402 }, { "completion_length": 242.2366180419922, "epoch": 0.06773125445324615, "grad_norm": 0.21504913509015106, "kl": 0.0003097057342529297, "learning_rate": 4.987654320987654e-08, "loss": 0.0, "reward": 1.744642935693264, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 404 }, { "completion_length": 249.0446548461914, "epoch": 0.0680665576931137, "grad_norm": 0.25395455275495804, "kl": 0.0003694295883178711, "learning_rate": 5.0123456790123455e-08, "loss": 0.0, "reward": 1.7357143610715866, "reward_std": 0.0909137288108468, "rewards/equation_reward_func": 0.7535714469850063, "rewards/format_reward_func": 0.9821428656578064, "step": 406 }, { "completion_length": 238.26340579986572, "epoch": 0.06840186093298127, "grad_norm": 0.33754900952521866, "kl": 0.0003122687339782715, "learning_rate": 5.037037037037037e-08, "loss": 0.0, "reward": 1.7535715103149414, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714600235224, "rewards/format_reward_func": 1.0, "step": 408 }, { "completion_length": 249.81697845458984, "epoch": 0.06873716417284882, "grad_norm": 0.1367116643336248, "kl": 0.0003184080123901367, "learning_rate": 5.0617283950617285e-08, "loss": 0.0, "reward": 1.728571504354477, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714522004128, "rewards/format_reward_func": 1.0, "step": 410 }, { "completion_length": 246.19197463989258, "epoch": 0.06907246741271637, "grad_norm": 0.23975839196424972, "kl": 0.00031936168670654297, "learning_rate": 5.08641975308642e-08, "loss": 0.0, "reward": 1.7946428880095482, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.799107177183032, "rewards/format_reward_func": 0.9955357164144516, "step": 412 }, { "completion_length": 245.99108505249023, "epoch": 0.06940777065258392, "grad_norm": 0.35602425619406325, "kl": 0.00033867359161376953, "learning_rate": 5.1111111111111114e-08, "loss": 0.0, "reward": 1.8196429014205933, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8241071589291096, "rewards/format_reward_func": 0.9955357164144516, "step": 414 }, { "completion_length": 250.1562614440918, "epoch": 0.06974307389245149, "grad_norm": 0.24837916834782592, "kl": 0.0003298521041870117, "learning_rate": 5.1358024691358016e-08, "loss": 0.0, "reward": 1.735714353621006, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7446428872644901, "rewards/format_reward_func": 0.9910714328289032, "step": 416 }, { "completion_length": 251.49107837677002, "epoch": 0.07007837713231904, "grad_norm": 0.35602402754236945, "kl": 0.0003180503845214844, "learning_rate": 5.160493827160493e-08, "loss": 0.0, "reward": 1.751785784959793, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 418 }, { "completion_length": 238.81251049041748, "epoch": 0.0704136803721866, "grad_norm": 0.3111388382821028, "kl": 0.0003452301025390625, "learning_rate": 5.1851851851851846e-08, "loss": 0.0, "reward": 1.7392857894301414, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857633531094, "rewards/format_reward_func": 1.0, "step": 420 }, { "completion_length": 241.9241180419922, "epoch": 0.07074898361205414, "grad_norm": 0.22274741286610353, "kl": 0.0003383755683898926, "learning_rate": 5.209876543209876e-08, "loss": 0.0, "reward": 1.7571429386734962, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428753435612, "rewards/format_reward_func": 1.0, "step": 422 }, { "completion_length": 244.17411708831787, "epoch": 0.07108428685192171, "grad_norm": 0.19095279674450452, "kl": 0.00030684471130371094, "learning_rate": 5.2345679012345675e-08, "loss": 0.0, "reward": 1.703571505844593, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7125000283122063, "rewards/format_reward_func": 0.9910714328289032, "step": 424 }, { "completion_length": 245.1562623977661, "epoch": 0.07141959009178926, "grad_norm": 0.21128512978496952, "kl": 0.0003224611282348633, "learning_rate": 5.259259259259259e-08, "loss": 0.0, "reward": 1.8107143491506577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.810714315623045, "rewards/format_reward_func": 1.0, "step": 426 }, { "completion_length": 232.4821548461914, "epoch": 0.07175489333165681, "grad_norm": 0.2414527596562914, "kl": 0.0003307461738586426, "learning_rate": 5.2839506172839505e-08, "loss": 0.0, "reward": 1.7964286357164383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964286096394062, "rewards/format_reward_func": 1.0, "step": 428 }, { "completion_length": 253.75447750091553, "epoch": 0.07209019657152438, "grad_norm": 0.2552537262189747, "kl": 0.00033414363861083984, "learning_rate": 5.308641975308642e-08, "loss": 0.0, "reward": 1.707142949104309, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7160714641213417, "rewards/format_reward_func": 0.9910714328289032, "step": 430 }, { "completion_length": 251.34822368621826, "epoch": 0.07242549981139193, "grad_norm": 0.1560573542361731, "kl": 0.0003288388252258301, "learning_rate": 5.3333333333333334e-08, "loss": 0.0, "reward": 1.742857202887535, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 432 }, { "completion_length": 247.5044755935669, "epoch": 0.07276080305125948, "grad_norm": 0.30241987654645447, "kl": 0.0003216266632080078, "learning_rate": 5.358024691358024e-08, "loss": 0.0, "reward": 1.7660715207457542, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7705357484519482, "rewards/format_reward_func": 0.9955357164144516, "step": 434 }, { "completion_length": 246.5312614440918, "epoch": 0.07309610629112703, "grad_norm": 0.33054557128364087, "kl": 0.0003314018249511719, "learning_rate": 5.382716049382716e-08, "loss": 0.0, "reward": 1.7250000536441803, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7339286021888256, "rewards/format_reward_func": 0.9910714328289032, "step": 436 }, { "completion_length": 243.68304920196533, "epoch": 0.0734314095309946, "grad_norm": 0.19851593896021988, "kl": 0.00032967329025268555, "learning_rate": 5.407407407407407e-08, "loss": 0.0, "reward": 1.7107143625617027, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7107143215835094, "rewards/format_reward_func": 1.0, "step": 438 }, { "completion_length": 249.64287185668945, "epoch": 0.07376671277086215, "grad_norm": 0.1713552603952388, "kl": 0.0003451108932495117, "learning_rate": 5.432098765432099e-08, "loss": 0.0, "reward": 1.7750000357627869, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7839285992085934, "rewards/format_reward_func": 0.9910714328289032, "step": 440 }, { "completion_length": 240.55358409881592, "epoch": 0.0741020160107297, "grad_norm": 0.21027297014803956, "kl": 0.000316619873046875, "learning_rate": 5.45679012345679e-08, "loss": 0.0, "reward": 1.7821429297327995, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 1.0, "step": 442 }, { "completion_length": 245.67411708831787, "epoch": 0.07443731925059725, "grad_norm": 0.33966947103680717, "kl": 0.00033915042877197266, "learning_rate": 5.481481481481482e-08, "loss": 0.0, "reward": 1.7660714983940125, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7705357447266579, "rewards/format_reward_func": 0.9955357164144516, "step": 444 }, { "completion_length": 254.05358028411865, "epoch": 0.07477262249046482, "grad_norm": 0.4185328389589341, "kl": 0.00033539533615112305, "learning_rate": 5.5061728395061725e-08, "loss": 0.0, "reward": 1.6982143595814705, "reward_std": 0.09848987031728029, "rewards/equation_reward_func": 0.7116071805357933, "rewards/format_reward_func": 0.9866071492433548, "step": 446 }, { "completion_length": 235.8437614440918, "epoch": 0.07510792573033237, "grad_norm": 0.31327344899543313, "kl": 0.00031751394271850586, "learning_rate": 5.530864197530864e-08, "loss": 0.0, "reward": 1.785714365541935, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 448 }, { "completion_length": 244.25894260406494, "epoch": 0.07544322897019992, "grad_norm": 0.168519024379435, "kl": 0.00033473968505859375, "learning_rate": 5.555555555555555e-08, "loss": 0.0, "reward": 1.7428572103381157, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 450 }, { "completion_length": 241.50447463989258, "epoch": 0.07577853221006747, "grad_norm": 0.2579852036649518, "kl": 0.0003389120101928711, "learning_rate": 5.580246913580246e-08, "loss": 0.0, "reward": 1.7750000730156898, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 452 }, { "completion_length": 246.60715103149414, "epoch": 0.07611383544993504, "grad_norm": 0.2946907618719622, "kl": 0.0003083944320678711, "learning_rate": 5.604938271604938e-08, "loss": 0.0, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 454 }, { "completion_length": 243.24554634094238, "epoch": 0.07644913868980259, "grad_norm": 0.4449621858359876, "kl": 0.0003170967102050781, "learning_rate": 5.629629629629629e-08, "loss": 0.0, "reward": 1.7464286386966705, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.746428607031703, "rewards/format_reward_func": 1.0, "step": 456 }, { "completion_length": 244.9017972946167, "epoch": 0.07678444192967014, "grad_norm": 0.27342841301104115, "kl": 0.00032132863998413086, "learning_rate": 5.654320987654321e-08, "loss": 0.0, "reward": 1.778571479022503, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 458 }, { "completion_length": 246.16518783569336, "epoch": 0.0771197451695377, "grad_norm": 0.19558656358425155, "kl": 0.0003364682197570801, "learning_rate": 5.679012345679012e-08, "loss": 0.0, "reward": 1.7714286297559738, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7803571783006191, "rewards/format_reward_func": 0.9910714328289032, "step": 460 }, { "completion_length": 241.52233123779297, "epoch": 0.07745504840940526, "grad_norm": 0.169838691432525, "kl": 0.00031113624572753906, "learning_rate": 5.703703703703704e-08, "loss": 0.0, "reward": 1.7803572416305542, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7848214581608772, "rewards/format_reward_func": 0.9955357164144516, "step": 462 }, { "completion_length": 253.54019165039062, "epoch": 0.07779035164927281, "grad_norm": 0.19518278420827403, "kl": 0.00032633543014526367, "learning_rate": 5.728395061728395e-08, "loss": 0.0, "reward": 1.7000001072883606, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7089285887777805, "rewards/format_reward_func": 0.9910714328289032, "step": 464 }, { "completion_length": 236.49554634094238, "epoch": 0.07812565488914036, "grad_norm": 0.21490568594797874, "kl": 0.00033402442932128906, "learning_rate": 5.7530864197530866e-08, "loss": 0.0, "reward": 1.8160714954137802, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8205357305705547, "rewards/format_reward_func": 0.9955357164144516, "step": 466 }, { "completion_length": 244.2857255935669, "epoch": 0.07846095812900793, "grad_norm": 0.11754674459427326, "kl": 0.00033402442932128906, "learning_rate": 5.7777777777777775e-08, "loss": 0.0, "reward": 1.8017857447266579, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8062500320374966, "rewards/format_reward_func": 0.9955357164144516, "step": 468 }, { "completion_length": 244.0848331451416, "epoch": 0.07879626136887548, "grad_norm": 0.23226873868439502, "kl": 0.00031507015228271484, "learning_rate": 5.802469135802469e-08, "loss": 0.0, "reward": 1.70000009983778, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7089286036789417, "rewards/format_reward_func": 0.9910714328289032, "step": 470 }, { "completion_length": 252.51340770721436, "epoch": 0.07913156460874303, "grad_norm": 0.2389300517769942, "kl": 0.00030809640884399414, "learning_rate": 5.8271604938271604e-08, "loss": 0.0, "reward": 1.7339286506175995, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9776785783469677, "step": 472 }, { "completion_length": 258.8705463409424, "epoch": 0.07946686784861058, "grad_norm": 0.21870083869491905, "kl": 0.00032639503479003906, "learning_rate": 5.851851851851851e-08, "loss": 0.0, "reward": 1.6946429386734962, "reward_std": 0.09848987124860287, "rewards/equation_reward_func": 0.7080357503145933, "rewards/format_reward_func": 0.9866071492433548, "step": 474 }, { "completion_length": 240.48661994934082, "epoch": 0.07980217108847815, "grad_norm": 0.22507093576887516, "kl": 0.00033414363861083984, "learning_rate": 5.876543209876543e-08, "loss": 0.0, "reward": 1.7571429088711739, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428772062063, "rewards/format_reward_func": 1.0, "step": 476 }, { "completion_length": 249.78572750091553, "epoch": 0.0801374743283457, "grad_norm": 0.2081506142291193, "kl": 0.000325620174407959, "learning_rate": 5.901234567901234e-08, "loss": 0.0, "reward": 1.7232143580913544, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7276785969734192, "rewards/format_reward_func": 0.9955357164144516, "step": 478 }, { "completion_length": 248.21875762939453, "epoch": 0.08047277756821325, "grad_norm": 0.29807364642733086, "kl": 0.0003132820129394531, "learning_rate": 5.925925925925926e-08, "loss": 0.0, "reward": 1.6535715386271477, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.6535714566707611, "rewards/format_reward_func": 1.0, "step": 480 }, { "completion_length": 243.1919765472412, "epoch": 0.08080808080808081, "grad_norm": 0.20914536774706063, "kl": 0.000324249267578125, "learning_rate": 5.950617283950617e-08, "loss": 0.0, "reward": 1.7928571924567223, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 482 }, { "completion_length": 247.24554443359375, "epoch": 0.08114338404794837, "grad_norm": 0.235749099937035, "kl": 0.00035053491592407227, "learning_rate": 5.975308641975308e-08, "loss": 0.0, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 484 }, { "completion_length": 234.6830472946167, "epoch": 0.08147868728781592, "grad_norm": 0.28977720166304766, "kl": 0.0003306269645690918, "learning_rate": 6e-08, "loss": 0.0, "reward": 1.7428572252392769, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 486 }, { "completion_length": 249.5044755935669, "epoch": 0.08181399052768347, "grad_norm": 0.26207066498924353, "kl": 0.00032585859298706055, "learning_rate": 6.024691358024691e-08, "loss": 0.0, "reward": 1.716071479022503, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.729464327916503, "rewards/format_reward_func": 0.9866071492433548, "step": 488 }, { "completion_length": 247.4196548461914, "epoch": 0.08214929376755103, "grad_norm": 0.18458472700771195, "kl": 0.00031828880310058594, "learning_rate": 6.049382716049382e-08, "loss": 0.0, "reward": 1.776785783469677, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7812500298023224, "rewards/format_reward_func": 0.9955357164144516, "step": 490 }, { "completion_length": 235.79018783569336, "epoch": 0.08248459700741859, "grad_norm": 0.2414137740023195, "kl": 0.00032210350036621094, "learning_rate": 6.074074074074074e-08, "loss": 0.0, "reward": 1.7892857566475868, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7892857510596514, "rewards/format_reward_func": 1.0, "step": 492 }, { "completion_length": 244.3482255935669, "epoch": 0.08281990024728614, "grad_norm": 0.223221207261264, "kl": 0.00033855438232421875, "learning_rate": 6.098765432098765e-08, "loss": 0.0, "reward": 1.721428669989109, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7214285992085934, "rewards/format_reward_func": 1.0, "step": 494 }, { "completion_length": 250.5089406967163, "epoch": 0.08315520348715369, "grad_norm": 0.16481525246585177, "kl": 0.0003097057342529297, "learning_rate": 6.123456790123457e-08, "loss": 0.0, "reward": 1.7410714849829674, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7544643208384514, "rewards/format_reward_func": 0.9866071492433548, "step": 496 }, { "completion_length": 243.70090198516846, "epoch": 0.08349050672702125, "grad_norm": 0.23407503260150778, "kl": 0.000316619873046875, "learning_rate": 6.148148148148148e-08, "loss": 0.0, "reward": 1.7178572192788124, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571745753288, "rewards/format_reward_func": 1.0, "step": 498 }, { "completion_length": 251.65625953674316, "epoch": 0.0838258099668888, "grad_norm": 0.21363418666430034, "kl": 0.00032788515090942383, "learning_rate": 6.172839506172839e-08, "loss": 0.0, "reward": 1.7857143506407738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 500 }, { "completion_length": 234.915189743042, "epoch": 0.08416111320675636, "grad_norm": 0.2728153197023007, "kl": 0.00033909082412719727, "learning_rate": 6.19753086419753e-08, "loss": 0.0, "reward": 1.7392858266830444, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 502 }, { "completion_length": 245.33483409881592, "epoch": 0.08449641644662391, "grad_norm": 0.26575297012927046, "kl": 0.00031566619873046875, "learning_rate": 6.222222222222221e-08, "loss": 0.0, "reward": 1.7839286252856255, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928809314966, "rewards/format_reward_func": 0.9955357164144516, "step": 504 }, { "completion_length": 248.352689743042, "epoch": 0.08483171968649147, "grad_norm": 0.1380297498926032, "kl": 0.0003465414047241211, "learning_rate": 6.246913580246913e-08, "loss": 0.0, "reward": 1.785714365541935, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143077999353, "rewards/format_reward_func": 1.0, "step": 506 }, { "completion_length": 241.34375858306885, "epoch": 0.08516702292635903, "grad_norm": 0.2441522470456551, "kl": 0.00032442808151245117, "learning_rate": 6.271604938271604e-08, "loss": 0.0, "reward": 1.785714365541935, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 508 }, { "completion_length": 246.17412090301514, "epoch": 0.08550232616622658, "grad_norm": 0.21561174473883762, "kl": 0.0003211498260498047, "learning_rate": 6.296296296296296e-08, "loss": 0.0, "reward": 1.7250000685453415, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7250000294297934, "rewards/format_reward_func": 1.0, "step": 510 }, { "completion_length": 248.7991180419922, "epoch": 0.08583762940609414, "grad_norm": 0.1975704900557329, "kl": 0.00030416250228881836, "learning_rate": 6.320987654320987e-08, "loss": 0.0, "reward": 1.79464291036129, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7991071753203869, "rewards/format_reward_func": 0.9955357164144516, "step": 512 }, { "completion_length": 244.66518878936768, "epoch": 0.0861729326459617, "grad_norm": 0.31873917088865067, "kl": 0.0003369450569152832, "learning_rate": 6.345679012345679e-08, "loss": 0.0, "reward": 1.6946429535746574, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.6991071738302708, "rewards/format_reward_func": 0.9955357164144516, "step": 514 }, { "completion_length": 249.6785831451416, "epoch": 0.08650823588582925, "grad_norm": 0.1803545586351522, "kl": 0.00034105777740478516, "learning_rate": 6.37037037037037e-08, "loss": 0.0, "reward": 1.796428620815277, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285872876644, "rewards/format_reward_func": 1.0, "step": 516 }, { "completion_length": 238.2366180419922, "epoch": 0.0868435391256968, "grad_norm": 0.19465113178922888, "kl": 0.0003395676612854004, "learning_rate": 6.39506172839506e-08, "loss": 0.0, "reward": 1.803571492433548, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 518 }, { "completion_length": 243.0982255935669, "epoch": 0.08717884236556436, "grad_norm": 0.19674061927229855, "kl": 0.00034350156784057617, "learning_rate": 6.419753086419753e-08, "loss": 0.0, "reward": 1.7714286223053932, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714286055415869, "rewards/format_reward_func": 1.0, "step": 520 }, { "completion_length": 248.9866180419922, "epoch": 0.08751414560543191, "grad_norm": 0.282739041232047, "kl": 0.00033104419708251953, "learning_rate": 6.444444444444443e-08, "loss": 0.0, "reward": 1.7089286670088768, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7133928947150707, "rewards/format_reward_func": 0.9955357164144516, "step": 522 }, { "completion_length": 237.4553689956665, "epoch": 0.08784944884529947, "grad_norm": 0.2913300631224223, "kl": 0.00030612945556640625, "learning_rate": 6.469135802469136e-08, "loss": 0.0, "reward": 1.7995536252856255, "reward_std": 0.06124049657955766, "rewards/equation_reward_func": 0.8026785850524902, "rewards/format_reward_func": 0.9968750029802322, "step": 524 }, { "completion_length": 248.6517972946167, "epoch": 0.08818475208516702, "grad_norm": 0.2988093886621688, "kl": 0.0003228187561035156, "learning_rate": 6.493827160493826e-08, "loss": 0.0, "reward": 1.7535714954137802, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 526 }, { "completion_length": 247.02233505249023, "epoch": 0.08852005532503458, "grad_norm": 0.31123073954807806, "kl": 0.00034630298614501953, "learning_rate": 6.518518518518518e-08, "loss": 0.0, "reward": 1.7321429252624512, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428917348385, "rewards/format_reward_func": 1.0, "step": 528 }, { "completion_length": 240.74108123779297, "epoch": 0.08885535856490213, "grad_norm": 0.2844834020779769, "kl": 0.0003533363342285156, "learning_rate": 6.54320987654321e-08, "loss": 0.0, "reward": 1.8178571909666061, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 1.0, "step": 530 }, { "completion_length": 247.12947845458984, "epoch": 0.08919066180476969, "grad_norm": 0.3683831687012005, "kl": 0.00037026405334472656, "learning_rate": 6.567901234567901e-08, "loss": 0.0, "reward": 1.7214286252856255, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7392857503145933, "rewards/format_reward_func": 0.9821428656578064, "step": 532 }, { "completion_length": 239.91965293884277, "epoch": 0.08952596504463724, "grad_norm": 0.23647114042655057, "kl": 0.00031244754791259766, "learning_rate": 6.592592592592592e-08, "loss": 0.0, "reward": 1.737500049173832, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643066823483, "rewards/format_reward_func": 0.9955357164144516, "step": 534 }, { "completion_length": 239.70537090301514, "epoch": 0.0898612682845048, "grad_norm": 0.2744914568098237, "kl": 0.00034332275390625, "learning_rate": 6.617283950617284e-08, "loss": 0.0, "reward": 1.7750000730156898, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 536 }, { "completion_length": 235.54465579986572, "epoch": 0.09019657152437235, "grad_norm": 0.24470360924408466, "kl": 0.00032085180282592773, "learning_rate": 6.641975308641975e-08, "loss": 0.0, "reward": 1.7714286521077156, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 538 }, { "completion_length": 252.58929634094238, "epoch": 0.0905318747642399, "grad_norm": 0.19399993162010581, "kl": 0.0003203749656677246, "learning_rate": 6.666666666666667e-08, "loss": 0.0, "reward": 1.7142857536673546, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857536673546, "rewards/format_reward_func": 1.0, "step": 540 }, { "completion_length": 242.52679443359375, "epoch": 0.09086717800410747, "grad_norm": 0.1805765854243748, "kl": 0.0003228187561035156, "learning_rate": 6.691358024691358e-08, "loss": 0.0, "reward": 1.803571492433548, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 542 }, { "completion_length": 250.33930015563965, "epoch": 0.09120248124397502, "grad_norm": 0.2510144798661156, "kl": 0.00032061338424682617, "learning_rate": 6.71604938271605e-08, "loss": 0.0, "reward": 1.753571517765522, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7625000290572643, "rewards/format_reward_func": 0.9910714328289032, "step": 544 }, { "completion_length": 248.883939743042, "epoch": 0.09153778448384257, "grad_norm": 0.2655297568235366, "kl": 0.0003451108932495117, "learning_rate": 6.74074074074074e-08, "loss": 0.0, "reward": 1.751785784959793, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500312924385, "rewards/format_reward_func": 0.9955357164144516, "step": 546 }, { "completion_length": 249.2009038925171, "epoch": 0.09187308772371013, "grad_norm": 0.26878506330376745, "kl": 0.0003330707550048828, "learning_rate": 6.765432098765431e-08, "loss": 0.0, "reward": 1.7482143640518188, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 548 }, { "completion_length": 237.3348331451416, "epoch": 0.09220839096357769, "grad_norm": 0.2253938894358309, "kl": 0.00033724308013916016, "learning_rate": 6.790123456790123e-08, "loss": 0.0, "reward": 1.757142923772335, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571429051458836, "rewards/format_reward_func": 1.0, "step": 550 }, { "completion_length": 251.540189743042, "epoch": 0.09254369420344524, "grad_norm": 0.22765833106871078, "kl": 0.00031238794326782227, "learning_rate": 6.814814814814814e-08, "loss": 0.0, "reward": 1.762500062584877, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7669643145054579, "rewards/format_reward_func": 0.9955357164144516, "step": 552 }, { "completion_length": 243.71876430511475, "epoch": 0.0928789974433128, "grad_norm": 0.2972380601698253, "kl": 0.00034296512603759766, "learning_rate": 6.839506172839506e-08, "loss": 0.0, "reward": 1.7625000700354576, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 554 }, { "completion_length": 241.85269260406494, "epoch": 0.09321430068318035, "grad_norm": 0.18903812105876086, "kl": 0.0003712177276611328, "learning_rate": 6.864197530864197e-08, "loss": 0.0, "reward": 1.7821429148316383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428999304771, "rewards/format_reward_func": 1.0, "step": 556 }, { "completion_length": 240.98661708831787, "epoch": 0.09354960392304791, "grad_norm": 0.2808854496545026, "kl": 0.00033342838287353516, "learning_rate": 6.888888888888889e-08, "loss": 0.0, "reward": 1.7535715028643608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714488476515, "rewards/format_reward_func": 1.0, "step": 558 }, { "completion_length": 247.7678680419922, "epoch": 0.09388490716291546, "grad_norm": 0.2433063193601212, "kl": 0.00035250186920166016, "learning_rate": 6.913580246913579e-08, "loss": 0.0, "reward": 1.7250000834465027, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000312924385, "rewards/format_reward_func": 1.0, "step": 560 }, { "completion_length": 239.64733219146729, "epoch": 0.09422021040278301, "grad_norm": 0.2328401577713564, "kl": 0.00031560659408569336, "learning_rate": 6.938271604938272e-08, "loss": 0.0, "reward": 1.793750062584877, "reward_std": 0.03914341004565358, "rewards/equation_reward_func": 0.795535746961832, "rewards/format_reward_func": 0.9982142895460129, "step": 562 }, { "completion_length": 243.7321548461914, "epoch": 0.09455551364265057, "grad_norm": 0.27133592508850835, "kl": 0.00033986568450927734, "learning_rate": 6.962962962962962e-08, "loss": 0.0, "reward": 1.8071429282426834, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8071428686380386, "rewards/format_reward_func": 1.0, "step": 564 }, { "completion_length": 255.19197368621826, "epoch": 0.09489081688251813, "grad_norm": 0.2894428896849047, "kl": 0.0003210902214050293, "learning_rate": 6.987654320987655e-08, "loss": 0.0, "reward": 1.7375000789761543, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.74196432903409, "rewards/format_reward_func": 0.9955357164144516, "step": 566 }, { "completion_length": 256.6250114440918, "epoch": 0.09522612012238568, "grad_norm": 0.24724991177580125, "kl": 0.00033986568450927734, "learning_rate": 7.012345679012345e-08, "loss": 0.0, "reward": 1.721428632736206, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7214285992085934, "rewards/format_reward_func": 1.0, "step": 568 }, { "completion_length": 246.13840103149414, "epoch": 0.09556142336225323, "grad_norm": 0.27228424645528715, "kl": 0.0003192424774169922, "learning_rate": 7.037037037037038e-08, "loss": 0.0, "reward": 1.778571479022503, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 570 }, { "completion_length": 240.89286422729492, "epoch": 0.0958967266021208, "grad_norm": 0.17697003476147488, "kl": 0.0003370046615600586, "learning_rate": 7.061728395061728e-08, "loss": 0.0, "reward": 1.8071429133415222, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 572 }, { "completion_length": 245.89286613464355, "epoch": 0.09623202984198835, "grad_norm": 0.23976461206515592, "kl": 0.0003515481948852539, "learning_rate": 7.08641975308642e-08, "loss": 0.0, "reward": 1.7607143744826317, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.760714303702116, "rewards/format_reward_func": 1.0, "step": 574 }, { "completion_length": 257.33483505249023, "epoch": 0.0965673330818559, "grad_norm": 0.24968865023102368, "kl": 0.00032526254653930664, "learning_rate": 7.111111111111111e-08, "loss": 0.0, "reward": 1.7696429342031479, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7741071581840515, "rewards/format_reward_func": 0.9955357164144516, "step": 576 }, { "completion_length": 236.85715579986572, "epoch": 0.09690263632172345, "grad_norm": 0.19104595177042635, "kl": 0.00033676624298095703, "learning_rate": 7.135802469135801e-08, "loss": 0.0, "reward": 1.8535714894533157, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8535714372992516, "rewards/format_reward_func": 1.0, "step": 578 }, { "completion_length": 250.72322750091553, "epoch": 0.09723793956159102, "grad_norm": 0.26928102942838894, "kl": 0.00033277273178100586, "learning_rate": 7.160493827160494e-08, "loss": 0.0, "reward": 1.7857143506407738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 580 }, { "completion_length": 250.25001049041748, "epoch": 0.09757324280145857, "grad_norm": 0.2822786859772726, "kl": 0.0003406405448913574, "learning_rate": 7.185185185185184e-08, "loss": 0.0, "reward": 1.7553572058677673, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 582 }, { "completion_length": 231.52679824829102, "epoch": 0.09790854604132612, "grad_norm": 0.3094974337963821, "kl": 0.00034546852111816406, "learning_rate": 7.209876543209877e-08, "loss": 0.0, "reward": 1.7392857745289803, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7482143137603998, "rewards/format_reward_func": 0.9910714328289032, "step": 584 }, { "completion_length": 245.5134048461914, "epoch": 0.09824384928119367, "grad_norm": 0.29343542630310954, "kl": 0.0003381967544555664, "learning_rate": 7.234567901234567e-08, "loss": 0.0, "reward": 1.7714286372065544, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 586 }, { "completion_length": 246.6384038925171, "epoch": 0.09857915252106124, "grad_norm": 0.18806855527665864, "kl": 0.00032448768615722656, "learning_rate": 7.25925925925926e-08, "loss": 0.0, "reward": 1.7785715013742447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 588 }, { "completion_length": 251.34822940826416, "epoch": 0.09891445576092879, "grad_norm": 0.2674941278667219, "kl": 0.00032722949981689453, "learning_rate": 7.28395061728395e-08, "loss": 0.0, "reward": 1.667857214808464, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.6678571924567223, "rewards/format_reward_func": 1.0, "step": 590 }, { "completion_length": 239.42858123779297, "epoch": 0.09924975900079634, "grad_norm": 0.17254025141382712, "kl": 0.0003591179847717285, "learning_rate": 7.308641975308643e-08, "loss": 0.0, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 592 }, { "completion_length": 247.8303689956665, "epoch": 0.0995850622406639, "grad_norm": 0.3699325611286679, "kl": 0.0003343820571899414, "learning_rate": 7.333333333333333e-08, "loss": 0.0, "reward": 1.7500000819563866, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 594 }, { "completion_length": 256.76340103149414, "epoch": 0.09992036548053146, "grad_norm": 0.3237781294586503, "kl": 0.00031179189682006836, "learning_rate": 7.358024691358024e-08, "loss": 0.0, "reward": 1.7428572252392769, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7428571823984385, "rewards/format_reward_func": 1.0, "step": 596 }, { "completion_length": 247.66965579986572, "epoch": 0.10025566872039901, "grad_norm": 0.3258600776488885, "kl": 0.0003464221954345703, "learning_rate": 7.382716049382716e-08, "loss": 0.0, "reward": 1.6928571984171867, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7017857357859612, "rewards/format_reward_func": 0.9910714328289032, "step": 598 }, { "completion_length": 249.6250114440918, "epoch": 0.10059097196026656, "grad_norm": 0.261689626399354, "kl": 0.0003151893615722656, "learning_rate": 7.407407407407407e-08, "loss": 0.0, "reward": 1.7535714954137802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714488476515, "rewards/format_reward_func": 1.0, "step": 600 }, { "completion_length": 242.17858123779297, "epoch": 0.10092627520013413, "grad_norm": 0.24581423709483807, "kl": 0.00032001733779907227, "learning_rate": 7.432098765432099e-08, "loss": 0.0, "reward": 1.7642857730388641, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 602 }, { "completion_length": 248.79912090301514, "epoch": 0.10126157844000168, "grad_norm": 0.3911909271722524, "kl": 0.0003205537796020508, "learning_rate": 7.456790123456789e-08, "loss": 0.0, "reward": 1.7214286774396896, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7303571663796902, "rewards/format_reward_func": 0.9910714328289032, "step": 604 }, { "completion_length": 240.64286994934082, "epoch": 0.10159688167986923, "grad_norm": 0.1934736455480886, "kl": 0.0003267526626586914, "learning_rate": 7.481481481481482e-08, "loss": 0.0, "reward": 1.7910714820027351, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7955357544124126, "rewards/format_reward_func": 0.9955357164144516, "step": 606 }, { "completion_length": 244.6160831451416, "epoch": 0.10193218491973678, "grad_norm": 0.22264172706402283, "kl": 0.00035625696182250977, "learning_rate": 7.506172839506172e-08, "loss": 0.0, "reward": 1.796428620815277, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 608 }, { "completion_length": 238.7232255935669, "epoch": 0.10226748815960435, "grad_norm": 0.1806141118501612, "kl": 0.00031882524490356445, "learning_rate": 7.530864197530865e-08, "loss": 0.0, "reward": 1.7785714864730835, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7875000238418579, "rewards/format_reward_func": 0.9910714328289032, "step": 610 }, { "completion_length": 239.7678689956665, "epoch": 0.1026027913994719, "grad_norm": 0.17213793905290295, "kl": 0.00036013126373291016, "learning_rate": 7.555555555555555e-08, "loss": 0.0, "reward": 1.6964286267757416, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.6964286155998707, "rewards/format_reward_func": 1.0, "step": 612 }, { "completion_length": 257.57590770721436, "epoch": 0.10293809463933945, "grad_norm": 0.3303583332562279, "kl": 0.0003082752227783203, "learning_rate": 7.580246913580246e-08, "loss": 0.0, "reward": 1.723214365541935, "reward_std": 0.09848987031728029, "rewards/equation_reward_func": 0.7276786044239998, "rewards/format_reward_func": 0.9955357164144516, "step": 614 }, { "completion_length": 238.5312614440918, "epoch": 0.103273397879207, "grad_norm": 0.30225920527212524, "kl": 0.000339508056640625, "learning_rate": 7.604938271604938e-08, "loss": 0.0, "reward": 1.7571429386734962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428716182709, "rewards/format_reward_func": 1.0, "step": 616 }, { "completion_length": 237.79911994934082, "epoch": 0.10360870111907457, "grad_norm": 0.1956188765585153, "kl": 0.00033271312713623047, "learning_rate": 7.629629629629629e-08, "loss": 0.0, "reward": 1.7821429148316383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 618 }, { "completion_length": 243.6651906967163, "epoch": 0.10394400435894212, "grad_norm": 0.21769077201626216, "kl": 0.0003572702407836914, "learning_rate": 7.654320987654321e-08, "loss": 0.0, "reward": 1.76071435213089, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 1.0, "step": 620 }, { "completion_length": 252.90626335144043, "epoch": 0.10427930759880967, "grad_norm": 0.2545100315186332, "kl": 0.00031685829162597656, "learning_rate": 7.679012345679012e-08, "loss": 0.0, "reward": 1.7535715028643608, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7625000402331352, "rewards/format_reward_func": 0.9910714328289032, "step": 622 }, { "completion_length": 247.33929634094238, "epoch": 0.10461461083867724, "grad_norm": 0.3575182728529308, "kl": 0.0003485679626464844, "learning_rate": 7.703703703703704e-08, "loss": 0.0, "reward": 1.7625000774860382, "reward_std": 0.09343910776078701, "rewards/equation_reward_func": 0.766964316368103, "rewards/format_reward_func": 0.9955357164144516, "step": 624 }, { "completion_length": 256.81251525878906, "epoch": 0.10494991407854479, "grad_norm": 0.2494622615449293, "kl": 0.00032192468643188477, "learning_rate": 7.728395061728395e-08, "loss": 0.0, "reward": 1.7964286357164383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 626 }, { "completion_length": 245.0357265472412, "epoch": 0.10528521731841234, "grad_norm": 0.22256146612232233, "kl": 0.00034034252166748047, "learning_rate": 7.753086419753085e-08, "loss": 0.0, "reward": 1.6964286863803864, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.6964286006987095, "rewards/format_reward_func": 1.0, "step": 628 }, { "completion_length": 245.7634048461914, "epoch": 0.10562052055827989, "grad_norm": 0.2509199441030853, "kl": 0.00033855438232421875, "learning_rate": 7.777777777777778e-08, "loss": 0.0, "reward": 1.7732143476605415, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776785977184772, "rewards/format_reward_func": 0.9955357164144516, "step": 630 }, { "completion_length": 252.70090675354004, "epoch": 0.10595582379814746, "grad_norm": 0.2160039639235301, "kl": 0.0003249645233154297, "learning_rate": 7.802469135802468e-08, "loss": 0.0, "reward": 1.7446429207921028, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071820259094, "rewards/format_reward_func": 0.9955357164144516, "step": 632 }, { "completion_length": 244.58929538726807, "epoch": 0.10629112703801501, "grad_norm": 0.2032292563867832, "kl": 0.00032579898834228516, "learning_rate": 7.82716049382716e-08, "loss": 0.0, "reward": 1.7160714864730835, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7205357402563095, "rewards/format_reward_func": 0.9955357164144516, "step": 634 }, { "completion_length": 248.6026906967163, "epoch": 0.10662643027788256, "grad_norm": 0.27863712660799594, "kl": 0.00033843517303466797, "learning_rate": 7.851851851851851e-08, "loss": 0.0, "reward": 1.717857226729393, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.717857176437974, "rewards/format_reward_func": 1.0, "step": 636 }, { "completion_length": 245.8750114440918, "epoch": 0.10696173351775011, "grad_norm": 0.26748447053290475, "kl": 0.0003311634063720703, "learning_rate": 7.876543209876543e-08, "loss": 0.0, "reward": 1.8107143342494965, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8107143044471741, "rewards/format_reward_func": 1.0, "step": 638 }, { "completion_length": 237.75000953674316, "epoch": 0.10729703675761768, "grad_norm": 0.378121476129134, "kl": 0.00033354759216308594, "learning_rate": 7.901234567901234e-08, "loss": 0.0, "reward": 1.7196429446339607, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.724107176065445, "rewards/format_reward_func": 0.9955357164144516, "step": 640 }, { "completion_length": 254.9241189956665, "epoch": 0.10763233999748523, "grad_norm": 0.31986577676480604, "kl": 0.0003407001495361328, "learning_rate": 7.925925925925926e-08, "loss": 0.0, "reward": 1.7750000581145287, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000469386578, "rewards/format_reward_func": 1.0, "step": 642 }, { "completion_length": 251.55804824829102, "epoch": 0.10796764323735278, "grad_norm": 0.17222404932069518, "kl": 0.0003371238708496094, "learning_rate": 7.950617283950617e-08, "loss": 0.0, "reward": 1.7750000730156898, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000171363354, "rewards/format_reward_func": 1.0, "step": 644 }, { "completion_length": 238.65179920196533, "epoch": 0.10830294647722033, "grad_norm": 0.30833933519304063, "kl": 0.00034743547439575195, "learning_rate": 7.975308641975307e-08, "loss": 0.0, "reward": 1.7625000774860382, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7669643051922321, "rewards/format_reward_func": 0.9955357164144516, "step": 646 }, { "completion_length": 255.2500123977661, "epoch": 0.1086382497170879, "grad_norm": 0.2769674980707656, "kl": 0.00034427642822265625, "learning_rate": 8e-08, "loss": 0.0, "reward": 1.7053572237491608, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7098214589059353, "rewards/format_reward_func": 0.9955357164144516, "step": 648 }, { "completion_length": 240.78572750091553, "epoch": 0.10897355295695545, "grad_norm": 0.21938149745780547, "kl": 0.00034177303314208984, "learning_rate": 8.02469135802469e-08, "loss": 0.0, "reward": 1.7892857939004898, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 650 }, { "completion_length": 247.77233219146729, "epoch": 0.109308856196823, "grad_norm": 0.3524412693319839, "kl": 0.00033420324325561523, "learning_rate": 8.049382716049383e-08, "loss": 0.0, "reward": 1.7750000581145287, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 652 }, { "completion_length": 239.45536994934082, "epoch": 0.10964415943669056, "grad_norm": 0.21691827702998728, "kl": 0.00033783912658691406, "learning_rate": 8.074074074074073e-08, "loss": 0.0, "reward": 1.762500062584877, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7669642977416515, "rewards/format_reward_func": 0.9955357164144516, "step": 654 }, { "completion_length": 237.54018878936768, "epoch": 0.10997946267655812, "grad_norm": 0.36721379041357677, "kl": 0.00033032894134521484, "learning_rate": 8.098765432098766e-08, "loss": 0.0, "reward": 1.7375000566244125, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7419643104076385, "rewards/format_reward_func": 0.9955357164144516, "step": 656 }, { "completion_length": 247.8303689956665, "epoch": 0.11031476591642567, "grad_norm": 0.22179464679431105, "kl": 0.0003440380096435547, "learning_rate": 8.123456790123456e-08, "loss": 0.0, "reward": 1.7375000715255737, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 658 }, { "completion_length": 245.22322463989258, "epoch": 0.11065006915629322, "grad_norm": 0.28631903396643277, "kl": 0.0003451108932495117, "learning_rate": 8.148148148148149e-08, "loss": 0.0, "reward": 1.7678572088479996, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.776785746216774, "rewards/format_reward_func": 0.9910714328289032, "step": 660 }, { "completion_length": 239.6026906967163, "epoch": 0.11098537239616078, "grad_norm": 0.23815337312005594, "kl": 0.0003368854522705078, "learning_rate": 8.172839506172839e-08, "loss": 0.0, "reward": 1.7571429312229156, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 662 }, { "completion_length": 240.05804538726807, "epoch": 0.11132067563602834, "grad_norm": 0.00011184530104431835, "kl": 0.00032639503479003906, "learning_rate": 8.19753086419753e-08, "loss": 0.0, "reward": 1.7285715118050575, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 664 }, { "completion_length": 238.4196548461914, "epoch": 0.11165597887589589, "grad_norm": 0.16343083171368733, "kl": 0.0003247857093811035, "learning_rate": 8.222222222222222e-08, "loss": 0.0, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857581377029, "rewards/format_reward_func": 1.0, "step": 666 }, { "completion_length": 234.1919755935669, "epoch": 0.11199128211576344, "grad_norm": 0.2718751293232413, "kl": 0.0003414750099182129, "learning_rate": 8.246913580246912e-08, "loss": 0.0, "reward": 1.7535714954137802, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7625000402331352, "rewards/format_reward_func": 0.9910714328289032, "step": 668 }, { "completion_length": 247.2634048461914, "epoch": 0.112326585355631, "grad_norm": 0.44564106362192857, "kl": 0.0003719329833984375, "learning_rate": 8.271604938271605e-08, "loss": 0.0, "reward": 1.7732143625617027, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.9955357164144516, "step": 670 }, { "completion_length": 233.86608123779297, "epoch": 0.11266188859549855, "grad_norm": 0.24836091057957915, "kl": 0.00035303831100463867, "learning_rate": 8.296296296296295e-08, "loss": 0.0, "reward": 1.7678571939468384, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 672 }, { "completion_length": 250.02679634094238, "epoch": 0.1129971918353661, "grad_norm": 0.33629052891667766, "kl": 0.0003447532653808594, "learning_rate": 8.320987654320988e-08, "loss": 0.0, "reward": 1.7428571805357933, "reward_std": 0.10101525206118822, "rewards/equation_reward_func": 0.7517857626080513, "rewards/format_reward_func": 0.9910714328289032, "step": 674 }, { "completion_length": 244.0669755935669, "epoch": 0.11333249507523366, "grad_norm": 0.22850472179793171, "kl": 0.00035583972930908203, "learning_rate": 8.345679012345678e-08, "loss": 0.0, "reward": 1.7214286476373672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7214286103844643, "rewards/format_reward_func": 1.0, "step": 676 }, { "completion_length": 242.03572750091553, "epoch": 0.11366779831510122, "grad_norm": 0.23854294689657238, "kl": 0.00033485889434814453, "learning_rate": 8.37037037037037e-08, "loss": 0.0, "reward": 1.776785783469677, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.781250037252903, "rewards/format_reward_func": 0.9955357164144516, "step": 678 }, { "completion_length": 245.96429443359375, "epoch": 0.11400310155496877, "grad_norm": 0.16755888824586532, "kl": 0.0003756284713745117, "learning_rate": 8.395061728395061e-08, "loss": 0.0, "reward": 1.7571429163217545, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428883820772, "rewards/format_reward_func": 1.0, "step": 680 }, { "completion_length": 233.83929347991943, "epoch": 0.11433840479483633, "grad_norm": 0.24174291075874793, "kl": 0.0003483295440673828, "learning_rate": 8.419753086419753e-08, "loss": 0.0, "reward": 1.7196429297327995, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7241071742027998, "rewards/format_reward_func": 0.9955357164144516, "step": 682 }, { "completion_length": 248.6160831451416, "epoch": 0.11467370803470389, "grad_norm": 0.22786130194654666, "kl": 0.00032138824462890625, "learning_rate": 8.444444444444444e-08, "loss": 0.0, "reward": 1.7500000670552254, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000223517418, "rewards/format_reward_func": 1.0, "step": 684 }, { "completion_length": 237.14733123779297, "epoch": 0.11500901127457144, "grad_norm": 0.20303655961874897, "kl": 0.00035130977630615234, "learning_rate": 8.469135802469136e-08, "loss": 0.0, "reward": 1.7642857804894447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857581377029, "rewards/format_reward_func": 1.0, "step": 686 }, { "completion_length": 243.2767972946167, "epoch": 0.115344314514439, "grad_norm": 0.19912653343272865, "kl": 0.0003395676612854004, "learning_rate": 8.493827160493827e-08, "loss": 0.0, "reward": 1.7232143729925156, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7366071753203869, "rewards/format_reward_func": 0.9866071492433548, "step": 688 }, { "completion_length": 249.47322463989258, "epoch": 0.11567961775430655, "grad_norm": 0.2278184337257273, "kl": 0.0003470182418823242, "learning_rate": 8.518518518518517e-08, "loss": 0.0, "reward": 1.7312500551342964, "reward_std": 0.06692260666750371, "rewards/equation_reward_func": 0.7473214492201805, "rewards/format_reward_func": 0.9839285835623741, "step": 690 }, { "completion_length": 238.8169755935669, "epoch": 0.11601492099417411, "grad_norm": 0.17911647827573768, "kl": 0.0003763437271118164, "learning_rate": 8.54320987654321e-08, "loss": 0.0, "reward": 1.7714286223053932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285850524902, "rewards/format_reward_func": 1.0, "step": 692 }, { "completion_length": 255.2366180419922, "epoch": 0.11635022423404166, "grad_norm": 0.2756423672346394, "kl": 0.0003293752670288086, "learning_rate": 8.5679012345679e-08, "loss": 0.0, "reward": 1.7410714998841286, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7544643245637417, "rewards/format_reward_func": 0.9866071492433548, "step": 694 }, { "completion_length": 249.30804634094238, "epoch": 0.11668552747390921, "grad_norm": 0.2111679882859518, "kl": 0.00035190582275390625, "learning_rate": 8.592592592592592e-08, "loss": 0.0, "reward": 1.7196429073810577, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7330357506871223, "rewards/format_reward_func": 0.9866071492433548, "step": 696 }, { "completion_length": 243.665189743042, "epoch": 0.11702083071377677, "grad_norm": 0.155534871523258, "kl": 0.00037992000579833984, "learning_rate": 8.617283950617283e-08, "loss": 0.0, "reward": 1.7196429446339607, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7241071686148643, "rewards/format_reward_func": 0.9955357164144516, "step": 698 }, { "completion_length": 251.8348331451416, "epoch": 0.11735613395364433, "grad_norm": 0.26079539156736914, "kl": 0.0003502368927001953, "learning_rate": 8.641975308641975e-08, "loss": 0.0, "reward": 1.712500087916851, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7258928939700127, "rewards/format_reward_func": 0.9866071492433548, "step": 700 }, { "completion_length": 242.1071538925171, "epoch": 0.11769143719351188, "grad_norm": 0.2277414630854397, "kl": 0.0003610849380493164, "learning_rate": 8.666666666666666e-08, "loss": 0.0, "reward": 1.8321429193019867, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.832142885774374, "rewards/format_reward_func": 1.0, "step": 702 }, { "completion_length": 253.36608409881592, "epoch": 0.11802674043337943, "grad_norm": 0.30555546254853067, "kl": 0.0003439188003540039, "learning_rate": 8.691358024691358e-08, "loss": 0.0, "reward": 1.7464286386966705, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.746428607031703, "rewards/format_reward_func": 1.0, "step": 704 }, { "completion_length": 239.258939743042, "epoch": 0.11836204367324699, "grad_norm": 0.2848334485064057, "kl": 0.0003402233123779297, "learning_rate": 8.716049382716049e-08, "loss": 0.0, "reward": 1.771428644657135, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 706 }, { "completion_length": 239.32590293884277, "epoch": 0.11869734691311455, "grad_norm": 0.21435664986931335, "kl": 0.00037920475006103516, "learning_rate": 8.74074074074074e-08, "loss": 0.0, "reward": 1.7696429193019867, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7741071879863739, "rewards/format_reward_func": 0.9955357164144516, "step": 708 }, { "completion_length": 243.0401906967163, "epoch": 0.1190326501529821, "grad_norm": 0.20084110899875415, "kl": 0.0003509521484375, "learning_rate": 8.765432098765432e-08, "loss": 0.0, "reward": 1.7660714909434319, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9955357164144516, "step": 710 }, { "completion_length": 251.1339406967163, "epoch": 0.11936795339284965, "grad_norm": 0.6091294566896917, "kl": 0.00038111209869384766, "learning_rate": 8.790123456790124e-08, "loss": 0.0, "reward": 1.7428571954369545, "reward_std": 0.09091372694820166, "rewards/equation_reward_func": 0.7517857626080513, "rewards/format_reward_func": 0.9910714328289032, "step": 712 }, { "completion_length": 246.12947463989258, "epoch": 0.11970325663271722, "grad_norm": 0.24858780124660707, "kl": 0.0003471970558166504, "learning_rate": 8.814814814814814e-08, "loss": 0.0, "reward": 1.7714286595582962, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 714 }, { "completion_length": 260.6696557998657, "epoch": 0.12003855987258477, "grad_norm": 0.3235796854238328, "kl": 0.00035631656646728516, "learning_rate": 8.839506172839507e-08, "loss": 0.0, "reward": 1.680357240140438, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7026786040514708, "rewards/format_reward_func": 0.977678582072258, "step": 716 }, { "completion_length": 244.77233219146729, "epoch": 0.12037386311245232, "grad_norm": 0.18663753240167508, "kl": 0.0003312826156616211, "learning_rate": 8.864197530864197e-08, "loss": 0.0, "reward": 1.8017857745289803, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8062500208616257, "rewards/format_reward_func": 0.9955357164144516, "step": 718 }, { "completion_length": 244.47768688201904, "epoch": 0.12070916635231987, "grad_norm": 0.2831141797933221, "kl": 0.000345766544342041, "learning_rate": 8.888888888888888e-08, "loss": 0.0, "reward": 1.7343750670552254, "reward_std": 0.06250318652018905, "rewards/equation_reward_func": 0.7357143349945545, "rewards/format_reward_func": 0.9986607171595097, "step": 720 }, { "completion_length": 233.08483028411865, "epoch": 0.12104446959218744, "grad_norm": 0.2972022109726578, "kl": 0.0003477931022644043, "learning_rate": 8.91358024691358e-08, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 722 }, { "completion_length": 238.62054634094238, "epoch": 0.12137977283205499, "grad_norm": 0.39101243176433587, "kl": 0.00035965442657470703, "learning_rate": 8.938271604938271e-08, "loss": 0.0, "reward": 1.7321429327130318, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.7321428917348385, "rewards/format_reward_func": 1.0, "step": 724 }, { "completion_length": 246.7053680419922, "epoch": 0.12171507607192254, "grad_norm": 0.19475704922237347, "kl": 0.0003685951232910156, "learning_rate": 8.962962962962963e-08, "loss": 0.0, "reward": 1.7625000402331352, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 726 }, { "completion_length": 235.30357837677002, "epoch": 0.1220503793117901, "grad_norm": 0.22615944528910542, "kl": 0.0003758668899536133, "learning_rate": 8.987654320987654e-08, "loss": 0.0, "reward": 1.735714353621006, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143312692642, "rewards/format_reward_func": 1.0, "step": 728 }, { "completion_length": 243.78572750091553, "epoch": 0.12238568255165766, "grad_norm": 0.20370458125847893, "kl": 0.00036966800689697266, "learning_rate": 9.012345679012346e-08, "loss": 0.0, "reward": 1.791071504354477, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 730 }, { "completion_length": 241.9732255935669, "epoch": 0.12272098579152521, "grad_norm": 0.2427979605331861, "kl": 0.0003876686096191406, "learning_rate": 9.037037037037036e-08, "loss": 0.0, "reward": 1.8285714760422707, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8285714536905289, "rewards/format_reward_func": 1.0, "step": 732 }, { "completion_length": 242.00447273254395, "epoch": 0.12305628903139276, "grad_norm": 0.2729256204257734, "kl": 0.00039398670196533203, "learning_rate": 9.061728395061729e-08, "loss": 0.0, "reward": 1.8035714849829674, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 734 }, { "completion_length": 238.852689743042, "epoch": 0.12339159227126031, "grad_norm": 0.12436679177862824, "kl": 0.0003916025161743164, "learning_rate": 9.086419753086419e-08, "loss": 0.0, "reward": 1.7357143461704254, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143126428127, "rewards/format_reward_func": 1.0, "step": 736 }, { "completion_length": 238.6830472946167, "epoch": 0.12372689551112788, "grad_norm": 0.22044509282884264, "kl": 0.00040662288665771484, "learning_rate": 9.111111111111112e-08, "loss": 0.0, "reward": 1.7535715103149414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714730620384, "rewards/format_reward_func": 1.0, "step": 738 }, { "completion_length": 234.1473331451416, "epoch": 0.12406219875099543, "grad_norm": 0.10637798693708506, "kl": 0.00035452842712402344, "learning_rate": 9.135802469135802e-08, "loss": 0.0, "reward": 1.764285758137703, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857544124126, "rewards/format_reward_func": 1.0, "step": 740 }, { "completion_length": 247.75001049041748, "epoch": 0.12439750199086298, "grad_norm": 0.20902896319414058, "kl": 0.00039124488830566406, "learning_rate": 9.160493827160494e-08, "loss": 0.0, "reward": 1.7535714954137802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714730620384, "rewards/format_reward_func": 1.0, "step": 742 }, { "completion_length": 247.42858028411865, "epoch": 0.12473280523073055, "grad_norm": 0.17218299822927055, "kl": 0.00037997961044311523, "learning_rate": 9.185185185185185e-08, "loss": 0.0, "reward": 1.7678572088479996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571604192257, "rewards/format_reward_func": 1.0, "step": 744 }, { "completion_length": 242.24554634094238, "epoch": 0.1250681084705981, "grad_norm": 0.3318051559590746, "kl": 0.0003941059112548828, "learning_rate": 9.209876543209875e-08, "loss": 0.0, "reward": 1.7178572043776512, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571745753288, "rewards/format_reward_func": 1.0, "step": 746 }, { "completion_length": 235.47322463989258, "epoch": 0.12540341171046565, "grad_norm": 0.31337923764354797, "kl": 0.0003949403762817383, "learning_rate": 9.234567901234568e-08, "loss": 0.0, "reward": 1.7839286401867867, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 748 }, { "completion_length": 243.5000114440918, "epoch": 0.1257387149503332, "grad_norm": 0.17612462070677215, "kl": 0.0003739595413208008, "learning_rate": 9.259259259259258e-08, "loss": 0.0, "reward": 1.7696429044008255, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7741071805357933, "rewards/format_reward_func": 0.9955357164144516, "step": 750 }, { "completion_length": 251.64733028411865, "epoch": 0.12607401819020075, "grad_norm": 0.23943906846513874, "kl": 0.00038892030715942383, "learning_rate": 9.28395061728395e-08, "loss": 0.0, "reward": 1.74642863124609, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 752 }, { "completion_length": 237.15625858306885, "epoch": 0.1264093214300683, "grad_norm": 0.2950467187688119, "kl": 0.00037169456481933594, "learning_rate": 9.308641975308641e-08, "loss": 0.0, "reward": 1.760714367032051, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.760714303702116, "rewards/format_reward_func": 1.0, "step": 754 }, { "completion_length": 251.42858409881592, "epoch": 0.12674462466993588, "grad_norm": 0.16807803968532695, "kl": 0.00037419795989990234, "learning_rate": 9.333333333333334e-08, "loss": 0.0, "reward": 1.7428572252392769, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7517857477068901, "rewards/format_reward_func": 0.9910714328289032, "step": 756 }, { "completion_length": 243.0178680419922, "epoch": 0.12707992790980344, "grad_norm": 0.30999128943235993, "kl": 0.0003815889358520508, "learning_rate": 9.358024691358024e-08, "loss": 0.0, "reward": 1.8035715073347092, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 758 }, { "completion_length": 243.02233409881592, "epoch": 0.127415231149671, "grad_norm": 0.34858323722828805, "kl": 0.00039207935333251953, "learning_rate": 9.382716049382716e-08, "loss": 0.0, "reward": 1.7142857983708382, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857387661934, "rewards/format_reward_func": 1.0, "step": 760 }, { "completion_length": 241.68751049041748, "epoch": 0.12775053438953854, "grad_norm": 0.13222748191498046, "kl": 0.00041425228118896484, "learning_rate": 9.407407407407407e-08, "loss": 0.0, "reward": 1.7678572088479996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571864962578, "rewards/format_reward_func": 1.0, "step": 762 }, { "completion_length": 244.65626049041748, "epoch": 0.1280858376294061, "grad_norm": 0.30478521229008854, "kl": 0.0004159212112426758, "learning_rate": 9.432098765432098e-08, "loss": 0.0, "reward": 1.7089286670088768, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7133928947150707, "rewards/format_reward_func": 0.9955357164144516, "step": 764 }, { "completion_length": 237.8125114440918, "epoch": 0.12842114086927364, "grad_norm": 0.21084985263179185, "kl": 0.00038814544677734375, "learning_rate": 9.45679012345679e-08, "loss": 0.0, "reward": 1.7928572073578835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.792857164517045, "rewards/format_reward_func": 1.0, "step": 766 }, { "completion_length": 234.95982933044434, "epoch": 0.1287564441091412, "grad_norm": 0.2433138284776026, "kl": 0.00036156177520751953, "learning_rate": 9.481481481481481e-08, "loss": 0.0, "reward": 1.7232143804430962, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7267857454717159, "rewards/format_reward_func": 0.9964285716414452, "step": 768 }, { "completion_length": 242.1875114440918, "epoch": 0.12909174734900877, "grad_norm": 0.15249408191828723, "kl": 0.00039446353912353516, "learning_rate": 9.506172839506173e-08, "loss": 0.0, "reward": 1.7392857745289803, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7392857633531094, "rewards/format_reward_func": 1.0, "step": 770 }, { "completion_length": 241.51786613464355, "epoch": 0.12942705058887632, "grad_norm": 0.19767611997197468, "kl": 0.0004150867462158203, "learning_rate": 9.530864197530864e-08, "loss": 0.0, "reward": 1.7767857611179352, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7812500260770321, "rewards/format_reward_func": 0.9955357164144516, "step": 772 }, { "completion_length": 247.3259048461914, "epoch": 0.12976235382874388, "grad_norm": 0.2504453396389677, "kl": 0.00043642520904541016, "learning_rate": 9.555555555555556e-08, "loss": 0.0, "reward": 1.7517857998609543, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500350177288, "rewards/format_reward_func": 0.9955357164144516, "step": 774 }, { "completion_length": 248.77679634094238, "epoch": 0.13009765706861143, "grad_norm": 0.1976896046009228, "kl": 0.0004209280014038086, "learning_rate": 9.580246913580246e-08, "loss": 0.0, "reward": 1.7642857730388641, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7732143141329288, "rewards/format_reward_func": 0.9910714328289032, "step": 776 }, { "completion_length": 238.41965579986572, "epoch": 0.13043296030847898, "grad_norm": 0.3026664034458984, "kl": 0.0004271268844604492, "learning_rate": 9.604938271604938e-08, "loss": 0.0, "reward": 1.73392865806818, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7383928820490837, "rewards/format_reward_func": 0.9955357164144516, "step": 778 }, { "completion_length": 242.62501049041748, "epoch": 0.13076826354834653, "grad_norm": 0.21444822804367636, "kl": 0.00046432018280029297, "learning_rate": 9.629629629629629e-08, "loss": 0.0, "reward": 1.7750000581145287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 780 }, { "completion_length": 247.27679538726807, "epoch": 0.13110356678821408, "grad_norm": 0.257440107889928, "kl": 0.0003908872604370117, "learning_rate": 9.65432098765432e-08, "loss": 0.0, "reward": 1.783928632736206, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 782 }, { "completion_length": 242.34822463989258, "epoch": 0.13143887002808163, "grad_norm": 0.2855112633919714, "kl": 0.0004401206970214844, "learning_rate": 9.679012345679012e-08, "loss": 0.0, "reward": 1.7821429297327995, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 784 }, { "completion_length": 252.64286613464355, "epoch": 0.1317741732679492, "grad_norm": 0.13781706493838075, "kl": 0.0004125833511352539, "learning_rate": 9.703703703703703e-08, "loss": 0.0, "reward": 1.687500074505806, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.6919643171131611, "rewards/format_reward_func": 0.9955357164144516, "step": 786 }, { "completion_length": 241.74554538726807, "epoch": 0.13210947650781676, "grad_norm": 0.12654413814050075, "kl": 0.00040209293365478516, "learning_rate": 9.728395061728395e-08, "loss": 0.0, "reward": 1.7732143476605415, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.9955357164144516, "step": 788 }, { "completion_length": 243.47322750091553, "epoch": 0.13244477974768432, "grad_norm": 0.2837353459979307, "kl": 0.0004172325134277344, "learning_rate": 9.753086419753086e-08, "loss": 0.0, "reward": 1.728571504354477, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714503377676, "rewards/format_reward_func": 1.0, "step": 790 }, { "completion_length": 248.37054634094238, "epoch": 0.13278008298755187, "grad_norm": 0.22323115630506868, "kl": 0.00045418739318847656, "learning_rate": 9.777777777777778e-08, "loss": 0.0, "reward": 1.7357143834233284, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7357143126428127, "rewards/format_reward_func": 1.0, "step": 792 }, { "completion_length": 249.6071548461914, "epoch": 0.13311538622741942, "grad_norm": 0.24346910568279878, "kl": 0.000400543212890625, "learning_rate": 9.802469135802469e-08, "loss": 0.0, "reward": 1.7803572118282318, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 794 }, { "completion_length": 255.96875953674316, "epoch": 0.13345068946728697, "grad_norm": 0.3170654293138634, "kl": 0.00042128562927246094, "learning_rate": 9.82716049382716e-08, "loss": 0.0, "reward": 1.7089286372065544, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7133928816765547, "rewards/format_reward_func": 0.9955357164144516, "step": 796 }, { "completion_length": 242.2053680419922, "epoch": 0.13378599270715452, "grad_norm": 0.2949914528878829, "kl": 0.0004379749298095703, "learning_rate": 9.851851851851852e-08, "loss": 0.0, "reward": 1.7357143685221672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 1.0, "step": 798 }, { "completion_length": 250.4375123977661, "epoch": 0.1341212959470221, "grad_norm": 0.21137036767136858, "kl": 0.00042176246643066406, "learning_rate": 9.876543209876542e-08, "loss": 0.0, "reward": 1.7678572162985802, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7678571734577417, "rewards/format_reward_func": 1.0, "step": 800 }, { "completion_length": 238.6919755935669, "epoch": 0.13445659918688965, "grad_norm": 0.2814166147627615, "kl": 0.00045418739318847656, "learning_rate": 9.901234567901235e-08, "loss": 0.0, "reward": 1.7857143580913544, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 802 }, { "completion_length": 242.8705472946167, "epoch": 0.1347919024267572, "grad_norm": 0.30188794758016163, "kl": 0.00044226646423339844, "learning_rate": 9.925925925925925e-08, "loss": 0.0, "reward": 1.7366072237491608, "reward_std": 0.05934646027162671, "rewards/equation_reward_func": 0.7383928894996643, "rewards/format_reward_func": 0.9982142895460129, "step": 804 }, { "completion_length": 257.3794775009155, "epoch": 0.13512720566662476, "grad_norm": 0.25469855228198224, "kl": 0.0004140138626098633, "learning_rate": 9.950617283950617e-08, "loss": 0.0, "reward": 1.7446429505944252, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 806 }, { "completion_length": 250.01787090301514, "epoch": 0.1354625089064923, "grad_norm": 0.27453581331030075, "kl": 0.0004793405532836914, "learning_rate": 9.975308641975308e-08, "loss": 0.0, "reward": 1.767857201397419, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7678571827709675, "rewards/format_reward_func": 1.0, "step": 808 }, { "completion_length": 244.21876049041748, "epoch": 0.13579781214635986, "grad_norm": 0.28017698150363773, "kl": 0.0004837512969970703, "learning_rate": 1e-07, "loss": 0.0, "reward": 1.7232143878936768, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7276785895228386, "rewards/format_reward_func": 0.9955357164144516, "step": 810 }, { "completion_length": 247.321439743042, "epoch": 0.1361331153862274, "grad_norm": 0.29506744159707904, "kl": 0.000514984130859375, "learning_rate": 1.0024691358024691e-07, "loss": 0.0, "reward": 1.737500086426735, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7419643122702837, "rewards/format_reward_func": 0.9955357164144516, "step": 812 }, { "completion_length": 248.3794765472412, "epoch": 0.13646841862609496, "grad_norm": 0.45052300905816106, "kl": 0.0004858970642089844, "learning_rate": 1.0049382716049381e-07, "loss": 0.0, "reward": 1.807142935693264, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 814 }, { "completion_length": 247.02679443359375, "epoch": 0.13680372186596254, "grad_norm": 0.23890198941501573, "kl": 0.0005213022232055664, "learning_rate": 1.0074074074074074e-07, "loss": 0.0, "reward": 1.7535715028643608, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 816 }, { "completion_length": 255.1428680419922, "epoch": 0.1371390251058301, "grad_norm": 0.3321438645772538, "kl": 0.000493168830871582, "learning_rate": 1.0098765432098764e-07, "loss": 0.0, "reward": 1.7232143431901932, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7276786118745804, "rewards/format_reward_func": 0.9955357164144516, "step": 818 }, { "completion_length": 244.43304824829102, "epoch": 0.13747432834569764, "grad_norm": 0.20810648283029934, "kl": 0.0005180835723876953, "learning_rate": 1.0123456790123457e-07, "loss": 0.0, "reward": 1.7392857894301414, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.739285746589303, "rewards/format_reward_func": 1.0, "step": 820 }, { "completion_length": 237.0134038925171, "epoch": 0.1378096315855652, "grad_norm": 0.22541616829027122, "kl": 0.0004540681838989258, "learning_rate": 1.0148148148148147e-07, "loss": 0.0, "reward": 1.8535714894533157, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.8625000156462193, "rewards/format_reward_func": 0.9910714328289032, "step": 822 }, { "completion_length": 242.77233219146729, "epoch": 0.13814493482543275, "grad_norm": 0.27520042618499535, "kl": 0.00048482418060302734, "learning_rate": 1.017283950617284e-07, "loss": 0.0, "reward": 1.789285771548748, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 824 }, { "completion_length": 248.8660831451416, "epoch": 0.1384802380653003, "grad_norm": 0.30717772584495134, "kl": 0.0005295276641845703, "learning_rate": 1.019753086419753e-07, "loss": 0.0, "reward": 1.7571429088711739, "reward_std": 0.06060914974659681, "rewards/equation_reward_func": 0.7571428753435612, "rewards/format_reward_func": 1.0, "step": 826 }, { "completion_length": 241.49554824829102, "epoch": 0.13881554130516785, "grad_norm": 0.16709679133115038, "kl": 0.0005042552947998047, "learning_rate": 1.0222222222222223e-07, "loss": 0.0, "reward": 1.7964286282658577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 828 }, { "completion_length": 258.34376335144043, "epoch": 0.13915084454503543, "grad_norm": 0.19247229448577885, "kl": 0.0004944801330566406, "learning_rate": 1.0246913580246913e-07, "loss": 0.0, "reward": 1.755357213318348, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7598214484751225, "rewards/format_reward_func": 0.9955357164144516, "step": 830 }, { "completion_length": 236.6651906967163, "epoch": 0.13948614778490298, "grad_norm": 0.2414068343995031, "kl": 0.00048828125, "learning_rate": 1.0271604938271603e-07, "loss": 0.0, "reward": 1.7785714864730835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 832 }, { "completion_length": 245.133939743042, "epoch": 0.13982145102477053, "grad_norm": 0.328663766531583, "kl": 0.0005055665969848633, "learning_rate": 1.0296296296296296e-07, "loss": 0.0, "reward": 1.689285784959793, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.6892857402563095, "rewards/format_reward_func": 1.0, "step": 834 }, { "completion_length": 243.54018878936768, "epoch": 0.14015675426463808, "grad_norm": 0.4397604670140151, "kl": 0.00045621395111083984, "learning_rate": 1.0320987654320986e-07, "loss": 0.0, "reward": 1.7482143491506577, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7526785954833031, "rewards/format_reward_func": 0.9955357164144516, "step": 836 }, { "completion_length": 241.74108219146729, "epoch": 0.14049205750450564, "grad_norm": 0.29775824487765673, "kl": 0.00047719478607177734, "learning_rate": 1.0345679012345679e-07, "loss": 0.0, "reward": 1.7517857626080513, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 838 }, { "completion_length": 239.21429538726807, "epoch": 0.1408273607443732, "grad_norm": 0.3551956976091781, "kl": 0.0004627704620361328, "learning_rate": 1.0370370370370369e-07, "loss": 0.0, "reward": 1.757589340209961, "reward_std": 0.09028238197788596, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9941964335739613, "step": 840 }, { "completion_length": 249.44197368621826, "epoch": 0.14116266398424074, "grad_norm": 0.16079556819889082, "kl": 0.0005052089691162109, "learning_rate": 1.0395061728395062e-07, "loss": 0.0, "reward": 1.701785795390606, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7062500417232513, "rewards/format_reward_func": 0.9955357164144516, "step": 842 }, { "completion_length": 249.0178689956665, "epoch": 0.1414979672241083, "grad_norm": 0.43008578708661116, "kl": 0.0005563497543334961, "learning_rate": 1.0419753086419752e-07, "loss": 0.0, "reward": 1.7410714998841286, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7455357611179352, "rewards/format_reward_func": 0.9955357164144516, "step": 844 }, { "completion_length": 240.696439743042, "epoch": 0.14183327046397587, "grad_norm": 0.19942106819097674, "kl": 0.0004343986511230469, "learning_rate": 1.0444444444444445e-07, "loss": 0.0, "reward": 1.7803571745753288, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 846 }, { "completion_length": 248.0937614440918, "epoch": 0.14216857370384342, "grad_norm": 0.32938655852109006, "kl": 0.0005087852478027344, "learning_rate": 1.0469135802469135e-07, "loss": 0.0, "reward": 1.7625000700354576, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7669643107801676, "rewards/format_reward_func": 0.9955357164144516, "step": 848 }, { "completion_length": 241.70983123779297, "epoch": 0.14250387694371097, "grad_norm": 0.20099853956894784, "kl": 0.0004711151123046875, "learning_rate": 1.0493827160493827e-07, "loss": 0.0, "reward": 1.7571429014205933, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428921073675, "rewards/format_reward_func": 1.0, "step": 850 }, { "completion_length": 251.04911613464355, "epoch": 0.14283918018357852, "grad_norm": 0.26710235078942274, "kl": 0.0005549192428588867, "learning_rate": 1.0518518518518518e-07, "loss": 0.0, "reward": 1.7535715028643608, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 852 }, { "completion_length": 242.21875953674316, "epoch": 0.14317448342344608, "grad_norm": 0.24595597743321643, "kl": 0.000561833381652832, "learning_rate": 1.054320987654321e-07, "loss": 0.0, "reward": 1.7821429073810577, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 1.0, "step": 854 }, { "completion_length": 239.6160831451416, "epoch": 0.14350978666331363, "grad_norm": 0.24191581359797343, "kl": 0.0005096197128295898, "learning_rate": 1.0567901234567901e-07, "loss": 0.0, "reward": 1.7125000730156898, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.716964315623045, "rewards/format_reward_func": 0.9955357164144516, "step": 856 }, { "completion_length": 242.3705472946167, "epoch": 0.14384508990318118, "grad_norm": 0.3070888032318429, "kl": 0.0004938840866088867, "learning_rate": 1.0592592592592592e-07, "loss": 0.0, "reward": 1.7250000685453415, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 1.0, "step": 858 }, { "completion_length": 250.14733123779297, "epoch": 0.14418039314304876, "grad_norm": 0.2893268587254202, "kl": 0.0005345344543457031, "learning_rate": 1.0617283950617284e-07, "loss": 0.0, "reward": 1.7589286640286446, "reward_std": 0.09848987031728029, "rewards/equation_reward_func": 0.7633928768336773, "rewards/format_reward_func": 0.9955357164144516, "step": 860 }, { "completion_length": 244.47322463989258, "epoch": 0.1445156963829163, "grad_norm": 0.19801472144759966, "kl": 0.0005357265472412109, "learning_rate": 1.0641975308641974e-07, "loss": 0.0, "reward": 1.7178572490811348, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571783006191, "rewards/format_reward_func": 1.0, "step": 862 }, { "completion_length": 241.60715293884277, "epoch": 0.14485099962278386, "grad_norm": 0.2844514690303181, "kl": 0.0005900859832763672, "learning_rate": 1.0666666666666667e-07, "loss": 0.0, "reward": 1.8107143342494965, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.8107143081724644, "rewards/format_reward_func": 1.0, "step": 864 }, { "completion_length": 246.8035831451416, "epoch": 0.1451863028626514, "grad_norm": 0.14321224353680495, "kl": 0.0005359649658203125, "learning_rate": 1.0691358024691357e-07, "loss": 0.0, "reward": 1.6946429386734962, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.6991071663796902, "rewards/format_reward_func": 0.9955357164144516, "step": 866 }, { "completion_length": 252.5446538925171, "epoch": 0.14552160610251896, "grad_norm": 0.2782425391415311, "kl": 0.00047397613525390625, "learning_rate": 1.0716049382716049e-07, "loss": 0.0, "reward": 1.778571479022503, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7875000424683094, "rewards/format_reward_func": 0.9910714328289032, "step": 868 }, { "completion_length": 240.04465293884277, "epoch": 0.14585690934238651, "grad_norm": 0.41490336823387663, "kl": 0.0005705356597900391, "learning_rate": 1.074074074074074e-07, "loss": 0.0, "reward": 1.7500000894069672, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7500000279396772, "rewards/format_reward_func": 1.0, "step": 870 }, { "completion_length": 249.63840675354004, "epoch": 0.14619221258225407, "grad_norm": 0.2734543943620819, "kl": 0.0005452632904052734, "learning_rate": 1.0765432098765431e-07, "loss": 0.0, "reward": 1.7714286372065544, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.771428594365716, "rewards/format_reward_func": 1.0, "step": 872 }, { "completion_length": 239.69197273254395, "epoch": 0.14652751582212162, "grad_norm": 0.2274695812309938, "kl": 0.000532984733581543, "learning_rate": 1.0790123456790123e-07, "loss": 0.0, "reward": 1.7750000581145287, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 874 }, { "completion_length": 238.9330472946167, "epoch": 0.1468628190619892, "grad_norm": 0.16239306753807847, "kl": 0.0005614757537841797, "learning_rate": 1.0814814814814814e-07, "loss": 0.0, "reward": 1.760714367032051, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.769642885774374, "rewards/format_reward_func": 0.9910714328289032, "step": 876 }, { "completion_length": 246.38393783569336, "epoch": 0.14719812230185675, "grad_norm": 0.25783637098150386, "kl": 0.0005737543106079102, "learning_rate": 1.0839506172839506e-07, "loss": 0.0, "reward": 1.7678572088479996, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7767857387661934, "rewards/format_reward_func": 0.9910714328289032, "step": 878 }, { "completion_length": 241.51786708831787, "epoch": 0.1475334255417243, "grad_norm": 0.3677828611432142, "kl": 0.0005735158920288086, "learning_rate": 1.0864197530864197e-07, "loss": 0.0, "reward": 1.7125000730156898, "reward_std": 0.09343910776078701, "rewards/equation_reward_func": 0.7169643249362707, "rewards/format_reward_func": 0.9955357164144516, "step": 880 }, { "completion_length": 242.76340675354004, "epoch": 0.14786872878159185, "grad_norm": 0.20185872070200894, "kl": 0.0005519390106201172, "learning_rate": 1.0888888888888888e-07, "loss": 0.0, "reward": 1.7696429267525673, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 882 }, { "completion_length": 239.31697368621826, "epoch": 0.1482040320214594, "grad_norm": 0.24556548501686268, "kl": 0.0005544424057006836, "learning_rate": 1.091358024691358e-07, "loss": 0.0, "reward": 1.7892857640981674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857510596514, "rewards/format_reward_func": 1.0, "step": 884 }, { "completion_length": 236.79465198516846, "epoch": 0.14853933526132695, "grad_norm": 0.19666918858541813, "kl": 0.0005842447280883789, "learning_rate": 1.093827160493827e-07, "loss": 0.0, "reward": 1.7142857909202576, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857201397419, "rewards/format_reward_func": 1.0, "step": 886 }, { "completion_length": 243.69644165039062, "epoch": 0.1488746385011945, "grad_norm": 0.2543890898105721, "kl": 0.0005837678909301758, "learning_rate": 1.0962962962962963e-07, "loss": 0.0, "reward": 1.7946429252624512, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.799107164144516, "rewards/format_reward_func": 0.9955357164144516, "step": 888 }, { "completion_length": 240.16518783569336, "epoch": 0.14920994174106209, "grad_norm": 0.2612662392595029, "kl": 0.000578761100769043, "learning_rate": 1.0987654320987653e-07, "loss": 0.0, "reward": 1.7535714879631996, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.753571443259716, "rewards/format_reward_func": 1.0, "step": 890 }, { "completion_length": 243.67412090301514, "epoch": 0.14954524498092964, "grad_norm": 0.2645548665031763, "kl": 0.0005819797515869141, "learning_rate": 1.1012345679012345e-07, "loss": 0.0, "reward": 1.760714367032051, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7696428671479225, "rewards/format_reward_func": 0.9910714328289032, "step": 892 }, { "completion_length": 240.39733123779297, "epoch": 0.1498805482207972, "grad_norm": 0.202545137842961, "kl": 0.0006400346755981445, "learning_rate": 1.1037037037037036e-07, "loss": 0.0, "reward": 1.7142857760190964, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857573926449, "rewards/format_reward_func": 1.0, "step": 894 }, { "completion_length": 244.81250953674316, "epoch": 0.15021585146066474, "grad_norm": 0.26342116615886524, "kl": 0.0005612373352050781, "learning_rate": 1.1061728395061728e-07, "loss": 0.0, "reward": 1.7250000834465027, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000387430191, "rewards/format_reward_func": 1.0, "step": 896 }, { "completion_length": 252.0714406967163, "epoch": 0.1505511547005323, "grad_norm": 0.19740268449770748, "kl": 0.0004432201385498047, "learning_rate": 1.108641975308642e-07, "loss": 0.0, "reward": 1.7375000715255737, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.741964302957058, "rewards/format_reward_func": 0.9955357164144516, "step": 898 }, { "completion_length": 244.9062623977661, "epoch": 0.15088645794039984, "grad_norm": 0.21749415281902099, "kl": 0.0005629062652587891, "learning_rate": 1.111111111111111e-07, "loss": 0.0, "reward": 1.755357213318348, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214708268642, "rewards/format_reward_func": 0.9955357164144516, "step": 900 }, { "completion_length": 238.94643783569336, "epoch": 0.1512217611802674, "grad_norm": 0.19636977379194281, "kl": 0.0005797147750854492, "learning_rate": 1.1135802469135802e-07, "loss": 0.0, "reward": 1.7910714745521545, "reward_std": 0.022728432901203632, "rewards/equation_reward_func": 0.7955357357859612, "rewards/format_reward_func": 0.9955357164144516, "step": 902 }, { "completion_length": 237.40179634094238, "epoch": 0.15155706442013495, "grad_norm": 0.0812537734176348, "kl": 0.0005922317504882812, "learning_rate": 1.1160493827160493e-07, "loss": 0.0, "reward": 1.7500000819563866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000111758709, "rewards/format_reward_func": 1.0, "step": 904 }, { "completion_length": 240.58482837677002, "epoch": 0.15189236766000253, "grad_norm": 0.31431448086070735, "kl": 0.0008060932159423828, "learning_rate": 1.1185185185185185e-07, "loss": 0.0, "reward": 1.7642857730388641, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7732143197208643, "rewards/format_reward_func": 0.9910714328289032, "step": 906 }, { "completion_length": 234.2857265472412, "epoch": 0.15222767089987008, "grad_norm": 0.13392711374206998, "kl": 0.0005654096603393555, "learning_rate": 1.1209876543209876e-07, "loss": 0.0, "reward": 1.7714286297559738, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 908 }, { "completion_length": 244.5267972946167, "epoch": 0.15256297413973763, "grad_norm": 0.27221219675506475, "kl": 0.0005095005035400391, "learning_rate": 1.1234567901234568e-07, "loss": 0.0, "reward": 1.7267857789993286, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7312500383704901, "rewards/format_reward_func": 0.9955357164144516, "step": 910 }, { "completion_length": 241.03126049041748, "epoch": 0.15289827737960518, "grad_norm": 0.31530224582644223, "kl": 0.0005794763565063477, "learning_rate": 1.1259259259259258e-07, "loss": 0.0, "reward": 1.7250000834465027, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000238418579, "rewards/format_reward_func": 1.0, "step": 912 }, { "completion_length": 246.7991180419922, "epoch": 0.15323358061947273, "grad_norm": 0.250379166539302, "kl": 0.0006912946701049805, "learning_rate": 1.1283950617283951e-07, "loss": 0.0, "reward": 1.7142858058214188, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857499420643, "rewards/format_reward_func": 1.0, "step": 914 }, { "completion_length": 238.97322463989258, "epoch": 0.15356888385934028, "grad_norm": 0.19226153813655147, "kl": 0.0006650686264038086, "learning_rate": 1.1308641975308641e-07, "loss": 0.0, "reward": 1.7267857640981674, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.740178594365716, "rewards/format_reward_func": 0.9866071492433548, "step": 916 }, { "completion_length": 236.57143688201904, "epoch": 0.15390418709920783, "grad_norm": 0.29729296307514946, "kl": 0.00064849853515625, "learning_rate": 1.1333333333333332e-07, "loss": 0.0, "reward": 1.7892857566475868, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 918 }, { "completion_length": 241.696439743042, "epoch": 0.1542394903390754, "grad_norm": 0.23932282850941852, "kl": 0.0005295276641845703, "learning_rate": 1.1358024691358024e-07, "loss": 0.0, "reward": 1.7750000655651093, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 920 }, { "completion_length": 240.95090293884277, "epoch": 0.15457479357894297, "grad_norm": 0.1940264202902731, "kl": 0.0005745887756347656, "learning_rate": 1.1382716049382715e-07, "loss": 0.0, "reward": 1.769642911851406, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7741071619093418, "rewards/format_reward_func": 0.9955357164144516, "step": 922 }, { "completion_length": 243.47322273254395, "epoch": 0.15491009681881052, "grad_norm": 0.21505952196582073, "kl": 0.0005426406860351562, "learning_rate": 1.1407407407407407e-07, "loss": 0.0, "reward": 1.7607143446803093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.760714303702116, "rewards/format_reward_func": 1.0, "step": 924 }, { "completion_length": 239.3616180419922, "epoch": 0.15524540005867807, "grad_norm": 0.2723087004689939, "kl": 0.0005807876586914062, "learning_rate": 1.1432098765432098e-07, "loss": 0.0, "reward": 1.8107143342494965, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8196429014205933, "rewards/format_reward_func": 0.9910714328289032, "step": 926 }, { "completion_length": 248.90179824829102, "epoch": 0.15558070329854562, "grad_norm": 0.4941249855932243, "kl": 0.0005967617034912109, "learning_rate": 1.145679012345679e-07, "loss": 0.0, "reward": 1.7767857536673546, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500186264515, "rewards/format_reward_func": 0.9955357164144516, "step": 928 }, { "completion_length": 241.01340293884277, "epoch": 0.15591600653841317, "grad_norm": 0.1552928542536375, "kl": 0.0006102323532104492, "learning_rate": 1.148148148148148e-07, "loss": 0.0, "reward": 1.7500000894069672, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000223517418, "rewards/format_reward_func": 1.0, "step": 930 }, { "completion_length": 238.94197273254395, "epoch": 0.15625130977828072, "grad_norm": 0.418642637769558, "kl": 0.0006206035614013672, "learning_rate": 1.1506172839506173e-07, "loss": 0.0, "reward": 1.7428572177886963, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7428571749478579, "rewards/format_reward_func": 1.0, "step": 932 }, { "completion_length": 243.3437614440918, "epoch": 0.15658661301814827, "grad_norm": 0.223657964390005, "kl": 0.0006108283996582031, "learning_rate": 1.1530864197530863e-07, "loss": 0.0, "reward": 1.723214402794838, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7276786025613546, "rewards/format_reward_func": 0.9955357164144516, "step": 934 }, { "completion_length": 238.477689743042, "epoch": 0.15692191625801585, "grad_norm": 0.19391093772147885, "kl": 0.0006520748138427734, "learning_rate": 1.1555555555555555e-07, "loss": 0.0, "reward": 1.7750000804662704, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 936 }, { "completion_length": 243.4955472946167, "epoch": 0.1572572194978834, "grad_norm": 0.29975605622207685, "kl": 0.000649571418762207, "learning_rate": 1.1580246913580246e-07, "loss": 0.0, "reward": 1.7625000849366188, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7669643014669418, "rewards/format_reward_func": 0.9955357164144516, "step": 938 }, { "completion_length": 240.0625114440918, "epoch": 0.15759252273775096, "grad_norm": 0.1899270930718893, "kl": 0.0006060600280761719, "learning_rate": 1.1604938271604938e-07, "loss": 0.0, "reward": 1.7928571924567223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571626543999, "rewards/format_reward_func": 1.0, "step": 940 }, { "completion_length": 246.10715293884277, "epoch": 0.1579278259776185, "grad_norm": 0.3378569868261778, "kl": 0.0005865097045898438, "learning_rate": 1.162962962962963e-07, "loss": 0.0, "reward": 1.7303572073578835, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7348214574158192, "rewards/format_reward_func": 0.9955357164144516, "step": 942 }, { "completion_length": 238.98661613464355, "epoch": 0.15826312921748606, "grad_norm": 0.23510469699532882, "kl": 0.0005903244018554688, "learning_rate": 1.1654320987654321e-07, "loss": 0.0, "reward": 1.8107143267989159, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 944 }, { "completion_length": 239.17858219146729, "epoch": 0.1585984324573536, "grad_norm": 0.2531029628465875, "kl": 0.0005866289138793945, "learning_rate": 1.1679012345679012e-07, "loss": 0.0, "reward": 1.7553571909666061, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 946 }, { "completion_length": 239.99108219146729, "epoch": 0.15893373569722116, "grad_norm": 0.1832711381666499, "kl": 0.0005750656127929688, "learning_rate": 1.1703703703703702e-07, "loss": 0.0, "reward": 1.8303571790456772, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.834821455180645, "rewards/format_reward_func": 0.9955357164144516, "step": 948 }, { "completion_length": 248.0223331451416, "epoch": 0.15926903893708874, "grad_norm": 0.21616249364705065, "kl": 0.0006320476531982422, "learning_rate": 1.1728395061728394e-07, "loss": 0.0, "reward": 1.7892857939004898, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 950 }, { "completion_length": 247.7053689956665, "epoch": 0.1596043421769563, "grad_norm": 0.2808309498061027, "kl": 0.0006732940673828125, "learning_rate": 1.1753086419753085e-07, "loss": 0.0, "reward": 1.732142947614193, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428656578064, "rewards/format_reward_func": 1.0, "step": 952 }, { "completion_length": 247.93304634094238, "epoch": 0.15993964541682384, "grad_norm": 0.28244344156282136, "kl": 0.0006171464920043945, "learning_rate": 1.1777777777777777e-07, "loss": 0.0, "reward": 1.7678571939468384, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.776785746216774, "rewards/format_reward_func": 0.9910714328289032, "step": 954 }, { "completion_length": 249.13840293884277, "epoch": 0.1602749486566914, "grad_norm": 0.3471574848884779, "kl": 0.0006940364837646484, "learning_rate": 1.1802469135802468e-07, "loss": 0.0, "reward": 1.7535714879631996, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.7535714730620384, "rewards/format_reward_func": 1.0, "step": 956 }, { "completion_length": 237.1696548461914, "epoch": 0.16061025189655895, "grad_norm": 0.24213326359003345, "kl": 0.0006643533706665039, "learning_rate": 1.182716049382716e-07, "loss": 0.0, "reward": 1.7928571924567223, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 958 }, { "completion_length": 246.29019260406494, "epoch": 0.1609455551364265, "grad_norm": 0.25180774730216865, "kl": 0.0006979703903198242, "learning_rate": 1.1851851851851851e-07, "loss": 0.0, "reward": 1.7196429297327995, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.724107176065445, "rewards/format_reward_func": 0.9955357164144516, "step": 960 }, { "completion_length": 240.30804824829102, "epoch": 0.16128085837629405, "grad_norm": 0.27009721641478057, "kl": 0.0005599260330200195, "learning_rate": 1.1876543209876543e-07, "loss": 0.0, "reward": 1.7857143580913544, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 962 }, { "completion_length": 240.95090675354004, "epoch": 0.16161616161616163, "grad_norm": 0.20274197501049643, "kl": 0.0005586147308349609, "learning_rate": 1.1901234567901234e-07, "loss": 0.0, "reward": 1.7035715207457542, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.712500024586916, "rewards/format_reward_func": 0.9910714328289032, "step": 964 }, { "completion_length": 234.34376049041748, "epoch": 0.16195146485602918, "grad_norm": 0.2154479276321837, "kl": 0.0006890296936035156, "learning_rate": 1.1925925925925924e-07, "loss": 0.0, "reward": 1.7821429148316383, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7910714615136385, "rewards/format_reward_func": 0.9910714328289032, "step": 966 }, { "completion_length": 246.4776906967163, "epoch": 0.16228676809589673, "grad_norm": 0.31223951314737436, "kl": 0.000695347785949707, "learning_rate": 1.1950617283950616e-07, "loss": 0.0, "reward": 1.74821437895298, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.752678606659174, "rewards/format_reward_func": 0.9955357164144516, "step": 968 }, { "completion_length": 247.47322463989258, "epoch": 0.16262207133576428, "grad_norm": 0.26589705178869344, "kl": 0.0006476640701293945, "learning_rate": 1.1975308641975307e-07, "loss": 0.0, "reward": 1.7379465252161026, "reward_std": 0.06755394977517426, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 0.9986607171595097, "step": 970 }, { "completion_length": 243.9241180419922, "epoch": 0.16295737457563184, "grad_norm": 0.2610188794854556, "kl": 0.0007169246673583984, "learning_rate": 1.2e-07, "loss": 0.0, "reward": 1.733928695321083, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7383928894996643, "rewards/format_reward_func": 0.9955357164144516, "step": 972 }, { "completion_length": 244.821439743042, "epoch": 0.1632926778154994, "grad_norm": 0.3007014229187292, "kl": 0.0007772445678710938, "learning_rate": 1.202469135802469e-07, "loss": 0.0, "reward": 1.7875000685453415, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7919643111526966, "rewards/format_reward_func": 0.9955357164144516, "step": 974 }, { "completion_length": 244.1964406967163, "epoch": 0.16362798105536694, "grad_norm": 0.31254137964797346, "kl": 0.0007396936416625977, "learning_rate": 1.2049382716049382e-07, "loss": 0.0, "reward": 1.764285795390606, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7732143178582191, "rewards/format_reward_func": 0.9910714328289032, "step": 976 }, { "completion_length": 245.65180015563965, "epoch": 0.1639632842952345, "grad_norm": 0.12124514568359325, "kl": 0.0006595849990844727, "learning_rate": 1.2074074074074073e-07, "loss": 0.0, "reward": 1.7089286372065544, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.713392898440361, "rewards/format_reward_func": 0.9955357164144516, "step": 978 }, { "completion_length": 231.5982255935669, "epoch": 0.16429858753510207, "grad_norm": 0.30993064357758865, "kl": 0.0006895065307617188, "learning_rate": 1.2098765432098765e-07, "loss": 0.0, "reward": 1.757142923772335, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7651786059141159, "rewards/format_reward_func": 0.9919642880558968, "step": 980 }, { "completion_length": 240.02679538726807, "epoch": 0.16463389077496962, "grad_norm": 0.2581272471044061, "kl": 0.0006649494171142578, "learning_rate": 1.2123456790123456e-07, "loss": 0.0, "reward": 1.7482143566012383, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7526785992085934, "rewards/format_reward_func": 0.9955357164144516, "step": 982 }, { "completion_length": 241.2678689956665, "epoch": 0.16496919401483717, "grad_norm": 0.36361643382301556, "kl": 0.00080108642578125, "learning_rate": 1.2148148148148148e-07, "loss": 0.0, "reward": 1.7607143446803093, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 984 }, { "completion_length": 250.83483505249023, "epoch": 0.16530449725470472, "grad_norm": 0.21378059263062124, "kl": 0.0007206201553344727, "learning_rate": 1.217283950617284e-07, "loss": 0.0, "reward": 1.7017857730388641, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7062500435858965, "rewards/format_reward_func": 0.9955357164144516, "step": 986 }, { "completion_length": 238.31697463989258, "epoch": 0.16563980049457228, "grad_norm": 0.32740835817943337, "kl": 0.0007141828536987305, "learning_rate": 1.219753086419753e-07, "loss": 0.0, "reward": 1.6821429505944252, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.6910714693367481, "rewards/format_reward_func": 0.9910714328289032, "step": 988 }, { "completion_length": 238.20983219146729, "epoch": 0.16597510373443983, "grad_norm": 0.19363083263718955, "kl": 0.0007278919219970703, "learning_rate": 1.2222222222222222e-07, "loss": 0.0, "reward": 1.7000000700354576, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7000000439584255, "rewards/format_reward_func": 1.0, "step": 990 }, { "completion_length": 234.18751049041748, "epoch": 0.16631040697430738, "grad_norm": 0.23389482243909598, "kl": 0.0006586313247680664, "learning_rate": 1.2246913580246914e-07, "loss": 0.0, "reward": 1.7593750730156898, "reward_std": 0.047350899782031775, "rewards/equation_reward_func": 0.7696428969502449, "rewards/format_reward_func": 0.9897321499884129, "step": 992 }, { "completion_length": 239.7009038925171, "epoch": 0.16664571021417496, "grad_norm": 0.2894204366418183, "kl": 0.0007368326187133789, "learning_rate": 1.2271604938271605e-07, "loss": 0.0, "reward": 1.7821429297327995, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7910714708268642, "rewards/format_reward_func": 0.9910714328289032, "step": 994 }, { "completion_length": 232.94643878936768, "epoch": 0.1669810134540425, "grad_norm": 0.24697120320275165, "kl": 0.0007069110870361328, "learning_rate": 1.2296296296296297e-07, "loss": 0.0, "reward": 1.7750000655651093, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 996 }, { "completion_length": 240.09376335144043, "epoch": 0.16731631669391006, "grad_norm": 0.26157424089241804, "kl": 0.0006704330444335938, "learning_rate": 1.2320987654320988e-07, "loss": 0.0, "reward": 1.764285795390606, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.764285733923316, "rewards/format_reward_func": 1.0, "step": 998 }, { "completion_length": 253.48215198516846, "epoch": 0.1676516199337776, "grad_norm": 0.2163902996279677, "kl": 0.0006573200225830078, "learning_rate": 1.2345679012345677e-07, "loss": 0.0, "reward": 1.7928572073578835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 1000 }, { "completion_length": 248.79018878936768, "epoch": 0.16798692317364516, "grad_norm": 0.12645581691936147, "kl": 0.0006426572799682617, "learning_rate": 1.237037037037037e-07, "loss": 0.0, "reward": 1.7178572341799736, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.71785718947649, "rewards/format_reward_func": 1.0, "step": 1002 }, { "completion_length": 229.32143878936768, "epoch": 0.16832222641351272, "grad_norm": 0.15066270893991937, "kl": 0.0006672143936157227, "learning_rate": 1.239506172839506e-07, "loss": 0.0, "reward": 1.7750000730156898, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 1004 }, { "completion_length": 243.65179538726807, "epoch": 0.16865752965338027, "grad_norm": 0.19863441041666846, "kl": 0.0008147954940795898, "learning_rate": 1.2419753086419751e-07, "loss": 0.0, "reward": 1.7517857924103737, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 1006 }, { "completion_length": 236.758939743042, "epoch": 0.16899283289324782, "grad_norm": 0.323289276257295, "kl": 0.0006589889526367188, "learning_rate": 1.2444444444444443e-07, "loss": 0.0, "reward": 1.816071480512619, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.8205357156693935, "rewards/format_reward_func": 0.9955357164144516, "step": 1008 }, { "completion_length": 239.04465293884277, "epoch": 0.1693281361331154, "grad_norm": 0.21195653521421964, "kl": 0.0006917715072631836, "learning_rate": 1.2469135802469134e-07, "loss": 0.0, "reward": 1.7285715118050575, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7464285958558321, "rewards/format_reward_func": 0.9821428619325161, "step": 1010 }, { "completion_length": 234.57143878936768, "epoch": 0.16966343937298295, "grad_norm": 0.14904220568452134, "kl": 0.0006827116012573242, "learning_rate": 1.2493827160493826e-07, "loss": 0.0, "reward": 1.7477679178118706, "reward_std": 0.053664354141801596, "rewards/equation_reward_func": 0.7553571909666061, "rewards/format_reward_func": 0.9924107193946838, "step": 1012 }, { "completion_length": 239.071439743042, "epoch": 0.1699987426128505, "grad_norm": 0.32596573692825204, "kl": 0.0007929801940917969, "learning_rate": 1.2518518518518517e-07, "loss": 0.0, "reward": 1.7964286357164383, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7964286059141159, "rewards/format_reward_func": 1.0, "step": 1014 }, { "completion_length": 241.7053689956665, "epoch": 0.17033404585271805, "grad_norm": 0.22868795871007735, "kl": 0.0008907318115234375, "learning_rate": 1.254320987654321e-07, "loss": 0.0, "reward": 1.7107143700122833, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7196428813040257, "rewards/format_reward_func": 0.9910714328289032, "step": 1016 }, { "completion_length": 242.83037185668945, "epoch": 0.1706693490925856, "grad_norm": 0.2450458836005228, "kl": 0.0008082389831542969, "learning_rate": 1.25679012345679e-07, "loss": 0.0, "reward": 1.764285795390606, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7732143215835094, "rewards/format_reward_func": 0.9910714328289032, "step": 1018 }, { "completion_length": 241.7321538925171, "epoch": 0.17100465233245316, "grad_norm": 0.22439507701355918, "kl": 0.0007718801498413086, "learning_rate": 1.2592592592592592e-07, "loss": 0.0, "reward": 1.7607143744826317, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143167406321, "rewards/format_reward_func": 1.0, "step": 1020 }, { "completion_length": 239.7857208251953, "epoch": 0.1713399555723207, "grad_norm": 0.275089302136405, "kl": 0.0007205009460449219, "learning_rate": 1.2617283950617283e-07, "loss": 0.0, "reward": 1.7464286461472511, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 1022 }, { "completion_length": 236.66518783569336, "epoch": 0.17167525881218829, "grad_norm": 0.31408574841301085, "kl": 0.0007870197296142578, "learning_rate": 1.2641975308641975e-07, "loss": 0.0, "reward": 1.6982143744826317, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7026785835623741, "rewards/format_reward_func": 0.9955357164144516, "step": 1024 }, { "completion_length": 244.68751049041748, "epoch": 0.17201056205205584, "grad_norm": 0.23233340761476015, "kl": 0.0008363723754882812, "learning_rate": 1.2666666666666666e-07, "loss": 0.0, "reward": 1.717857226729393, "reward_std": 0.0656599160283804, "rewards/equation_reward_func": 0.7267857417464256, "rewards/format_reward_func": 0.9910714328289032, "step": 1026 }, { "completion_length": 243.33483409881592, "epoch": 0.1723458652919234, "grad_norm": 0.2969390274202134, "kl": 0.000970005989074707, "learning_rate": 1.2691358024691358e-07, "loss": 0.0, "reward": 1.7428572252392769, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 1028 }, { "completion_length": 232.19643783569336, "epoch": 0.17268116853179094, "grad_norm": 0.267704808940672, "kl": 0.0009148120880126953, "learning_rate": 1.271604938271605e-07, "loss": 0.0, "reward": 1.7482143491506577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526785954833031, "rewards/format_reward_func": 0.9955357164144516, "step": 1030 }, { "completion_length": 242.52679920196533, "epoch": 0.1730164717716585, "grad_norm": 0.41808152142365423, "kl": 0.0009038448333740234, "learning_rate": 1.274074074074074e-07, "loss": 0.0, "reward": 1.7160715162754059, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.720535745844245, "rewards/format_reward_func": 0.9955357164144516, "step": 1032 }, { "completion_length": 237.8348331451416, "epoch": 0.17335177501152604, "grad_norm": 0.2222729619708794, "kl": 0.0007498264312744141, "learning_rate": 1.276543209876543e-07, "loss": 0.0, "reward": 1.716071493923664, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.720535745844245, "rewards/format_reward_func": 0.9955357164144516, "step": 1034 }, { "completion_length": 239.33483219146729, "epoch": 0.1736870782513936, "grad_norm": 0.24150968139086826, "kl": 0.0009173154830932617, "learning_rate": 1.279012345679012e-07, "loss": 0.0, "reward": 1.7357143461704254, "reward_std": 0.06060914974659681, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9910714328289032, "step": 1036 }, { "completion_length": 234.47322463989258, "epoch": 0.17402238149126115, "grad_norm": 0.24846206951979183, "kl": 0.0008068084716796875, "learning_rate": 1.2814814814814815e-07, "loss": 0.0, "reward": 1.766071505844593, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 1038 }, { "completion_length": 237.48661708831787, "epoch": 0.17435768473112873, "grad_norm": 0.23149397808743663, "kl": 0.0008314847946166992, "learning_rate": 1.2839506172839507e-07, "loss": 0.0, "reward": 1.7964286357164383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 1040 }, { "completion_length": 237.63840293884277, "epoch": 0.17469298797099628, "grad_norm": 0.272804539190451, "kl": 0.0009119510650634766, "learning_rate": 1.2864197530864195e-07, "loss": 0.0, "reward": 1.759375087916851, "reward_std": 0.0473508988507092, "rewards/equation_reward_func": 0.760714303702116, "rewards/format_reward_func": 0.9986607171595097, "step": 1042 }, { "completion_length": 232.0491180419922, "epoch": 0.17502829121086383, "grad_norm": 0.26158107193461716, "kl": 0.0008912086486816406, "learning_rate": 1.2888888888888887e-07, "loss": 0.0, "reward": 1.705357238650322, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.709821455180645, "rewards/format_reward_func": 0.9955357164144516, "step": 1044 }, { "completion_length": 233.5178680419922, "epoch": 0.17536359445073138, "grad_norm": 0.16438665975664443, "kl": 0.0008466243743896484, "learning_rate": 1.291358024691358e-07, "loss": 0.0, "reward": 1.799553632736206, "reward_std": 0.061240497045218945, "rewards/equation_reward_func": 0.8026785925030708, "rewards/format_reward_func": 0.9968750029802322, "step": 1046 }, { "completion_length": 232.6875123977661, "epoch": 0.17569889769059893, "grad_norm": 0.1428673378811846, "kl": 0.0009279251098632812, "learning_rate": 1.2938271604938273e-07, "loss": 0.0, "reward": 1.7607143595814705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607142999768257, "rewards/format_reward_func": 1.0, "step": 1048 }, { "completion_length": 247.0312614440918, "epoch": 0.17603420093046648, "grad_norm": 0.24613124143913667, "kl": 0.0008966922760009766, "learning_rate": 1.2962962962962961e-07, "loss": 0.0, "reward": 1.760714367032051, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 1050 }, { "completion_length": 244.1875123977661, "epoch": 0.17636950417033403, "grad_norm": 0.20124636357271036, "kl": 0.00095367431640625, "learning_rate": 1.2987654320987653e-07, "loss": 0.0, "reward": 1.7178572341799736, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571708500385, "rewards/format_reward_func": 1.0, "step": 1052 }, { "completion_length": 243.13840675354004, "epoch": 0.17670480741020161, "grad_norm": 0.2653305938876837, "kl": 0.0008423328399658203, "learning_rate": 1.3012345679012347e-07, "loss": 0.0, "reward": 1.7285714969038963, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 1054 }, { "completion_length": 238.96429824829102, "epoch": 0.17704011065006917, "grad_norm": 0.33954870085400096, "kl": 0.0009109973907470703, "learning_rate": 1.3037037037037036e-07, "loss": 0.0, "reward": 1.725000075995922, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7339286059141159, "rewards/format_reward_func": 0.9910714328289032, "step": 1056 }, { "completion_length": 255.60269451141357, "epoch": 0.17737541388993672, "grad_norm": 0.336539878900831, "kl": 0.0009992122650146484, "learning_rate": 1.3061728395061727e-07, "loss": 0.0, "reward": 1.703571505844593, "reward_std": 0.09091372601687908, "rewards/equation_reward_func": 0.7125000264495611, "rewards/format_reward_func": 0.9910714328289032, "step": 1058 }, { "completion_length": 240.08929634094238, "epoch": 0.17771071712980427, "grad_norm": 0.26274674597932585, "kl": 0.0010219812393188477, "learning_rate": 1.308641975308642e-07, "loss": 0.0, "reward": 1.7392857745289803, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.748214315623045, "rewards/format_reward_func": 0.9910714328289032, "step": 1060 }, { "completion_length": 238.883939743042, "epoch": 0.17804602036967182, "grad_norm": 0.29858341249542397, "kl": 0.000922083854675293, "learning_rate": 1.3111111111111113e-07, "loss": 0.0, "reward": 1.7035715132951736, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.7035714648663998, "rewards/format_reward_func": 1.0, "step": 1062 }, { "completion_length": 238.93751049041748, "epoch": 0.17838132360953937, "grad_norm": 0.26838733869333925, "kl": 0.0009055137634277344, "learning_rate": 1.3135802469135802e-07, "loss": 0.0, "reward": 1.7125000730156898, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7169643323868513, "rewards/format_reward_func": 0.9955357164144516, "step": 1064 }, { "completion_length": 240.852689743042, "epoch": 0.17871662684940692, "grad_norm": 0.3392750138879499, "kl": 0.0009553432464599609, "learning_rate": 1.3160493827160493e-07, "loss": 0.0, "reward": 1.7785715088248253, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7875000238418579, "rewards/format_reward_func": 0.9910714328289032, "step": 1066 }, { "completion_length": 235.77679824829102, "epoch": 0.17905193008927447, "grad_norm": 0.18746947628661784, "kl": 0.001000523567199707, "learning_rate": 1.3185185185185185e-07, "loss": 0.0, "reward": 1.778571479022503, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.778571454808116, "rewards/format_reward_func": 1.0, "step": 1068 }, { "completion_length": 241.3928680419922, "epoch": 0.17938723332914205, "grad_norm": 0.3135635456080588, "kl": 0.0010788440704345703, "learning_rate": 1.3209876543209874e-07, "loss": 0.0, "reward": 1.7571429535746574, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 1070 }, { "completion_length": 239.50001049041748, "epoch": 0.1797225365690096, "grad_norm": 0.15649356816838816, "kl": 0.0009417533874511719, "learning_rate": 1.3234567901234568e-07, "loss": 0.0, "reward": 1.7910714745521545, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7955357357859612, "rewards/format_reward_func": 0.9955357164144516, "step": 1072 }, { "completion_length": 242.91518878936768, "epoch": 0.18005783980887716, "grad_norm": 0.2412609119195414, "kl": 0.0009436607360839844, "learning_rate": 1.325925925925926e-07, "loss": 0.0, "reward": 1.775000050663948, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000264495611, "rewards/format_reward_func": 1.0, "step": 1074 }, { "completion_length": 230.6696538925171, "epoch": 0.1803931430487447, "grad_norm": 0.3883932440560228, "kl": 0.0012047290802001953, "learning_rate": 1.328395061728395e-07, "loss": 0.0, "reward": 1.7714286521077156, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 1076 }, { "completion_length": 243.3125114440918, "epoch": 0.18072844628861226, "grad_norm": 0.2765865793717639, "kl": 0.0011830329895019531, "learning_rate": 1.330864197530864e-07, "loss": 0.0, "reward": 1.7928572446107864, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 1078 }, { "completion_length": 242.84822463989258, "epoch": 0.1810637495284798, "grad_norm": 0.3631849365890873, "kl": 0.0010890960693359375, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "reward": 1.739285796880722, "reward_std": 0.09596448950469494, "rewards/equation_reward_func": 0.7482143230736256, "rewards/format_reward_func": 0.9910714328289032, "step": 1080 }, { "completion_length": 231.1919765472412, "epoch": 0.18139905276834736, "grad_norm": 0.12764449962903732, "kl": 0.0010178089141845703, "learning_rate": 1.3358024691358025e-07, "loss": 0.0, "reward": 1.757142923772335, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 1082 }, { "completion_length": 223.36161708831787, "epoch": 0.18173435600821494, "grad_norm": 0.223794147157261, "kl": 0.0009124279022216797, "learning_rate": 1.3382716049382717e-07, "loss": 0.0, "reward": 1.7535714954137802, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 1084 }, { "completion_length": 240.0357255935669, "epoch": 0.1820696592480825, "grad_norm": 0.21731252308037757, "kl": 0.0009878873825073242, "learning_rate": 1.3407407407407405e-07, "loss": 0.0, "reward": 1.775000087916851, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 1086 }, { "completion_length": 238.94197463989258, "epoch": 0.18240496248795005, "grad_norm": 0.3074918487704999, "kl": 0.0010647773742675781, "learning_rate": 1.34320987654321e-07, "loss": 0.0, "reward": 1.7214286401867867, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7303571701049805, "rewards/format_reward_func": 0.9910714328289032, "step": 1088 }, { "completion_length": 239.14286708831787, "epoch": 0.1827402657278176, "grad_norm": 0.13563050504252466, "kl": 0.001009225845336914, "learning_rate": 1.345679012345679e-07, "loss": 0.0, "reward": 1.7839286252856255, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7973214648663998, "rewards/format_reward_func": 0.9866071492433548, "step": 1090 }, { "completion_length": 244.07590293884277, "epoch": 0.18307556896768515, "grad_norm": 0.35995071786593574, "kl": 0.0010128021240234375, "learning_rate": 1.348148148148148e-07, "loss": 0.0, "reward": 1.764285795390606, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7732143141329288, "rewards/format_reward_func": 0.9910714328289032, "step": 1092 }, { "completion_length": 247.30358600616455, "epoch": 0.1834108722075527, "grad_norm": 0.20962785028909983, "kl": 0.00118255615234375, "learning_rate": 1.3506172839506171e-07, "loss": 0.0, "reward": 1.7321429327130318, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428917348385, "rewards/format_reward_func": 1.0, "step": 1094 }, { "completion_length": 238.3571538925171, "epoch": 0.18374617544742025, "grad_norm": 0.27384299158814857, "kl": 0.0010139942169189453, "learning_rate": 1.3530864197530863e-07, "loss": 0.0, "reward": 1.7571429163217545, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428753435612, "rewards/format_reward_func": 1.0, "step": 1096 }, { "completion_length": 239.65179538726807, "epoch": 0.1840814786872878, "grad_norm": 0.28567982737530095, "kl": 0.0012295246124267578, "learning_rate": 1.3555555555555557e-07, "loss": 0.0, "reward": 1.750892922282219, "reward_std": 0.06944798585027456, "rewards/equation_reward_func": 0.7526785992085934, "rewards/format_reward_func": 0.9982142895460129, "step": 1098 }, { "completion_length": 237.58929634094238, "epoch": 0.18441678192715538, "grad_norm": 0.2796157535424371, "kl": 0.001096487045288086, "learning_rate": 1.3580246913580246e-07, "loss": 0.0, "reward": 1.7803571969270706, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214581608772, "rewards/format_reward_func": 0.9955357164144516, "step": 1100 }, { "completion_length": 236.9107265472412, "epoch": 0.18475208516702293, "grad_norm": 0.23820606951477952, "kl": 0.0011627674102783203, "learning_rate": 1.3604938271604937e-07, "loss": 0.0, "reward": 1.8071429207921028, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 1102 }, { "completion_length": 238.8705472946167, "epoch": 0.18508738840689049, "grad_norm": 0.17300731372670122, "kl": 0.0011518001556396484, "learning_rate": 1.362962962962963e-07, "loss": 0.0, "reward": 1.717857226729393, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7178571671247482, "rewards/format_reward_func": 1.0, "step": 1104 }, { "completion_length": 241.66965293884277, "epoch": 0.18542269164675804, "grad_norm": 0.27020448765160554, "kl": 0.0010493993759155273, "learning_rate": 1.365432098765432e-07, "loss": 0.0, "reward": 1.784375049173832, "reward_std": 0.05240166233852506, "rewards/equation_reward_func": 0.7857143320143223, "rewards/format_reward_func": 0.9986607171595097, "step": 1106 }, { "completion_length": 243.57590579986572, "epoch": 0.1857579948866256, "grad_norm": 0.2713591769752031, "kl": 0.0012111663818359375, "learning_rate": 1.3679012345679012e-07, "loss": 0.0, "reward": 1.789285808801651, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.789285734295845, "rewards/format_reward_func": 1.0, "step": 1108 }, { "completion_length": 247.6339406967163, "epoch": 0.18609329812649314, "grad_norm": 0.3714389827370947, "kl": 0.0012345314025878906, "learning_rate": 1.3703703703703703e-07, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 1110 }, { "completion_length": 241.4107255935669, "epoch": 0.1864286013663607, "grad_norm": 0.2053685320097415, "kl": 0.0010557174682617188, "learning_rate": 1.3728395061728395e-07, "loss": 0.0, "reward": 1.7500000819563866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000223517418, "rewards/format_reward_func": 1.0, "step": 1112 }, { "completion_length": 233.26786994934082, "epoch": 0.18676390460622827, "grad_norm": 0.33413966119251975, "kl": 0.001161813735961914, "learning_rate": 1.3753086419753086e-07, "loss": 0.0, "reward": 1.7910714820027351, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7955357581377029, "rewards/format_reward_func": 0.9955357164144516, "step": 1114 }, { "completion_length": 248.7053680419922, "epoch": 0.18709920784609582, "grad_norm": 0.19048515531842317, "kl": 0.0014069080352783203, "learning_rate": 1.3777777777777778e-07, "loss": 0.0, "reward": 1.7214286401867867, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7303571775555611, "rewards/format_reward_func": 0.9910714328289032, "step": 1116 }, { "completion_length": 247.49108219146729, "epoch": 0.18743451108596337, "grad_norm": 0.23644455559250388, "kl": 0.0011017322540283203, "learning_rate": 1.380246913580247e-07, "loss": 0.0, "reward": 1.7392857819795609, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 1118 }, { "completion_length": 241.5982265472412, "epoch": 0.18776981432583092, "grad_norm": 0.20435446836020607, "kl": 0.0010347366333007812, "learning_rate": 1.3827160493827158e-07, "loss": 0.0, "reward": 1.733928643167019, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7383928894996643, "rewards/format_reward_func": 0.9955357164144516, "step": 1120 }, { "completion_length": 244.29018878936768, "epoch": 0.18810511756569848, "grad_norm": 0.2660093786588556, "kl": 0.001314401626586914, "learning_rate": 1.385185185185185e-07, "loss": 0.0, "reward": 1.7428572326898575, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7428571730852127, "rewards/format_reward_func": 1.0, "step": 1122 }, { "completion_length": 232.94643688201904, "epoch": 0.18844042080556603, "grad_norm": 0.2939042070750301, "kl": 0.0010373592376708984, "learning_rate": 1.3876543209876544e-07, "loss": 0.0, "reward": 1.7714286223053932, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7803571708500385, "rewards/format_reward_func": 0.9910714328289032, "step": 1124 }, { "completion_length": 232.59822463989258, "epoch": 0.18877572404543358, "grad_norm": 0.2232515295771991, "kl": 0.0011088848114013672, "learning_rate": 1.3901234567901235e-07, "loss": 0.0, "reward": 1.751785770058632, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500461935997, "rewards/format_reward_func": 0.9955357164144516, "step": 1126 }, { "completion_length": 235.14733123779297, "epoch": 0.18911102728530113, "grad_norm": 0.23194326544395175, "kl": 0.0011610984802246094, "learning_rate": 1.3925925925925924e-07, "loss": 0.0, "reward": 1.7428572177886963, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7517857402563095, "rewards/format_reward_func": 0.9910714328289032, "step": 1128 }, { "completion_length": 235.05804634094238, "epoch": 0.1894463305251687, "grad_norm": 0.2536261881719766, "kl": 0.0012238025665283203, "learning_rate": 1.3950617283950615e-07, "loss": 0.0, "reward": 1.7553571984171867, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7598214447498322, "rewards/format_reward_func": 0.9955357164144516, "step": 1130 }, { "completion_length": 224.6384038925171, "epoch": 0.18978163376503626, "grad_norm": 0.10219346905103746, "kl": 0.001035451889038086, "learning_rate": 1.397530864197531e-07, "loss": 0.0, "reward": 1.810714341700077, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143230736256, "rewards/format_reward_func": 1.0, "step": 1132 }, { "completion_length": 237.4776906967163, "epoch": 0.1901169370049038, "grad_norm": 0.20131262973323824, "kl": 0.0013401508331298828, "learning_rate": 1.4e-07, "loss": 0.0, "reward": 1.7607143446803093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 1134 }, { "completion_length": 242.58929443359375, "epoch": 0.19045224024477136, "grad_norm": 0.3540408384031364, "kl": 0.0012521743774414062, "learning_rate": 1.402469135802469e-07, "loss": 0.0, "reward": 1.7696429193019867, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7741071656346321, "rewards/format_reward_func": 0.9955357164144516, "step": 1136 }, { "completion_length": 244.79911613464355, "epoch": 0.19078754348463892, "grad_norm": 0.34346695563107493, "kl": 0.001157999038696289, "learning_rate": 1.404938271604938e-07, "loss": 0.0, "reward": 1.7196429371833801, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7241071611642838, "rewards/format_reward_func": 0.9955357164144516, "step": 1138 }, { "completion_length": 241.477689743042, "epoch": 0.19112284672450647, "grad_norm": 0.20868917961928712, "kl": 0.0011320114135742188, "learning_rate": 1.4074074074074075e-07, "loss": 0.0, "reward": 1.7571429088711739, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 1140 }, { "completion_length": 244.14733409881592, "epoch": 0.19145814996437402, "grad_norm": 0.32124170225006765, "kl": 0.0014166831970214844, "learning_rate": 1.4098765432098764e-07, "loss": 0.0, "reward": 1.7107143476605415, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.7196428887546062, "rewards/format_reward_func": 0.9910714328289032, "step": 1142 }, { "completion_length": 236.6116180419922, "epoch": 0.1917934532042416, "grad_norm": 0.3877598831982432, "kl": 0.0013773441314697266, "learning_rate": 1.4123456790123456e-07, "loss": 0.0, "reward": 1.7232143878936768, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7276786006987095, "rewards/format_reward_func": 0.9955357164144516, "step": 1144 }, { "completion_length": 241.6562623977661, "epoch": 0.19212875644410915, "grad_norm": 0.2101486308792021, "kl": 0.0011439323425292969, "learning_rate": 1.4148148148148147e-07, "loss": 0.0, "reward": 1.760714367032051, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 1146 }, { "completion_length": 238.7857265472412, "epoch": 0.1924640596839767, "grad_norm": 0.24726688519808554, "kl": 0.0011260509490966797, "learning_rate": 1.417283950617284e-07, "loss": 0.0, "reward": 1.8035714849829674, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.8035714477300644, "rewards/format_reward_func": 1.0, "step": 1148 }, { "completion_length": 237.46429634094238, "epoch": 0.19279936292384425, "grad_norm": 0.24592047025377717, "kl": 0.001191854476928711, "learning_rate": 1.419753086419753e-07, "loss": 0.0, "reward": 1.7107143849134445, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7107143141329288, "rewards/format_reward_func": 1.0, "step": 1150 }, { "completion_length": 237.977689743042, "epoch": 0.1931346661637118, "grad_norm": 0.3134071733947118, "kl": 0.001195669174194336, "learning_rate": 1.4222222222222222e-07, "loss": 0.0, "reward": 1.8178571984171867, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.8267857357859612, "rewards/format_reward_func": 0.9910714328289032, "step": 1152 }, { "completion_length": 251.6651906967163, "epoch": 0.19346996940357936, "grad_norm": 0.25882729527590953, "kl": 0.0014595985412597656, "learning_rate": 1.4246913580246913e-07, "loss": 0.0, "reward": 1.7571429163217545, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7660714462399483, "rewards/format_reward_func": 0.9910714328289032, "step": 1154 }, { "completion_length": 239.1428689956665, "epoch": 0.1938052726434469, "grad_norm": 0.22827723141273715, "kl": 0.0012450218200683594, "learning_rate": 1.4271604938271602e-07, "loss": 0.0, "reward": 1.8142857626080513, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857290804386, "rewards/format_reward_func": 1.0, "step": 1156 }, { "completion_length": 227.54911613464355, "epoch": 0.19414057588331446, "grad_norm": 0.2404619261901875, "kl": 0.0012445449829101562, "learning_rate": 1.4296296296296296e-07, "loss": 0.0, "reward": 1.7464286535978317, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 1158 }, { "completion_length": 239.5759038925171, "epoch": 0.19447587912318204, "grad_norm": 0.22192693701997795, "kl": 0.0011388063430786133, "learning_rate": 1.4320987654320988e-07, "loss": 0.0, "reward": 1.7714286521077156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 1160 }, { "completion_length": 240.5357265472412, "epoch": 0.1948111823630496, "grad_norm": 0.29006089497662224, "kl": 0.0012791156768798828, "learning_rate": 1.434567901234568e-07, "loss": 0.0, "reward": 1.7678571939468384, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571809083223, "rewards/format_reward_func": 1.0, "step": 1162 }, { "completion_length": 232.29465293884277, "epoch": 0.19514648560291714, "grad_norm": 0.33240946056531395, "kl": 0.0012426376342773438, "learning_rate": 1.4370370370370368e-07, "loss": 0.0, "reward": 1.7428572103381157, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 1164 }, { "completion_length": 238.4196538925171, "epoch": 0.1954817888427847, "grad_norm": 0.27403505123708316, "kl": 0.0012884140014648438, "learning_rate": 1.4395061728395062e-07, "loss": 0.0, "reward": 1.7250000834465027, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7250000201165676, "rewards/format_reward_func": 1.0, "step": 1166 }, { "completion_length": 237.09822368621826, "epoch": 0.19581709208265224, "grad_norm": 0.4095224412410846, "kl": 0.0012373924255371094, "learning_rate": 1.4419753086419753e-07, "loss": 0.0, "reward": 1.8107143267989159, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8107143249362707, "rewards/format_reward_func": 1.0, "step": 1168 }, { "completion_length": 241.94197273254395, "epoch": 0.1961523953225198, "grad_norm": 0.4084487835326827, "kl": 0.0016338825225830078, "learning_rate": 1.4444444444444442e-07, "loss": 0.0, "reward": 1.7625000700354576, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7669643051922321, "rewards/format_reward_func": 0.9955357164144516, "step": 1170 }, { "completion_length": 230.22322368621826, "epoch": 0.19648769856238735, "grad_norm": 0.22951686862873494, "kl": 0.0014760494232177734, "learning_rate": 1.4469135802469134e-07, "loss": 0.0, "reward": 1.76071435213089, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 1.0, "step": 1172 }, { "completion_length": 234.08483219146729, "epoch": 0.19682300180225493, "grad_norm": 0.4243391288107956, "kl": 0.001817464828491211, "learning_rate": 1.4493827160493828e-07, "loss": 0.0, "reward": 1.7357143759727478, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7446428872644901, "rewards/format_reward_func": 0.9910714328289032, "step": 1174 }, { "completion_length": 238.15625858306885, "epoch": 0.19715830504212248, "grad_norm": 0.40595547340881505, "kl": 0.0014379024505615234, "learning_rate": 1.451851851851852e-07, "loss": 0.0, "reward": 1.6946429386734962, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7080357484519482, "rewards/format_reward_func": 0.9866071492433548, "step": 1176 }, { "completion_length": 236.58483123779297, "epoch": 0.19749360828199003, "grad_norm": 0.20405546121413234, "kl": 0.0012536048889160156, "learning_rate": 1.4543209876543208e-07, "loss": 0.0, "reward": 1.7214286774396896, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214285992085934, "rewards/format_reward_func": 1.0, "step": 1178 }, { "completion_length": 241.3884048461914, "epoch": 0.19782891152185758, "grad_norm": 0.531077990728977, "kl": 0.001644134521484375, "learning_rate": 1.45679012345679e-07, "loss": 0.0, "reward": 1.7196429371833801, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.724107176065445, "rewards/format_reward_func": 0.9955357164144516, "step": 1180 }, { "completion_length": 236.0535831451416, "epoch": 0.19816421476172513, "grad_norm": 0.16600747936281435, "kl": 0.001375436782836914, "learning_rate": 1.459259259259259e-07, "loss": 0.0, "reward": 1.7071429193019867, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7071428894996643, "rewards/format_reward_func": 1.0, "step": 1182 }, { "completion_length": 239.51786613464355, "epoch": 0.19849951800159268, "grad_norm": 0.18321459355862832, "kl": 0.0015826225280761719, "learning_rate": 1.4617283950617285e-07, "loss": 0.0, "reward": 1.76607146859169, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9955357164144516, "step": 1184 }, { "completion_length": 234.85268688201904, "epoch": 0.19883482124146024, "grad_norm": 0.2741705293832859, "kl": 0.002036571502685547, "learning_rate": 1.4641975308641974e-07, "loss": 0.0, "reward": 1.760714367032051, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143055647612, "rewards/format_reward_func": 1.0, "step": 1186 }, { "completion_length": 242.77679443359375, "epoch": 0.1991701244813278, "grad_norm": 0.2830046064019713, "kl": 0.0015716552734375, "learning_rate": 1.4666666666666666e-07, "loss": 0.0, "reward": 1.6839286535978317, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.6883928831666708, "rewards/format_reward_func": 0.9955357164144516, "step": 1188 }, { "completion_length": 245.2276906967163, "epoch": 0.19950542772119537, "grad_norm": 0.3715662769891078, "kl": 0.0017573833465576172, "learning_rate": 1.4691358024691357e-07, "loss": 0.0, "reward": 1.7375000789761543, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7419643141329288, "rewards/format_reward_func": 0.9955357164144516, "step": 1190 }, { "completion_length": 244.12054538726807, "epoch": 0.19984073096106292, "grad_norm": 0.27907119303296496, "kl": 0.0016562938690185547, "learning_rate": 1.4716049382716049e-07, "loss": 0.0, "reward": 1.8035714849829674, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8035714402794838, "rewards/format_reward_func": 1.0, "step": 1192 }, { "completion_length": 243.1205472946167, "epoch": 0.20017603420093047, "grad_norm": 0.45567982927097334, "kl": 0.0018799304962158203, "learning_rate": 1.474074074074074e-07, "loss": 0.0, "reward": 1.7321429401636124, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.7410714589059353, "rewards/format_reward_func": 0.9910714328289032, "step": 1194 }, { "completion_length": 232.75447368621826, "epoch": 0.20051133744079802, "grad_norm": 0.21608914943333704, "kl": 0.0015969276428222656, "learning_rate": 1.4765432098765432e-07, "loss": 0.0, "reward": 1.8000000417232513, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 1196 }, { "completion_length": 241.44197368621826, "epoch": 0.20084664068066557, "grad_norm": 0.26748599126218425, "kl": 0.0018237829208374023, "learning_rate": 1.4790123456790123e-07, "loss": 0.0, "reward": 1.814285784959793, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 1198 }, { "completion_length": 228.45983219146729, "epoch": 0.20118194392053312, "grad_norm": 0.28378913534322847, "kl": 0.0016875267028808594, "learning_rate": 1.4814814814814815e-07, "loss": 0.0, "reward": 1.7535714954137802, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 1200 }, { "completion_length": 241.91519165039062, "epoch": 0.20151724716040068, "grad_norm": 0.2504716674270821, "kl": 0.0017731189727783203, "learning_rate": 1.4839506172839506e-07, "loss": 0.0, "reward": 1.7714286297559738, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7803571745753288, "rewards/format_reward_func": 0.9910714328289032, "step": 1202 }, { "completion_length": 243.80804538726807, "epoch": 0.20185255040026825, "grad_norm": 0.33704098188644993, "kl": 0.002542257308959961, "learning_rate": 1.4864197530864197e-07, "loss": 0.0, "reward": 1.7053572237491608, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7098214663565159, "rewards/format_reward_func": 0.9955357164144516, "step": 1204 }, { "completion_length": 234.44643878936768, "epoch": 0.2021878536401358, "grad_norm": 0.2747468585504003, "kl": 0.0015654563903808594, "learning_rate": 1.4888888888888886e-07, "loss": 0.0, "reward": 1.7964286357164383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7964285872876644, "rewards/format_reward_func": 1.0, "step": 1206 }, { "completion_length": 237.40625953674316, "epoch": 0.20252315688000336, "grad_norm": 0.24314632357287652, "kl": 0.001814126968383789, "learning_rate": 1.4913580246913578e-07, "loss": 0.0, "reward": 1.7107143551111221, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7107143215835094, "rewards/format_reward_func": 1.0, "step": 1208 }, { "completion_length": 239.18304824829102, "epoch": 0.2028584601198709, "grad_norm": 0.24318172658651124, "kl": 0.002092123031616211, "learning_rate": 1.4938271604938272e-07, "loss": 0.0, "reward": 1.828571505844593, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8285714499652386, "rewards/format_reward_func": 1.0, "step": 1210 }, { "completion_length": 238.27679824829102, "epoch": 0.20319376335973846, "grad_norm": 0.32728322145173633, "kl": 0.003085613250732422, "learning_rate": 1.4962962962962963e-07, "loss": 0.0, "reward": 1.744642935693264, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071671247482, "rewards/format_reward_func": 0.9955357164144516, "step": 1212 }, { "completion_length": 240.79465198516846, "epoch": 0.203529066599606, "grad_norm": 0.283836659998682, "kl": 0.0021562576293945312, "learning_rate": 1.4987654320987652e-07, "loss": 0.0, "reward": 1.716071493923664, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7294643130153418, "rewards/format_reward_func": 0.9866071492433548, "step": 1214 }, { "completion_length": 244.0491180419922, "epoch": 0.20386436983947356, "grad_norm": 0.26081031140969885, "kl": 0.0022537708282470703, "learning_rate": 1.5012345679012344e-07, "loss": 0.0, "reward": 1.742857202887535, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.742857189849019, "rewards/format_reward_func": 1.0, "step": 1216 }, { "completion_length": 241.91072845458984, "epoch": 0.20419967307934114, "grad_norm": 0.30485410684658837, "kl": 0.0026428699493408203, "learning_rate": 1.5037037037037038e-07, "loss": 0.0, "reward": 1.7642857730388641, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7732143215835094, "rewards/format_reward_func": 0.9910714328289032, "step": 1218 }, { "completion_length": 241.0669755935669, "epoch": 0.2045349763192087, "grad_norm": 0.2265319246913318, "kl": 0.0030205249786376953, "learning_rate": 1.506172839506173e-07, "loss": 0.0, "reward": 1.8232143446803093, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.8276785835623741, "rewards/format_reward_func": 0.9955357164144516, "step": 1220 }, { "completion_length": 238.30358028411865, "epoch": 0.20487027955907625, "grad_norm": 0.2382537179736265, "kl": 0.0023665428161621094, "learning_rate": 1.5086419753086418e-07, "loss": 0.0, "reward": 1.8017857819795609, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.8062500357627869, "rewards/format_reward_func": 0.9955357164144516, "step": 1222 }, { "completion_length": 245.17411708831787, "epoch": 0.2052055827989438, "grad_norm": 0.3324516681199202, "kl": 0.0020515918731689453, "learning_rate": 1.511111111111111e-07, "loss": 0.0, "reward": 1.7857143357396126, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 1224 }, { "completion_length": 239.4196548461914, "epoch": 0.20554088603881135, "grad_norm": 0.25251488896253566, "kl": 0.0025861263275146484, "learning_rate": 1.5135802469135804e-07, "loss": 0.0, "reward": 1.7250000983476639, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000424683094, "rewards/format_reward_func": 1.0, "step": 1226 }, { "completion_length": 236.84822463989258, "epoch": 0.2058761892786789, "grad_norm": 0.23428687641725407, "kl": 0.0025599002838134766, "learning_rate": 1.5160493827160493e-07, "loss": 0.0, "reward": 1.6964286267757416, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.6964286118745804, "rewards/format_reward_func": 1.0, "step": 1228 }, { "completion_length": 252.34375953674316, "epoch": 0.20621149251854645, "grad_norm": 0.32242002681200926, "kl": 0.0028755664825439453, "learning_rate": 1.5185185185185184e-07, "loss": 0.0, "reward": 1.7125000804662704, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7169643137603998, "rewards/format_reward_func": 0.9955357164144516, "step": 1230 }, { "completion_length": 242.9107265472412, "epoch": 0.206546795758414, "grad_norm": 0.24004483575380084, "kl": 0.002190113067626953, "learning_rate": 1.5209876543209876e-07, "loss": 0.0, "reward": 1.7250000834465027, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7339285984635353, "rewards/format_reward_func": 0.9910714328289032, "step": 1232 }, { "completion_length": 237.33036708831787, "epoch": 0.20688209899828158, "grad_norm": 0.24368559931981323, "kl": 0.0017604827880859375, "learning_rate": 1.523456790123457e-07, "loss": 0.0, "reward": 1.7660715132951736, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 1234 }, { "completion_length": 231.47322368621826, "epoch": 0.20721740223814913, "grad_norm": 0.5328530490006232, "kl": 0.0025615692138671875, "learning_rate": 1.5259259259259259e-07, "loss": 0.0, "reward": 1.7785715162754059, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 1236 }, { "completion_length": 240.35268878936768, "epoch": 0.20755270547801669, "grad_norm": 0.18652124710992135, "kl": 0.002346515655517578, "learning_rate": 1.528395061728395e-07, "loss": 0.0, "reward": 1.776785783469677, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500223517418, "rewards/format_reward_func": 0.9955357164144516, "step": 1238 }, { "completion_length": 236.0446548461914, "epoch": 0.20788800871788424, "grad_norm": 0.30917406808952935, "kl": 0.0021347999572753906, "learning_rate": 1.5308641975308642e-07, "loss": 0.0, "reward": 1.7455357909202576, "reward_std": 0.07702413015067577, "rewards/equation_reward_func": 0.7517857402563095, "rewards/format_reward_func": 0.9937500059604645, "step": 1240 }, { "completion_length": 246.4285831451416, "epoch": 0.2082233119577518, "grad_norm": 0.24049563158557588, "kl": 0.004585981369018555, "learning_rate": 1.533333333333333e-07, "loss": 0.0, "reward": 1.7035715207457542, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7035714723169804, "rewards/format_reward_func": 1.0, "step": 1242 }, { "completion_length": 252.42858123779297, "epoch": 0.20855861519761934, "grad_norm": 0.28232304155666915, "kl": 0.004839181900024414, "learning_rate": 1.5358024691358024e-07, "loss": 0.0, "reward": 1.8035715073347092, "reward_std": 0.09596449043601751, "rewards/equation_reward_func": 0.8125000149011612, "rewards/format_reward_func": 0.9910714328289032, "step": 1244 }, { "completion_length": 237.1562614440918, "epoch": 0.2088939184374869, "grad_norm": 0.252566996772104, "kl": 0.0032820701599121094, "learning_rate": 1.5382716049382716e-07, "loss": 0.0, "reward": 1.8035714626312256, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 1246 }, { "completion_length": 224.24554538726807, "epoch": 0.20922922167735447, "grad_norm": 0.4379151122153357, "kl": 0.0019059181213378906, "learning_rate": 1.5407407407407407e-07, "loss": 0.0, "reward": 1.7571429461240768, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 1248 }, { "completion_length": 243.2366189956665, "epoch": 0.20956452491722202, "grad_norm": 0.2435262045125475, "kl": 0.0021789073944091797, "learning_rate": 1.5432098765432096e-07, "loss": 0.0, "reward": 1.7750000730156898, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7839285843074322, "rewards/format_reward_func": 0.9910714328289032, "step": 1250 }, { "completion_length": 240.821439743042, "epoch": 0.20989982815708957, "grad_norm": 0.3508141244807139, "kl": 0.002946138381958008, "learning_rate": 1.545679012345679e-07, "loss": 0.0, "reward": 1.6964286416769028, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.6964286155998707, "rewards/format_reward_func": 1.0, "step": 1252 }, { "completion_length": 247.0669755935669, "epoch": 0.21023513139695713, "grad_norm": 0.24180904017268992, "kl": 0.002747058868408203, "learning_rate": 1.5481481481481482e-07, "loss": 0.0, "reward": 1.7857143431901932, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7946428805589676, "rewards/format_reward_func": 0.9910714328289032, "step": 1254 }, { "completion_length": 229.96875858306885, "epoch": 0.21057043463682468, "grad_norm": 0.2411987768476652, "kl": 0.002772808074951172, "learning_rate": 1.550617283950617e-07, "loss": 0.0, "reward": 1.7464286461472511, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286033064127, "rewards/format_reward_func": 1.0, "step": 1256 }, { "completion_length": 236.8303680419922, "epoch": 0.21090573787669223, "grad_norm": 0.32063347431687766, "kl": 0.002824068069458008, "learning_rate": 1.5530864197530862e-07, "loss": 0.0, "reward": 1.7892857789993286, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 1258 }, { "completion_length": 241.53125953674316, "epoch": 0.21124104111655978, "grad_norm": 0.31144432521616555, "kl": 0.004393339157104492, "learning_rate": 1.5555555555555556e-07, "loss": 0.0, "reward": 1.782142885029316, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821429073810577, "rewards/format_reward_func": 1.0, "step": 1260 }, { "completion_length": 238.9642972946167, "epoch": 0.21157634435642733, "grad_norm": 0.29715450473241195, "kl": 0.004702568054199219, "learning_rate": 1.5580246913580248e-07, "loss": 0.0, "reward": 1.7357143610715866, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7357143312692642, "rewards/format_reward_func": 1.0, "step": 1262 }, { "completion_length": 241.18751049041748, "epoch": 0.2119116475962949, "grad_norm": 0.1951487271047744, "kl": 0.0032672882080078125, "learning_rate": 1.5604938271604937e-07, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 1264 }, { "completion_length": 241.1384038925171, "epoch": 0.21224695083616246, "grad_norm": 0.24012888678173097, "kl": 0.0034089088439941406, "learning_rate": 1.5629629629629628e-07, "loss": 0.0, "reward": 1.7678572088479996, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 1266 }, { "completion_length": 232.91965293884277, "epoch": 0.21258225407603001, "grad_norm": 0.25773928795116036, "kl": 0.003378152847290039, "learning_rate": 1.565432098765432e-07, "loss": 0.0, "reward": 1.7964286357164383, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.8053571693599224, "rewards/format_reward_func": 0.9910714328289032, "step": 1268 }, { "completion_length": 243.12054920196533, "epoch": 0.21291755731589757, "grad_norm": 0.17985039137077788, "kl": 0.003942012786865234, "learning_rate": 1.5679012345679014e-07, "loss": 0.0, "reward": 1.7482143491506577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526786010712385, "rewards/format_reward_func": 0.9955357164144516, "step": 1270 }, { "completion_length": 236.90626049041748, "epoch": 0.21325286055576512, "grad_norm": 0.1548271078866053, "kl": 0.005479335784912109, "learning_rate": 1.5703703703703703e-07, "loss": 0.0, "reward": 1.7303572297096252, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7348214648663998, "rewards/format_reward_func": 0.9955357164144516, "step": 1272 }, { "completion_length": 241.95536708831787, "epoch": 0.21358816379563267, "grad_norm": 0.27982895939363556, "kl": 0.004017353057861328, "learning_rate": 1.5728395061728394e-07, "loss": 0.0, "reward": 1.79642865806818, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 1274 }, { "completion_length": 246.4598331451416, "epoch": 0.21392346703550022, "grad_norm": 0.24055112645480836, "kl": 0.00449061393737793, "learning_rate": 1.5753086419753086e-07, "loss": 0.0, "reward": 1.7892857789993286, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 1276 }, { "completion_length": 242.5312623977661, "epoch": 0.2142587702753678, "grad_norm": 0.19247495138177573, "kl": 0.002553701400756836, "learning_rate": 1.5777777777777777e-07, "loss": 0.0, "reward": 1.8089286163449287, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8133928887546062, "rewards/format_reward_func": 0.9955357164144516, "step": 1278 }, { "completion_length": 242.32144260406494, "epoch": 0.21459407351523535, "grad_norm": 0.1627487076419372, "kl": 0.0036182403564453125, "learning_rate": 1.5802469135802468e-07, "loss": 0.0, "reward": 1.7607143372297287, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 1280 }, { "completion_length": 240.27233123779297, "epoch": 0.2149293767551029, "grad_norm": 0.22177047448633702, "kl": 0.005349874496459961, "learning_rate": 1.582716049382716e-07, "loss": 0.0, "reward": 1.7321429252624512, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428880095482, "rewards/format_reward_func": 1.0, "step": 1282 }, { "completion_length": 234.53572463989258, "epoch": 0.21526467999497045, "grad_norm": 0.2958328217477393, "kl": 0.004651069641113281, "learning_rate": 1.5851851851851851e-07, "loss": 0.0, "reward": 1.7214286476373672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.721428606659174, "rewards/format_reward_func": 1.0, "step": 1284 }, { "completion_length": 241.8080472946167, "epoch": 0.215599983234838, "grad_norm": 0.33299459227609995, "kl": 0.004356861114501953, "learning_rate": 1.5876543209876543e-07, "loss": 0.0, "reward": 1.7285714969038963, "reward_std": 0.09091372694820166, "rewards/equation_reward_func": 0.7375000417232513, "rewards/format_reward_func": 0.9910714328289032, "step": 1286 }, { "completion_length": 235.60268878936768, "epoch": 0.21593528647470556, "grad_norm": 0.35271283163499656, "kl": 0.005709171295166016, "learning_rate": 1.5901234567901234e-07, "loss": 0.0, "reward": 1.7089286595582962, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7133928686380386, "rewards/format_reward_func": 0.9955357164144516, "step": 1288 }, { "completion_length": 244.56251049041748, "epoch": 0.2162705897145731, "grad_norm": 0.1859424366173882, "kl": 0.003217935562133789, "learning_rate": 1.5925925925925926e-07, "loss": 0.0, "reward": 1.7625000476837158, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7669642996042967, "rewards/format_reward_func": 0.9955357164144516, "step": 1290 }, { "completion_length": 240.76786708831787, "epoch": 0.21660589295444066, "grad_norm": 0.24532934625705524, "kl": 0.0031633377075195312, "learning_rate": 1.5950617283950615e-07, "loss": 0.0, "reward": 1.6625000908970833, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.6669643167406321, "rewards/format_reward_func": 0.9955357164144516, "step": 1292 }, { "completion_length": 243.30358505249023, "epoch": 0.21694119619430824, "grad_norm": 0.2586537390088073, "kl": 0.002529144287109375, "learning_rate": 1.5975308641975306e-07, "loss": 0.0, "reward": 1.776785783469677, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7812500260770321, "rewards/format_reward_func": 0.9955357164144516, "step": 1294 }, { "completion_length": 234.05357933044434, "epoch": 0.2172764994341758, "grad_norm": 0.20464408868137368, "kl": 0.006105184555053711, "learning_rate": 1.6e-07, "loss": 0.0, "reward": 1.7696429193019867, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.9955357164144516, "step": 1296 }, { "completion_length": 243.42858123779297, "epoch": 0.21761180267404334, "grad_norm": 0.4311301072230461, "kl": 0.004184722900390625, "learning_rate": 1.6024691358024692e-07, "loss": 0.0, "reward": 1.7160714864730835, "reward_std": 0.11364215798676014, "rewards/equation_reward_func": 0.7294643148779869, "rewards/format_reward_func": 0.9866071492433548, "step": 1298 }, { "completion_length": 233.65626049041748, "epoch": 0.2179471059139109, "grad_norm": 0.5118573409819533, "kl": 0.006073713302612305, "learning_rate": 1.604938271604938e-07, "loss": 0.0, "reward": 1.7464286535978317, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7553571686148643, "rewards/format_reward_func": 0.9910714328289032, "step": 1300 }, { "completion_length": 242.00894165039062, "epoch": 0.21828240915377844, "grad_norm": 0.18901941957665191, "kl": 0.004821300506591797, "learning_rate": 1.6074074074074072e-07, "loss": 0.0, "reward": 1.7214286476373672, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214285992085934, "rewards/format_reward_func": 1.0, "step": 1302 }, { "completion_length": 243.61608123779297, "epoch": 0.218617712393646, "grad_norm": 0.2868402947325543, "kl": 0.004178762435913086, "learning_rate": 1.6098765432098766e-07, "loss": 0.0, "reward": 1.7732143253087997, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776785865426064, "rewards/format_reward_func": 0.9955357164144516, "step": 1304 }, { "completion_length": 231.63840293884277, "epoch": 0.21895301563351355, "grad_norm": 0.28362586224310077, "kl": 0.0027713775634765625, "learning_rate": 1.6123456790123455e-07, "loss": 0.0, "reward": 1.7857143357396126, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7857143357396126, "rewards/format_reward_func": 1.0, "step": 1306 }, { "completion_length": 244.44643783569336, "epoch": 0.21928831887338113, "grad_norm": 0.2076709051554259, "kl": 0.0029850006103515625, "learning_rate": 1.6148148148148147e-07, "loss": 0.0, "reward": 1.703571505844593, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7125000357627869, "rewards/format_reward_func": 0.9910714328289032, "step": 1308 }, { "completion_length": 233.55358219146729, "epoch": 0.21962362211324868, "grad_norm": 0.25018701924401665, "kl": 0.0067141056060791016, "learning_rate": 1.6172839506172838e-07, "loss": 0.0, "reward": 1.782142922282219, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 1310 }, { "completion_length": 239.6026906967163, "epoch": 0.21995892535311623, "grad_norm": 0.3511161464206752, "kl": 0.00603485107421875, "learning_rate": 1.6197530864197532e-07, "loss": 0.0, "reward": 1.7714286297559738, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 1312 }, { "completion_length": 240.23215579986572, "epoch": 0.22029422859298378, "grad_norm": 0.28800201598541564, "kl": 0.003075122833251953, "learning_rate": 1.622222222222222e-07, "loss": 0.0, "reward": 1.7714286670088768, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7803571633994579, "rewards/format_reward_func": 0.9910714328289032, "step": 1314 }, { "completion_length": 234.04019165039062, "epoch": 0.22062953183285133, "grad_norm": 0.2676481126389581, "kl": 0.004632472991943359, "learning_rate": 1.6246913580246912e-07, "loss": 0.0, "reward": 1.7339286282658577, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7383929006755352, "rewards/format_reward_func": 0.9955357164144516, "step": 1316 }, { "completion_length": 244.27679443359375, "epoch": 0.22096483507271888, "grad_norm": 0.3087973565894431, "kl": 0.010606765747070312, "learning_rate": 1.6271604938271604e-07, "loss": 0.0, "reward": 1.6928572058677673, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7017857395112514, "rewards/format_reward_func": 0.9910714328289032, "step": 1318 }, { "completion_length": 241.70983123779297, "epoch": 0.22130013831258644, "grad_norm": 0.25034205125370274, "kl": 0.00440216064453125, "learning_rate": 1.6296296296296298e-07, "loss": 0.0, "reward": 1.7589286267757416, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7723214589059353, "rewards/format_reward_func": 0.9866071492433548, "step": 1320 }, { "completion_length": 234.6785831451416, "epoch": 0.221635441552454, "grad_norm": 0.41038167641507267, "kl": 0.00315093994140625, "learning_rate": 1.6320987654320987e-07, "loss": 0.0, "reward": 1.7392857819795609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 1322 }, { "completion_length": 245.37947368621826, "epoch": 0.22197074479232157, "grad_norm": 0.806420438279851, "kl": 0.009297370910644531, "learning_rate": 1.6345679012345678e-07, "loss": 0.0, "reward": 1.7625000551342964, "reward_std": 0.0732360603287816, "rewards/equation_reward_func": 0.7758928723633289, "rewards/format_reward_func": 0.9866071492433548, "step": 1324 }, { "completion_length": 229.21429538726807, "epoch": 0.22230604803218912, "grad_norm": 0.29786161917524756, "kl": 0.004057168960571289, "learning_rate": 1.637037037037037e-07, "loss": 0.0, "reward": 1.7857143580913544, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 1326 }, { "completion_length": 241.70090293884277, "epoch": 0.22264135127205667, "grad_norm": 0.2705780895983189, "kl": 0.0032672882080078125, "learning_rate": 1.639506172839506e-07, "loss": 0.0, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 1328 }, { "completion_length": 244.21429538726807, "epoch": 0.22297665451192422, "grad_norm": 0.24169684934286853, "kl": 0.003974437713623047, "learning_rate": 1.6419753086419753e-07, "loss": 0.0, "reward": 1.771428644657135, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 1330 }, { "completion_length": 236.27679634094238, "epoch": 0.22331195775179177, "grad_norm": 0.2513951306851172, "kl": 0.003792285919189453, "learning_rate": 1.6444444444444444e-07, "loss": 0.0, "reward": 1.7232143506407738, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276785969734192, "rewards/format_reward_func": 0.9955357164144516, "step": 1332 }, { "completion_length": 239.6830472946167, "epoch": 0.22364726099165932, "grad_norm": 0.26017730409226486, "kl": 0.006929159164428711, "learning_rate": 1.6469135802469136e-07, "loss": 0.0, "reward": 1.8214286267757416, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8214285969734192, "rewards/format_reward_func": 1.0, "step": 1334 }, { "completion_length": 240.4062614440918, "epoch": 0.22398256423152688, "grad_norm": 0.33342545268150153, "kl": 0.007687568664550781, "learning_rate": 1.6493827160493825e-07, "loss": 0.0, "reward": 1.765178643167019, "reward_std": 0.03914341004565358, "rewards/equation_reward_func": 0.7669643089175224, "rewards/format_reward_func": 0.9982142895460129, "step": 1336 }, { "completion_length": 238.1294755935669, "epoch": 0.22431786747139446, "grad_norm": 0.28918367802683476, "kl": 0.003994941711425781, "learning_rate": 1.651851851851852e-07, "loss": 0.0, "reward": 1.7035715281963348, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7035714536905289, "rewards/format_reward_func": 1.0, "step": 1338 }, { "completion_length": 237.70090103149414, "epoch": 0.224653170711262, "grad_norm": 0.2494069780734928, "kl": 0.011424541473388672, "learning_rate": 1.654320987654321e-07, "loss": 0.0, "reward": 1.741071492433548, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7455357518047094, "rewards/format_reward_func": 0.9955357164144516, "step": 1340 }, { "completion_length": 237.7991180419922, "epoch": 0.22498847395112956, "grad_norm": 0.303123570931485, "kl": 0.003299713134765625, "learning_rate": 1.65679012345679e-07, "loss": 0.0, "reward": 1.6857143640518188, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.6946428939700127, "rewards/format_reward_func": 0.9910714328289032, "step": 1342 }, { "completion_length": 245.04911708831787, "epoch": 0.2253237771909971, "grad_norm": 0.2523171541215057, "kl": 0.005613803863525391, "learning_rate": 1.659259259259259e-07, "loss": 0.0, "reward": 1.7571429088711739, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7660714723169804, "rewards/format_reward_func": 0.9910714328289032, "step": 1344 }, { "completion_length": 243.0937614440918, "epoch": 0.22565908043086466, "grad_norm": 0.14018283087264063, "kl": 0.0050811767578125, "learning_rate": 1.6617283950617285e-07, "loss": 0.0, "reward": 1.7250000685453415, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000443309546, "rewards/format_reward_func": 1.0, "step": 1346 }, { "completion_length": 241.71429443359375, "epoch": 0.2259943836707322, "grad_norm": 0.41487774404342265, "kl": 0.005957126617431641, "learning_rate": 1.6641975308641976e-07, "loss": 0.0, "reward": 1.7410714998841286, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7455357573926449, "rewards/format_reward_func": 0.9955357164144516, "step": 1348 }, { "completion_length": 230.00893878936768, "epoch": 0.22632968691059976, "grad_norm": 0.2806775670690932, "kl": 0.00467681884765625, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "reward": 1.7714286372065544, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7803571708500385, "rewards/format_reward_func": 0.9910714328289032, "step": 1350 }, { "completion_length": 231.69643878936768, "epoch": 0.22666499015046732, "grad_norm": 0.24691130694143, "kl": 0.005158901214599609, "learning_rate": 1.6691358024691357e-07, "loss": 0.0, "reward": 1.7321429401636124, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7321428805589676, "rewards/format_reward_func": 1.0, "step": 1352 }, { "completion_length": 240.7857255935669, "epoch": 0.2270002933903349, "grad_norm": 0.24994952871151663, "kl": 0.005618095397949219, "learning_rate": 1.6716049382716048e-07, "loss": 0.0, "reward": 1.7571429163217545, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428921073675, "rewards/format_reward_func": 1.0, "step": 1354 }, { "completion_length": 248.33929824829102, "epoch": 0.22733559663020245, "grad_norm": 0.09872595517896748, "kl": 0.009372234344482422, "learning_rate": 1.674074074074074e-07, "loss": 0.0, "reward": 1.717857226729393, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7267857305705547, "rewards/format_reward_func": 0.9910714328289032, "step": 1356 }, { "completion_length": 245.62054634094238, "epoch": 0.22767089987007, "grad_norm": 0.2373904924245633, "kl": 0.008755207061767578, "learning_rate": 1.676543209876543e-07, "loss": 0.0, "reward": 1.710714377462864, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7107143122702837, "rewards/format_reward_func": 1.0, "step": 1358 }, { "completion_length": 234.75893878936768, "epoch": 0.22800620310993755, "grad_norm": 0.20854539089599378, "kl": 0.007511138916015625, "learning_rate": 1.6790123456790122e-07, "loss": 0.0, "reward": 1.7500000670552254, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 1360 }, { "completion_length": 234.74108028411865, "epoch": 0.2283415063498051, "grad_norm": 0.28694424946821906, "kl": 0.0039539337158203125, "learning_rate": 1.6814814814814814e-07, "loss": 0.0, "reward": 1.7000000700354576, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7000000551342964, "rewards/format_reward_func": 1.0, "step": 1362 }, { "completion_length": 241.7366189956665, "epoch": 0.22867680958967265, "grad_norm": 0.3141697765365759, "kl": 0.010674476623535156, "learning_rate": 1.6839506172839505e-07, "loss": 0.0, "reward": 1.8071429431438446, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 1364 }, { "completion_length": 239.72322368621826, "epoch": 0.2290121128295402, "grad_norm": 0.2671441737251063, "kl": 0.005108356475830078, "learning_rate": 1.6864197530864197e-07, "loss": 0.0, "reward": 1.7696429267525673, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 1366 }, { "completion_length": 228.58483219146729, "epoch": 0.22934741606940778, "grad_norm": 0.21440951279482529, "kl": 0.003632068634033203, "learning_rate": 1.6888888888888888e-07, "loss": 0.0, "reward": 1.74642863124609, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 1368 }, { "completion_length": 235.47322463989258, "epoch": 0.22968271930927533, "grad_norm": 0.20954095826447186, "kl": 0.008241653442382812, "learning_rate": 1.691358024691358e-07, "loss": 0.0, "reward": 1.7107143700122833, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.710714315995574, "rewards/format_reward_func": 1.0, "step": 1370 }, { "completion_length": 237.51786994934082, "epoch": 0.2300180225491429, "grad_norm": 0.17949433295497624, "kl": 0.007944583892822266, "learning_rate": 1.693827160493827e-07, "loss": 0.0, "reward": 1.7250000536441803, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.72500004991889, "rewards/format_reward_func": 1.0, "step": 1372 }, { "completion_length": 239.4687614440918, "epoch": 0.23035332578901044, "grad_norm": 0.3259964424478922, "kl": 0.004845619201660156, "learning_rate": 1.6962962962962963e-07, "loss": 0.0, "reward": 1.7285715192556381, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 1374 }, { "completion_length": 241.49108409881592, "epoch": 0.230688629028878, "grad_norm": 0.27244331118122284, "kl": 0.009942054748535156, "learning_rate": 1.6987654320987654e-07, "loss": 0.0, "reward": 1.7714286595582962, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 1376 }, { "completion_length": 236.321439743042, "epoch": 0.23102393226874554, "grad_norm": 0.07781785904774369, "kl": 0.0042362213134765625, "learning_rate": 1.7012345679012343e-07, "loss": 0.0, "reward": 1.7178572192788124, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.7258928790688515, "rewards/format_reward_func": 0.9919642955064774, "step": 1378 }, { "completion_length": 237.6562623977661, "epoch": 0.2313592355086131, "grad_norm": 0.3306974061678042, "kl": 0.02078533172607422, "learning_rate": 1.7037037037037035e-07, "loss": 0.0, "reward": 1.7660714909434319, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357484519482, "rewards/format_reward_func": 0.9955357164144516, "step": 1380 }, { "completion_length": 234.9107265472412, "epoch": 0.23169453874848064, "grad_norm": 0.15438550905304013, "kl": 0.010261058807373047, "learning_rate": 1.706172839506173e-07, "loss": 0.0, "reward": 1.7000000476837158, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7089286036789417, "rewards/format_reward_func": 0.9910714328289032, "step": 1382 }, { "completion_length": 243.08483505249023, "epoch": 0.23202984198834822, "grad_norm": 0.26681977267282586, "kl": 0.017525196075439453, "learning_rate": 1.708641975308642e-07, "loss": 0.0, "reward": 1.7392857894301414, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857633531094, "rewards/format_reward_func": 1.0, "step": 1384 }, { "completion_length": 235.85715103149414, "epoch": 0.23236514522821577, "grad_norm": 0.2000816579266356, "kl": 0.01715850830078125, "learning_rate": 1.711111111111111e-07, "loss": 0.0, "reward": 1.7321429327130318, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428991854191, "rewards/format_reward_func": 1.0, "step": 1386 }, { "completion_length": 242.9017972946167, "epoch": 0.23270044846808333, "grad_norm": 0.22182251115978738, "kl": 0.006072044372558594, "learning_rate": 1.71358024691358e-07, "loss": 0.0, "reward": 1.7125000655651093, "reward_std": 0.09343911055475473, "rewards/equation_reward_func": 0.7258928790688515, "rewards/format_reward_func": 0.9866071492433548, "step": 1388 }, { "completion_length": 237.6830472946167, "epoch": 0.23303575170795088, "grad_norm": 0.14505985813023767, "kl": 0.006579399108886719, "learning_rate": 1.7160493827160495e-07, "loss": 0.0, "reward": 1.830357201397419, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8348214477300644, "rewards/format_reward_func": 0.9955357164144516, "step": 1390 }, { "completion_length": 221.95536518096924, "epoch": 0.23337105494781843, "grad_norm": 0.3887991399283552, "kl": 0.009667396545410156, "learning_rate": 1.7185185185185183e-07, "loss": 0.0, "reward": 1.7843750938773155, "reward_std": 0.06250318652018905, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 0.9986607171595097, "step": 1392 }, { "completion_length": 239.72322750091553, "epoch": 0.23370635818768598, "grad_norm": 0.31497940932668206, "kl": 0.00586700439453125, "learning_rate": 1.7209876543209875e-07, "loss": 0.0, "reward": 1.778571493923664, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7785714417695999, "rewards/format_reward_func": 1.0, "step": 1394 }, { "completion_length": 237.57143783569336, "epoch": 0.23404166142755353, "grad_norm": 0.1722442961054919, "kl": 0.01059722900390625, "learning_rate": 1.7234567901234566e-07, "loss": 0.0, "reward": 1.7535715028643608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 1396 }, { "completion_length": 241.54018878936768, "epoch": 0.2343769646674211, "grad_norm": 0.23289390607293414, "kl": 0.016029834747314453, "learning_rate": 1.725925925925926e-07, "loss": 0.0, "reward": 1.69821435213089, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7116071581840515, "rewards/format_reward_func": 0.9866071492433548, "step": 1398 }, { "completion_length": 240.33036613464355, "epoch": 0.23471226790728866, "grad_norm": 0.3716827959785001, "kl": 0.014810562133789062, "learning_rate": 1.728395061728395e-07, "loss": 0.0, "reward": 1.6892858073115349, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.6982143186032772, "rewards/format_reward_func": 0.9910714328289032, "step": 1400 }, { "completion_length": 235.44197463989258, "epoch": 0.23504757114715621, "grad_norm": 0.15371024300134672, "kl": 0.017034530639648438, "learning_rate": 1.730864197530864e-07, "loss": 0.0, "reward": 1.7178572118282318, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571820259094, "rewards/format_reward_func": 1.0, "step": 1402 }, { "completion_length": 228.0089406967163, "epoch": 0.23538287438702377, "grad_norm": 0.280897369010513, "kl": 0.011105060577392578, "learning_rate": 1.7333333333333332e-07, "loss": 0.0, "reward": 1.7964286655187607, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 1404 }, { "completion_length": 242.0759048461914, "epoch": 0.23571817762689132, "grad_norm": 0.4060917647274439, "kl": 0.014769554138183594, "learning_rate": 1.7358024691358027e-07, "loss": 0.0, "reward": 1.6803572252392769, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.684821454808116, "rewards/format_reward_func": 0.9955357164144516, "step": 1406 }, { "completion_length": 234.25000953674316, "epoch": 0.23605348086675887, "grad_norm": 0.29418746031538556, "kl": 0.007956981658935547, "learning_rate": 1.7382716049382715e-07, "loss": 0.0, "reward": 1.6821429431438446, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.6821429021656513, "rewards/format_reward_func": 1.0, "step": 1408 }, { "completion_length": 238.62501049041748, "epoch": 0.23638878410662642, "grad_norm": 0.5412517362596901, "kl": 0.016735076904296875, "learning_rate": 1.7407407407407407e-07, "loss": 0.0, "reward": 1.7750000432133675, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 0.9821428656578064, "step": 1410 }, { "completion_length": 244.31697273254395, "epoch": 0.23672408734649397, "grad_norm": 0.2775285412382821, "kl": 0.013233184814453125, "learning_rate": 1.7432098765432098e-07, "loss": 0.0, "reward": 1.751785784959793, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 1412 }, { "completion_length": 230.88393878936768, "epoch": 0.23705939058636155, "grad_norm": 0.30478659922904655, "kl": 0.005757331848144531, "learning_rate": 1.7456790123456787e-07, "loss": 0.0, "reward": 1.8285714760422707, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8285714387893677, "rewards/format_reward_func": 1.0, "step": 1414 }, { "completion_length": 230.1384038925171, "epoch": 0.2373946938262291, "grad_norm": 0.159625589618849, "kl": 0.018886566162109375, "learning_rate": 1.748148148148148e-07, "loss": 0.0, "reward": 1.7750000730156898, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 1416 }, { "completion_length": 233.52233409881592, "epoch": 0.23772999706609665, "grad_norm": 0.3840361324210306, "kl": 0.018432140350341797, "learning_rate": 1.7506172839506173e-07, "loss": 0.0, "reward": 1.7607143372297287, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7696428969502449, "rewards/format_reward_func": 0.9910714328289032, "step": 1418 }, { "completion_length": 233.1428689956665, "epoch": 0.2380653003059642, "grad_norm": 0.31642404943545666, "kl": 0.019349098205566406, "learning_rate": 1.7530864197530864e-07, "loss": 0.0, "reward": 1.7107143551111221, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7107143178582191, "rewards/format_reward_func": 1.0, "step": 1420 }, { "completion_length": 245.95983219146729, "epoch": 0.23840060354583176, "grad_norm": 0.1705597345975343, "kl": 0.008008956909179688, "learning_rate": 1.7555555555555553e-07, "loss": 0.0, "reward": 1.76071435213089, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 1422 }, { "completion_length": 232.80358123779297, "epoch": 0.2387359067856993, "grad_norm": 0.6085086681901897, "kl": 0.05005693435668945, "learning_rate": 1.7580246913580247e-07, "loss": 0.0001, "reward": 1.7196429297327995, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.724107176065445, "rewards/format_reward_func": 0.9955357164144516, "step": 1424 }, { "completion_length": 237.4107255935669, "epoch": 0.23907121002556686, "grad_norm": 0.22994413881569736, "kl": 0.02605152130126953, "learning_rate": 1.7604938271604939e-07, "loss": 0.0, "reward": 1.7214286550879478, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.721428606659174, "rewards/format_reward_func": 1.0, "step": 1426 }, { "completion_length": 241.17411994934082, "epoch": 0.23940651326543444, "grad_norm": 0.4663555822879698, "kl": 0.04881477355957031, "learning_rate": 1.7629629629629627e-07, "loss": 0.0, "reward": 1.6642858311533928, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.6732143200933933, "rewards/format_reward_func": 0.9910714328289032, "step": 1428 }, { "completion_length": 229.7678680419922, "epoch": 0.239741816505302, "grad_norm": 0.08922893400986967, "kl": 0.012132644653320312, "learning_rate": 1.765432098765432e-07, "loss": 0.0, "reward": 1.7000000923871994, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7000000365078449, "rewards/format_reward_func": 1.0, "step": 1430 }, { "completion_length": 235.18304443359375, "epoch": 0.24007711974516954, "grad_norm": 0.2590381600205453, "kl": 0.028981685638427734, "learning_rate": 1.7679012345679013e-07, "loss": 0.0, "reward": 1.7321429401636124, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7410714440047741, "rewards/format_reward_func": 0.9910714328289032, "step": 1432 }, { "completion_length": 235.06697368621826, "epoch": 0.2404124229850371, "grad_norm": 0.21589017259195808, "kl": 0.09281253814697266, "learning_rate": 1.7703703703703705e-07, "loss": 0.0001, "reward": 1.7142857909202576, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857536673546, "rewards/format_reward_func": 1.0, "step": 1434 }, { "completion_length": 235.43304634094238, "epoch": 0.24074772622490465, "grad_norm": 0.11790548457588271, "kl": 0.08521080017089844, "learning_rate": 1.7728395061728393e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 1436 }, { "completion_length": 234.1696538925171, "epoch": 0.2410830294647722, "grad_norm": 0.23563333190513872, "kl": 0.02545928955078125, "learning_rate": 1.7753086419753085e-07, "loss": 0.0, "reward": 1.7071429416537285, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7071428894996643, "rewards/format_reward_func": 1.0, "step": 1438 }, { "completion_length": 230.19197463989258, "epoch": 0.24141833270463975, "grad_norm": 0.2772527679123808, "kl": 0.06699180603027344, "learning_rate": 1.7777777777777776e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 1440 }, { "completion_length": 233.6294755935669, "epoch": 0.2417536359445073, "grad_norm": 0.10134875392539826, "kl": 0.036830902099609375, "learning_rate": 1.7802469135802468e-07, "loss": 0.0, "reward": 1.725000061094761, "reward_std": 0.005050762556493282, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 1.0, "step": 1442 }, { "completion_length": 236.0178680419922, "epoch": 0.24208893918437488, "grad_norm": 0.7041311897794371, "kl": 0.15368270874023438, "learning_rate": 1.782716049382716e-07, "loss": 0.0002, "reward": 1.7839286252856255, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 1444 }, { "completion_length": 231.64733219146729, "epoch": 0.24242424242424243, "grad_norm": 0.26936496812526695, "kl": 0.08620834350585938, "learning_rate": 1.785185185185185e-07, "loss": 0.0001, "reward": 1.7125000804662704, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7169643267989159, "rewards/format_reward_func": 0.9955357164144516, "step": 1446 }, { "completion_length": 234.21875858306885, "epoch": 0.24275954566410998, "grad_norm": 0.29822283393676635, "kl": 0.137451171875, "learning_rate": 1.7876543209876542e-07, "loss": 0.0001, "reward": 1.721428632736206, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7214286271482706, "rewards/format_reward_func": 1.0, "step": 1448 }, { "completion_length": 233.08483123779297, "epoch": 0.24309484890397753, "grad_norm": 0.13332326691597654, "kl": 0.16257810592651367, "learning_rate": 1.7901234567901234e-07, "loss": 0.0002, "reward": 1.7089286521077156, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7133928947150707, "rewards/format_reward_func": 0.9955357164144516, "step": 1450 }, { "completion_length": 234.89733219146729, "epoch": 0.24343015214384509, "grad_norm": 0.17016165008545617, "kl": 0.09032154083251953, "learning_rate": 1.7925925925925925e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643089175224, "rewards/format_reward_func": 0.9955357164144516, "step": 1452 }, { "completion_length": 237.5848331451416, "epoch": 0.24376545538371264, "grad_norm": 0.21223775500209358, "kl": 0.168792724609375, "learning_rate": 1.7950617283950617e-07, "loss": 0.0002, "reward": 1.7071429565548897, "reward_std": 0.03030457627028227, "rewards/equation_reward_func": 0.716071454808116, "rewards/format_reward_func": 0.9910714328289032, "step": 1454 }, { "completion_length": 236.383939743042, "epoch": 0.2441007586235802, "grad_norm": 0.22677331825081817, "kl": 0.09221458435058594, "learning_rate": 1.7975308641975308e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.05240166210569441, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 0.9986607171595097, "step": 1456 }, { "completion_length": 236.5357265472412, "epoch": 0.24443606186344777, "grad_norm": 0.3695446121649956, "kl": 0.1760234832763672, "learning_rate": 1.8e-07, "loss": 0.0002, "reward": 1.7285715341567993, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7375000342726707, "rewards/format_reward_func": 0.9910714328289032, "step": 1458 }, { "completion_length": 235.0535831451416, "epoch": 0.24477136510331532, "grad_norm": 0.19789398802687613, "kl": 0.11786079406738281, "learning_rate": 1.802469135802469e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.7651786003261805, "rewards/format_reward_func": 0.9866071492433548, "step": 1460 }, { "completion_length": 231.3928680419922, "epoch": 0.24510666834318287, "grad_norm": 0.49119529018607455, "kl": 0.14634132385253906, "learning_rate": 1.8049382716049383e-07, "loss": 0.0001, "reward": 1.7946429029107094, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7991071715950966, "rewards/format_reward_func": 0.9955357164144516, "step": 1462 }, { "completion_length": 237.61608409881592, "epoch": 0.24544197158305042, "grad_norm": 0.34020786263175756, "kl": 0.2747945785522461, "learning_rate": 1.8074074074074072e-07, "loss": 0.0003, "reward": 1.778571479022503, "reward_std": 0.1010152529925108, "rewards/equation_reward_func": 0.7875000312924385, "rewards/format_reward_func": 0.9910714328289032, "step": 1464 }, { "completion_length": 238.42858219146729, "epoch": 0.24577727482291797, "grad_norm": 0.2181514729696241, "kl": 0.48163700103759766, "learning_rate": 1.8098765432098763e-07, "loss": 0.0005, "reward": 1.717857226729393, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7267857454717159, "rewards/format_reward_func": 0.9910714328289032, "step": 1466 }, { "completion_length": 228.74108028411865, "epoch": 0.24611257806278553, "grad_norm": 0.22860592494644247, "kl": 0.4790782928466797, "learning_rate": 1.8123456790123457e-07, "loss": 0.0005, "reward": 1.7303571850061417, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7348214704543352, "rewards/format_reward_func": 0.9955357164144516, "step": 1468 }, { "completion_length": 234.67411994934082, "epoch": 0.24644788130265308, "grad_norm": 0.24797354188614557, "kl": 0.008512496948242188, "learning_rate": 1.8148148148148149e-07, "loss": 0.0, "reward": 1.7625000700354576, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9955357164144516, "step": 1470 }, { "completion_length": 243.23215770721436, "epoch": 0.24678318454252063, "grad_norm": 0.20385499798706525, "kl": 0.18136978149414062, "learning_rate": 1.8172839506172837e-07, "loss": 0.0002, "reward": 1.7107143476605415, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7107143308967352, "rewards/format_reward_func": 1.0, "step": 1472 }, { "completion_length": 230.69197463989258, "epoch": 0.2471184877823882, "grad_norm": 0.4924804061728603, "kl": 0.16703414916992188, "learning_rate": 1.819753086419753e-07, "loss": 0.0002, "reward": 1.723214365541935, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7276786044239998, "rewards/format_reward_func": 0.9955357164144516, "step": 1474 }, { "completion_length": 230.3750123977661, "epoch": 0.24745379102225576, "grad_norm": 0.41683170206161424, "kl": 0.630040168762207, "learning_rate": 1.8222222222222223e-07, "loss": 0.0006, "reward": 1.669642947614193, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.6741071697324514, "rewards/format_reward_func": 0.9955357164144516, "step": 1476 }, { "completion_length": 235.13840103149414, "epoch": 0.2477890942621233, "grad_norm": 0.31188651409494367, "kl": 0.8284721374511719, "learning_rate": 1.8246913580246912e-07, "loss": 0.0008, "reward": 1.7071429193019867, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7160714641213417, "rewards/format_reward_func": 0.9910714328289032, "step": 1478 }, { "completion_length": 234.34375858306885, "epoch": 0.24812439750199086, "grad_norm": 0.3133969512879245, "kl": 0.012765884399414062, "learning_rate": 1.8271604938271603e-07, "loss": 0.0, "reward": 1.7303572222590446, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.734821455553174, "rewards/format_reward_func": 0.9955357164144516, "step": 1480 }, { "completion_length": 236.3259038925171, "epoch": 0.2484597007418584, "grad_norm": 0.20803832575885528, "kl": 0.3232555389404297, "learning_rate": 1.8296296296296295e-07, "loss": 0.0003, "reward": 1.7571428939700127, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 1482 }, { "completion_length": 241.29465293884277, "epoch": 0.24879500398172597, "grad_norm": 0.32787663426007396, "kl": 0.4119834899902344, "learning_rate": 1.832098765432099e-07, "loss": 0.0004, "reward": 1.7107143849134445, "reward_std": 0.07576144114136696, "rewards/equation_reward_func": 0.719642885029316, "rewards/format_reward_func": 0.9910714328289032, "step": 1484 }, { "completion_length": 234.43304538726807, "epoch": 0.24913030722159352, "grad_norm": 0.2691108188962023, "kl": 0.06020641326904297, "learning_rate": 1.8345679012345678e-07, "loss": 0.0001, "reward": 1.7767857685685158, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7812500260770321, "rewards/format_reward_func": 0.9955357164144516, "step": 1486 }, { "completion_length": 235.89286994934082, "epoch": 0.2494656104614611, "grad_norm": 0.34431069891908556, "kl": 0.8705596923828125, "learning_rate": 1.837037037037037e-07, "loss": 0.0009, "reward": 1.775000050663948, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000301748514, "rewards/format_reward_func": 1.0, "step": 1488 }, { "completion_length": 227.37500953674316, "epoch": 0.24980091370132865, "grad_norm": 0.6695050569787687, "kl": 0.19437503814697266, "learning_rate": 1.839506172839506e-07, "loss": 0.0002, "reward": 1.7946429252624512, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7991071715950966, "rewards/format_reward_func": 0.9955357164144516, "step": 1490 }, { "completion_length": 228.99108123779297, "epoch": 0.2501362169411962, "grad_norm": 0.30169039119381347, "kl": 0.5019960403442383, "learning_rate": 1.841975308641975e-07, "loss": 0.0005, "reward": 1.744642935693264, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7491071671247482, "rewards/format_reward_func": 0.9955357164144516, "step": 1492 }, { "completion_length": 225.2142972946167, "epoch": 0.2504715201810637, "grad_norm": 0.14293763277183175, "kl": 0.21941280364990234, "learning_rate": 1.8444444444444444e-07, "loss": 0.0002, "reward": 1.7214286550879478, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214286029338837, "rewards/format_reward_func": 1.0, "step": 1494 }, { "completion_length": 242.78126049041748, "epoch": 0.2508068234209313, "grad_norm": 0.19395245164727634, "kl": 0.25534629821777344, "learning_rate": 1.8469135802469135e-07, "loss": 0.0003, "reward": 1.7732143551111221, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.777678593993187, "rewards/format_reward_func": 0.9955357164144516, "step": 1496 }, { "completion_length": 225.95536708831787, "epoch": 0.2511421266607989, "grad_norm": 0.21436022346581826, "kl": 0.3440971374511719, "learning_rate": 1.8493827160493827e-07, "loss": 0.0003, "reward": 1.8075893446803093, "reward_std": 0.0460882093757391, "rewards/equation_reward_func": 0.813392873853445, "rewards/format_reward_func": 0.994196429848671, "step": 1498 }, { "completion_length": 233.38393878936768, "epoch": 0.2514774299006664, "grad_norm": 0.3264524194551336, "kl": 0.22805309295654297, "learning_rate": 1.8518518518518516e-07, "loss": 0.0002, "reward": 1.6928572431206703, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7017857395112514, "rewards/format_reward_func": 0.9910714328289032, "step": 1500 }, { "completion_length": 229.2991189956665, "epoch": 0.251812733140534, "grad_norm": 0.20478533160273893, "kl": 0.14220809936523438, "learning_rate": 1.854320987654321e-07, "loss": 0.0001, "reward": 1.7910714745521545, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 1502 }, { "completion_length": 237.68750762939453, "epoch": 0.2521480363804015, "grad_norm": 0.22934017891844902, "kl": 0.11223888397216797, "learning_rate": 1.85679012345679e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.810714315623045, "rewards/format_reward_func": 1.0, "step": 1504 }, { "completion_length": 235.20536708831787, "epoch": 0.2524833396202691, "grad_norm": 0.4432363920330559, "kl": 0.08673858642578125, "learning_rate": 1.8592592592592593e-07, "loss": 0.0001, "reward": 1.7696429342031479, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7741071581840515, "rewards/format_reward_func": 0.9955357164144516, "step": 1506 }, { "completion_length": 230.18304443359375, "epoch": 0.2528186428601366, "grad_norm": 0.21735637485812978, "kl": 0.16140174865722656, "learning_rate": 1.8617283950617281e-07, "loss": 0.0002, "reward": 1.7714286297559738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 1508 }, { "completion_length": 232.8884038925171, "epoch": 0.2531539461000042, "grad_norm": 0.2751427765130532, "kl": 0.03536796569824219, "learning_rate": 1.8641975308641976e-07, "loss": 0.0, "reward": 1.8071428909897804, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428760886192, "rewards/format_reward_func": 1.0, "step": 1510 }, { "completion_length": 241.68304634094238, "epoch": 0.25348924933987177, "grad_norm": 0.3222525911957361, "kl": 0.14169883728027344, "learning_rate": 1.8666666666666667e-07, "loss": 0.0001, "reward": 1.6553572192788124, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.6687500290572643, "rewards/format_reward_func": 0.9866071492433548, "step": 1512 }, { "completion_length": 235.7053689956665, "epoch": 0.2538245525797393, "grad_norm": 0.6563885167540162, "kl": 0.09424400329589844, "learning_rate": 1.8691358024691356e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7357143349945545, "rewards/format_reward_func": 1.0, "step": 1514 }, { "completion_length": 233.75447368621826, "epoch": 0.2541598558196069, "grad_norm": 0.5746414100535419, "kl": 0.06946945190429688, "learning_rate": 1.8716049382716047e-07, "loss": 0.0001, "reward": 1.7678572311997414, "reward_std": 0.07576144114136696, "rewards/equation_reward_func": 0.7767857387661934, "rewards/format_reward_func": 0.9910714328289032, "step": 1516 }, { "completion_length": 223.63393783569336, "epoch": 0.2544951590594744, "grad_norm": 0.1957711124441575, "kl": 0.10572624206542969, "learning_rate": 1.8740740740740742e-07, "loss": 0.0001, "reward": 1.7928571924567223, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.8017857410013676, "rewards/format_reward_func": 0.9910714328289032, "step": 1518 }, { "completion_length": 236.38393783569336, "epoch": 0.254830462299342, "grad_norm": 0.30273470214807835, "kl": 0.15691757202148438, "learning_rate": 1.8765432098765433e-07, "loss": 0.0002, "reward": 1.7214286476373672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.721428606659174, "rewards/format_reward_func": 1.0, "step": 1520 }, { "completion_length": 226.78572463989258, "epoch": 0.2551657655392095, "grad_norm": 0.3477983467378772, "kl": 0.047286033630371094, "learning_rate": 1.8790123456790122e-07, "loss": 0.0, "reward": 1.7339286357164383, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.73839289881289, "rewards/format_reward_func": 0.9955357164144516, "step": 1522 }, { "completion_length": 225.79465198516846, "epoch": 0.2555010687790771, "grad_norm": 0.21323643527682343, "kl": 0.05094718933105469, "learning_rate": 1.8814814814814813e-07, "loss": 0.0001, "reward": 1.7392858192324638, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 1.0, "step": 1524 }, { "completion_length": 237.12947845458984, "epoch": 0.25583637201894466, "grad_norm": 0.2633799298401754, "kl": 0.10648727416992188, "learning_rate": 1.8839506172839505e-07, "loss": 0.0001, "reward": 1.6892857775092125, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6892857439815998, "rewards/format_reward_func": 1.0, "step": 1526 }, { "completion_length": 233.39286613464355, "epoch": 0.2561716752588122, "grad_norm": 0.3611704280068289, "kl": 0.039473533630371094, "learning_rate": 1.8864197530864196e-07, "loss": 0.0, "reward": 1.7428572177886963, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571842610836, "rewards/format_reward_func": 1.0, "step": 1528 }, { "completion_length": 238.5580472946167, "epoch": 0.25650697849867976, "grad_norm": 0.32581177303136893, "kl": 0.12656688690185547, "learning_rate": 1.8888888888888888e-07, "loss": 0.0001, "reward": 1.7446429505944252, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7491071633994579, "rewards/format_reward_func": 0.9955357164144516, "step": 1530 }, { "completion_length": 227.6741180419922, "epoch": 0.2568422817385473, "grad_norm": 0.3034539012000064, "kl": 0.046042442321777344, "learning_rate": 1.891358024691358e-07, "loss": 0.0, "reward": 1.7821428999304771, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 1532 }, { "completion_length": 239.6250114440918, "epoch": 0.25717758497841486, "grad_norm": 0.48368909019447376, "kl": 0.06012153625488281, "learning_rate": 1.893827160493827e-07, "loss": 0.0001, "reward": 1.737500086426735, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7419643141329288, "rewards/format_reward_func": 0.9955357164144516, "step": 1534 }, { "completion_length": 237.06251049041748, "epoch": 0.2575128882182824, "grad_norm": 0.26842429522669164, "kl": 0.06570243835449219, "learning_rate": 1.8962962962962962e-07, "loss": 0.0001, "reward": 1.7267857789993286, "reward_std": 0.10354063380509615, "rewards/equation_reward_func": 0.7401786055415869, "rewards/format_reward_func": 0.9866071492433548, "step": 1536 }, { "completion_length": 228.790189743042, "epoch": 0.25784819145814997, "grad_norm": 0.4018896461427612, "kl": 0.035400390625, "learning_rate": 1.8987654320987654e-07, "loss": 0.0, "reward": 1.737500086426735, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 1538 }, { "completion_length": 229.99554634094238, "epoch": 0.25818349469801755, "grad_norm": 0.2506558632879474, "kl": 0.0076465606689453125, "learning_rate": 1.9012345679012345e-07, "loss": 0.0, "reward": 1.805357187986374, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8187500257045031, "rewards/format_reward_func": 0.9866071492433548, "step": 1540 }, { "completion_length": 230.18304347991943, "epoch": 0.25851879793788507, "grad_norm": 0.29934272366305864, "kl": 0.06483650207519531, "learning_rate": 1.9037037037037037e-07, "loss": 0.0001, "reward": 1.7214286625385284, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7303571719676256, "rewards/format_reward_func": 0.9910714328289032, "step": 1542 }, { "completion_length": 224.05358219146729, "epoch": 0.25885410117775265, "grad_norm": 0.3433338629396455, "kl": 0.02986907958984375, "learning_rate": 1.9061728395061728e-07, "loss": 0.0, "reward": 1.7750000730156898, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 1544 }, { "completion_length": 239.09375953674316, "epoch": 0.2591894044176202, "grad_norm": 0.31108987070476973, "kl": 0.058963775634765625, "learning_rate": 1.908641975308642e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.745535746216774, "rewards/format_reward_func": 0.9955357164144516, "step": 1546 }, { "completion_length": 228.30358123779297, "epoch": 0.25952470765748775, "grad_norm": 0.2713275566948536, "kl": 0.023523330688476562, "learning_rate": 1.911111111111111e-07, "loss": 0.0, "reward": 1.766071505844593, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9955357164144516, "step": 1548 }, { "completion_length": 224.51340293884277, "epoch": 0.2598600108973553, "grad_norm": 0.31686450102462, "kl": 0.010550498962402344, "learning_rate": 1.91358024691358e-07, "loss": 0.0, "reward": 1.7750000581145287, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7839285973459482, "rewards/format_reward_func": 0.9910714328289032, "step": 1550 }, { "completion_length": 225.01786708831787, "epoch": 0.26019531413722286, "grad_norm": 0.515107201500274, "kl": 0.0798177719116211, "learning_rate": 1.9160493827160491e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7875000238418579, "rewards/format_reward_func": 0.9910714328289032, "step": 1552 }, { "completion_length": 228.12947463989258, "epoch": 0.2605306173770904, "grad_norm": 0.3542582192758546, "kl": 0.040058135986328125, "learning_rate": 1.9185185185185186e-07, "loss": 0.0, "reward": 1.7946429029107094, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7991071753203869, "rewards/format_reward_func": 0.9955357164144516, "step": 1554 }, { "completion_length": 236.19644165039062, "epoch": 0.26086592061695796, "grad_norm": 0.31969628871868205, "kl": 0.06807518005371094, "learning_rate": 1.9209876543209877e-07, "loss": 0.0001, "reward": 1.7142858132719994, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7142857536673546, "rewards/format_reward_func": 1.0, "step": 1556 }, { "completion_length": 240.19643878936768, "epoch": 0.26120122385682554, "grad_norm": 0.41985963092768086, "kl": 0.11923789978027344, "learning_rate": 1.9234567901234566e-07, "loss": 0.0001, "reward": 1.6767858043313026, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.689285745844245, "rewards/format_reward_func": 0.9875000044703484, "step": 1558 }, { "completion_length": 238.20537185668945, "epoch": 0.26153652709669306, "grad_norm": 0.26763216571838205, "kl": 0.036202430725097656, "learning_rate": 1.9259259259259257e-07, "loss": 0.0, "reward": 1.8017857670783997, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.806250024586916, "rewards/format_reward_func": 0.9955357164144516, "step": 1560 }, { "completion_length": 241.16072368621826, "epoch": 0.26187183033656064, "grad_norm": 0.7795636460003993, "kl": 0.08544540405273438, "learning_rate": 1.9283950617283951e-07, "loss": 0.0001, "reward": 1.7285714894533157, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7464286163449287, "rewards/format_reward_func": 0.9821428619325161, "step": 1562 }, { "completion_length": 226.32590198516846, "epoch": 0.26220713357642816, "grad_norm": 0.26431885626858, "kl": 0.031281471252441406, "learning_rate": 1.930864197530864e-07, "loss": 0.0, "reward": 1.7482143566012383, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7526785954833031, "rewards/format_reward_func": 0.9955357164144516, "step": 1564 }, { "completion_length": 230.01786708831787, "epoch": 0.26254243681629574, "grad_norm": 0.25896091143440436, "kl": 0.19019317626953125, "learning_rate": 1.9333333333333332e-07, "loss": 0.0002, "reward": 1.778571479022503, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7875000275671482, "rewards/format_reward_func": 0.9910714328289032, "step": 1566 }, { "completion_length": 228.42858219146729, "epoch": 0.26287774005616327, "grad_norm": 0.22123026075224217, "kl": 0.11663246154785156, "learning_rate": 1.9358024691358023e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357499420643, "rewards/format_reward_func": 0.9955357164144516, "step": 1568 }, { "completion_length": 223.83929634094238, "epoch": 0.26321304329603085, "grad_norm": 0.34891901610585063, "kl": 0.0828256607055664, "learning_rate": 1.9382716049382717e-07, "loss": 0.0001, "reward": 1.7392857745289803, "reward_std": 0.07576143741607666, "rewards/equation_reward_func": 0.7482143193483353, "rewards/format_reward_func": 0.9910714328289032, "step": 1570 }, { "completion_length": 234.86607933044434, "epoch": 0.2635483465358984, "grad_norm": 0.7646794550333904, "kl": 0.0537567138671875, "learning_rate": 1.9407407407407406e-07, "loss": 0.0001, "reward": 1.7732143476605415, "reward_std": 0.0883883461356163, "rewards/equation_reward_func": 0.777678582817316, "rewards/format_reward_func": 0.9955357164144516, "step": 1572 }, { "completion_length": 231.30804538726807, "epoch": 0.26388364977576595, "grad_norm": 0.20786625219993726, "kl": 0.06808948516845703, "learning_rate": 1.9432098765432098e-07, "loss": 0.0001, "reward": 1.7410714849829674, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9964285716414452, "step": 1574 }, { "completion_length": 234.66072463989258, "epoch": 0.26421895301563353, "grad_norm": 0.17919898669534326, "kl": 0.19156932830810547, "learning_rate": 1.945679012345679e-07, "loss": 0.0002, "reward": 1.8160715103149414, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.820535734295845, "rewards/format_reward_func": 0.9955357164144516, "step": 1576 }, { "completion_length": 234.99107933044434, "epoch": 0.26455425625550105, "grad_norm": 0.25375846103409333, "kl": 0.1639862060546875, "learning_rate": 1.9481481481481478e-07, "loss": 0.0002, "reward": 1.7392857819795609, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 1578 }, { "completion_length": 232.89733028411865, "epoch": 0.26488955949536863, "grad_norm": 0.20598495484765386, "kl": 0.32021331787109375, "learning_rate": 1.9506172839506172e-07, "loss": 0.0003, "reward": 1.7625000849366188, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7758928909897804, "rewards/format_reward_func": 0.9866071492433548, "step": 1580 }, { "completion_length": 229.95090293884277, "epoch": 0.26522486273523616, "grad_norm": 0.23689001841514462, "kl": 0.028433799743652344, "learning_rate": 1.9530864197530864e-07, "loss": 0.0, "reward": 1.7750000581145287, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7839285992085934, "rewards/format_reward_func": 0.9910714328289032, "step": 1582 }, { "completion_length": 237.3973331451416, "epoch": 0.26556016597510373, "grad_norm": 0.30083193411151415, "kl": 0.0551910400390625, "learning_rate": 1.9555555555555555e-07, "loss": 0.0001, "reward": 1.7236607745289803, "reward_std": 0.0776554741896689, "rewards/equation_reward_func": 0.7250000350177288, "rewards/format_reward_func": 0.9986607171595097, "step": 1584 }, { "completion_length": 233.04018783569336, "epoch": 0.2658954692149713, "grad_norm": 0.28273465688517146, "kl": 0.12781333923339844, "learning_rate": 1.9580246913580244e-07, "loss": 0.0001, "reward": 1.7196429297327995, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7241071742027998, "rewards/format_reward_func": 0.9955357164144516, "step": 1586 }, { "completion_length": 236.07143878936768, "epoch": 0.26623077245483884, "grad_norm": 0.13755953595441434, "kl": 0.26348114013671875, "learning_rate": 1.9604938271604938e-07, "loss": 0.0003, "reward": 1.7428572252392769, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571674972773, "rewards/format_reward_func": 1.0, "step": 1588 }, { "completion_length": 231.6205472946167, "epoch": 0.2665660756947064, "grad_norm": 0.24383190621635564, "kl": 0.07479667663574219, "learning_rate": 1.962962962962963e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7455357536673546, "rewards/format_reward_func": 0.9955357164144516, "step": 1590 }, { "completion_length": 239.19197368621826, "epoch": 0.26690137893457394, "grad_norm": 0.2082010962256202, "kl": 0.21643352508544922, "learning_rate": 1.965432098765432e-07, "loss": 0.0002, "reward": 1.7642857655882835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 1592 }, { "completion_length": 220.26786613464355, "epoch": 0.2672366821744415, "grad_norm": 0.33452171117357393, "kl": 0.12536048889160156, "learning_rate": 1.967901234567901e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7410714514553547, "rewards/format_reward_func": 0.9910714328289032, "step": 1594 }, { "completion_length": 226.27233219146729, "epoch": 0.26757198541430904, "grad_norm": 0.3662231272162297, "kl": 0.12168693542480469, "learning_rate": 1.9703703703703704e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571626543999, "rewards/format_reward_func": 1.0, "step": 1596 }, { "completion_length": 223.64286708831787, "epoch": 0.2679072886541766, "grad_norm": 0.45793697052291993, "kl": 0.15146446228027344, "learning_rate": 1.9728395061728395e-07, "loss": 0.0002, "reward": 1.8000000268220901, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.8089286014437675, "rewards/format_reward_func": 0.9910714328289032, "step": 1598 }, { "completion_length": 227.0178689956665, "epoch": 0.2682425918940442, "grad_norm": 0.5879741708402264, "kl": 0.11877250671386719, "learning_rate": 1.9753086419753084e-07, "loss": 0.0001, "reward": 1.7714286670088768, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 1600 }, { "completion_length": 225.34822463989258, "epoch": 0.2685778951339117, "grad_norm": 0.3172965825133875, "kl": 0.1242218017578125, "learning_rate": 1.9777777777777776e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 1602 }, { "completion_length": 225.34376049041748, "epoch": 0.2689131983737793, "grad_norm": 0.19786567305881758, "kl": 0.041957855224609375, "learning_rate": 1.980246913580247e-07, "loss": 0.0, "reward": 1.7821429297327995, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 1604 }, { "completion_length": 231.17411613464355, "epoch": 0.26924850161364683, "grad_norm": 0.199203557943275, "kl": 0.2041006088256836, "learning_rate": 1.9827160493827161e-07, "loss": 0.0002, "reward": 1.807142935693264, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 1606 }, { "completion_length": 244.96429824829102, "epoch": 0.2695838048535144, "grad_norm": 0.5953925603348911, "kl": 0.39769744873046875, "learning_rate": 1.985185185185185e-07, "loss": 0.0004, "reward": 1.721428632736206, "reward_std": 0.0909137288108468, "rewards/equation_reward_func": 0.7392857391387224, "rewards/format_reward_func": 0.9821428656578064, "step": 1608 }, { "completion_length": 227.01340198516846, "epoch": 0.26991910809338193, "grad_norm": 0.6255436231056462, "kl": 0.01708221435546875, "learning_rate": 1.9876543209876542e-07, "loss": 0.0, "reward": 1.6892858073115349, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6892857495695353, "rewards/format_reward_func": 1.0, "step": 1610 }, { "completion_length": 227.75001049041748, "epoch": 0.2702544113332495, "grad_norm": 0.23378469972731863, "kl": 0.2529106140136719, "learning_rate": 1.9901234567901233e-07, "loss": 0.0003, "reward": 1.7500000670552254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 1612 }, { "completion_length": 241.0134048461914, "epoch": 0.27058971457311704, "grad_norm": 0.2597360936259886, "kl": 0.17315101623535156, "learning_rate": 1.9925925925925925e-07, "loss": 0.0002, "reward": 1.717857226729393, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7178571783006191, "rewards/format_reward_func": 1.0, "step": 1614 }, { "completion_length": 231.7946548461914, "epoch": 0.2709250178129846, "grad_norm": 0.46129278988052735, "kl": 0.10466766357421875, "learning_rate": 1.9950617283950616e-07, "loss": 0.0001, "reward": 1.7446428909897804, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7491071820259094, "rewards/format_reward_func": 0.9955357164144516, "step": 1616 }, { "completion_length": 224.696439743042, "epoch": 0.2712603210528522, "grad_norm": 0.4255237430650551, "kl": 0.17496871948242188, "learning_rate": 1.9975308641975308e-07, "loss": 0.0002, "reward": 1.7714286521077156, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 1618 }, { "completion_length": 229.60715198516846, "epoch": 0.2715956242927197, "grad_norm": 0.189806473011921, "kl": 0.1701374053955078, "learning_rate": 2e-07, "loss": 0.0002, "reward": 1.7821429297327995, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428906172514, "rewards/format_reward_func": 1.0, "step": 1620 }, { "completion_length": 223.0089406967163, "epoch": 0.2719309275325873, "grad_norm": 0.2910744729086953, "kl": 0.29755210876464844, "learning_rate": 2.002469135802469e-07, "loss": 0.0003, "reward": 1.6928572282195091, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7017857581377029, "rewards/format_reward_func": 0.9910714328289032, "step": 1622 }, { "completion_length": 221.98215007781982, "epoch": 0.2722662307724548, "grad_norm": 0.22473947784807705, "kl": 0.17630863189697266, "learning_rate": 2.0049382716049382e-07, "loss": 0.0002, "reward": 1.7428571805357933, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7428571842610836, "rewards/format_reward_func": 1.0, "step": 1624 }, { "completion_length": 244.11608219146729, "epoch": 0.2726015340123224, "grad_norm": 0.40284671702968305, "kl": 0.11990642547607422, "learning_rate": 2.0074074074074074e-07, "loss": 0.0001, "reward": 1.7321429178118706, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7410714589059353, "rewards/format_reward_func": 0.9910714328289032, "step": 1626 }, { "completion_length": 227.2187614440918, "epoch": 0.2729368372521899, "grad_norm": 0.26856909090880804, "kl": 0.08802032470703125, "learning_rate": 2.0098765432098762e-07, "loss": 0.0001, "reward": 1.6857143640518188, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.6857143305242062, "rewards/format_reward_func": 1.0, "step": 1628 }, { "completion_length": 220.85715103149414, "epoch": 0.2732721404920575, "grad_norm": 0.20613533704375775, "kl": 0.019829750061035156, "learning_rate": 2.0123456790123457e-07, "loss": 0.0, "reward": 1.7928572073578835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571626543999, "rewards/format_reward_func": 1.0, "step": 1630 }, { "completion_length": 227.7321548461914, "epoch": 0.2736074437319251, "grad_norm": 0.20721345610666672, "kl": 0.10821723937988281, "learning_rate": 2.0148148148148148e-07, "loss": 0.0001, "reward": 1.7071429267525673, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7160714566707611, "rewards/format_reward_func": 0.9910714328289032, "step": 1632 }, { "completion_length": 224.10268783569336, "epoch": 0.2739427469717926, "grad_norm": 0.2862899956793819, "kl": 0.10094642639160156, "learning_rate": 2.017283950617284e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 1634 }, { "completion_length": 221.59375953674316, "epoch": 0.2742780502116602, "grad_norm": 0.3526294409199377, "kl": 0.10612869262695312, "learning_rate": 2.0197530864197528e-07, "loss": 0.0001, "reward": 1.8129464909434319, "reward_std": 0.04230013629421592, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 0.9986607171595097, "step": 1636 }, { "completion_length": 240.34822368621826, "epoch": 0.2746133534515277, "grad_norm": 0.22284661939811562, "kl": 0.19942092895507812, "learning_rate": 2.022222222222222e-07, "loss": 0.0002, "reward": 1.7035714983940125, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7035714723169804, "rewards/format_reward_func": 1.0, "step": 1638 }, { "completion_length": 228.54465103149414, "epoch": 0.2749486566913953, "grad_norm": 0.23164365281361313, "kl": 0.035366058349609375, "learning_rate": 2.0246913580246914e-07, "loss": 0.0, "reward": 1.700000062584877, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7000000216066837, "rewards/format_reward_func": 1.0, "step": 1640 }, { "completion_length": 224.1071538925171, "epoch": 0.2752839599312628, "grad_norm": 0.3284594210277132, "kl": 0.09695816040039062, "learning_rate": 2.0271604938271605e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 1642 }, { "completion_length": 229.30804634094238, "epoch": 0.2756192631711304, "grad_norm": 0.383168344007737, "kl": 0.11481475830078125, "learning_rate": 2.0296296296296294e-07, "loss": 0.0001, "reward": 1.7125000804662704, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7258928753435612, "rewards/format_reward_func": 0.9866071492433548, "step": 1644 }, { "completion_length": 228.91518783569336, "epoch": 0.27595456641099797, "grad_norm": 0.4223823876902598, "kl": 0.1330738067626953, "learning_rate": 2.0320987654320986e-07, "loss": 0.0001, "reward": 1.7196429371833801, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.724107189103961, "rewards/format_reward_func": 0.9955357164144516, "step": 1646 }, { "completion_length": 230.92858219146729, "epoch": 0.2762898696508655, "grad_norm": 0.19741134994974258, "kl": 0.24378204345703125, "learning_rate": 2.034567901234568e-07, "loss": 0.0002, "reward": 1.7535714730620384, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 1648 }, { "completion_length": 238.7812623977661, "epoch": 0.2766251728907331, "grad_norm": 0.23204192482524613, "kl": 0.17847061157226562, "learning_rate": 2.0370370370370369e-07, "loss": 0.0002, "reward": 1.7232143431901932, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276786044239998, "rewards/format_reward_func": 0.9955357164144516, "step": 1650 }, { "completion_length": 227.9196538925171, "epoch": 0.2769604761306006, "grad_norm": 0.43032013537420893, "kl": 0.3296394348144531, "learning_rate": 2.039506172839506e-07, "loss": 0.0003, "reward": 1.7875000536441803, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7919643186032772, "rewards/format_reward_func": 0.9955357164144516, "step": 1652 }, { "completion_length": 237.1696538925171, "epoch": 0.2772957793704682, "grad_norm": 0.2077137808223099, "kl": 0.31763267517089844, "learning_rate": 2.0419753086419752e-07, "loss": 0.0003, "reward": 1.7214286550879478, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7214286103844643, "rewards/format_reward_func": 1.0, "step": 1654 }, { "completion_length": 231.7098331451416, "epoch": 0.2776310826103357, "grad_norm": 0.08385124065366012, "kl": 0.20266056060791016, "learning_rate": 2.0444444444444446e-07, "loss": 0.0002, "reward": 1.725000075995922, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7339286096394062, "rewards/format_reward_func": 0.9910714328289032, "step": 1656 }, { "completion_length": 238.44643878936768, "epoch": 0.2779663858502033, "grad_norm": 0.22718182123890435, "kl": 0.027462005615234375, "learning_rate": 2.0469135802469135e-07, "loss": 0.0, "reward": 1.7660714909434319, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7705357484519482, "rewards/format_reward_func": 0.9955357164144516, "step": 1658 }, { "completion_length": 229.26340198516846, "epoch": 0.27830168909007086, "grad_norm": 0.30742028326270204, "kl": 0.30859375, "learning_rate": 2.0493827160493826e-07, "loss": 0.0003, "reward": 1.7589286267757416, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7633928954601288, "rewards/format_reward_func": 0.9955357164144516, "step": 1660 }, { "completion_length": 229.0580472946167, "epoch": 0.2786369923299384, "grad_norm": 0.44218696016693715, "kl": 0.05316638946533203, "learning_rate": 2.0518518518518518e-07, "loss": 0.0001, "reward": 1.7598214894533157, "reward_std": 0.07702413015067577, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9937500059604645, "step": 1662 }, { "completion_length": 230.37947463989258, "epoch": 0.27897229556980596, "grad_norm": 0.1850784449678431, "kl": 0.06500816345214844, "learning_rate": 2.0543209876543206e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 1664 }, { "completion_length": 222.53126049041748, "epoch": 0.2793075988096735, "grad_norm": 0.279684266768164, "kl": 0.011688232421875, "learning_rate": 2.05679012345679e-07, "loss": 0.0, "reward": 1.766964353621006, "reward_std": 0.06692260596901178, "rewards/equation_reward_func": 0.7741071730852127, "rewards/format_reward_func": 0.9928571507334709, "step": 1666 }, { "completion_length": 223.3303689956665, "epoch": 0.27964290204954106, "grad_norm": 0.2295089233084164, "kl": 0.034732818603515625, "learning_rate": 2.0592592592592592e-07, "loss": 0.0, "reward": 1.7821429371833801, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 1668 }, { "completion_length": 224.62501049041748, "epoch": 0.2799782052894086, "grad_norm": 0.16455998588464119, "kl": 0.03418731689453125, "learning_rate": 2.0617283950617283e-07, "loss": 0.0, "reward": 1.7607143595814705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 1670 }, { "completion_length": 228.66965103149414, "epoch": 0.28031350852927617, "grad_norm": 0.16323662515693607, "kl": 0.1743144989013672, "learning_rate": 2.0641975308641972e-07, "loss": 0.0002, "reward": 1.753571480512619, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 1672 }, { "completion_length": 231.9107265472412, "epoch": 0.28064881176914375, "grad_norm": 0.17971824743419645, "kl": 0.07829761505126953, "learning_rate": 2.0666666666666666e-07, "loss": 0.0001, "reward": 1.7553572207689285, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7598214540630579, "rewards/format_reward_func": 0.9955357164144516, "step": 1674 }, { "completion_length": 236.86162090301514, "epoch": 0.28098411500901127, "grad_norm": 0.18232481891962177, "kl": 0.1730976104736328, "learning_rate": 2.0691358024691358e-07, "loss": 0.0002, "reward": 1.755357213318348, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7687500342726707, "rewards/format_reward_func": 0.9866071492433548, "step": 1676 }, { "completion_length": 233.86161613464355, "epoch": 0.28131941824887885, "grad_norm": 0.2875084708059203, "kl": 0.08128738403320312, "learning_rate": 2.0716049382716047e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 1678 }, { "completion_length": 230.69644260406494, "epoch": 0.2816547214887464, "grad_norm": 0.23955868610264447, "kl": 0.12451171875, "learning_rate": 2.0740740740740738e-07, "loss": 0.0001, "reward": 1.7160715088248253, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7205357421189547, "rewards/format_reward_func": 0.9955357164144516, "step": 1680 }, { "completion_length": 225.41965293884277, "epoch": 0.28199002472861395, "grad_norm": 0.24345243540818787, "kl": 0.27367496490478516, "learning_rate": 2.0765432098765432e-07, "loss": 0.0003, "reward": 1.776785783469677, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500223517418, "rewards/format_reward_func": 0.9955357164144516, "step": 1682 }, { "completion_length": 223.46875953674316, "epoch": 0.2823253279684815, "grad_norm": 0.25014435964613124, "kl": 0.030881881713867188, "learning_rate": 2.0790123456790124e-07, "loss": 0.0, "reward": 1.7571429163217545, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 1684 }, { "completion_length": 224.60268783569336, "epoch": 0.28266063120834906, "grad_norm": 0.2666289593787631, "kl": 0.03217506408691406, "learning_rate": 2.0814814814814813e-07, "loss": 0.0, "reward": 1.7571429163217545, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571429051458836, "rewards/format_reward_func": 1.0, "step": 1686 }, { "completion_length": 224.8214406967163, "epoch": 0.2829959344482166, "grad_norm": 0.2344435869553199, "kl": 0.014842987060546875, "learning_rate": 2.0839506172839504e-07, "loss": 0.0, "reward": 1.7553572207689285, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7598214633762836, "rewards/format_reward_func": 0.9955357164144516, "step": 1688 }, { "completion_length": 222.98661613464355, "epoch": 0.28333123768808416, "grad_norm": 0.2974750754107423, "kl": 0.10511493682861328, "learning_rate": 2.0864197530864198e-07, "loss": 0.0001, "reward": 1.7678572311997414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 1690 }, { "completion_length": 225.80804538726807, "epoch": 0.28366654092795174, "grad_norm": 0.40492073155295566, "kl": 0.19135475158691406, "learning_rate": 2.088888888888889e-07, "loss": 0.0002, "reward": 1.7625000551342964, "reward_std": 0.0833375845104456, "rewards/equation_reward_func": 0.7758928760886192, "rewards/format_reward_func": 0.9866071492433548, "step": 1692 }, { "completion_length": 225.75000858306885, "epoch": 0.28400184416781926, "grad_norm": 0.21306977700182334, "kl": 0.2654247283935547, "learning_rate": 2.0913580246913579e-07, "loss": 0.0003, "reward": 1.6964286789298058, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.6964285988360643, "rewards/format_reward_func": 1.0, "step": 1694 }, { "completion_length": 224.08036708831787, "epoch": 0.28433714740768684, "grad_norm": 0.25962073008852654, "kl": 0.022909164428710938, "learning_rate": 2.093827160493827e-07, "loss": 0.0, "reward": 1.7464286163449287, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286163449287, "rewards/format_reward_func": 1.0, "step": 1696 }, { "completion_length": 220.73661708831787, "epoch": 0.28467245064755436, "grad_norm": 0.2998414003107778, "kl": 0.34929847717285156, "learning_rate": 2.0962962962962962e-07, "loss": 0.0003, "reward": 1.7428572326898575, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 1698 }, { "completion_length": 222.66965198516846, "epoch": 0.28500775388742194, "grad_norm": 0.21281974812319984, "kl": 0.18515777587890625, "learning_rate": 2.0987654320987653e-07, "loss": 0.0002, "reward": 1.775000050663948, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 1700 }, { "completion_length": 220.08036613464355, "epoch": 0.28534305712728947, "grad_norm": 0.3371442250969937, "kl": 0.10428047180175781, "learning_rate": 2.1012345679012345e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7678571864962578, "rewards/format_reward_func": 1.0, "step": 1702 }, { "completion_length": 222.62054538726807, "epoch": 0.28567836036715705, "grad_norm": 0.5105200926308263, "kl": 0.22168350219726562, "learning_rate": 2.1037037037037036e-07, "loss": 0.0002, "reward": 1.7428572252392769, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571842610836, "rewards/format_reward_func": 1.0, "step": 1704 }, { "completion_length": 226.08929538726807, "epoch": 0.2860136636070246, "grad_norm": 0.4370022389501131, "kl": 0.1873798370361328, "learning_rate": 2.1061728395061727e-07, "loss": 0.0002, "reward": 1.773214340209961, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.777678593993187, "rewards/format_reward_func": 0.9955357164144516, "step": 1706 }, { "completion_length": 226.66518878936768, "epoch": 0.28634896684689215, "grad_norm": 0.11375834492782687, "kl": 0.1496267318725586, "learning_rate": 2.108641975308642e-07, "loss": 0.0002, "reward": 1.8160714656114578, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.8205357566475868, "rewards/format_reward_func": 0.9955357164144516, "step": 1708 }, { "completion_length": 230.83483123779297, "epoch": 0.28668427008675973, "grad_norm": 0.46898913636215384, "kl": 0.1600666046142578, "learning_rate": 2.111111111111111e-07, "loss": 0.0002, "reward": 1.7767858058214188, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7812500149011612, "rewards/format_reward_func": 0.9955357164144516, "step": 1710 }, { "completion_length": 219.47768783569336, "epoch": 0.28701957332662725, "grad_norm": 0.0017106252395304078, "kl": 0.03565216064453125, "learning_rate": 2.1135802469135802e-07, "loss": 0.0, "reward": 1.7357143685221672, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7357143107801676, "rewards/format_reward_func": 1.0, "step": 1712 }, { "completion_length": 221.4509048461914, "epoch": 0.28735487656649483, "grad_norm": 0.29632758745427196, "kl": 0.10880851745605469, "learning_rate": 2.116049382716049e-07, "loss": 0.0001, "reward": 1.8000000417232513, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 1714 }, { "completion_length": 236.94197463989258, "epoch": 0.28769017980636236, "grad_norm": 0.2917454687041177, "kl": 0.2630786895751953, "learning_rate": 2.1185185185185185e-07, "loss": 0.0003, "reward": 1.787500061094761, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 1716 }, { "completion_length": 222.91965293884277, "epoch": 0.28802548304622994, "grad_norm": 0.3137815271928074, "kl": 0.04746055603027344, "learning_rate": 2.1209876543209876e-07, "loss": 0.0, "reward": 1.7642857804894447, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 1718 }, { "completion_length": 231.45983123779297, "epoch": 0.2883607862860975, "grad_norm": 0.28325392986376813, "kl": 0.1917266845703125, "learning_rate": 2.1234567901234568e-07, "loss": 0.0002, "reward": 1.8071429133415222, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428686380386, "rewards/format_reward_func": 1.0, "step": 1720 }, { "completion_length": 226.11608219146729, "epoch": 0.28869608952596504, "grad_norm": 0.6358652431461438, "kl": 0.370849609375, "learning_rate": 2.1259259259259257e-07, "loss": 0.0004, "reward": 1.7017857879400253, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7062500268220901, "rewards/format_reward_func": 0.9955357164144516, "step": 1722 }, { "completion_length": 232.03572463989258, "epoch": 0.2890313927658326, "grad_norm": 0.1300999799664777, "kl": 0.3420524597167969, "learning_rate": 2.1283950617283948e-07, "loss": 0.0003, "reward": 1.7178572341799736, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571783006191, "rewards/format_reward_func": 1.0, "step": 1724 }, { "completion_length": 223.0357255935669, "epoch": 0.28936669600570014, "grad_norm": 0.269776246641907, "kl": 0.7043094635009766, "learning_rate": 2.1308641975308642e-07, "loss": 0.0007, "reward": 1.7642857804894447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 1726 }, { "completion_length": 229.25447177886963, "epoch": 0.2897019992455677, "grad_norm": 0.26928261439673645, "kl": 0.1688404083251953, "learning_rate": 2.1333333333333334e-07, "loss": 0.0002, "reward": 1.725000075995922, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000461935997, "rewards/format_reward_func": 1.0, "step": 1728 }, { "completion_length": 227.66518688201904, "epoch": 0.29003730248543524, "grad_norm": 0.25291738156225757, "kl": 0.019290924072265625, "learning_rate": 2.1358024691358023e-07, "loss": 0.0, "reward": 1.782142922282219, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 1730 }, { "completion_length": 232.74108123779297, "epoch": 0.2903726057253028, "grad_norm": 0.3125680880835837, "kl": 0.11983871459960938, "learning_rate": 2.1382716049382714e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7571428958326578, "rewards/format_reward_func": 1.0, "step": 1732 }, { "completion_length": 234.12501049041748, "epoch": 0.2907079089651704, "grad_norm": 0.16799823386356563, "kl": 0.32921409606933594, "learning_rate": 2.1407407407407408e-07, "loss": 0.0003, "reward": 1.7500000670552254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 1734 }, { "completion_length": 229.38840293884277, "epoch": 0.2910432122050379, "grad_norm": 0.2370698650557852, "kl": 0.29618072509765625, "learning_rate": 2.1432098765432097e-07, "loss": 0.0003, "reward": 1.767857201397419, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 1736 }, { "completion_length": 233.07590198516846, "epoch": 0.2913785154449055, "grad_norm": 0.31089682021611725, "kl": 0.3679676055908203, "learning_rate": 2.1456790123456789e-07, "loss": 0.0004, "reward": 1.746428668498993, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 1738 }, { "completion_length": 220.51786518096924, "epoch": 0.29171381868477303, "grad_norm": 0.3056727159750165, "kl": 0.5169277191162109, "learning_rate": 2.148148148148148e-07, "loss": 0.0005, "reward": 1.7142857983708382, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857443541288, "rewards/format_reward_func": 1.0, "step": 1740 }, { "completion_length": 229.81250953674316, "epoch": 0.2920491219246406, "grad_norm": 0.1519463725257348, "kl": 0.013641357421875, "learning_rate": 2.1506172839506174e-07, "loss": 0.0, "reward": 1.707142911851406, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7071428913623095, "rewards/format_reward_func": 1.0, "step": 1742 }, { "completion_length": 233.602689743042, "epoch": 0.29238442516450813, "grad_norm": 0.3022476612112489, "kl": 0.06960678100585938, "learning_rate": 2.1530864197530863e-07, "loss": 0.0001, "reward": 1.70089291036129, "reward_std": 0.069447988178581, "rewards/equation_reward_func": 0.7205357626080513, "rewards/format_reward_func": 0.9803571552038193, "step": 1744 }, { "completion_length": 225.93304443359375, "epoch": 0.2927197284043757, "grad_norm": 0.16061238412841028, "kl": 0.06715774536132812, "learning_rate": 2.1555555555555554e-07, "loss": 0.0001, "reward": 1.7910714745521545, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 1746 }, { "completion_length": 231.93304634094238, "epoch": 0.29305503164424324, "grad_norm": 0.1863992188614141, "kl": 0.30138206481933594, "learning_rate": 2.1580246913580246e-07, "loss": 0.0003, "reward": 1.7571429014205933, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 1748 }, { "completion_length": 234.3259038925171, "epoch": 0.2933903348841108, "grad_norm": 0.4137435993933847, "kl": 0.1421070098876953, "learning_rate": 2.1604938271604935e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7928571812808514, "rewards/format_reward_func": 1.0, "step": 1750 }, { "completion_length": 225.38840579986572, "epoch": 0.2937256381239784, "grad_norm": 0.24968596600314152, "kl": 0.10309982299804688, "learning_rate": 2.162962962962963e-07, "loss": 0.0001, "reward": 1.7160715162754059, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7205357551574707, "rewards/format_reward_func": 0.9955357164144516, "step": 1752 }, { "completion_length": 228.83036708831787, "epoch": 0.2940609413638459, "grad_norm": 0.3084047530337667, "kl": 0.11126136779785156, "learning_rate": 2.165432098765432e-07, "loss": 0.0001, "reward": 1.687500074505806, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.6919643208384514, "rewards/format_reward_func": 0.9955357164144516, "step": 1754 }, { "completion_length": 241.81697368621826, "epoch": 0.2943962446037135, "grad_norm": 0.18964625385824094, "kl": 0.3287525177001953, "learning_rate": 2.1679012345679012e-07, "loss": 0.0003, "reward": 1.650000087916851, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.65892861969769, "rewards/format_reward_func": 0.9910714328289032, "step": 1756 }, { "completion_length": 236.86608219146729, "epoch": 0.294731547843581, "grad_norm": 0.3619400638075198, "kl": 0.15472793579101562, "learning_rate": 2.17037037037037e-07, "loss": 0.0002, "reward": 1.773214340209961, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7776785921305418, "rewards/format_reward_func": 0.9955357164144516, "step": 1758 }, { "completion_length": 235.95983123779297, "epoch": 0.2950668510834486, "grad_norm": 0.3397716552384549, "kl": 0.6170005798339844, "learning_rate": 2.1728395061728395e-07, "loss": 0.0006, "reward": 1.7392857894301414, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 1760 }, { "completion_length": 233.0759048461914, "epoch": 0.2954021543233161, "grad_norm": 0.23654675030018038, "kl": 0.013632774353027344, "learning_rate": 2.1753086419753086e-07, "loss": 0.0, "reward": 1.7642857804894447, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 1762 }, { "completion_length": 226.56250953674316, "epoch": 0.2957374575631837, "grad_norm": 0.288814634064109, "kl": 0.12615585327148438, "learning_rate": 2.1777777777777775e-07, "loss": 0.0001, "reward": 1.725000075995922, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000163912773, "rewards/format_reward_func": 1.0, "step": 1764 }, { "completion_length": 229.02679824829102, "epoch": 0.2960727608030513, "grad_norm": 0.24413996834108162, "kl": 0.1447772979736328, "learning_rate": 2.1802469135802467e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 1766 }, { "completion_length": 231.7634038925171, "epoch": 0.2964080640429188, "grad_norm": 0.4000458770990593, "kl": 0.15684127807617188, "learning_rate": 2.182716049382716e-07, "loss": 0.0002, "reward": 1.7267858013510704, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.7312500234693289, "rewards/format_reward_func": 0.9955357164144516, "step": 1768 }, { "completion_length": 223.84375953674316, "epoch": 0.2967433672827864, "grad_norm": 0.2284878268099746, "kl": 0.13471412658691406, "learning_rate": 2.1851851851851852e-07, "loss": 0.0001, "reward": 1.767857238650322, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 1770 }, { "completion_length": 215.60715293884277, "epoch": 0.2970786705226539, "grad_norm": 0.19678904064037697, "kl": 0.05077934265136719, "learning_rate": 2.187654320987654e-07, "loss": 0.0001, "reward": 1.742857202887535, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571730852127, "rewards/format_reward_func": 1.0, "step": 1772 }, { "completion_length": 227.2946538925171, "epoch": 0.2974139737625215, "grad_norm": 0.33814303171679666, "kl": 0.11888504028320312, "learning_rate": 2.1901234567901233e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 1774 }, { "completion_length": 223.72322463989258, "epoch": 0.297749277002389, "grad_norm": 0.19717208048606372, "kl": 0.24188232421875, "learning_rate": 2.1925925925925927e-07, "loss": 0.0002, "reward": 1.7928572222590446, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 1776 }, { "completion_length": 228.0625123977661, "epoch": 0.2980845802422566, "grad_norm": 0.271144369694773, "kl": 0.2233419418334961, "learning_rate": 2.1950617283950618e-07, "loss": 0.0002, "reward": 1.7839286178350449, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.788392897695303, "rewards/format_reward_func": 0.9955357164144516, "step": 1778 }, { "completion_length": 237.03572368621826, "epoch": 0.29841988348212417, "grad_norm": 0.2724767955336668, "kl": 0.021409988403320312, "learning_rate": 2.1975308641975307e-07, "loss": 0.0, "reward": 1.775000087916851, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 1780 }, { "completion_length": 230.90179538726807, "epoch": 0.2987551867219917, "grad_norm": 0.1573001174468948, "kl": 0.029628753662109375, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": 1.7125000953674316, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7250000350177288, "rewards/format_reward_func": 0.987500011920929, "step": 1782 }, { "completion_length": 230.42411994934082, "epoch": 0.2990904899618593, "grad_norm": 0.29308188881949576, "kl": 0.06266975402832031, "learning_rate": 2.202469135802469e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 1784 }, { "completion_length": 225.75447368621826, "epoch": 0.2994257932017268, "grad_norm": 0.37598447598224566, "kl": 0.06233024597167969, "learning_rate": 2.2049382716049381e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714693367481, "rewards/format_reward_func": 1.0, "step": 1786 }, { "completion_length": 239.06697368621826, "epoch": 0.2997610964415944, "grad_norm": 0.2410658800415634, "kl": 0.08719062805175781, "learning_rate": 2.2074074074074073e-07, "loss": 0.0001, "reward": 1.7142857909202576, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7232143227010965, "rewards/format_reward_func": 0.9910714328289032, "step": 1788 }, { "completion_length": 237.25001430511475, "epoch": 0.3000963996814619, "grad_norm": 0.27428234718509725, "kl": 0.0132598876953125, "learning_rate": 2.2098765432098764e-07, "loss": 0.0, "reward": 1.760714367032051, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 1.0, "step": 1790 }, { "completion_length": 230.80804538726807, "epoch": 0.3004317029213295, "grad_norm": 0.20216357930418644, "kl": 0.03976631164550781, "learning_rate": 2.2123456790123456e-07, "loss": 0.0, "reward": 1.7232143729925156, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7276785839349031, "rewards/format_reward_func": 0.9955357164144516, "step": 1792 }, { "completion_length": 230.75893783569336, "epoch": 0.30076700616119706, "grad_norm": 0.3042072979752627, "kl": 0.09014701843261719, "learning_rate": 2.2148148148148147e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 1794 }, { "completion_length": 227.02679443359375, "epoch": 0.3011023094010646, "grad_norm": 0.27795542549745866, "kl": 0.04588603973388672, "learning_rate": 2.217283950617284e-07, "loss": 0.0, "reward": 1.7678571939468384, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571827709675, "rewards/format_reward_func": 1.0, "step": 1796 }, { "completion_length": 236.84822750091553, "epoch": 0.30143761264093216, "grad_norm": 0.4820397237602499, "kl": 0.4211463928222656, "learning_rate": 2.219753086419753e-07, "loss": 0.0004, "reward": 1.7625000402331352, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7758928947150707, "rewards/format_reward_func": 0.9866071492433548, "step": 1798 }, { "completion_length": 226.65179538726807, "epoch": 0.3017729158807997, "grad_norm": 0.2887272445792341, "kl": 0.016399383544921875, "learning_rate": 2.222222222222222e-07, "loss": 0.0, "reward": 1.7678571939468384, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571660071611, "rewards/format_reward_func": 1.0, "step": 1800 }, { "completion_length": 235.01340293884277, "epoch": 0.30210821912066727, "grad_norm": 0.6643608729826055, "kl": 0.1521320343017578, "learning_rate": 2.2246913580246913e-07, "loss": 0.0002, "reward": 1.7410714998841286, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.745535746216774, "rewards/format_reward_func": 0.9955357164144516, "step": 1802 }, { "completion_length": 225.6384038925171, "epoch": 0.3024435223605348, "grad_norm": 0.26640352513056065, "kl": 0.03233146667480469, "learning_rate": 2.2271604938271605e-07, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 1804 }, { "completion_length": 228.18750953674316, "epoch": 0.30277882560040237, "grad_norm": 0.14731187386849665, "kl": 0.027124404907226562, "learning_rate": 2.2296296296296296e-07, "loss": 0.0, "reward": 1.7089286521077156, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7133928928524256, "rewards/format_reward_func": 0.9955357164144516, "step": 1806 }, { "completion_length": 239.9464406967163, "epoch": 0.3031141288402699, "grad_norm": 0.29234930155500477, "kl": 0.020626068115234375, "learning_rate": 2.2320987654320985e-07, "loss": 0.0, "reward": 1.7607143372297287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 1808 }, { "completion_length": 235.46875858306885, "epoch": 0.30344943208013747, "grad_norm": 0.1040229140664818, "kl": 0.015781402587890625, "learning_rate": 2.2345679012345677e-07, "loss": 0.0, "reward": 1.7285715118050575, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 1810 }, { "completion_length": 237.58929920196533, "epoch": 0.30378473532000505, "grad_norm": 0.10295620756709041, "kl": 0.16724872589111328, "learning_rate": 2.237037037037037e-07, "loss": 0.0002, "reward": 1.7696429267525673, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7830357439815998, "rewards/format_reward_func": 0.9866071492433548, "step": 1812 }, { "completion_length": 228.08929634094238, "epoch": 0.3041200385598726, "grad_norm": 0.15218741534858393, "kl": 0.2906198501586914, "learning_rate": 2.239506172839506e-07, "loss": 0.0003, "reward": 1.7428572177886963, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 1814 }, { "completion_length": 224.18750762939453, "epoch": 0.30445534179974015, "grad_norm": 0.19074701356213938, "kl": 0.06517219543457031, "learning_rate": 2.241975308641975e-07, "loss": 0.0001, "reward": 1.7142858058214188, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7142857518047094, "rewards/format_reward_func": 1.0, "step": 1816 }, { "completion_length": 222.05804443359375, "epoch": 0.3047906450396077, "grad_norm": 0.1476048923105393, "kl": 0.012349128723144531, "learning_rate": 2.2444444444444442e-07, "loss": 0.0, "reward": 1.742857202887535, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 1818 }, { "completion_length": 233.8750114440918, "epoch": 0.30512594827947526, "grad_norm": 0.17331623039744623, "kl": 0.01285552978515625, "learning_rate": 2.2469135802469137e-07, "loss": 0.0, "reward": 1.8000000566244125, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 1820 }, { "completion_length": 231.1250114440918, "epoch": 0.3054612515193428, "grad_norm": 0.3307768489017182, "kl": 0.1770782470703125, "learning_rate": 2.2493827160493825e-07, "loss": 0.0002, "reward": 1.744642935693264, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.9955357164144516, "step": 1822 }, { "completion_length": 233.92858028411865, "epoch": 0.30579655475921036, "grad_norm": 0.22175695720606406, "kl": 0.16938114166259766, "learning_rate": 2.2518518518518517e-07, "loss": 0.0002, "reward": 1.7892857789993286, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 1824 }, { "completion_length": 230.14733219146729, "epoch": 0.30613185799907794, "grad_norm": 0.27675034844790847, "kl": 0.020910263061523438, "learning_rate": 2.2543209876543208e-07, "loss": 0.0, "reward": 1.8107143640518188, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8107143081724644, "rewards/format_reward_func": 1.0, "step": 1826 }, { "completion_length": 247.6160831451416, "epoch": 0.30646716123894546, "grad_norm": 0.44542823477944804, "kl": 0.24846267700195312, "learning_rate": 2.2567901234567903e-07, "loss": 0.0002, "reward": 1.739285796880722, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7482143267989159, "rewards/format_reward_func": 0.9910714328289032, "step": 1828 }, { "completion_length": 226.4062623977661, "epoch": 0.30680246447881304, "grad_norm": 0.3461096636929037, "kl": 0.05735588073730469, "learning_rate": 2.2592592592592591e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857618629932, "rewards/format_reward_func": 1.0, "step": 1830 }, { "completion_length": 237.54465198516846, "epoch": 0.30713776771868057, "grad_norm": 0.4290936324075917, "kl": 0.08965682983398438, "learning_rate": 2.2617283950617283e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7410714589059353, "rewards/format_reward_func": 0.9910714328289032, "step": 1832 }, { "completion_length": 234.8839406967163, "epoch": 0.30747307095854814, "grad_norm": 0.28822848873036533, "kl": 0.035064697265625, "learning_rate": 2.2641975308641974e-07, "loss": 0.0, "reward": 1.7821428924798965, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428906172514, "rewards/format_reward_func": 1.0, "step": 1834 }, { "completion_length": 226.7634038925171, "epoch": 0.30780837419841567, "grad_norm": 0.14622024873230458, "kl": 0.35060882568359375, "learning_rate": 2.2666666666666663e-07, "loss": 0.0004, "reward": 1.7767857611179352, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7812500484287739, "rewards/format_reward_func": 0.9955357164144516, "step": 1836 }, { "completion_length": 241.6919755935669, "epoch": 0.30814367743828325, "grad_norm": 0.35153050154322957, "kl": 0.012337684631347656, "learning_rate": 2.2691358024691357e-07, "loss": 0.0, "reward": 1.7267858162522316, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.7312500290572643, "rewards/format_reward_func": 0.9955357164144516, "step": 1838 }, { "completion_length": 224.90179634094238, "epoch": 0.3084789806781508, "grad_norm": 0.2655829602093783, "kl": 0.0985097885131836, "learning_rate": 2.271604938271605e-07, "loss": 0.0001, "reward": 1.801785759627819, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8062500357627869, "rewards/format_reward_func": 0.9955357164144516, "step": 1840 }, { "completion_length": 241.04911994934082, "epoch": 0.30881428391801835, "grad_norm": 0.2453221248498777, "kl": 0.4122467041015625, "learning_rate": 2.274074074074074e-07, "loss": 0.0004, "reward": 1.7589286044239998, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928991854191, "rewards/format_reward_func": 0.9955357164144516, "step": 1842 }, { "completion_length": 231.6741180419922, "epoch": 0.30914958715788593, "grad_norm": 0.2234689362529436, "kl": 0.09752082824707031, "learning_rate": 2.276543209876543e-07, "loss": 0.0001, "reward": 1.7142857983708382, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857518047094, "rewards/format_reward_func": 1.0, "step": 1844 }, { "completion_length": 229.68751049041748, "epoch": 0.30948489039775345, "grad_norm": 0.33753612337173233, "kl": 0.017671585083007812, "learning_rate": 2.2790123456790123e-07, "loss": 0.0, "reward": 1.7642857879400253, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 1846 }, { "completion_length": 236.9821538925171, "epoch": 0.30982019363762103, "grad_norm": 0.34463075474830307, "kl": 0.06949996948242188, "learning_rate": 2.2814814814814815e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 1848 }, { "completion_length": 222.60268783569336, "epoch": 0.31015549687748856, "grad_norm": 0.3017657562993159, "kl": 0.17605209350585938, "learning_rate": 2.2839506172839504e-07, "loss": 0.0002, "reward": 1.7410714998841286, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.745535746216774, "rewards/format_reward_func": 0.9955357164144516, "step": 1850 }, { "completion_length": 233.35268878936768, "epoch": 0.31049080011735614, "grad_norm": 0.19114987608251482, "kl": 0.11333656311035156, "learning_rate": 2.2864197530864195e-07, "loss": 0.0001, "reward": 1.7446429207921028, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.74910718947649, "rewards/format_reward_func": 0.9955357164144516, "step": 1852 }, { "completion_length": 239.08929443359375, "epoch": 0.3108261033572237, "grad_norm": 0.38085970590024615, "kl": 0.1593780517578125, "learning_rate": 2.288888888888889e-07, "loss": 0.0002, "reward": 1.7250000536441803, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000461935997, "rewards/format_reward_func": 1.0, "step": 1854 }, { "completion_length": 231.82143878936768, "epoch": 0.31116140659709124, "grad_norm": 0.7838052230054621, "kl": 0.08046340942382812, "learning_rate": 2.291358024691358e-07, "loss": 0.0001, "reward": 1.8071429207921028, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 1856 }, { "completion_length": 231.53126049041748, "epoch": 0.3114967098369588, "grad_norm": 0.22527110312924714, "kl": 0.16312026977539062, "learning_rate": 2.293827160493827e-07, "loss": 0.0002, "reward": 1.8035714775323868, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714402794838, "rewards/format_reward_func": 1.0, "step": 1858 }, { "completion_length": 226.75447273254395, "epoch": 0.31183201307682634, "grad_norm": 0.20671855052520396, "kl": 0.08982467651367188, "learning_rate": 2.296296296296296e-07, "loss": 0.0001, "reward": 1.7392857670783997, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 1860 }, { "completion_length": 229.11161518096924, "epoch": 0.3121673163166939, "grad_norm": 0.23576458309190282, "kl": 0.07481575012207031, "learning_rate": 2.2987654320987655e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286107569933, "rewards/format_reward_func": 1.0, "step": 1862 }, { "completion_length": 220.23215293884277, "epoch": 0.31250261955656145, "grad_norm": 0.2948337250310543, "kl": 0.048417091369628906, "learning_rate": 2.3012345679012347e-07, "loss": 0.0, "reward": 1.7910714745521545, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7955357506871223, "rewards/format_reward_func": 0.9955357164144516, "step": 1864 }, { "completion_length": 231.8035831451416, "epoch": 0.312837922796429, "grad_norm": 0.3180397707419142, "kl": 0.17154312133789062, "learning_rate": 2.3037037037037035e-07, "loss": 0.0002, "reward": 1.6535715013742447, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.6535714697092772, "rewards/format_reward_func": 1.0, "step": 1866 }, { "completion_length": 222.42858219146729, "epoch": 0.31317322603629655, "grad_norm": 0.23005009438891452, "kl": 0.07344245910644531, "learning_rate": 2.3061728395061727e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7410714626312256, "rewards/format_reward_func": 0.9910714328289032, "step": 1868 }, { "completion_length": 241.3794765472412, "epoch": 0.3135085292761641, "grad_norm": 0.27632991514710453, "kl": 0.26630401611328125, "learning_rate": 2.3086419753086418e-07, "loss": 0.0003, "reward": 1.807142935693264, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.807142898440361, "rewards/format_reward_func": 1.0, "step": 1870 }, { "completion_length": 231.10268783569336, "epoch": 0.3138438325160317, "grad_norm": 0.24458798323898348, "kl": 0.024705886840820312, "learning_rate": 2.311111111111111e-07, "loss": 0.0, "reward": 1.7160714864730835, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7205357383936644, "rewards/format_reward_func": 0.9955357164144516, "step": 1872 }, { "completion_length": 231.321439743042, "epoch": 0.31417913575589923, "grad_norm": 0.22131545732884386, "kl": 0.034271240234375, "learning_rate": 2.31358024691358e-07, "loss": 0.0, "reward": 1.8000000342726707, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.800000037997961, "rewards/format_reward_func": 1.0, "step": 1874 }, { "completion_length": 229.5089406967163, "epoch": 0.3145144389957668, "grad_norm": 0.22465986688885642, "kl": 0.165924072265625, "learning_rate": 2.3160493827160493e-07, "loss": 0.0002, "reward": 1.7660714834928513, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 1876 }, { "completion_length": 233.3884038925171, "epoch": 0.31484974223563433, "grad_norm": 0.23910415798432272, "kl": 0.18875694274902344, "learning_rate": 2.3185185185185184e-07, "loss": 0.0002, "reward": 1.7535714879631996, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714656114578, "rewards/format_reward_func": 1.0, "step": 1878 }, { "completion_length": 233.0357265472412, "epoch": 0.3151850454755019, "grad_norm": 0.40358730438180807, "kl": 0.04170417785644531, "learning_rate": 2.3209876543209876e-07, "loss": 0.0, "reward": 1.7607143595814705, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.769642885774374, "rewards/format_reward_func": 0.9910714328289032, "step": 1880 }, { "completion_length": 225.48661994934082, "epoch": 0.31552034871536944, "grad_norm": 0.2931343957440977, "kl": 0.08726978302001953, "learning_rate": 2.3234567901234567e-07, "loss": 0.0001, "reward": 1.7607143446803093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 1882 }, { "completion_length": 231.7009048461914, "epoch": 0.315855651955237, "grad_norm": 0.5066000684278723, "kl": 0.06025505065917969, "learning_rate": 2.325925925925926e-07, "loss": 0.0001, "reward": 1.7053572237491608, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7098214663565159, "rewards/format_reward_func": 0.9955357164144516, "step": 1884 }, { "completion_length": 225.73661613464355, "epoch": 0.3161909551951046, "grad_norm": 0.11965847860757209, "kl": 0.09498214721679688, "learning_rate": 2.3283950617283948e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428872644901, "rewards/format_reward_func": 1.0, "step": 1886 }, { "completion_length": 235.83483123779297, "epoch": 0.3165262584349721, "grad_norm": 0.27708246702194467, "kl": 0.26693153381347656, "learning_rate": 2.3308641975308642e-07, "loss": 0.0003, "reward": 1.7107143625617027, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.71071432903409, "rewards/format_reward_func": 1.0, "step": 1888 }, { "completion_length": 230.55358219146729, "epoch": 0.3168615616748397, "grad_norm": 0.2196535389068709, "kl": 0.1350536346435547, "learning_rate": 2.3333333333333333e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143256813288, "rewards/format_reward_func": 1.0, "step": 1890 }, { "completion_length": 233.94643783569336, "epoch": 0.3171968649147072, "grad_norm": 0.16739288082692094, "kl": 0.018312454223632812, "learning_rate": 2.3358024691358025e-07, "loss": 0.0, "reward": 1.6982143446803093, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.702678618952632, "rewards/format_reward_func": 0.9955357164144516, "step": 1892 }, { "completion_length": 227.74554634094238, "epoch": 0.3175321681545748, "grad_norm": 0.23554998821278342, "kl": 0.04662895202636719, "learning_rate": 2.3382716049382713e-07, "loss": 0.0, "reward": 1.764285758137703, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 1894 }, { "completion_length": 230.40625953674316, "epoch": 0.3178674713944423, "grad_norm": 0.3893511360014083, "kl": 0.03877544403076172, "learning_rate": 2.3407407407407405e-07, "loss": 0.0, "reward": 1.7571429312229156, "reward_std": 0.09091372694820166, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 1896 }, { "completion_length": 235.1250114440918, "epoch": 0.3182027746343099, "grad_norm": 0.2841157377433794, "kl": 0.19602394104003906, "learning_rate": 2.34320987654321e-07, "loss": 0.0002, "reward": 1.7428572252392769, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 1898 }, { "completion_length": 226.2366189956665, "epoch": 0.3185380778741775, "grad_norm": 0.17757173837637225, "kl": 0.0174407958984375, "learning_rate": 2.3456790123456788e-07, "loss": 0.0, "reward": 1.7357143387198448, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 1900 }, { "completion_length": 237.0000123977661, "epoch": 0.318873381114045, "grad_norm": 0.43075952842159304, "kl": 0.13301658630371094, "learning_rate": 2.348148148148148e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857301980257, "rewards/format_reward_func": 1.0, "step": 1902 }, { "completion_length": 230.7544755935669, "epoch": 0.3192086843539126, "grad_norm": 0.2751534698278088, "kl": 0.07329368591308594, "learning_rate": 2.350617283950617e-07, "loss": 0.0001, "reward": 1.7053572237491608, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7098214663565159, "rewards/format_reward_func": 0.9955357164144516, "step": 1904 }, { "completion_length": 228.5669755935669, "epoch": 0.3195439875937801, "grad_norm": 0.25202227295897695, "kl": 0.03522491455078125, "learning_rate": 2.3530864197530865e-07, "loss": 0.0, "reward": 1.7500000670552254, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 1906 }, { "completion_length": 238.35715675354004, "epoch": 0.3198792908336477, "grad_norm": 0.17488179771033238, "kl": 0.021076202392578125, "learning_rate": 2.3555555555555554e-07, "loss": 0.0, "reward": 1.7714286595582962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 1908 }, { "completion_length": 226.04465579986572, "epoch": 0.3202145940735152, "grad_norm": 0.29303369817907765, "kl": 0.011088371276855469, "learning_rate": 2.3580246913580245e-07, "loss": 0.0, "reward": 1.764285795390606, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 1910 }, { "completion_length": 238.8259048461914, "epoch": 0.3205498973133828, "grad_norm": 0.28677067456421385, "kl": 0.06038475036621094, "learning_rate": 2.3604938271604937e-07, "loss": 0.0001, "reward": 1.7589286267757416, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 1912 }, { "completion_length": 230.89286613464355, "epoch": 0.32088520055325037, "grad_norm": 0.23104718016621956, "kl": 0.023987770080566406, "learning_rate": 2.362962962962963e-07, "loss": 0.0, "reward": 1.7535714954137802, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714469850063, "rewards/format_reward_func": 1.0, "step": 1914 }, { "completion_length": 238.77679824829102, "epoch": 0.3212205037931179, "grad_norm": 0.24706180736743527, "kl": 0.2645835876464844, "learning_rate": 2.365432098765432e-07, "loss": 0.0003, "reward": 1.7500000596046448, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 1916 }, { "completion_length": 240.2544755935669, "epoch": 0.3215558070329855, "grad_norm": 0.2133498248289064, "kl": 0.12011337280273438, "learning_rate": 2.367901234567901e-07, "loss": 0.0001, "reward": 1.758928619325161, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7633928954601288, "rewards/format_reward_func": 0.9955357164144516, "step": 1918 }, { "completion_length": 228.94197368621826, "epoch": 0.321891110272853, "grad_norm": 0.43973771460402583, "kl": 0.11147499084472656, "learning_rate": 2.3703703703703703e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 1920 }, { "completion_length": 230.3169755935669, "epoch": 0.3222264135127206, "grad_norm": 0.22943147719990709, "kl": 0.04773712158203125, "learning_rate": 2.3728395061728394e-07, "loss": 0.0, "reward": 1.7803571969270706, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7848214581608772, "rewards/format_reward_func": 0.9955357164144516, "step": 1922 }, { "completion_length": 231.18751049041748, "epoch": 0.3225617167525881, "grad_norm": 0.35686116682750085, "kl": 0.07299137115478516, "learning_rate": 2.3753086419753086e-07, "loss": 0.0001, "reward": 1.7125001028180122, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.716964315623045, "rewards/format_reward_func": 0.9955357164144516, "step": 1924 }, { "completion_length": 224.07143783569336, "epoch": 0.3228970199924557, "grad_norm": 0.30472172697594374, "kl": 0.029535293579101562, "learning_rate": 2.3777777777777777e-07, "loss": 0.0, "reward": 1.7500000819563866, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 1926 }, { "completion_length": 244.20983409881592, "epoch": 0.32323232323232326, "grad_norm": 0.2661952587072812, "kl": 0.019370079040527344, "learning_rate": 2.3802469135802469e-07, "loss": 0.0, "reward": 1.7642857655882835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 1928 }, { "completion_length": 231.15179824829102, "epoch": 0.3235676264721908, "grad_norm": 0.2501130921501854, "kl": 0.027837753295898438, "learning_rate": 2.3827160493827157e-07, "loss": 0.0, "reward": 1.7928571924567223, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 1930 }, { "completion_length": 233.75001430511475, "epoch": 0.32390292971205836, "grad_norm": 0.44629294557250104, "kl": 0.07960128784179688, "learning_rate": 2.385185185185185e-07, "loss": 0.0001, "reward": 1.6946429312229156, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.6991071682423353, "rewards/format_reward_func": 0.9955357164144516, "step": 1932 }, { "completion_length": 235.3928680419922, "epoch": 0.3242382329519259, "grad_norm": 0.5119444231013779, "kl": 0.10437774658203125, "learning_rate": 2.3876543209876543e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.739285746589303, "rewards/format_reward_func": 1.0, "step": 1934 }, { "completion_length": 236.93304538726807, "epoch": 0.32457353619179347, "grad_norm": 0.2119830430647975, "kl": 0.03171539306640625, "learning_rate": 2.390123456790123e-07, "loss": 0.0, "reward": 1.7464286461472511, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.746428593993187, "rewards/format_reward_func": 1.0, "step": 1936 }, { "completion_length": 228.1294755935669, "epoch": 0.324908839431661, "grad_norm": 0.27788889937507205, "kl": 0.01113128662109375, "learning_rate": 2.3925925925925926e-07, "loss": 0.0, "reward": 1.6607143729925156, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.6607143338769674, "rewards/format_reward_func": 1.0, "step": 1938 }, { "completion_length": 237.4955472946167, "epoch": 0.32524414267152857, "grad_norm": 0.24299818770137305, "kl": 0.013406753540039062, "learning_rate": 2.3950617283950615e-07, "loss": 0.0, "reward": 1.7428572252392769, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 1940 }, { "completion_length": 233.36608409881592, "epoch": 0.3255794459113961, "grad_norm": 0.1810489507798674, "kl": 0.03422355651855469, "learning_rate": 2.397530864197531e-07, "loss": 0.0, "reward": 1.7392857894301414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 1942 }, { "completion_length": 229.81697368621826, "epoch": 0.32591474915126367, "grad_norm": 0.1674598005786771, "kl": 0.044902801513671875, "learning_rate": 2.4e-07, "loss": 0.0, "reward": 1.7607143446803093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 1.0, "step": 1944 }, { "completion_length": 229.18304634094238, "epoch": 0.32625005239113125, "grad_norm": 0.1620064245637525, "kl": 0.017425537109375, "learning_rate": 2.402469135802469e-07, "loss": 0.0, "reward": 1.7000000923871994, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.700000025331974, "rewards/format_reward_func": 1.0, "step": 1946 }, { "completion_length": 236.3392972946167, "epoch": 0.3265853556309988, "grad_norm": 0.20727220749960834, "kl": 0.010746002197265625, "learning_rate": 2.404938271604938e-07, "loss": 0.0, "reward": 1.7107143551111221, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7196428794413805, "rewards/format_reward_func": 0.9910714328289032, "step": 1948 }, { "completion_length": 234.52679634094238, "epoch": 0.32692065887086635, "grad_norm": 0.3255722185826685, "kl": 0.07966041564941406, "learning_rate": 2.407407407407407e-07, "loss": 0.0001, "reward": 1.7839286476373672, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 1950 }, { "completion_length": 229.72768783569336, "epoch": 0.3272559621107339, "grad_norm": 0.32015743982788275, "kl": 0.08532905578613281, "learning_rate": 2.4098765432098764e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 1.0, "step": 1952 }, { "completion_length": 227.7232265472412, "epoch": 0.32759126535060146, "grad_norm": 0.21736216860562788, "kl": 0.04352378845214844, "learning_rate": 2.412345679012346e-07, "loss": 0.0, "reward": 1.785714328289032, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143115252256, "rewards/format_reward_func": 1.0, "step": 1954 }, { "completion_length": 229.696439743042, "epoch": 0.327926568590469, "grad_norm": 0.214901028849602, "kl": 0.054378509521484375, "learning_rate": 2.4148148148148147e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 1956 }, { "completion_length": 230.96429538726807, "epoch": 0.32826187183033656, "grad_norm": 0.35751988952802904, "kl": 0.04533100128173828, "learning_rate": 2.4172839506172836e-07, "loss": 0.0, "reward": 1.7142857909202576, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7142857611179352, "rewards/format_reward_func": 1.0, "step": 1958 }, { "completion_length": 225.32590293884277, "epoch": 0.32859717507020414, "grad_norm": 0.29350278528039353, "kl": 0.014001846313476562, "learning_rate": 2.419753086419753e-07, "loss": 0.0, "reward": 1.7321429327130318, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428991854191, "rewards/format_reward_func": 1.0, "step": 1960 }, { "completion_length": 226.90626049041748, "epoch": 0.32893247831007166, "grad_norm": 0.2693844120927604, "kl": 0.07091999053955078, "learning_rate": 2.4222222222222224e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.8080357313156128, "rewards/format_reward_func": 0.9866071455180645, "step": 1962 }, { "completion_length": 232.9732255935669, "epoch": 0.32926778154993924, "grad_norm": 0.36427653784791236, "kl": 0.00850677490234375, "learning_rate": 2.4246913580246913e-07, "loss": 0.0, "reward": 1.7107143476605415, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7196428999304771, "rewards/format_reward_func": 0.9910714328289032, "step": 1964 }, { "completion_length": 238.4821538925171, "epoch": 0.32960308478980677, "grad_norm": 0.18662587745852874, "kl": 0.014192581176757812, "learning_rate": 2.42716049382716e-07, "loss": 0.0, "reward": 1.7178571969270706, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571801632643, "rewards/format_reward_func": 1.0, "step": 1966 }, { "completion_length": 235.21429538726807, "epoch": 0.32993838802967435, "grad_norm": 0.3017454432462034, "kl": 0.0271148681640625, "learning_rate": 2.4296296296296296e-07, "loss": 0.0, "reward": 1.755357213318348, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214447498322, "rewards/format_reward_func": 0.9955357164144516, "step": 1968 }, { "completion_length": 222.56697368621826, "epoch": 0.33027369126954187, "grad_norm": 0.3683075178582106, "kl": 0.008819580078125, "learning_rate": 2.432098765432099e-07, "loss": 0.0, "reward": 1.8504464700818062, "reward_std": 0.07007933221757412, "rewards/equation_reward_func": 0.8562500178813934, "rewards/format_reward_func": 0.9941964335739613, "step": 1970 }, { "completion_length": 225.54018688201904, "epoch": 0.33060899450940945, "grad_norm": 0.254113309813442, "kl": 0.015944480895996094, "learning_rate": 2.434567901234568e-07, "loss": 0.0, "reward": 1.7678572088479996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 1972 }, { "completion_length": 228.3303680419922, "epoch": 0.330944297749277, "grad_norm": 0.3838229118855729, "kl": 0.012636184692382812, "learning_rate": 2.437037037037037e-07, "loss": 0.0, "reward": 1.7071429416537285, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7160714659839869, "rewards/format_reward_func": 0.9910714328289032, "step": 1974 }, { "completion_length": 237.89733028411865, "epoch": 0.33127960098914455, "grad_norm": 0.33245679288182095, "kl": 0.03874683380126953, "learning_rate": 2.439506172839506e-07, "loss": 0.0, "reward": 1.7553572058677673, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 1976 }, { "completion_length": 228.4241180419922, "epoch": 0.33161490422901213, "grad_norm": 0.1817437147379376, "kl": 0.06694793701171875, "learning_rate": 2.4419753086419756e-07, "loss": 0.0001, "reward": 1.8107143566012383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107143081724644, "rewards/format_reward_func": 1.0, "step": 1978 }, { "completion_length": 231.20090293884277, "epoch": 0.33195020746887965, "grad_norm": 0.23502872535450134, "kl": 0.08605766296386719, "learning_rate": 2.4444444444444445e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000432133675, "rewards/format_reward_func": 1.0, "step": 1980 }, { "completion_length": 228.21429538726807, "epoch": 0.33228551070874723, "grad_norm": 0.1645380719145372, "kl": 0.022317886352539062, "learning_rate": 2.4469135802469133e-07, "loss": 0.0, "reward": 1.7571429386734962, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 1982 }, { "completion_length": 230.04018878936768, "epoch": 0.33262081394861476, "grad_norm": 0.5832458221239087, "kl": 0.07063102722167969, "learning_rate": 2.449382716049383e-07, "loss": 0.0001, "reward": 1.7303572222590446, "reward_std": 0.0883883461356163, "rewards/equation_reward_func": 0.7437500357627869, "rewards/format_reward_func": 0.9866071492433548, "step": 1984 }, { "completion_length": 230.77679634094238, "epoch": 0.33295611718848234, "grad_norm": 0.32034128630556463, "kl": 0.05539894104003906, "learning_rate": 2.4518518518518516e-07, "loss": 0.0001, "reward": 1.7125000581145287, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7169643081724644, "rewards/format_reward_func": 0.9955357164144516, "step": 1986 }, { "completion_length": 221.37054538726807, "epoch": 0.3332914204283499, "grad_norm": 0.3104718829687837, "kl": 0.0174102783203125, "learning_rate": 2.454320987654321e-07, "loss": 0.0, "reward": 1.7571429163217545, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 1988 }, { "completion_length": 230.3973331451416, "epoch": 0.33362672366821744, "grad_norm": 0.28791233501393015, "kl": 0.02490234375, "learning_rate": 2.45679012345679e-07, "loss": 0.0, "reward": 1.814285770058632, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 1990 }, { "completion_length": 231.79911708831787, "epoch": 0.333962026908085, "grad_norm": 0.1954296500371838, "kl": 0.022606849670410156, "learning_rate": 2.4592592592592593e-07, "loss": 0.0, "reward": 1.757142923772335, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 1992 }, { "completion_length": 236.7053680419922, "epoch": 0.33429733014795254, "grad_norm": 0.21342070200715885, "kl": 0.10403156280517578, "learning_rate": 2.461728395061728e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214286029338837, "rewards/format_reward_func": 1.0, "step": 1994 }, { "completion_length": 225.05358409881592, "epoch": 0.3346326333878201, "grad_norm": 0.5948907828800926, "kl": 0.0234222412109375, "learning_rate": 2.4641975308641976e-07, "loss": 0.0, "reward": 1.7357143685221672, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143070548773, "rewards/format_reward_func": 1.0, "step": 1996 }, { "completion_length": 229.7366189956665, "epoch": 0.33496793662768765, "grad_norm": 0.14636131178410403, "kl": 0.037792205810546875, "learning_rate": 2.4666666666666665e-07, "loss": 0.0, "reward": 1.7142857983708382, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7142857499420643, "rewards/format_reward_func": 1.0, "step": 1998 }, { "completion_length": 228.67858219146729, "epoch": 0.3353032398675552, "grad_norm": 0.1664830030099182, "kl": 0.01624774932861328, "learning_rate": 2.4691358024691354e-07, "loss": 0.0, "reward": 1.742857202887535, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 2000 }, { "completion_length": 222.88840293884277, "epoch": 0.33563854310742275, "grad_norm": 0.26362387045617086, "kl": 0.012630462646484375, "learning_rate": 2.471604938271605e-07, "loss": 0.0, "reward": 1.796428620815277, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286133646965, "rewards/format_reward_func": 1.0, "step": 2002 }, { "completion_length": 223.72322273254395, "epoch": 0.33597384634729033, "grad_norm": 0.0014755454691630647, "kl": 0.009225845336914062, "learning_rate": 2.474074074074074e-07, "loss": 0.0, "reward": 1.7285715118050575, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7285714708268642, "rewards/format_reward_func": 1.0, "step": 2004 }, { "completion_length": 220.60715198516846, "epoch": 0.3363091495871579, "grad_norm": 0.310672175130401, "kl": 0.016305923461914062, "learning_rate": 2.476543209876543e-07, "loss": 0.0, "reward": 1.8000000417232513, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 2006 }, { "completion_length": 227.81250953674316, "epoch": 0.33664445282702543, "grad_norm": 0.2702079969416635, "kl": 0.10282135009765625, "learning_rate": 2.479012345679012e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 2008 }, { "completion_length": 225.70536613464355, "epoch": 0.336979756066893, "grad_norm": 0.20606098388880464, "kl": 0.018157005310058594, "learning_rate": 2.4814814814814814e-07, "loss": 0.0, "reward": 1.8053572103381157, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8098214641213417, "rewards/format_reward_func": 0.9955357164144516, "step": 2010 }, { "completion_length": 237.58036708831787, "epoch": 0.33731505930676053, "grad_norm": 0.2482946763221189, "kl": 0.05824089050292969, "learning_rate": 2.4839506172839503e-07, "loss": 0.0001, "reward": 1.7035715132951736, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7035714760422707, "rewards/format_reward_func": 1.0, "step": 2012 }, { "completion_length": 233.10269260406494, "epoch": 0.3376503625466281, "grad_norm": 0.28002200040351294, "kl": 0.0264739990234375, "learning_rate": 2.4864197530864197e-07, "loss": 0.0, "reward": 1.7267857864499092, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7312500234693289, "rewards/format_reward_func": 0.9955357164144516, "step": 2014 }, { "completion_length": 236.15626049041748, "epoch": 0.33798566578649564, "grad_norm": 0.47691443247370385, "kl": 0.02268695831298828, "learning_rate": 2.4888888888888886e-07, "loss": 0.0, "reward": 1.757142923772335, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 2016 }, { "completion_length": 220.9732255935669, "epoch": 0.3383209690263632, "grad_norm": 0.4744433084933257, "kl": 0.03647422790527344, "learning_rate": 2.491358024691358e-07, "loss": 0.0, "reward": 1.7857143506407738, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 2018 }, { "completion_length": 227.18304538726807, "epoch": 0.3386562722662308, "grad_norm": 0.16060134070242493, "kl": 0.05447578430175781, "learning_rate": 2.493827160493827e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 2020 }, { "completion_length": 230.07143783569336, "epoch": 0.3389915755060983, "grad_norm": 0.349626308958822, "kl": 0.09692955017089844, "learning_rate": 2.4962962962962963e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 2022 }, { "completion_length": 227.34375953674316, "epoch": 0.3393268787459659, "grad_norm": 0.229310222683427, "kl": 0.022785186767578125, "learning_rate": 2.498765432098765e-07, "loss": 0.0, "reward": 1.7803572192788124, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214562982321, "rewards/format_reward_func": 0.9955357164144516, "step": 2024 }, { "completion_length": 224.8839406967163, "epoch": 0.3396621819858334, "grad_norm": 0.13934129466630318, "kl": 0.05133819580078125, "learning_rate": 2.501234567901234e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 2026 }, { "completion_length": 229.83036708831787, "epoch": 0.339997485225701, "grad_norm": 0.11982324549628261, "kl": 0.03446388244628906, "learning_rate": 2.5037037037037035e-07, "loss": 0.0, "reward": 1.7642857804894447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 2028 }, { "completion_length": 220.665189743042, "epoch": 0.3403327884655685, "grad_norm": 0.14265061273334256, "kl": 0.014574050903320312, "learning_rate": 2.506172839506173e-07, "loss": 0.0, "reward": 1.8107143267989159, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 2030 }, { "completion_length": 228.60715293884277, "epoch": 0.3406680917054361, "grad_norm": 0.08317394544730386, "kl": 0.05511188507080078, "learning_rate": 2.508641975308642e-07, "loss": 0.0001, "reward": 1.6892857998609543, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.689285745844245, "rewards/format_reward_func": 1.0, "step": 2032 }, { "completion_length": 224.03572368621826, "epoch": 0.3410033949453037, "grad_norm": 0.13839855068066612, "kl": 0.013246536254882812, "learning_rate": 2.511111111111111e-07, "loss": 0.0, "reward": 1.7714286148548126, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 2034 }, { "completion_length": 236.71876335144043, "epoch": 0.3413386981851712, "grad_norm": 0.46152422665186366, "kl": 0.03157615661621094, "learning_rate": 2.51358024691358e-07, "loss": 0.0, "reward": 1.7232143431901932, "reward_std": 0.10859139822423458, "rewards/equation_reward_func": 0.7366071827709675, "rewards/format_reward_func": 0.9866071492433548, "step": 2036 }, { "completion_length": 219.82143783569336, "epoch": 0.3416740014250388, "grad_norm": 0.2956090340920406, "kl": 0.007686614990234375, "learning_rate": 2.5160493827160495e-07, "loss": 0.0, "reward": 1.792857214808464, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 2038 }, { "completion_length": 237.73215293884277, "epoch": 0.3420093046649063, "grad_norm": 0.3170164235407117, "kl": 0.04242229461669922, "learning_rate": 2.5185185185185184e-07, "loss": 0.0, "reward": 1.7392857819795609, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857596278191, "rewards/format_reward_func": 1.0, "step": 2040 }, { "completion_length": 229.84375858306885, "epoch": 0.3423446079047739, "grad_norm": 0.18831475868217637, "kl": 0.022202491760253906, "learning_rate": 2.520987654320987e-07, "loss": 0.0, "reward": 1.7785715162754059, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 2042 }, { "completion_length": 225.55358219146729, "epoch": 0.3426799111446414, "grad_norm": 0.2642266312318678, "kl": 0.07865715026855469, "learning_rate": 2.5234567901234567e-07, "loss": 0.0001, "reward": 1.7589286267757416, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 2044 }, { "completion_length": 235.05357933044434, "epoch": 0.343015214384509, "grad_norm": 0.2661198557562209, "kl": 0.028797149658203125, "learning_rate": 2.5259259259259255e-07, "loss": 0.0, "reward": 1.7446429207921028, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.9955357164144516, "step": 2046 }, { "completion_length": 236.3259048461914, "epoch": 0.34335051762437657, "grad_norm": 0.3349485363833583, "kl": 0.029720306396484375, "learning_rate": 2.528395061728395e-07, "loss": 0.0, "reward": 1.7357143759727478, "reward_std": 0.09091372601687908, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 2048 }, { "completion_length": 231.45983123779297, "epoch": 0.3436858208642441, "grad_norm": 0.4207071809083083, "kl": 0.06764984130859375, "learning_rate": 2.5308641975308644e-07, "loss": 0.0001, "reward": 1.789285771548748, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 2050 }, { "completion_length": 229.0580472946167, "epoch": 0.3440211241041117, "grad_norm": 0.2937621352389509, "kl": 0.02901458740234375, "learning_rate": 2.533333333333333e-07, "loss": 0.0, "reward": 1.7321429401636124, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428880095482, "rewards/format_reward_func": 1.0, "step": 2052 }, { "completion_length": 232.07590198516846, "epoch": 0.3443564273439792, "grad_norm": 0.291425140305454, "kl": 0.015499114990234375, "learning_rate": 2.535802469135802e-07, "loss": 0.0, "reward": 1.714285783469677, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857387661934, "rewards/format_reward_func": 1.0, "step": 2054 }, { "completion_length": 234.2053680419922, "epoch": 0.3446917305838468, "grad_norm": 0.37230523225702566, "kl": 0.01446533203125, "learning_rate": 2.5382716049382716e-07, "loss": 0.0, "reward": 1.760714367032051, "reward_std": 0.09596448857337236, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 2056 }, { "completion_length": 231.3526906967163, "epoch": 0.3450270338237143, "grad_norm": 0.13254691883933248, "kl": 0.015127182006835938, "learning_rate": 2.5407407407407404e-07, "loss": 0.0, "reward": 1.8571428805589676, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.8571428842842579, "rewards/format_reward_func": 1.0, "step": 2058 }, { "completion_length": 224.11161708831787, "epoch": 0.3453623370635819, "grad_norm": 0.2855218059164058, "kl": 0.021585464477539062, "learning_rate": 2.54320987654321e-07, "loss": 0.0, "reward": 1.7392858043313026, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 2060 }, { "completion_length": 230.78572463989258, "epoch": 0.3456976403034494, "grad_norm": 0.2911414082155755, "kl": 0.01221466064453125, "learning_rate": 2.5456790123456787e-07, "loss": 0.0, "reward": 1.7321429550647736, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428991854191, "rewards/format_reward_func": 1.0, "step": 2062 }, { "completion_length": 233.9642972946167, "epoch": 0.346032943543317, "grad_norm": 0.3881993855084975, "kl": 0.0128936767578125, "learning_rate": 2.548148148148148e-07, "loss": 0.0, "reward": 1.7785715237259865, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7785714380443096, "rewards/format_reward_func": 1.0, "step": 2064 }, { "completion_length": 221.6384038925171, "epoch": 0.34636824678318456, "grad_norm": 0.26392784790783114, "kl": 0.0257110595703125, "learning_rate": 2.5506172839506176e-07, "loss": 0.0, "reward": 1.792857214808464, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 2066 }, { "completion_length": 227.79911613464355, "epoch": 0.3467035500230521, "grad_norm": 0.32096854524746926, "kl": 0.026147842407226562, "learning_rate": 2.553086419753086e-07, "loss": 0.0, "reward": 1.76071435213089, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143204659224, "rewards/format_reward_func": 1.0, "step": 2068 }, { "completion_length": 235.91072750091553, "epoch": 0.34703885326291967, "grad_norm": 0.4583612266863497, "kl": 0.03256988525390625, "learning_rate": 2.5555555555555553e-07, "loss": 0.0, "reward": 1.8196429461240768, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8241071552038193, "rewards/format_reward_func": 0.9955357164144516, "step": 2070 }, { "completion_length": 228.0000114440918, "epoch": 0.3473741565027872, "grad_norm": 0.18039195795853832, "kl": 0.03345489501953125, "learning_rate": 2.558024691358024e-07, "loss": 0.0, "reward": 1.7500000670552254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000242143869, "rewards/format_reward_func": 1.0, "step": 2072 }, { "completion_length": 228.9687614440918, "epoch": 0.34770945974265477, "grad_norm": 0.19847492504938696, "kl": 0.011264801025390625, "learning_rate": 2.5604938271604936e-07, "loss": 0.0, "reward": 1.7736607789993286, "reward_std": 0.027147850021719933, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 0.9986607171595097, "step": 2074 }, { "completion_length": 233.9241189956665, "epoch": 0.3480447629825223, "grad_norm": 0.21015001646006334, "kl": 0.009357452392578125, "learning_rate": 2.562962962962963e-07, "loss": 0.0, "reward": 1.7732143476605415, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7776785977184772, "rewards/format_reward_func": 0.9955357164144516, "step": 2076 }, { "completion_length": 232.8482255935669, "epoch": 0.3483800662223899, "grad_norm": 0.33872680822882006, "kl": 0.022771835327148438, "learning_rate": 2.565432098765432e-07, "loss": 0.0, "reward": 1.8000000640749931, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.8089285977184772, "rewards/format_reward_func": 0.9910714328289032, "step": 2078 }, { "completion_length": 229.36161708831787, "epoch": 0.34871536946225745, "grad_norm": 0.28679987177075117, "kl": 0.054790496826171875, "learning_rate": 2.5679012345679013e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 2080 }, { "completion_length": 217.6384038925171, "epoch": 0.349050672702125, "grad_norm": 0.27843672987692414, "kl": 0.00815582275390625, "learning_rate": 2.570370370370371e-07, "loss": 0.0, "reward": 1.7857143431901932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 2082 }, { "completion_length": 237.37501049041748, "epoch": 0.34938597594199255, "grad_norm": 0.37663868418897395, "kl": 0.09327888488769531, "learning_rate": 2.572839506172839e-07, "loss": 0.0001, "reward": 1.7714286148548126, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286185801029, "rewards/format_reward_func": 1.0, "step": 2084 }, { "completion_length": 229.415189743042, "epoch": 0.3497212791818601, "grad_norm": 0.2059930556278535, "kl": 0.07577705383300781, "learning_rate": 2.5753086419753085e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 2086 }, { "completion_length": 223.26786613464355, "epoch": 0.35005658242172766, "grad_norm": 0.26716203101438885, "kl": 0.01068878173828125, "learning_rate": 2.5777777777777774e-07, "loss": 0.0, "reward": 1.7482143640518188, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7526785992085934, "rewards/format_reward_func": 0.9955357164144516, "step": 2088 }, { "completion_length": 223.92858505249023, "epoch": 0.3503918856615952, "grad_norm": 0.5845833867815128, "kl": 0.16082000732421875, "learning_rate": 2.580246913580247e-07, "loss": 0.0002, "reward": 1.757142923772335, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 2090 }, { "completion_length": 230.1384038925171, "epoch": 0.35072718890146276, "grad_norm": 0.30983776550084197, "kl": 0.0106658935546875, "learning_rate": 2.582716049382716e-07, "loss": 0.0, "reward": 1.76071435213089, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 2092 }, { "completion_length": 225.102689743042, "epoch": 0.35106249214133034, "grad_norm": 0.342229109593764, "kl": 0.019098281860351562, "learning_rate": 2.585185185185185e-07, "loss": 0.0, "reward": 1.778571479022503, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7785714790225029, "rewards/format_reward_func": 1.0, "step": 2094 }, { "completion_length": 231.91518783569336, "epoch": 0.35139779538119786, "grad_norm": 0.34481677702260877, "kl": 0.04109764099121094, "learning_rate": 2.5876543209876545e-07, "loss": 0.0, "reward": 1.7464286535978317, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.746428620070219, "rewards/format_reward_func": 1.0, "step": 2096 }, { "completion_length": 233.06697368621826, "epoch": 0.35173309862106544, "grad_norm": 0.31827473583036603, "kl": 0.01319122314453125, "learning_rate": 2.590123456790123e-07, "loss": 0.0, "reward": 1.769642911851406, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7741071656346321, "rewards/format_reward_func": 0.9955357164144516, "step": 2098 }, { "completion_length": 230.33036708831787, "epoch": 0.35206840186093297, "grad_norm": 0.351989871127145, "kl": 0.07884025573730469, "learning_rate": 2.5925925925925923e-07, "loss": 0.0001, "reward": 1.7285715192556381, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714540630579, "rewards/format_reward_func": 1.0, "step": 2100 }, { "completion_length": 220.97768783569336, "epoch": 0.35240370510080055, "grad_norm": 0.09114508670738188, "kl": 0.024440765380859375, "learning_rate": 2.5950617283950617e-07, "loss": 0.0, "reward": 1.8107143566012383, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143044471741, "rewards/format_reward_func": 1.0, "step": 2102 }, { "completion_length": 232.7946538925171, "epoch": 0.35273900834066807, "grad_norm": 0.2583027938351751, "kl": 0.04311180114746094, "learning_rate": 2.5975308641975306e-07, "loss": 0.0, "reward": 1.7607143223285675, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7696428950875998, "rewards/format_reward_func": 0.9910714328289032, "step": 2104 }, { "completion_length": 225.14286613464355, "epoch": 0.35307431158053565, "grad_norm": 0.2558472070866311, "kl": 0.009954452514648438, "learning_rate": 2.6e-07, "loss": 0.0, "reward": 1.8250000402331352, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8250000383704901, "rewards/format_reward_func": 1.0, "step": 2106 }, { "completion_length": 230.54465198516846, "epoch": 0.35340961482040323, "grad_norm": 0.37407279058475107, "kl": 0.0380706787109375, "learning_rate": 2.6024691358024694e-07, "loss": 0.0, "reward": 1.7214286401867867, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7214286141097546, "rewards/format_reward_func": 1.0, "step": 2108 }, { "completion_length": 221.95536518096924, "epoch": 0.35374491806027075, "grad_norm": 0.16814010224897627, "kl": 0.08795928955078125, "learning_rate": 2.6049382716049383e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 2110 }, { "completion_length": 228.25893783569336, "epoch": 0.35408022130013833, "grad_norm": 0.45033267511988145, "kl": 0.026628494262695312, "learning_rate": 2.607407407407407e-07, "loss": 0.0, "reward": 1.730357214808464, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7348214685916901, "rewards/format_reward_func": 0.9955357164144516, "step": 2112 }, { "completion_length": 234.99108219146729, "epoch": 0.35441552454000586, "grad_norm": 0.13415719938476556, "kl": 0.062084197998046875, "learning_rate": 2.609876543209876e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 2114 }, { "completion_length": 224.31697368621826, "epoch": 0.35475082777987343, "grad_norm": 0.4264411382430659, "kl": 0.043720245361328125, "learning_rate": 2.6123456790123455e-07, "loss": 0.0, "reward": 1.7303572073578835, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7348214592784643, "rewards/format_reward_func": 0.9955357164144516, "step": 2116 }, { "completion_length": 226.071439743042, "epoch": 0.35508613101974096, "grad_norm": 0.40260476168590575, "kl": 0.015716552734375, "learning_rate": 2.614814814814815e-07, "loss": 0.0, "reward": 1.7392858043313026, "reward_std": 0.09596448857337236, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 2118 }, { "completion_length": 233.508939743042, "epoch": 0.35542143425960854, "grad_norm": 0.24759868181676284, "kl": 0.024053573608398438, "learning_rate": 2.617283950617284e-07, "loss": 0.0, "reward": 1.7571429312229156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 2120 }, { "completion_length": 224.0267972946167, "epoch": 0.35575673749947606, "grad_norm": 0.10767464037699406, "kl": 0.14308738708496094, "learning_rate": 2.619753086419753e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428775787354, "rewards/format_reward_func": 1.0, "step": 2122 }, { "completion_length": 231.52232933044434, "epoch": 0.35609204073934364, "grad_norm": 0.3298464882369941, "kl": 0.01337432861328125, "learning_rate": 2.6222222222222226e-07, "loss": 0.0, "reward": 1.7232143580913544, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7276786006987095, "rewards/format_reward_func": 0.9955357164144516, "step": 2124 }, { "completion_length": 223.696439743042, "epoch": 0.3564273439792112, "grad_norm": 0.24994278995491304, "kl": 0.025163650512695312, "learning_rate": 2.624691358024691e-07, "loss": 0.0, "reward": 1.7928571924567223, "reward_std": 0.06060914974659681, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 2126 }, { "completion_length": 224.03125953674316, "epoch": 0.35676264721907874, "grad_norm": 0.423221266506577, "kl": 0.08066177368164062, "learning_rate": 2.6271604938271604e-07, "loss": 0.0001, "reward": 1.7214286625385284, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7214285973459482, "rewards/format_reward_func": 1.0, "step": 2128 }, { "completion_length": 231.25447273254395, "epoch": 0.3570979504589463, "grad_norm": 0.22862671789859407, "kl": 0.025022506713867188, "learning_rate": 2.629629629629629e-07, "loss": 0.0, "reward": 1.7303571924567223, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.734821455553174, "rewards/format_reward_func": 0.9955357164144516, "step": 2130 }, { "completion_length": 226.540189743042, "epoch": 0.35743325369881385, "grad_norm": 0.19159948818631775, "kl": 0.011707305908203125, "learning_rate": 2.6320987654320986e-07, "loss": 0.0, "reward": 1.7428572326898575, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 2132 }, { "completion_length": 221.31697463989258, "epoch": 0.3577685569386814, "grad_norm": 0.29371425581013727, "kl": 0.04029273986816406, "learning_rate": 2.634567901234568e-07, "loss": 0.0, "reward": 1.7785714864730835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 2134 }, { "completion_length": 228.50447463989258, "epoch": 0.35810386017854895, "grad_norm": 0.12820217400008221, "kl": 0.10086727142333984, "learning_rate": 2.637037037037037e-07, "loss": 0.0001, "reward": 1.8107143491506577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107143007218838, "rewards/format_reward_func": 1.0, "step": 2136 }, { "completion_length": 231.02233219146729, "epoch": 0.35843916341841653, "grad_norm": 0.23459826960008642, "kl": 0.043140411376953125, "learning_rate": 2.6395061728395064e-07, "loss": 0.0, "reward": 1.7964286357164383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964286096394062, "rewards/format_reward_func": 1.0, "step": 2138 }, { "completion_length": 221.94643878936768, "epoch": 0.3587744666582841, "grad_norm": 0.3008130333023876, "kl": 0.010679244995117188, "learning_rate": 2.6419753086419747e-07, "loss": 0.0, "reward": 1.7750000730156898, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 2140 }, { "completion_length": 220.62947273254395, "epoch": 0.35910976989815163, "grad_norm": 0.32973760375580985, "kl": 0.09985923767089844, "learning_rate": 2.644444444444444e-07, "loss": 0.0001, "reward": 1.7928571924567223, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571850061417, "rewards/format_reward_func": 1.0, "step": 2142 }, { "completion_length": 228.62501049041748, "epoch": 0.3594450731380192, "grad_norm": 0.48222064013341237, "kl": 0.10888290405273438, "learning_rate": 2.6469135802469135e-07, "loss": 0.0001, "reward": 1.7000000700354576, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7000000458210707, "rewards/format_reward_func": 1.0, "step": 2144 }, { "completion_length": 229.02233219146729, "epoch": 0.35978037637788673, "grad_norm": 0.23380887912828222, "kl": 0.08852958679199219, "learning_rate": 2.6493827160493824e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857670783997, "rewards/format_reward_func": 1.0, "step": 2146 }, { "completion_length": 228.0669755935669, "epoch": 0.3601156796177543, "grad_norm": 0.32278175684584354, "kl": 0.17383193969726562, "learning_rate": 2.651851851851852e-07, "loss": 0.0002, "reward": 1.735714353621006, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 2148 }, { "completion_length": 227.0759048461914, "epoch": 0.36045098285762184, "grad_norm": 0.26353297086437466, "kl": 0.0684661865234375, "learning_rate": 2.654320987654321e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714477300644, "rewards/format_reward_func": 1.0, "step": 2150 }, { "completion_length": 228.1428680419922, "epoch": 0.3607862860974894, "grad_norm": 0.22570297887110932, "kl": 0.05149078369140625, "learning_rate": 2.65679012345679e-07, "loss": 0.0001, "reward": 1.7000000774860382, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7000000402331352, "rewards/format_reward_func": 1.0, "step": 2152 }, { "completion_length": 220.92858028411865, "epoch": 0.361121589337357, "grad_norm": 0.3708480771834063, "kl": 0.08710098266601562, "learning_rate": 2.659259259259259e-07, "loss": 0.0001, "reward": 1.84285718947649, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8428571671247482, "rewards/format_reward_func": 1.0, "step": 2154 }, { "completion_length": 235.52679443359375, "epoch": 0.3614568925772245, "grad_norm": 0.1637799996002816, "kl": 0.05851173400878906, "learning_rate": 2.661728395061728e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 2156 }, { "completion_length": 226.90626049041748, "epoch": 0.3617921958170921, "grad_norm": 0.3360529398155563, "kl": 0.021284103393554688, "learning_rate": 2.6641975308641973e-07, "loss": 0.0, "reward": 1.7821429371833801, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 1.0, "step": 2158 }, { "completion_length": 233.8973331451416, "epoch": 0.3621274990569596, "grad_norm": 0.31128516923824884, "kl": 0.15531539916992188, "learning_rate": 2.6666666666666667e-07, "loss": 0.0002, "reward": 1.764285758137703, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 2160 }, { "completion_length": 223.227689743042, "epoch": 0.3624628022968272, "grad_norm": 0.23111936609267744, "kl": 0.038791656494140625, "learning_rate": 2.6691358024691356e-07, "loss": 0.0, "reward": 1.817857176065445, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8178571686148643, "rewards/format_reward_func": 1.0, "step": 2162 }, { "completion_length": 221.3035831451416, "epoch": 0.3627981055366947, "grad_norm": 0.4127253192986052, "kl": 0.04503440856933594, "learning_rate": 2.671604938271605e-07, "loss": 0.0, "reward": 1.7821429446339607, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 2164 }, { "completion_length": 221.6428680419922, "epoch": 0.3631334087765623, "grad_norm": 0.223704776342276, "kl": 0.061389923095703125, "learning_rate": 2.674074074074074e-07, "loss": 0.0001, "reward": 1.7428572252392769, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7428571712225676, "rewards/format_reward_func": 1.0, "step": 2166 }, { "completion_length": 228.24108409881592, "epoch": 0.3634687120164299, "grad_norm": 0.22441233228251525, "kl": 0.03566169738769531, "learning_rate": 2.6765432098765433e-07, "loss": 0.0, "reward": 1.7410714998841286, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7455357387661934, "rewards/format_reward_func": 0.9955357164144516, "step": 2168 }, { "completion_length": 225.6830472946167, "epoch": 0.3638040152562974, "grad_norm": 0.13680117104986803, "kl": 0.015272140502929688, "learning_rate": 2.679012345679012e-07, "loss": 0.0, "reward": 1.7892857789993286, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 2170 }, { "completion_length": 222.26340293884277, "epoch": 0.364139318496165, "grad_norm": 0.26671750198225047, "kl": 0.02013397216796875, "learning_rate": 2.681481481481481e-07, "loss": 0.0, "reward": 1.7214286550879478, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.721428606659174, "rewards/format_reward_func": 1.0, "step": 2172 }, { "completion_length": 221.37947463989258, "epoch": 0.3644746217360325, "grad_norm": 0.2374098871508374, "kl": 0.06538772583007812, "learning_rate": 2.6839506172839505e-07, "loss": 0.0001, "reward": 1.7785715162754059, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 2174 }, { "completion_length": 237.63393878936768, "epoch": 0.3648099249759001, "grad_norm": 0.2467816451908553, "kl": 0.03937530517578125, "learning_rate": 2.68641975308642e-07, "loss": 0.0, "reward": 1.7446429431438446, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.9955357164144516, "step": 2176 }, { "completion_length": 230.14733123779297, "epoch": 0.3651452282157676, "grad_norm": 0.3179348050690457, "kl": 0.01219940185546875, "learning_rate": 2.688888888888889e-07, "loss": 0.0, "reward": 1.8178572207689285, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.8178571499884129, "rewards/format_reward_func": 1.0, "step": 2178 }, { "completion_length": 228.75447368621826, "epoch": 0.3654805314556352, "grad_norm": 0.2525358576089782, "kl": 0.02394866943359375, "learning_rate": 2.691358024691358e-07, "loss": 0.0, "reward": 1.7964286282658577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286096394062, "rewards/format_reward_func": 1.0, "step": 2180 }, { "completion_length": 222.07590293884277, "epoch": 0.3658158346955028, "grad_norm": 0.2556722238310013, "kl": 0.022491455078125, "learning_rate": 2.693827160493827e-07, "loss": 0.0, "reward": 1.6678572222590446, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.6678571812808514, "rewards/format_reward_func": 1.0, "step": 2182 }, { "completion_length": 236.40179634094238, "epoch": 0.3661511379353703, "grad_norm": 0.29412853954255963, "kl": 0.07938003540039062, "learning_rate": 2.696296296296296e-07, "loss": 0.0001, "reward": 1.7571429461240768, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7571429051458836, "rewards/format_reward_func": 1.0, "step": 2184 }, { "completion_length": 234.1428689956665, "epoch": 0.3664864411752379, "grad_norm": 0.2060815902787529, "kl": 0.10279083251953125, "learning_rate": 2.6987654320987654e-07, "loss": 0.0001, "reward": 1.698214367032051, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7026786133646965, "rewards/format_reward_func": 0.9955357164144516, "step": 2186 }, { "completion_length": 227.27233219146729, "epoch": 0.3668217444151054, "grad_norm": 0.3214990798853585, "kl": 0.06783485412597656, "learning_rate": 2.7012345679012343e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 2188 }, { "completion_length": 232.0803680419922, "epoch": 0.367157047654973, "grad_norm": 0.29116456137569957, "kl": 0.06546974182128906, "learning_rate": 2.7037037037037037e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714440047741, "rewards/format_reward_func": 1.0, "step": 2190 }, { "completion_length": 236.8259048461914, "epoch": 0.3674923508948405, "grad_norm": 0.275873402245631, "kl": 0.3006572723388672, "learning_rate": 2.7061728395061726e-07, "loss": 0.0003, "reward": 1.7125000655651093, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7169643267989159, "rewards/format_reward_func": 0.9955357164144516, "step": 2192 }, { "completion_length": 235.76786613464355, "epoch": 0.3678276541347081, "grad_norm": 0.37498091593844435, "kl": 0.12526512145996094, "learning_rate": 2.708641975308642e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 2194 }, { "completion_length": 238.69197463989258, "epoch": 0.3681629573745756, "grad_norm": 0.19498387354928298, "kl": 0.19458770751953125, "learning_rate": 2.7111111111111114e-07, "loss": 0.0002, "reward": 1.7267857939004898, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7312500365078449, "rewards/format_reward_func": 0.9955357164144516, "step": 2196 }, { "completion_length": 232.6339406967163, "epoch": 0.3684982606144432, "grad_norm": 0.16930298305774352, "kl": 0.04517364501953125, "learning_rate": 2.71358024691358e-07, "loss": 0.0, "reward": 1.7821429148316383, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7910714522004128, "rewards/format_reward_func": 0.9910714328289032, "step": 2198 }, { "completion_length": 225.65179538726807, "epoch": 0.36883356385431076, "grad_norm": 0.2732590185925259, "kl": 0.09507369995117188, "learning_rate": 2.716049382716049e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571697324514, "rewards/format_reward_func": 1.0, "step": 2200 }, { "completion_length": 220.92858028411865, "epoch": 0.3691688670941783, "grad_norm": 0.3480826700798807, "kl": 0.2006244659423828, "learning_rate": 2.7185185185185186e-07, "loss": 0.0002, "reward": 1.7571429386734962, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 2202 }, { "completion_length": 228.95983028411865, "epoch": 0.36950417033404587, "grad_norm": 0.23230064612406418, "kl": 0.09466743469238281, "learning_rate": 2.7209876543209875e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357499420643, "rewards/format_reward_func": 0.9955357164144516, "step": 2204 }, { "completion_length": 225.86161613464355, "epoch": 0.3698394735739134, "grad_norm": 0.37423496124042877, "kl": 0.07233428955078125, "learning_rate": 2.723456790123457e-07, "loss": 0.0001, "reward": 1.7000000849366188, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7000000383704901, "rewards/format_reward_func": 1.0, "step": 2206 }, { "completion_length": 235.37501049041748, "epoch": 0.37017477681378097, "grad_norm": 0.20144566733771846, "kl": 0.08749961853027344, "learning_rate": 2.725925925925926e-07, "loss": 0.0001, "reward": 1.7232143506407738, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7276786044239998, "rewards/format_reward_func": 0.9955357164144516, "step": 2208 }, { "completion_length": 231.55804538726807, "epoch": 0.3705100800536485, "grad_norm": 0.9344322170031661, "kl": 0.2306499481201172, "learning_rate": 2.728395061728395e-07, "loss": 0.0002, "reward": 1.7214286476373672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7214286029338837, "rewards/format_reward_func": 1.0, "step": 2210 }, { "completion_length": 232.2946538925171, "epoch": 0.3708453832935161, "grad_norm": 0.1253578302593943, "kl": 0.07514762878417969, "learning_rate": 2.730864197530864e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.753571467474103, "rewards/format_reward_func": 1.0, "step": 2212 }, { "completion_length": 232.62054824829102, "epoch": 0.37118068653338365, "grad_norm": 0.20158043710907292, "kl": 0.1405200958251953, "learning_rate": 2.733333333333333e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7589286118745804, "rewards/format_reward_func": 0.9910714328289032, "step": 2214 }, { "completion_length": 231.17411518096924, "epoch": 0.3715159897732512, "grad_norm": 0.5074480366301922, "kl": 0.08559036254882812, "learning_rate": 2.7358024691358023e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 2216 }, { "completion_length": 228.22768783569336, "epoch": 0.37185129301311876, "grad_norm": 0.16069692799159083, "kl": 0.05332183837890625, "learning_rate": 2.738271604938271e-07, "loss": 0.0001, "reward": 1.7714286595582962, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 2218 }, { "completion_length": 233.99108219146729, "epoch": 0.3721865962529863, "grad_norm": 0.45528804532488343, "kl": 0.2368755340576172, "learning_rate": 2.7407407407407406e-07, "loss": 0.0002, "reward": 1.757142923772335, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 2220 }, { "completion_length": 226.79465198516846, "epoch": 0.37252189949285386, "grad_norm": 0.4392680631342267, "kl": 0.0537261962890625, "learning_rate": 2.74320987654321e-07, "loss": 0.0001, "reward": 1.7714286148548126, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 2222 }, { "completion_length": 240.25000858306885, "epoch": 0.3728572027327214, "grad_norm": 0.42132753666272715, "kl": 0.18262672424316406, "learning_rate": 2.745679012345679e-07, "loss": 0.0002, "reward": 1.7696429044008255, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7741071656346321, "rewards/format_reward_func": 0.9955357164144516, "step": 2224 }, { "completion_length": 227.9419755935669, "epoch": 0.37319250597258896, "grad_norm": 0.29602895586363986, "kl": 0.07367324829101562, "learning_rate": 2.748148148148148e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 2226 }, { "completion_length": 222.40179443359375, "epoch": 0.37352780921245654, "grad_norm": 0.18568951191388172, "kl": 0.08404731750488281, "learning_rate": 2.750617283950617e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714469850063, "rewards/format_reward_func": 1.0, "step": 2228 }, { "completion_length": 230.79018878936768, "epoch": 0.37386311245232406, "grad_norm": 0.2065159293766093, "kl": 0.041645050048828125, "learning_rate": 2.753086419753086e-07, "loss": 0.0, "reward": 1.796428643167019, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 2230 }, { "completion_length": 224.0625114440918, "epoch": 0.37419841569219164, "grad_norm": 0.33197065162865186, "kl": 0.0833282470703125, "learning_rate": 2.7555555555555555e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.782142873853445, "rewards/format_reward_func": 1.0, "step": 2232 }, { "completion_length": 235.4776906967163, "epoch": 0.37453371893205917, "grad_norm": 0.25584059088231476, "kl": 0.040119171142578125, "learning_rate": 2.7580246913580244e-07, "loss": 0.0, "reward": 1.7107143625617027, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7107143253087997, "rewards/format_reward_func": 1.0, "step": 2234 }, { "completion_length": 231.67411708831787, "epoch": 0.37486902217192675, "grad_norm": 0.35658627996828857, "kl": 0.6641654968261719, "learning_rate": 2.760493827160494e-07, "loss": 0.0007, "reward": 1.7589286491274834, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9955357164144516, "step": 2236 }, { "completion_length": 231.4196548461914, "epoch": 0.37520432541179427, "grad_norm": 0.36641847714887776, "kl": 0.057590484619140625, "learning_rate": 2.762962962962963e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 2238 }, { "completion_length": 228.5223331451416, "epoch": 0.37553962865166185, "grad_norm": 0.24546364124161799, "kl": 0.059139251708984375, "learning_rate": 2.7654320987654316e-07, "loss": 0.0001, "reward": 1.7589286267757416, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928898721933, "rewards/format_reward_func": 0.9955357164144516, "step": 2240 }, { "completion_length": 233.8750123977661, "epoch": 0.37587493189152943, "grad_norm": 0.2251870227230481, "kl": 0.18067550659179688, "learning_rate": 2.767901234567901e-07, "loss": 0.0002, "reward": 1.7196429297327995, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 2242 }, { "completion_length": 225.83036613464355, "epoch": 0.37621023513139695, "grad_norm": 0.18177423931815112, "kl": 0.13021469116210938, "learning_rate": 2.77037037037037e-07, "loss": 0.0001, "reward": 1.7321429178118706, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7321428898721933, "rewards/format_reward_func": 1.0, "step": 2244 }, { "completion_length": 224.49108028411865, "epoch": 0.37654553837126453, "grad_norm": 0.156748013206083, "kl": 0.1608562469482422, "learning_rate": 2.7728395061728393e-07, "loss": 0.0002, "reward": 1.7785714864730835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 2246 }, { "completion_length": 236.4464406967163, "epoch": 0.37688084161113206, "grad_norm": 0.25260715945203926, "kl": 0.2948493957519531, "learning_rate": 2.7753086419753087e-07, "loss": 0.0003, "reward": 1.6928572282195091, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.6928571742027998, "rewards/format_reward_func": 1.0, "step": 2248 }, { "completion_length": 235.70983409881592, "epoch": 0.37721614485099964, "grad_norm": 0.17868279769604858, "kl": 0.021900177001953125, "learning_rate": 2.7777777777777776e-07, "loss": 0.0, "reward": 1.7464286535978317, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285902678967, "rewards/format_reward_func": 1.0, "step": 2250 }, { "completion_length": 228.9509038925171, "epoch": 0.37755144809086716, "grad_norm": 0.24309000324636804, "kl": 0.2515907287597656, "learning_rate": 2.780246913580247e-07, "loss": 0.0003, "reward": 1.7696429267525673, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.9955357164144516, "step": 2252 }, { "completion_length": 226.20983219146729, "epoch": 0.37788675133073474, "grad_norm": 0.2029313002323469, "kl": 0.1733074188232422, "learning_rate": 2.782716049382716e-07, "loss": 0.0002, "reward": 1.760714367032051, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.760714303702116, "rewards/format_reward_func": 1.0, "step": 2254 }, { "completion_length": 228.9732265472412, "epoch": 0.37822205457060226, "grad_norm": 0.3040059058343939, "kl": 0.12060165405273438, "learning_rate": 2.785185185185185e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 2256 }, { "completion_length": 233.83036518096924, "epoch": 0.37855735781046984, "grad_norm": 0.3822294853412589, "kl": 0.17465782165527344, "learning_rate": 2.787654320987654e-07, "loss": 0.0002, "reward": 1.7446429133415222, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071783006191, "rewards/format_reward_func": 0.9955357164144516, "step": 2258 }, { "completion_length": 228.77233028411865, "epoch": 0.3788926610503374, "grad_norm": 0.3139393157242552, "kl": 0.5619621276855469, "learning_rate": 2.790123456790123e-07, "loss": 0.0006, "reward": 1.7410714849829674, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7455357499420643, "rewards/format_reward_func": 0.9955357164144516, "step": 2260 }, { "completion_length": 228.11608123779297, "epoch": 0.37922796429020494, "grad_norm": 0.1977537144990948, "kl": 0.09502792358398438, "learning_rate": 2.7925925925925925e-07, "loss": 0.0001, "reward": 1.7000000923871994, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.700000025331974, "rewards/format_reward_func": 1.0, "step": 2262 }, { "completion_length": 231.3035831451416, "epoch": 0.3795632675300725, "grad_norm": 0.262065947559945, "kl": 0.09780120849609375, "learning_rate": 2.795061728395062e-07, "loss": 0.0001, "reward": 1.7035714983940125, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7035714611411095, "rewards/format_reward_func": 1.0, "step": 2264 }, { "completion_length": 230.77679634094238, "epoch": 0.37989857076994005, "grad_norm": 0.37533314523360195, "kl": 0.018627166748046875, "learning_rate": 2.797530864197531e-07, "loss": 0.0, "reward": 1.7571429088711739, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 2266 }, { "completion_length": 229.883939743042, "epoch": 0.3802338740098076, "grad_norm": 0.2793404214719423, "kl": 0.12833023071289062, "learning_rate": 2.8e-07, "loss": 0.0001, "reward": 1.7553571835160255, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7598214708268642, "rewards/format_reward_func": 0.9955357164144516, "step": 2268 }, { "completion_length": 223.883939743042, "epoch": 0.38056917724967515, "grad_norm": 0.12149878381355204, "kl": 0.021205902099609375, "learning_rate": 2.8024691358024685e-07, "loss": 0.0, "reward": 1.7714286148548126, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286148548126, "rewards/format_reward_func": 1.0, "step": 2270 }, { "completion_length": 225.12947463989258, "epoch": 0.38090448048954273, "grad_norm": 0.20368255516424766, "kl": 0.05774688720703125, "learning_rate": 2.804938271604938e-07, "loss": 0.0001, "reward": 1.6821429431438446, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.6821428947150707, "rewards/format_reward_func": 1.0, "step": 2272 }, { "completion_length": 229.54465293884277, "epoch": 0.3812397837294103, "grad_norm": 0.2472125611890382, "kl": 0.2638816833496094, "learning_rate": 2.8074074074074074e-07, "loss": 0.0003, "reward": 1.7392857819795609, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7482143081724644, "rewards/format_reward_func": 0.9910714328289032, "step": 2274 }, { "completion_length": 224.26340293884277, "epoch": 0.38157508696927783, "grad_norm": 0.33149997659240854, "kl": 0.07962608337402344, "learning_rate": 2.809876543209876e-07, "loss": 0.0001, "reward": 1.7785715162754059, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7785714454948902, "rewards/format_reward_func": 1.0, "step": 2276 }, { "completion_length": 227.98661708831787, "epoch": 0.3819103902091454, "grad_norm": 0.3172056878755379, "kl": 0.09324264526367188, "learning_rate": 2.8123456790123457e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 2278 }, { "completion_length": 225.8660831451416, "epoch": 0.38224569344901294, "grad_norm": 0.5301463048688589, "kl": 0.4278106689453125, "learning_rate": 2.814814814814815e-07, "loss": 0.0004, "reward": 1.74642863124609, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286163449287, "rewards/format_reward_func": 1.0, "step": 2280 }, { "completion_length": 229.35715293884277, "epoch": 0.3825809966888805, "grad_norm": 0.38356873327089946, "kl": 0.5712699890136719, "learning_rate": 2.817283950617284e-07, "loss": 0.0006, "reward": 1.7267857864499092, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.731250049546361, "rewards/format_reward_func": 0.9955357164144516, "step": 2282 }, { "completion_length": 235.3884038925171, "epoch": 0.38291629992874804, "grad_norm": 0.934076897552111, "kl": 0.042568206787109375, "learning_rate": 2.819753086419753e-07, "loss": 0.0, "reward": 1.7892857566475868, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 2284 }, { "completion_length": 227.19643878936768, "epoch": 0.3832516031686156, "grad_norm": 0.16375574573485474, "kl": 0.04029083251953125, "learning_rate": 2.8222222222222217e-07, "loss": 0.0, "reward": 1.789285771548748, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 2286 }, { "completion_length": 227.34375953674316, "epoch": 0.3835869064084832, "grad_norm": 0.3301045316257568, "kl": 0.05418968200683594, "learning_rate": 2.824691358024691e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 2288 }, { "completion_length": 224.05804443359375, "epoch": 0.3839222096483507, "grad_norm": 0.2926858428633974, "kl": 0.021707534790039062, "learning_rate": 2.8271604938271606e-07, "loss": 0.0, "reward": 1.7642857804894447, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7732143104076385, "rewards/format_reward_func": 0.9910714328289032, "step": 2290 }, { "completion_length": 230.9553689956665, "epoch": 0.3842575128882183, "grad_norm": 0.32705642260561363, "kl": 0.13091659545898438, "learning_rate": 2.8296296296296294e-07, "loss": 0.0001, "reward": 1.6982143595814705, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7026786096394062, "rewards/format_reward_func": 0.9955357164144516, "step": 2292 }, { "completion_length": 229.50447273254395, "epoch": 0.3845928161280858, "grad_norm": 0.21564649196585978, "kl": 0.039546966552734375, "learning_rate": 2.832098765432099e-07, "loss": 0.0, "reward": 1.7464286759495735, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.746428593993187, "rewards/format_reward_func": 1.0, "step": 2294 }, { "completion_length": 219.53125953674316, "epoch": 0.3849281193679534, "grad_norm": 0.26692613221581457, "kl": 0.033473968505859375, "learning_rate": 2.834567901234568e-07, "loss": 0.0, "reward": 1.7785714864730835, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 2296 }, { "completion_length": 233.65179538726807, "epoch": 0.3852634226078209, "grad_norm": 0.2678121392019473, "kl": 0.06494903564453125, "learning_rate": 2.8370370370370366e-07, "loss": 0.0001, "reward": 1.7285715118050575, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714782774448, "rewards/format_reward_func": 1.0, "step": 2298 }, { "completion_length": 228.22322368621826, "epoch": 0.3855987258476885, "grad_norm": 0.39709523911952316, "kl": 0.040225982666015625, "learning_rate": 2.839506172839506e-07, "loss": 0.0, "reward": 1.7250000834465027, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7339286096394062, "rewards/format_reward_func": 0.9910714328289032, "step": 2300 }, { "completion_length": 217.65179634094238, "epoch": 0.3859340290875561, "grad_norm": 0.39208732176897304, "kl": 0.064727783203125, "learning_rate": 2.841975308641975e-07, "loss": 0.0001, "reward": 1.8071429133415222, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 2302 }, { "completion_length": 229.55804920196533, "epoch": 0.3862693323274236, "grad_norm": 0.31110812623321255, "kl": 0.036376953125, "learning_rate": 2.8444444444444443e-07, "loss": 0.0, "reward": 1.8035714998841286, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8035714663565159, "rewards/format_reward_func": 1.0, "step": 2304 }, { "completion_length": 227.11161708831787, "epoch": 0.3866046355672912, "grad_norm": 0.2816325042184154, "kl": 0.028156280517578125, "learning_rate": 2.846913580246914e-07, "loss": 0.0, "reward": 1.7464286461472511, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 2306 }, { "completion_length": 216.4241189956665, "epoch": 0.3869399388071587, "grad_norm": 0.14324961565346786, "kl": 0.02130126953125, "learning_rate": 2.8493827160493826e-07, "loss": 0.0, "reward": 1.821428619325161, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.821428582072258, "rewards/format_reward_func": 1.0, "step": 2308 }, { "completion_length": 221.38393878936768, "epoch": 0.3872752420470263, "grad_norm": 0.26443223145301653, "kl": 0.026519775390625, "learning_rate": 2.851851851851852e-07, "loss": 0.0, "reward": 1.742857187986374, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571730852127, "rewards/format_reward_func": 1.0, "step": 2310 }, { "completion_length": 234.01340293884277, "epoch": 0.3876105452868938, "grad_norm": 0.16721530630211082, "kl": 0.02655792236328125, "learning_rate": 2.8543209876543204e-07, "loss": 0.0, "reward": 1.7142857909202576, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.714285746216774, "rewards/format_reward_func": 1.0, "step": 2312 }, { "completion_length": 236.1384048461914, "epoch": 0.3879458485267614, "grad_norm": 0.18018195259232184, "kl": 0.025665283203125, "learning_rate": 2.85679012345679e-07, "loss": 0.0, "reward": 1.7142857760190964, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857611179352, "rewards/format_reward_func": 1.0, "step": 2314 }, { "completion_length": 231.79465293884277, "epoch": 0.3882811517666289, "grad_norm": 0.2801403066500493, "kl": 0.034091949462890625, "learning_rate": 2.859259259259259e-07, "loss": 0.0, "reward": 1.7464286535978317, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.746428620070219, "rewards/format_reward_func": 1.0, "step": 2316 }, { "completion_length": 222.54465103149414, "epoch": 0.3886164550064965, "grad_norm": 0.19240949356448653, "kl": 0.026641845703125, "learning_rate": 2.861728395061728e-07, "loss": 0.0, "reward": 1.7821429073810577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 2318 }, { "completion_length": 227.81251335144043, "epoch": 0.3889517582463641, "grad_norm": 0.14613030642705596, "kl": 0.030071258544921875, "learning_rate": 2.8641975308641975e-07, "loss": 0.0, "reward": 1.7035714983940125, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7035714592784643, "rewards/format_reward_func": 1.0, "step": 2320 }, { "completion_length": 227.46876049041748, "epoch": 0.3892870614862316, "grad_norm": 0.39846413926302476, "kl": 0.030303955078125, "learning_rate": 2.866666666666667e-07, "loss": 0.0, "reward": 1.7767857685685158, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7812500223517418, "rewards/format_reward_func": 0.9955357164144516, "step": 2322 }, { "completion_length": 221.89733123779297, "epoch": 0.3896223647260992, "grad_norm": 0.41518256668271264, "kl": 0.040134429931640625, "learning_rate": 2.869135802469136e-07, "loss": 0.0, "reward": 1.7821429073810577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428775787354, "rewards/format_reward_func": 1.0, "step": 2324 }, { "completion_length": 234.54465579986572, "epoch": 0.3899576679659667, "grad_norm": 0.4979212066521887, "kl": 0.04520416259765625, "learning_rate": 2.8716049382716047e-07, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.09091372601687908, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 2326 }, { "completion_length": 225.66518783569336, "epoch": 0.3902929712058343, "grad_norm": 0.68471046288006, "kl": 0.06603240966796875, "learning_rate": 2.8740740740740736e-07, "loss": 0.0001, "reward": 1.7232143580913544, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7276786118745804, "rewards/format_reward_func": 0.9955357164144516, "step": 2328 }, { "completion_length": 230.5134038925171, "epoch": 0.3906282744457018, "grad_norm": 0.1734750680599172, "kl": 0.022487640380859375, "learning_rate": 2.876543209876543e-07, "loss": 0.0, "reward": 1.7642857879400253, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 2330 }, { "completion_length": 226.7009038925171, "epoch": 0.3909635776855694, "grad_norm": 0.17619003683755471, "kl": 0.052631378173828125, "learning_rate": 2.8790123456790124e-07, "loss": 0.0001, "reward": 1.6821429207921028, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.682142898440361, "rewards/format_reward_func": 1.0, "step": 2332 }, { "completion_length": 223.3928680419922, "epoch": 0.39129888092543696, "grad_norm": 0.2130792522496975, "kl": 0.0273590087890625, "learning_rate": 2.8814814814814813e-07, "loss": 0.0, "reward": 1.7625000849366188, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643089175224, "rewards/format_reward_func": 0.9955357164144516, "step": 2334 }, { "completion_length": 230.91518878936768, "epoch": 0.3916341841653045, "grad_norm": 0.16563670418519555, "kl": 0.0242462158203125, "learning_rate": 2.8839506172839507e-07, "loss": 0.0, "reward": 1.7607143223285675, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 2336 }, { "completion_length": 223.83036708831787, "epoch": 0.39196948740517207, "grad_norm": 0.2537452887729266, "kl": 0.0279083251953125, "learning_rate": 2.8864197530864196e-07, "loss": 0.0, "reward": 1.7857143506407738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 2338 }, { "completion_length": 233.96876430511475, "epoch": 0.3923047906450396, "grad_norm": 0.25310078074433373, "kl": 0.04262542724609375, "learning_rate": 2.8888888888888885e-07, "loss": 0.0, "reward": 1.8000000566244125, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 2340 }, { "completion_length": 231.4732255935669, "epoch": 0.39264009388490717, "grad_norm": 0.6108374179336262, "kl": 0.07244873046875, "learning_rate": 2.891358024691358e-07, "loss": 0.0001, "reward": 1.7750000804662704, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000096857548, "rewards/format_reward_func": 1.0, "step": 2342 }, { "completion_length": 230.8437614440918, "epoch": 0.3929753971247747, "grad_norm": 0.20767365117804806, "kl": 0.031772613525390625, "learning_rate": 2.893827160493827e-07, "loss": 0.0, "reward": 1.6892857998609543, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.6892857402563095, "rewards/format_reward_func": 1.0, "step": 2344 }, { "completion_length": 229.22768783569336, "epoch": 0.3933107003646423, "grad_norm": 0.18029574615767352, "kl": 0.055999755859375, "learning_rate": 2.896296296296296e-07, "loss": 0.0001, "reward": 1.7357143759727478, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 1.0, "step": 2346 }, { "completion_length": 231.33483123779297, "epoch": 0.39364600360450985, "grad_norm": 0.174887596588021, "kl": 0.020839691162109375, "learning_rate": 2.8987654320987656e-07, "loss": 0.0, "reward": 1.7142857983708382, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7142857443541288, "rewards/format_reward_func": 1.0, "step": 2348 }, { "completion_length": 239.0803680419922, "epoch": 0.3939813068443774, "grad_norm": 0.24591155442162888, "kl": 0.028659820556640625, "learning_rate": 2.9012345679012345e-07, "loss": 0.0, "reward": 1.7821429073810577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 2350 }, { "completion_length": 232.0580472946167, "epoch": 0.39431661008424496, "grad_norm": 0.16364288828454512, "kl": 0.031280517578125, "learning_rate": 2.903703703703704e-07, "loss": 0.0, "reward": 1.7428572177886963, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571823984385, "rewards/format_reward_func": 1.0, "step": 2352 }, { "completion_length": 238.96429634094238, "epoch": 0.3946519133241125, "grad_norm": 0.25228292667784574, "kl": 0.03216552734375, "learning_rate": 2.906172839506173e-07, "loss": 0.0, "reward": 1.7625000700354576, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7669643089175224, "rewards/format_reward_func": 0.9955357164144516, "step": 2354 }, { "completion_length": 237.01786994934082, "epoch": 0.39498721656398006, "grad_norm": 0.2681810554388981, "kl": 0.0478515625, "learning_rate": 2.9086419753086416e-07, "loss": 0.0, "reward": 1.7214286476373672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.721428606659174, "rewards/format_reward_func": 1.0, "step": 2356 }, { "completion_length": 231.8660831451416, "epoch": 0.3953225198038476, "grad_norm": 0.2985047069341516, "kl": 0.048858642578125, "learning_rate": 2.911111111111111e-07, "loss": 0.0, "reward": 1.7732143327593803, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7776785958558321, "rewards/format_reward_func": 0.9955357164144516, "step": 2358 }, { "completion_length": 240.35268878936768, "epoch": 0.39565782304371516, "grad_norm": 0.2855045435472761, "kl": 0.023204803466796875, "learning_rate": 2.91358024691358e-07, "loss": 0.0, "reward": 1.7428571954369545, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7517857439815998, "rewards/format_reward_func": 0.9910714328289032, "step": 2360 }, { "completion_length": 237.57143878936768, "epoch": 0.39599312628358274, "grad_norm": 0.290504962458472, "kl": 0.030864715576171875, "learning_rate": 2.9160493827160494e-07, "loss": 0.0, "reward": 1.7607143744826317, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 2362 }, { "completion_length": 231.0669755935669, "epoch": 0.39632842952345027, "grad_norm": 0.3066050981425015, "kl": 0.025148391723632812, "learning_rate": 2.918518518518518e-07, "loss": 0.0, "reward": 1.791071504354477, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.795535746961832, "rewards/format_reward_func": 0.9955357164144516, "step": 2364 }, { "completion_length": 238.04465579986572, "epoch": 0.39666373276331784, "grad_norm": 0.1959750175843398, "kl": 0.034114837646484375, "learning_rate": 2.9209876543209877e-07, "loss": 0.0, "reward": 1.6928572431206703, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.692857164889574, "rewards/format_reward_func": 1.0, "step": 2366 }, { "completion_length": 224.20090293884277, "epoch": 0.39699903600318537, "grad_norm": 0.18280818205958912, "kl": 0.0224151611328125, "learning_rate": 2.923456790123457e-07, "loss": 0.0, "reward": 1.753571517765522, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 2368 }, { "completion_length": 228.18750953674316, "epoch": 0.39733433924305295, "grad_norm": 0.40446503207075296, "kl": 0.059787750244140625, "learning_rate": 2.9259259259259254e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 2370 }, { "completion_length": 238.78572750091553, "epoch": 0.39766964248292047, "grad_norm": 0.3511749525167252, "kl": 0.02414703369140625, "learning_rate": 2.928395061728395e-07, "loss": 0.0, "reward": 1.7464286386966705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 2372 }, { "completion_length": 250.41518783569336, "epoch": 0.39800494572278805, "grad_norm": 0.4279324335140687, "kl": 0.04547119140625, "learning_rate": 2.930864197530864e-07, "loss": 0.0, "reward": 1.712500087916851, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7169643044471741, "rewards/format_reward_func": 0.9955357164144516, "step": 2374 }, { "completion_length": 233.25893878936768, "epoch": 0.3983402489626556, "grad_norm": 0.3870702027447503, "kl": 0.03582000732421875, "learning_rate": 2.933333333333333e-07, "loss": 0.0, "reward": 1.6857143640518188, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.6857143249362707, "rewards/format_reward_func": 1.0, "step": 2376 }, { "completion_length": 243.75000953674316, "epoch": 0.39867555220252315, "grad_norm": 0.2756062262293934, "kl": 0.03237152099609375, "learning_rate": 2.9358024691358025e-07, "loss": 0.0, "reward": 1.7607143744826317, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 1.0, "step": 2378 }, { "completion_length": 232.4821538925171, "epoch": 0.39901085544239073, "grad_norm": 0.21033752055447466, "kl": 0.022663116455078125, "learning_rate": 2.9382716049382714e-07, "loss": 0.0, "reward": 1.7017858177423477, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7062500212341547, "rewards/format_reward_func": 0.9955357164144516, "step": 2380 }, { "completion_length": 233.0535831451416, "epoch": 0.39934615868225826, "grad_norm": 0.30371212478863435, "kl": 0.02435302734375, "learning_rate": 2.940740740740741e-07, "loss": 0.0, "reward": 1.7821429446339607, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.782142873853445, "rewards/format_reward_func": 1.0, "step": 2382 }, { "completion_length": 233.5491180419922, "epoch": 0.39968146192212584, "grad_norm": 0.29400370741808635, "kl": 0.0242462158203125, "learning_rate": 2.9432098765432097e-07, "loss": 0.0, "reward": 1.7357143387198448, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7357143182307482, "rewards/format_reward_func": 1.0, "step": 2384 }, { "completion_length": 235.47768878936768, "epoch": 0.40001676516199336, "grad_norm": 0.24495863289215739, "kl": 0.0254669189453125, "learning_rate": 2.9456790123456786e-07, "loss": 0.0, "reward": 1.7607143372297287, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7696428783237934, "rewards/format_reward_func": 0.9910714328289032, "step": 2386 }, { "completion_length": 234.45090293884277, "epoch": 0.40035206840186094, "grad_norm": 0.18587902604819076, "kl": 0.0247650146484375, "learning_rate": 2.948148148148148e-07, "loss": 0.0, "reward": 1.807142898440361, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.807142898440361, "rewards/format_reward_func": 1.0, "step": 2388 }, { "completion_length": 233.30358409881592, "epoch": 0.40068737164172846, "grad_norm": 0.41121728830485704, "kl": 0.036991119384765625, "learning_rate": 2.950617283950617e-07, "loss": 0.0, "reward": 1.7928572073578835, "reward_std": 0.07071067485958338, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 2390 }, { "completion_length": 234.9732255935669, "epoch": 0.40102267488159604, "grad_norm": 0.2577178462756106, "kl": 0.027858734130859375, "learning_rate": 2.9530864197530863e-07, "loss": 0.0, "reward": 1.839285746216774, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.839285746216774, "rewards/format_reward_func": 1.0, "step": 2392 }, { "completion_length": 233.92411994934082, "epoch": 0.4013579781214636, "grad_norm": 0.08764588825744794, "kl": 0.02735137939453125, "learning_rate": 2.9555555555555557e-07, "loss": 0.0, "reward": 1.8035714700818062, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 2394 }, { "completion_length": 238.86161708831787, "epoch": 0.40169328136133114, "grad_norm": 0.1686954172881697, "kl": 0.026401519775390625, "learning_rate": 2.9580246913580246e-07, "loss": 0.0, "reward": 1.7428571954369545, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7517857570201159, "rewards/format_reward_func": 0.9910714328289032, "step": 2396 }, { "completion_length": 240.46875953674316, "epoch": 0.4020285846011987, "grad_norm": 0.28063510826087984, "kl": 0.034069061279296875, "learning_rate": 2.9604938271604935e-07, "loss": 0.0, "reward": 1.782142922282219, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 2398 }, { "completion_length": 243.3928680419922, "epoch": 0.40236388784106625, "grad_norm": 0.4889857363092545, "kl": 0.03224945068359375, "learning_rate": 2.962962962962963e-07, "loss": 0.0, "reward": 1.7535714954137802, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.762500025331974, "rewards/format_reward_func": 0.9910714328289032, "step": 2400 }, { "completion_length": 236.2500123977661, "epoch": 0.4026991910809338, "grad_norm": 0.10869538161812134, "kl": 0.03144073486328125, "learning_rate": 2.965432098765432e-07, "loss": 0.0, "reward": 1.7178572118282318, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7178571708500385, "rewards/format_reward_func": 1.0, "step": 2402 }, { "completion_length": 244.38840579986572, "epoch": 0.40303449432080135, "grad_norm": 0.40208088618904964, "kl": 0.033901214599609375, "learning_rate": 2.967901234567901e-07, "loss": 0.0, "reward": 1.7714286372065544, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7803571596741676, "rewards/format_reward_func": 0.9910714328289032, "step": 2404 }, { "completion_length": 232.1964406967163, "epoch": 0.40336979756066893, "grad_norm": 0.290996152868013, "kl": 0.026607513427734375, "learning_rate": 2.97037037037037e-07, "loss": 0.0, "reward": 1.7875000461935997, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 2406 }, { "completion_length": 243.1785831451416, "epoch": 0.4037051008005365, "grad_norm": 0.17181050036006004, "kl": 0.06661605834960938, "learning_rate": 2.9728395061728395e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 2408 }, { "completion_length": 239.33483219146729, "epoch": 0.40404040404040403, "grad_norm": 0.20456214359825808, "kl": 0.06856155395507812, "learning_rate": 2.975308641975309e-07, "loss": 0.0001, "reward": 1.7625000402331352, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643051922321, "rewards/format_reward_func": 0.9955357164144516, "step": 2410 }, { "completion_length": 236.35268878936768, "epoch": 0.4043757072802716, "grad_norm": 0.10846825466310321, "kl": 0.02588653564453125, "learning_rate": 2.9777777777777773e-07, "loss": 0.0, "reward": 1.8053571805357933, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.8098214603960514, "rewards/format_reward_func": 0.9955357164144516, "step": 2412 }, { "completion_length": 227.97768783569336, "epoch": 0.40471101052013914, "grad_norm": 0.3076350136465618, "kl": 0.043216705322265625, "learning_rate": 2.9802469135802467e-07, "loss": 0.0, "reward": 1.7839286103844643, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 2414 }, { "completion_length": 241.9509048461914, "epoch": 0.4050463137600067, "grad_norm": 0.22497192914816633, "kl": 0.0537261962890625, "learning_rate": 2.9827160493827156e-07, "loss": 0.0001, "reward": 1.6750000715255737, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.6839286033064127, "rewards/format_reward_func": 0.9910714328289032, "step": 2416 }, { "completion_length": 237.3259048461914, "epoch": 0.40538161699987424, "grad_norm": 0.2892909531482391, "kl": 0.0372314453125, "learning_rate": 2.985185185185185e-07, "loss": 0.0, "reward": 1.7535714954137802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 2418 }, { "completion_length": 248.46429920196533, "epoch": 0.4057169202397418, "grad_norm": 0.20048739142980532, "kl": 0.025386810302734375, "learning_rate": 2.9876543209876544e-07, "loss": 0.0, "reward": 1.717857226729393, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7178571783006191, "rewards/format_reward_func": 1.0, "step": 2420 }, { "completion_length": 238.56250953674316, "epoch": 0.4060522234796094, "grad_norm": 0.3425412260910249, "kl": 0.06465911865234375, "learning_rate": 2.9901234567901233e-07, "loss": 0.0001, "reward": 1.783928632736206, "reward_std": 0.09343910776078701, "rewards/equation_reward_func": 0.7883928958326578, "rewards/format_reward_func": 0.9955357164144516, "step": 2422 }, { "completion_length": 243.9151906967163, "epoch": 0.4063875267194769, "grad_norm": 0.2759994660120673, "kl": 0.034027099609375, "learning_rate": 2.9925925925925927e-07, "loss": 0.0, "reward": 1.703571505844593, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7035714723169804, "rewards/format_reward_func": 1.0, "step": 2424 }, { "completion_length": 241.92858123779297, "epoch": 0.4067228299593445, "grad_norm": 0.21101376319229112, "kl": 0.023448944091796875, "learning_rate": 2.9950617283950616e-07, "loss": 0.0, "reward": 1.70000009983778, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7000000402331352, "rewards/format_reward_func": 1.0, "step": 2426 }, { "completion_length": 239.5357265472412, "epoch": 0.407058133199212, "grad_norm": 0.1853824828635671, "kl": 0.022472381591796875, "learning_rate": 2.9975308641975305e-07, "loss": 0.0, "reward": 1.719642922282219, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7241071835160255, "rewards/format_reward_func": 0.9955357164144516, "step": 2428 }, { "completion_length": 230.90625953674316, "epoch": 0.4073934364390796, "grad_norm": 0.10183301453604178, "kl": 0.033657073974609375, "learning_rate": 3e-07, "loss": 0.0, "reward": 1.7553572058677673, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7598214540630579, "rewards/format_reward_func": 0.9955357164144516, "step": 2430 }, { "completion_length": 242.81251049041748, "epoch": 0.4077287396789471, "grad_norm": 0.046270323750636944, "kl": 0.024127960205078125, "learning_rate": 3.002469135802469e-07, "loss": 0.0, "reward": 1.7696429193019867, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7741071917116642, "rewards/format_reward_func": 0.9955357164144516, "step": 2432 }, { "completion_length": 240.46875953674316, "epoch": 0.4080640429188147, "grad_norm": 0.3617812467572274, "kl": 0.0596771240234375, "learning_rate": 3.004938271604938e-07, "loss": 0.0001, "reward": 1.7482143640518188, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.752678606659174, "rewards/format_reward_func": 0.9955357164144516, "step": 2434 }, { "completion_length": 238.13840293884277, "epoch": 0.4083993461586823, "grad_norm": 0.474064872648244, "kl": 0.042621612548828125, "learning_rate": 3.0074074074074076e-07, "loss": 0.0, "reward": 1.717857226729393, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7267857417464256, "rewards/format_reward_func": 0.9910714328289032, "step": 2436 }, { "completion_length": 229.0267972946167, "epoch": 0.4087346493985498, "grad_norm": 0.25945331903624114, "kl": 0.061847686767578125, "learning_rate": 3.0098765432098765e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 2438 }, { "completion_length": 246.94197845458984, "epoch": 0.4090699526384174, "grad_norm": 0.29150718449013036, "kl": 0.19774627685546875, "learning_rate": 3.012345679012346e-07, "loss": 0.0002, "reward": 1.7267857939004898, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7401785925030708, "rewards/format_reward_func": 0.9866071492433548, "step": 2440 }, { "completion_length": 243.5937623977661, "epoch": 0.4094052558782849, "grad_norm": 0.47884281229914716, "kl": 0.08807373046875, "learning_rate": 3.014814814814814e-07, "loss": 0.0001, "reward": 1.7017857804894447, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7151786088943481, "rewards/format_reward_func": 0.9866071492433548, "step": 2442 }, { "completion_length": 233.33036994934082, "epoch": 0.4097405591181525, "grad_norm": 0.39819654959775574, "kl": 0.07984161376953125, "learning_rate": 3.0172839506172836e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 2444 }, { "completion_length": 247.2232265472412, "epoch": 0.41007586235802, "grad_norm": 0.2930877870059224, "kl": 0.07854843139648438, "learning_rate": 3.019753086419753e-07, "loss": 0.0001, "reward": 1.6589286550879478, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.6633928846567869, "rewards/format_reward_func": 0.9955357164144516, "step": 2446 }, { "completion_length": 242.20983219146729, "epoch": 0.4104111655978876, "grad_norm": 0.327769941363608, "kl": 0.04544830322265625, "learning_rate": 3.022222222222222e-07, "loss": 0.0, "reward": 1.6892857924103737, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.6982143186032772, "rewards/format_reward_func": 0.9910714328289032, "step": 2448 }, { "completion_length": 234.95090293884277, "epoch": 0.4107464688377551, "grad_norm": 0.2819124888649559, "kl": 0.103240966796875, "learning_rate": 3.0246913580246913e-07, "loss": 0.0001, "reward": 1.710714340209961, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7196428794413805, "rewards/format_reward_func": 0.9910714328289032, "step": 2450 }, { "completion_length": 238.6250114440918, "epoch": 0.4110817720776227, "grad_norm": 0.13389653861381598, "kl": 0.024539947509765625, "learning_rate": 3.027160493827161e-07, "loss": 0.0, "reward": 1.7714286297559738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 2452 }, { "completion_length": 249.633939743042, "epoch": 0.4114170753174903, "grad_norm": 0.2228403364562828, "kl": 0.033233642578125, "learning_rate": 3.0296296296296296e-07, "loss": 0.0, "reward": 1.7946429029107094, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7991071715950966, "rewards/format_reward_func": 0.9955357164144516, "step": 2454 }, { "completion_length": 239.16519260406494, "epoch": 0.4117523785573578, "grad_norm": 0.29195230568320907, "kl": 0.02500152587890625, "learning_rate": 3.0320987654320985e-07, "loss": 0.0, "reward": 1.8071429133415222, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428909897804, "rewards/format_reward_func": 1.0, "step": 2456 }, { "completion_length": 243.6875114440918, "epoch": 0.4120876817972254, "grad_norm": 0.17428188867849037, "kl": 0.048297882080078125, "learning_rate": 3.0345679012345674e-07, "loss": 0.0, "reward": 1.7571429535746574, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 2458 }, { "completion_length": 236.196439743042, "epoch": 0.4124229850370929, "grad_norm": 0.36680589539865105, "kl": 0.061859130859375, "learning_rate": 3.037037037037037e-07, "loss": 0.0001, "reward": 1.785714365541935, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7946428768336773, "rewards/format_reward_func": 0.9910714328289032, "step": 2460 }, { "completion_length": 241.49554824829102, "epoch": 0.4127582882769605, "grad_norm": 0.482576488789002, "kl": 0.05983734130859375, "learning_rate": 3.039506172839506e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.10354063659906387, "rewards/equation_reward_func": 0.7758928947150707, "rewards/format_reward_func": 0.9866071492433548, "step": 2462 }, { "completion_length": 251.9509048461914, "epoch": 0.413093591516828, "grad_norm": 0.5102705709365054, "kl": 0.23926925659179688, "learning_rate": 3.041975308641975e-07, "loss": 0.0002, "reward": 1.7821429148316383, "reward_std": 0.11616754066199064, "rewards/equation_reward_func": 0.8000000156462193, "rewards/format_reward_func": 0.9821428656578064, "step": 2464 }, { "completion_length": 243.87054538726807, "epoch": 0.4134288947566956, "grad_norm": 0.6208793238312089, "kl": 0.6168899536132812, "learning_rate": 3.0444444444444445e-07, "loss": 0.0006, "reward": 1.78035718947649, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7848214507102966, "rewards/format_reward_func": 0.9955357164144516, "step": 2466 }, { "completion_length": 230.75447463989258, "epoch": 0.41376419799656317, "grad_norm": 0.20628736823470975, "kl": 0.09053802490234375, "learning_rate": 3.046913580246914e-07, "loss": 0.0001, "reward": 1.8357143327593803, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8357142955064774, "rewards/format_reward_func": 1.0, "step": 2468 }, { "completion_length": 247.08036708831787, "epoch": 0.4140995012364307, "grad_norm": 0.34160066484399093, "kl": 0.14976119995117188, "learning_rate": 3.0493827160493823e-07, "loss": 0.0001, "reward": 1.726785771548748, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.731250025331974, "rewards/format_reward_func": 0.9955357164144516, "step": 2470 }, { "completion_length": 240.43304824829102, "epoch": 0.41443480447629827, "grad_norm": 0.22342851163494104, "kl": 0.24057769775390625, "learning_rate": 3.0518518518518517e-07, "loss": 0.0002, "reward": 1.7160714864730835, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7205357383936644, "rewards/format_reward_func": 0.9955357164144516, "step": 2472 }, { "completion_length": 241.9062623977661, "epoch": 0.4147701077161658, "grad_norm": 0.1707649901922291, "kl": 0.0610198974609375, "learning_rate": 3.0543209876543206e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643219560385, "rewards/format_reward_func": 0.9955357164144516, "step": 2474 }, { "completion_length": 244.6607255935669, "epoch": 0.41510541095603337, "grad_norm": 0.28532945804476273, "kl": 0.042633056640625, "learning_rate": 3.05679012345679e-07, "loss": 0.0, "reward": 1.8250000402331352, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.825000025331974, "rewards/format_reward_func": 1.0, "step": 2476 }, { "completion_length": 246.70983409881592, "epoch": 0.4154407141959009, "grad_norm": 0.20875711813517592, "kl": 0.06849288940429688, "learning_rate": 3.0592592592592594e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286033064127, "rewards/format_reward_func": 1.0, "step": 2478 }, { "completion_length": 242.1428680419922, "epoch": 0.4157760174357685, "grad_norm": 0.16125349654244475, "kl": 0.026233673095703125, "learning_rate": 3.0617283950617283e-07, "loss": 0.0, "reward": 1.7321429252624512, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7321428768336773, "rewards/format_reward_func": 1.0, "step": 2480 }, { "completion_length": 242.0625123977661, "epoch": 0.41611132067563605, "grad_norm": 0.24992527870915915, "kl": 0.032123565673828125, "learning_rate": 3.0641975308641977e-07, "loss": 0.0, "reward": 1.7714286297559738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 2482 }, { "completion_length": 251.56251335144043, "epoch": 0.4164466239155036, "grad_norm": 0.277668143360467, "kl": 0.05127716064453125, "learning_rate": 3.066666666666666e-07, "loss": 0.0001, "reward": 1.7000001072883606, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7000000290572643, "rewards/format_reward_func": 1.0, "step": 2484 }, { "completion_length": 244.11608123779297, "epoch": 0.41678192715537116, "grad_norm": 0.1966415408927424, "kl": 0.033660888671875, "learning_rate": 3.0691358024691355e-07, "loss": 0.0, "reward": 1.678571529686451, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.6785714775323868, "rewards/format_reward_func": 1.0, "step": 2486 }, { "completion_length": 243.00000858306885, "epoch": 0.4171172303952387, "grad_norm": 0.4143451684580404, "kl": 0.0799713134765625, "learning_rate": 3.071604938271605e-07, "loss": 0.0001, "reward": 1.7665179148316383, "reward_std": 0.06755394861102104, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 0.9986607171595097, "step": 2488 }, { "completion_length": 237.1473331451416, "epoch": 0.41745253363510626, "grad_norm": 0.26423638528297066, "kl": 0.03952789306640625, "learning_rate": 3.074074074074074e-07, "loss": 0.0, "reward": 1.814285784959793, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 2490 }, { "completion_length": 244.64286708831787, "epoch": 0.4177878368749738, "grad_norm": 0.2312057942169059, "kl": 0.04471588134765625, "learning_rate": 3.076543209876543e-07, "loss": 0.0, "reward": 1.7593750730156898, "reward_std": 0.027147849323228, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 0.9986607171595097, "step": 2492 }, { "completion_length": 246.00447273254395, "epoch": 0.41812314011484136, "grad_norm": 0.4950536770067875, "kl": 0.04170989990234375, "learning_rate": 3.0790123456790126e-07, "loss": 0.0, "reward": 1.7196429297327995, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7241071779280901, "rewards/format_reward_func": 0.9955357164144516, "step": 2494 }, { "completion_length": 244.5491189956665, "epoch": 0.41845844335470894, "grad_norm": 0.32448324165354375, "kl": 0.02623748779296875, "learning_rate": 3.0814814814814815e-07, "loss": 0.0, "reward": 1.7821429446339607, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7910714540630579, "rewards/format_reward_func": 0.9910714328289032, "step": 2496 }, { "completion_length": 247.46430015563965, "epoch": 0.41879374659457647, "grad_norm": 0.23471168214360996, "kl": 0.0288848876953125, "learning_rate": 3.0839506172839504e-07, "loss": 0.0, "reward": 1.7267857864499092, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7401786036789417, "rewards/format_reward_func": 0.9866071492433548, "step": 2498 }, { "completion_length": 244.1384038925171, "epoch": 0.41912904983444405, "grad_norm": 0.2761535326936902, "kl": 0.025539398193359375, "learning_rate": 3.086419753086419e-07, "loss": 0.0, "reward": 1.783928632736206, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 2500 }, { "completion_length": 248.1651906967163, "epoch": 0.41946435307431157, "grad_norm": 0.2309364519086521, "kl": 0.037647247314453125, "learning_rate": 3.0888888888888887e-07, "loss": 0.0, "reward": 1.7642858028411865, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 2502 }, { "completion_length": 250.14733219146729, "epoch": 0.41979965631417915, "grad_norm": 0.29956528286167283, "kl": 0.07854461669921875, "learning_rate": 3.091358024691358e-07, "loss": 0.0001, "reward": 1.7607143372297287, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.769642885774374, "rewards/format_reward_func": 0.9910714328289032, "step": 2504 }, { "completion_length": 246.00893783569336, "epoch": 0.42013495955404667, "grad_norm": 0.3199713423763226, "kl": 0.08681488037109375, "learning_rate": 3.093827160493827e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7517857439815998, "rewards/format_reward_func": 0.9910714328289032, "step": 2506 }, { "completion_length": 243.0134048461914, "epoch": 0.42047026279391425, "grad_norm": 0.11689773260946061, "kl": 0.06568527221679688, "learning_rate": 3.0962962962962964e-07, "loss": 0.0001, "reward": 1.7357143461704254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143126428127, "rewards/format_reward_func": 1.0, "step": 2508 }, { "completion_length": 240.65179920196533, "epoch": 0.4208055660337818, "grad_norm": 0.3331850418354822, "kl": 0.0395355224609375, "learning_rate": 3.098765432098765e-07, "loss": 0.0, "reward": 1.7553572058677673, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 2510 }, { "completion_length": 239.3125114440918, "epoch": 0.42114086927364935, "grad_norm": 0.24332880806945797, "kl": 0.026187896728515625, "learning_rate": 3.101234567901234e-07, "loss": 0.0, "reward": 1.771428644657135, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7803571745753288, "rewards/format_reward_func": 0.9910714328289032, "step": 2512 }, { "completion_length": 256.1294775009155, "epoch": 0.42147617251351693, "grad_norm": 0.2725284304146817, "kl": 0.04680633544921875, "learning_rate": 3.1037037037037036e-07, "loss": 0.0, "reward": 1.721428632736206, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7214285973459482, "rewards/format_reward_func": 1.0, "step": 2514 }, { "completion_length": 242.76340103149414, "epoch": 0.42181147575338446, "grad_norm": 0.4089842216356524, "kl": 0.0549163818359375, "learning_rate": 3.1061728395061724e-07, "loss": 0.0001, "reward": 1.700000062584877, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7000000327825546, "rewards/format_reward_func": 1.0, "step": 2516 }, { "completion_length": 239.29465103149414, "epoch": 0.42214677899325204, "grad_norm": 0.22043840909969697, "kl": 0.05002593994140625, "learning_rate": 3.108641975308642e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000149011612, "rewards/format_reward_func": 1.0, "step": 2518 }, { "completion_length": 243.62947368621826, "epoch": 0.42248208223311956, "grad_norm": 0.1826835616573898, "kl": 0.0255279541015625, "learning_rate": 3.111111111111111e-07, "loss": 0.0, "reward": 1.7714286670088768, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 2520 }, { "completion_length": 243.39287090301514, "epoch": 0.42281738547298714, "grad_norm": 0.216918530888218, "kl": 0.035877227783203125, "learning_rate": 3.11358024691358e-07, "loss": 0.0, "reward": 1.7750000730156898, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 2522 }, { "completion_length": 242.32590579986572, "epoch": 0.42315268871285466, "grad_norm": 0.16724150439793084, "kl": 0.03157806396484375, "learning_rate": 3.1160493827160496e-07, "loss": 0.0, "reward": 1.7928571999073029, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 2524 }, { "completion_length": 244.15179634094238, "epoch": 0.42348799195272224, "grad_norm": 0.2824979326339329, "kl": 0.04889678955078125, "learning_rate": 3.118518518518518e-07, "loss": 0.0, "reward": 1.7571429461240768, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7660714536905289, "rewards/format_reward_func": 0.9910714328289032, "step": 2526 }, { "completion_length": 243.45536708831787, "epoch": 0.4238232951925898, "grad_norm": 0.37376178454239156, "kl": 0.03272247314453125, "learning_rate": 3.1209876543209873e-07, "loss": 0.0, "reward": 1.7660714983940125, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9955357164144516, "step": 2528 }, { "completion_length": 235.4509048461914, "epoch": 0.42415859843245735, "grad_norm": 0.40747437616592286, "kl": 0.04550933837890625, "learning_rate": 3.123456790123457e-07, "loss": 0.0, "reward": 1.757142923772335, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 2530 }, { "completion_length": 248.3750123977661, "epoch": 0.4244939016723249, "grad_norm": 0.2179129790820734, "kl": 0.059764862060546875, "learning_rate": 3.1259259259259256e-07, "loss": 0.0001, "reward": 1.7553571984171867, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7598214577883482, "rewards/format_reward_func": 0.9955357164144516, "step": 2532 }, { "completion_length": 255.93304538726807, "epoch": 0.42482920491219245, "grad_norm": 0.253351262982618, "kl": 0.0557861328125, "learning_rate": 3.128395061728395e-07, "loss": 0.0001, "reward": 1.7642857655882835, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.77321432903409, "rewards/format_reward_func": 0.9910714328289032, "step": 2534 }, { "completion_length": 242.6294755935669, "epoch": 0.42516450815206003, "grad_norm": 0.21174747359151216, "kl": 0.0252227783203125, "learning_rate": 3.130864197530864e-07, "loss": 0.0, "reward": 1.8178572058677673, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.817857176065445, "rewards/format_reward_func": 1.0, "step": 2536 }, { "completion_length": 236.34376049041748, "epoch": 0.42549981139192755, "grad_norm": 0.07787793090487856, "kl": 0.049343109130859375, "learning_rate": 3.1333333333333333e-07, "loss": 0.0, "reward": 1.735714353621006, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7357143182307482, "rewards/format_reward_func": 1.0, "step": 2538 }, { "completion_length": 244.9241189956665, "epoch": 0.42583511463179513, "grad_norm": 0.2622517959405339, "kl": 0.04746246337890625, "learning_rate": 3.135802469135803e-07, "loss": 0.0, "reward": 1.7517858073115349, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 2540 }, { "completion_length": 239.76786613464355, "epoch": 0.4261704178716627, "grad_norm": 0.34941696728550165, "kl": 0.02500152587890625, "learning_rate": 3.138271604938271e-07, "loss": 0.0, "reward": 1.7303572073578835, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7348214611411095, "rewards/format_reward_func": 0.9955357164144516, "step": 2542 }, { "completion_length": 237.49554443359375, "epoch": 0.42650572111153023, "grad_norm": 0.20175321039359187, "kl": 0.022891998291015625, "learning_rate": 3.1407407407407405e-07, "loss": 0.0, "reward": 1.7357143461704254, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.735714303329587, "rewards/format_reward_func": 1.0, "step": 2544 }, { "completion_length": 251.40626335144043, "epoch": 0.4268410243513978, "grad_norm": 0.13603414263062696, "kl": 0.08626937866210938, "learning_rate": 3.14320987654321e-07, "loss": 0.0001, "reward": 1.7178572490811348, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7357143126428127, "rewards/format_reward_func": 0.9821428656578064, "step": 2546 }, { "completion_length": 249.23215293884277, "epoch": 0.42717632759126534, "grad_norm": 0.25842443169289875, "kl": 0.039806365966796875, "learning_rate": 3.145679012345679e-07, "loss": 0.0, "reward": 1.723214365541935, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7276785802096128, "rewards/format_reward_func": 0.9955357164144516, "step": 2548 }, { "completion_length": 248.36608219146729, "epoch": 0.4275116308311329, "grad_norm": 0.2714677153001056, "kl": 0.12255859375, "learning_rate": 3.148148148148148e-07, "loss": 0.0001, "reward": 1.7178572192788124, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7178571745753288, "rewards/format_reward_func": 1.0, "step": 2550 }, { "completion_length": 236.23215293884277, "epoch": 0.42784693407100044, "grad_norm": 0.35098529470563555, "kl": 0.03569793701171875, "learning_rate": 3.150617283950617e-07, "loss": 0.0, "reward": 1.7535714879631996, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 2552 }, { "completion_length": 251.02679443359375, "epoch": 0.428182237310868, "grad_norm": 0.12539207610264913, "kl": 0.08616256713867188, "learning_rate": 3.1530864197530865e-07, "loss": 0.0001, "reward": 1.7946429029107094, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.799107164144516, "rewards/format_reward_func": 0.9955357164144516, "step": 2554 }, { "completion_length": 238.7901906967163, "epoch": 0.4285175405507356, "grad_norm": 0.3178237413778879, "kl": 0.023960113525390625, "learning_rate": 3.1555555555555554e-07, "loss": 0.0, "reward": 1.7446429207921028, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 2556 }, { "completion_length": 245.50001049041748, "epoch": 0.4288528437906031, "grad_norm": 0.45006753486366646, "kl": 0.10959625244140625, "learning_rate": 3.1580246913580243e-07, "loss": 0.0001, "reward": 1.7267857939004898, "reward_std": 0.0732360566034913, "rewards/equation_reward_func": 0.7312500216066837, "rewards/format_reward_func": 0.9955357164144516, "step": 2558 }, { "completion_length": 247.4196548461914, "epoch": 0.4291881470304707, "grad_norm": 0.8192667075625969, "kl": 0.22243881225585938, "learning_rate": 3.1604938271604937e-07, "loss": 0.0002, "reward": 1.7750000581145287, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7839286047965288, "rewards/format_reward_func": 0.9910714328289032, "step": 2560 }, { "completion_length": 251.14286708831787, "epoch": 0.4295234502703382, "grad_norm": 0.22454144415636515, "kl": 0.024169921875, "learning_rate": 3.1629629629629626e-07, "loss": 0.0, "reward": 1.726785808801651, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7312500290572643, "rewards/format_reward_func": 0.9955357164144516, "step": 2562 }, { "completion_length": 246.77679634094238, "epoch": 0.4298587535102058, "grad_norm": 0.3261496817001146, "kl": 0.19002532958984375, "learning_rate": 3.165432098765432e-07, "loss": 0.0002, "reward": 1.7892857939004898, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 2564 }, { "completion_length": 243.477689743042, "epoch": 0.43019405675007333, "grad_norm": 0.33328892831898055, "kl": 0.037197113037109375, "learning_rate": 3.1679012345679014e-07, "loss": 0.0, "reward": 1.737500049173832, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7508929036557674, "rewards/format_reward_func": 0.9866071492433548, "step": 2566 }, { "completion_length": 236.32590198516846, "epoch": 0.4305293599899409, "grad_norm": 0.145820177741581, "kl": 0.02503204345703125, "learning_rate": 3.1703703703703703e-07, "loss": 0.0, "reward": 1.7500000521540642, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7589285969734192, "rewards/format_reward_func": 0.9910714328289032, "step": 2568 }, { "completion_length": 246.85715293884277, "epoch": 0.43086466322980843, "grad_norm": 0.29553026796035753, "kl": 0.06223297119140625, "learning_rate": 3.172839506172839e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7758928798139095, "rewards/format_reward_func": 0.9866071492433548, "step": 2570 }, { "completion_length": 247.07590198516846, "epoch": 0.431199966469676, "grad_norm": 0.15917956082740858, "kl": 0.30030059814453125, "learning_rate": 3.1753086419753086e-07, "loss": 0.0003, "reward": 1.7714286372065544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 2572 }, { "completion_length": 259.2946538925171, "epoch": 0.4315352697095436, "grad_norm": 0.45168312035874253, "kl": 0.3671760559082031, "learning_rate": 3.1777777777777775e-07, "loss": 0.0004, "reward": 1.7446429282426834, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7491071783006191, "rewards/format_reward_func": 0.9955357164144516, "step": 2574 }, { "completion_length": 240.11608409881592, "epoch": 0.4318705729494111, "grad_norm": 0.3496117852276753, "kl": 0.02508544921875, "learning_rate": 3.180246913580247e-07, "loss": 0.0, "reward": 1.7375000715255737, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7419643253087997, "rewards/format_reward_func": 0.9955357164144516, "step": 2576 }, { "completion_length": 243.9821538925171, "epoch": 0.4322058761892787, "grad_norm": 0.3225535714095935, "kl": 0.02301025390625, "learning_rate": 3.182716049382716e-07, "loss": 0.0, "reward": 1.7250001057982445, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7428571842610836, "rewards/format_reward_func": 0.9821428656578064, "step": 2578 }, { "completion_length": 250.12054634094238, "epoch": 0.4325411794291462, "grad_norm": 0.26559242472932765, "kl": 0.034000396728515625, "learning_rate": 3.185185185185185e-07, "loss": 0.0, "reward": 1.7857143506407738, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143320143223, "rewards/format_reward_func": 1.0, "step": 2580 }, { "completion_length": 250.25447463989258, "epoch": 0.4328764826690138, "grad_norm": 0.2759127477860899, "kl": 0.06327056884765625, "learning_rate": 3.1876543209876546e-07, "loss": 0.0001, "reward": 1.7017857730388641, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7151785958558321, "rewards/format_reward_func": 0.9866071492433548, "step": 2582 }, { "completion_length": 244.22769165039062, "epoch": 0.4332117859088813, "grad_norm": 0.207482672686499, "kl": 0.1029205322265625, "learning_rate": 3.190123456790123e-07, "loss": 0.0001, "reward": 1.7089286521077156, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7133928947150707, "rewards/format_reward_func": 0.9955357164144516, "step": 2584 }, { "completion_length": 245.727689743042, "epoch": 0.4335470891487489, "grad_norm": 0.5793295166969608, "kl": 0.1691436767578125, "learning_rate": 3.1925925925925924e-07, "loss": 0.0002, "reward": 1.7196429371833801, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 2586 }, { "completion_length": 251.60715293884277, "epoch": 0.4338823923886165, "grad_norm": 0.5828281237924078, "kl": 0.9253044128417969, "learning_rate": 3.195061728395061e-07, "loss": 0.0009, "reward": 1.7607143223285675, "reward_std": 0.07576144114136696, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 0.9821428656578064, "step": 2588 }, { "completion_length": 255.88394260406494, "epoch": 0.434217695628484, "grad_norm": 0.25950153452146346, "kl": 0.05170440673828125, "learning_rate": 3.1975308641975307e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 2590 }, { "completion_length": 251.83929443359375, "epoch": 0.4345529988683516, "grad_norm": 0.31718516852729894, "kl": 0.11458206176757812, "learning_rate": 3.2e-07, "loss": 0.0001, "reward": 1.7803571820259094, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7848214656114578, "rewards/format_reward_func": 0.9955357164144516, "step": 2592 }, { "completion_length": 251.4419765472412, "epoch": 0.4348883021082191, "grad_norm": 0.1884583384444149, "kl": 0.12668609619140625, "learning_rate": 3.202469135802469e-07, "loss": 0.0001, "reward": 1.7339286357164383, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.73839289881289, "rewards/format_reward_func": 0.9955357164144516, "step": 2594 }, { "completion_length": 252.9375123977661, "epoch": 0.4352236053480867, "grad_norm": 0.22589381749613982, "kl": 0.035221099853515625, "learning_rate": 3.2049382716049384e-07, "loss": 0.0, "reward": 1.8482143506407738, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.852678582072258, "rewards/format_reward_func": 0.9955357164144516, "step": 2596 }, { "completion_length": 246.90179538726807, "epoch": 0.4355589085879542, "grad_norm": 0.15749758438094288, "kl": 0.04557037353515625, "learning_rate": 3.207407407407407e-07, "loss": 0.0, "reward": 1.7821429073810577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 2598 }, { "completion_length": 265.1696529388428, "epoch": 0.4358942118278218, "grad_norm": 0.22695902010162736, "kl": 0.19362640380859375, "learning_rate": 3.209876543209876e-07, "loss": 0.0002, "reward": 1.7160714864730835, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7294643111526966, "rewards/format_reward_func": 0.9866071492433548, "step": 2600 }, { "completion_length": 246.68751049041748, "epoch": 0.43622951506768937, "grad_norm": 0.3494971527672536, "kl": 0.2670745849609375, "learning_rate": 3.2123456790123455e-07, "loss": 0.0003, "reward": 1.7857143729925156, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7857142984867096, "rewards/format_reward_func": 1.0, "step": 2602 }, { "completion_length": 252.92858505249023, "epoch": 0.4365648183075569, "grad_norm": 0.2570008866666796, "kl": 0.08303451538085938, "learning_rate": 3.2148148148148144e-07, "loss": 0.0001, "reward": 1.7446429207921028, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 2604 }, { "completion_length": 254.10269165039062, "epoch": 0.43690012154742447, "grad_norm": 0.3532176636632374, "kl": 1.0090484619140625, "learning_rate": 3.217283950617284e-07, "loss": 0.001, "reward": 1.7375000640749931, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643215835094, "rewards/format_reward_func": 0.9955357164144516, "step": 2606 }, { "completion_length": 252.3526906967163, "epoch": 0.437235424787292, "grad_norm": 0.1619935140323748, "kl": 0.17085647583007812, "learning_rate": 3.219753086419753e-07, "loss": 0.0002, "reward": 1.7375000789761543, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.750892873853445, "rewards/format_reward_func": 0.9866071492433548, "step": 2608 }, { "completion_length": 253.14286708831787, "epoch": 0.4375707280271596, "grad_norm": 0.19724744628716312, "kl": 0.15912628173828125, "learning_rate": 3.222222222222222e-07, "loss": 0.0002, "reward": 1.7232143878936768, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7276786081492901, "rewards/format_reward_func": 0.9955357164144516, "step": 2610 }, { "completion_length": 257.7009057998657, "epoch": 0.4379060312670271, "grad_norm": 0.500823014531597, "kl": 0.095916748046875, "learning_rate": 3.224691358024691e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7455357424914837, "rewards/format_reward_func": 0.9955357164144516, "step": 2612 }, { "completion_length": 242.83929824829102, "epoch": 0.4382413345068947, "grad_norm": 0.30820036330593226, "kl": 0.0233306884765625, "learning_rate": 3.22716049382716e-07, "loss": 0.0, "reward": 1.76071435213089, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143055647612, "rewards/format_reward_func": 1.0, "step": 2614 }, { "completion_length": 253.11608219146729, "epoch": 0.43857663774676225, "grad_norm": 0.2536925262111839, "kl": 0.04438018798828125, "learning_rate": 3.2296296296296293e-07, "loss": 0.0, "reward": 1.708928644657135, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7133928816765547, "rewards/format_reward_func": 0.9955357164144516, "step": 2616 }, { "completion_length": 258.24554920196533, "epoch": 0.4389119409866298, "grad_norm": 0.28150699335408913, "kl": 0.10482025146484375, "learning_rate": 3.2320987654320987e-07, "loss": 0.0001, "reward": 1.7339286357164383, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7473214454948902, "rewards/format_reward_func": 0.9866071492433548, "step": 2618 }, { "completion_length": 254.0580472946167, "epoch": 0.43924724422649736, "grad_norm": 0.12402202969919433, "kl": 0.6347122192382812, "learning_rate": 3.2345679012345676e-07, "loss": 0.0006, "reward": 1.7178572192788124, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7267857454717159, "rewards/format_reward_func": 0.9910714328289032, "step": 2620 }, { "completion_length": 257.6473321914673, "epoch": 0.4395825474663649, "grad_norm": 0.3870609356353709, "kl": 0.027385711669921875, "learning_rate": 3.237037037037037e-07, "loss": 0.0, "reward": 1.7107143551111221, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.7196428887546062, "rewards/format_reward_func": 0.9910714328289032, "step": 2622 }, { "completion_length": 250.04912090301514, "epoch": 0.43991785070623246, "grad_norm": 0.4512728132951566, "kl": 0.14263916015625, "learning_rate": 3.2395061728395064e-07, "loss": 0.0001, "reward": 1.7303572297096252, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7348214723169804, "rewards/format_reward_func": 0.9955357164144516, "step": 2624 }, { "completion_length": 266.42858695983887, "epoch": 0.4402531539461, "grad_norm": 0.2475325032392611, "kl": 0.025585174560546875, "learning_rate": 3.2419753086419753e-07, "loss": 0.0, "reward": 1.7392857670783997, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.748214315623045, "rewards/format_reward_func": 0.9910714328289032, "step": 2626 }, { "completion_length": 240.79019260406494, "epoch": 0.44058845718596756, "grad_norm": 0.24164859505408506, "kl": 0.3480949401855469, "learning_rate": 3.244444444444444e-07, "loss": 0.0003, "reward": 1.7964286357164383, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8053571805357933, "rewards/format_reward_func": 0.9910714328289032, "step": 2628 }, { "completion_length": 254.13393878936768, "epoch": 0.4409237604258351, "grad_norm": 0.31436550452414275, "kl": 0.12886428833007812, "learning_rate": 3.246913580246913e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7553571872413158, "rewards/format_reward_func": 0.9910714328289032, "step": 2630 }, { "completion_length": 251.0580472946167, "epoch": 0.44125906366570267, "grad_norm": 0.29791950286400054, "kl": 0.037143707275390625, "learning_rate": 3.2493827160493825e-07, "loss": 0.0, "reward": 1.783928632736206, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 2632 }, { "completion_length": 270.63840675354004, "epoch": 0.44159436690557025, "grad_norm": 0.24728104547594096, "kl": 0.06447219848632812, "learning_rate": 3.251851851851852e-07, "loss": 0.0001, "reward": 1.7410714998841286, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7544643133878708, "rewards/format_reward_func": 0.9866071492433548, "step": 2634 }, { "completion_length": 248.92858409881592, "epoch": 0.44192967014543777, "grad_norm": 0.31478920467178306, "kl": 0.051105499267578125, "learning_rate": 3.254320987654321e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7696428894996643, "rewards/format_reward_func": 0.9910714328289032, "step": 2636 }, { "completion_length": 249.5759038925171, "epoch": 0.44226497338530535, "grad_norm": 0.24993023096821998, "kl": 0.13302993774414062, "learning_rate": 3.25679012345679e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.045456865802407265, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 0.9821428656578064, "step": 2638 }, { "completion_length": 260.30804920196533, "epoch": 0.4426002766251729, "grad_norm": 1.3821262018529439, "kl": 0.258209228515625, "learning_rate": 3.2592592592592596e-07, "loss": 0.0003, "reward": 1.7089286297559738, "reward_std": 0.08838834799826145, "rewards/equation_reward_func": 0.7312500216066837, "rewards/format_reward_func": 0.977678582072258, "step": 2640 }, { "completion_length": 262.3973340988159, "epoch": 0.44293557986504045, "grad_norm": 0.23587870226894336, "kl": 0.12923049926757812, "learning_rate": 3.261728395061728e-07, "loss": 0.0001, "reward": 1.7589286044239998, "reward_std": 0.10859139915555716, "rewards/equation_reward_func": 0.7812500335276127, "rewards/format_reward_func": 0.977678582072258, "step": 2642 }, { "completion_length": 265.0803699493408, "epoch": 0.443270883104908, "grad_norm": 0.260573659490083, "kl": 0.06291961669921875, "learning_rate": 3.2641975308641974e-07, "loss": 0.0001, "reward": 1.7142857983708382, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7232143115252256, "rewards/format_reward_func": 0.9910714328289032, "step": 2644 }, { "completion_length": 266.5848331451416, "epoch": 0.44360618634477555, "grad_norm": 0.29066419116889264, "kl": 0.4171791076660156, "learning_rate": 3.2666666666666663e-07, "loss": 0.0004, "reward": 1.7571428939700127, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 0.9821428656578064, "step": 2646 }, { "completion_length": 255.9464406967163, "epoch": 0.44394148958464313, "grad_norm": 0.2812603360108659, "kl": 0.05731964111328125, "learning_rate": 3.2691358024691357e-07, "loss": 0.0001, "reward": 1.7303571701049805, "reward_std": 0.1085914010182023, "rewards/equation_reward_func": 0.7616071719676256, "rewards/format_reward_func": 0.9687500149011612, "step": 2648 }, { "completion_length": 262.03572940826416, "epoch": 0.44427679282451066, "grad_norm": 0.30695546284, "kl": 0.057254791259765625, "learning_rate": 3.271604938271605e-07, "loss": 0.0001, "reward": 1.7053572461009026, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7187500298023224, "rewards/format_reward_func": 0.9866071492433548, "step": 2650 }, { "completion_length": 264.9910831451416, "epoch": 0.44461209606437824, "grad_norm": 0.3474524782514886, "kl": 0.06933212280273438, "learning_rate": 3.274074074074074e-07, "loss": 0.0001, "reward": 1.7571429461240768, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7660714648663998, "rewards/format_reward_func": 0.9910714328289032, "step": 2652 }, { "completion_length": 249.60268878936768, "epoch": 0.44494739930424576, "grad_norm": 0.2084635078884685, "kl": 0.029857635498046875, "learning_rate": 3.2765432098765434e-07, "loss": 0.0, "reward": 1.7553572058677673, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 2654 }, { "completion_length": 247.1919755935669, "epoch": 0.44528270254411334, "grad_norm": 0.1889165270559453, "kl": 0.0886993408203125, "learning_rate": 3.279012345679012e-07, "loss": 0.0001, "reward": 1.78035718947649, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214618861675, "rewards/format_reward_func": 0.9955357164144516, "step": 2656 }, { "completion_length": 245.50894165039062, "epoch": 0.44561800578398086, "grad_norm": 0.1117686353305998, "kl": 0.03253936767578125, "learning_rate": 3.281481481481481e-07, "loss": 0.0, "reward": 1.7714286521077156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 2658 }, { "completion_length": 250.4375123977661, "epoch": 0.44595330902384844, "grad_norm": 0.2595808740548275, "kl": 0.1915740966796875, "learning_rate": 3.2839506172839506e-07, "loss": 0.0002, "reward": 1.750000074505806, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 2660 }, { "completion_length": 251.02233600616455, "epoch": 0.446288612263716, "grad_norm": 0.2233516502035073, "kl": 0.02516937255859375, "learning_rate": 3.2864197530864195e-07, "loss": 0.0, "reward": 1.7214286401867867, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214285992085934, "rewards/format_reward_func": 1.0, "step": 2662 }, { "completion_length": 257.571439743042, "epoch": 0.44662391550358355, "grad_norm": 0.2758449396807126, "kl": 0.046142578125, "learning_rate": 3.288888888888889e-07, "loss": 0.0, "reward": 1.7553572207689285, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 2664 }, { "completion_length": 270.15179920196533, "epoch": 0.4469592187434511, "grad_norm": 0.16203992665582476, "kl": 0.20227813720703125, "learning_rate": 3.2913580246913583e-07, "loss": 0.0002, "reward": 1.748214341700077, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.761607164517045, "rewards/format_reward_func": 0.9866071492433548, "step": 2666 }, { "completion_length": 257.2455472946167, "epoch": 0.44729452198331865, "grad_norm": 0.31669039361071327, "kl": 0.17920303344726562, "learning_rate": 3.293827160493827e-07, "loss": 0.0002, "reward": 1.803571492433548, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 2668 }, { "completion_length": 248.63393783569336, "epoch": 0.44762982522318623, "grad_norm": 0.14210074166673994, "kl": 0.06937408447265625, "learning_rate": 3.296296296296296e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.766964316368103, "rewards/format_reward_func": 0.9955357164144516, "step": 2670 }, { "completion_length": 250.87947368621826, "epoch": 0.44796512846305375, "grad_norm": 0.6957656299935766, "kl": 0.3000144958496094, "learning_rate": 3.298765432098765e-07, "loss": 0.0003, "reward": 1.7718750685453415, "reward_std": 0.059977806406095624, "rewards/equation_reward_func": 0.7776785902678967, "rewards/format_reward_func": 0.9941964335739613, "step": 2672 }, { "completion_length": 249.4732265472412, "epoch": 0.44830043170292133, "grad_norm": 0.28613974030082806, "kl": 0.361419677734375, "learning_rate": 3.3012345679012343e-07, "loss": 0.0004, "reward": 1.82321435213089, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.8276785984635353, "rewards/format_reward_func": 0.9955357164144516, "step": 2674 }, { "completion_length": 255.8571538925171, "epoch": 0.4486357349427889, "grad_norm": 0.15867424298767457, "kl": 0.026119232177734375, "learning_rate": 3.303703703703704e-07, "loss": 0.0, "reward": 1.7232143431901932, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7366071715950966, "rewards/format_reward_func": 0.9866071492433548, "step": 2676 }, { "completion_length": 253.98215579986572, "epoch": 0.44897103818265643, "grad_norm": 0.7509986400487191, "kl": 0.14146804809570312, "learning_rate": 3.3061728395061726e-07, "loss": 0.0001, "reward": 1.7553572207689285, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 2678 }, { "completion_length": 252.8482255935669, "epoch": 0.449306341422524, "grad_norm": 0.38029749797861767, "kl": 0.07612991333007812, "learning_rate": 3.308641975308642e-07, "loss": 0.0001, "reward": 1.7267857864499092, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7312500365078449, "rewards/format_reward_func": 0.9955357164144516, "step": 2680 }, { "completion_length": 251.9285831451416, "epoch": 0.44964164466239154, "grad_norm": 0.39717612267346547, "kl": 0.026416778564453125, "learning_rate": 3.311111111111111e-07, "loss": 0.0, "reward": 1.7428572252392769, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7428571619093418, "rewards/format_reward_func": 1.0, "step": 2682 }, { "completion_length": 253.34822463989258, "epoch": 0.4499769479022591, "grad_norm": 0.643773820689036, "kl": 1.5482711791992188, "learning_rate": 3.31358024691358e-07, "loss": 0.0016, "reward": 1.7107143700122833, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7196429036557674, "rewards/format_reward_func": 0.9910714328289032, "step": 2684 }, { "completion_length": 251.17858600616455, "epoch": 0.45031225114212664, "grad_norm": 0.34670280364640704, "kl": 0.042018890380859375, "learning_rate": 3.316049382716049e-07, "loss": 0.0, "reward": 1.7267857864499092, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7312500346451998, "rewards/format_reward_func": 0.9955357164144516, "step": 2686 }, { "completion_length": 245.5312614440918, "epoch": 0.4506475543819942, "grad_norm": 0.25670407143953416, "kl": 0.06372833251953125, "learning_rate": 3.318518518518518e-07, "loss": 0.0001, "reward": 1.8107143640518188, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8107143081724644, "rewards/format_reward_func": 1.0, "step": 2688 }, { "completion_length": 244.03572463989258, "epoch": 0.45098285762186174, "grad_norm": 0.23765421348038518, "kl": 0.027862548828125, "learning_rate": 3.3209876543209875e-07, "loss": 0.0, "reward": 1.7857143357396126, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 2690 }, { "completion_length": 247.9151906967163, "epoch": 0.4513181608617293, "grad_norm": 0.33600943743007694, "kl": 0.559478759765625, "learning_rate": 3.323456790123457e-07, "loss": 0.0006, "reward": 1.7571429461240768, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 2692 }, { "completion_length": 237.571439743042, "epoch": 0.4516534641015969, "grad_norm": 0.5969202483502274, "kl": 0.6981925964355469, "learning_rate": 3.325925925925926e-07, "loss": 0.0007, "reward": 1.7446429505944252, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071671247482, "rewards/format_reward_func": 0.9955357164144516, "step": 2694 }, { "completion_length": 244.20090103149414, "epoch": 0.4519887673414644, "grad_norm": 1.2490766104629545, "kl": 0.19190216064453125, "learning_rate": 3.328395061728395e-07, "loss": 0.0002, "reward": 1.7553572207689285, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7687500305473804, "rewards/format_reward_func": 0.9866071492433548, "step": 2696 }, { "completion_length": 256.09375953674316, "epoch": 0.452324070581332, "grad_norm": 0.3169020842125548, "kl": 0.19821548461914062, "learning_rate": 3.3308641975308636e-07, "loss": 0.0002, "reward": 1.744642935693264, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071671247482, "rewards/format_reward_func": 0.9955357164144516, "step": 2698 }, { "completion_length": 251.383939743042, "epoch": 0.45265937382119953, "grad_norm": 0.17454350064179114, "kl": 0.209808349609375, "learning_rate": 3.333333333333333e-07, "loss": 0.0002, "reward": 1.796428643167019, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964286096394062, "rewards/format_reward_func": 1.0, "step": 2700 }, { "completion_length": 250.05805110931396, "epoch": 0.4529946770610671, "grad_norm": 0.270399027082715, "kl": 0.09576034545898438, "learning_rate": 3.3358024691358024e-07, "loss": 0.0001, "reward": 1.7964286655187607, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285910129547, "rewards/format_reward_func": 1.0, "step": 2702 }, { "completion_length": 235.66072463989258, "epoch": 0.45332998030093463, "grad_norm": 0.25215380030085344, "kl": 0.033405303955078125, "learning_rate": 3.3382716049382713e-07, "loss": 0.0, "reward": 1.7535715028643608, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 2704 }, { "completion_length": 258.839298248291, "epoch": 0.4536652835408022, "grad_norm": 0.507861698652949, "kl": 0.32860565185546875, "learning_rate": 3.3407407407407407e-07, "loss": 0.0003, "reward": 1.7625000923871994, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 2706 }, { "completion_length": 248.53572368621826, "epoch": 0.4540005867806698, "grad_norm": 0.30873013848794667, "kl": 0.034900665283203125, "learning_rate": 3.3432098765432096e-07, "loss": 0.0, "reward": 1.7625000551342964, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9955357164144516, "step": 2708 }, { "completion_length": 242.1696548461914, "epoch": 0.4543358900205373, "grad_norm": 0.36290902428851235, "kl": 0.1857452392578125, "learning_rate": 3.345679012345679e-07, "loss": 0.0002, "reward": 1.7875000685453415, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7919643111526966, "rewards/format_reward_func": 0.9955357164144516, "step": 2710 }, { "completion_length": 265.44197273254395, "epoch": 0.4546711932604049, "grad_norm": 0.18967670433015887, "kl": 0.16097640991210938, "learning_rate": 3.348148148148148e-07, "loss": 0.0002, "reward": 1.7464286461472511, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286033064127, "rewards/format_reward_func": 1.0, "step": 2712 }, { "completion_length": 266.29019260406494, "epoch": 0.4550064965002724, "grad_norm": 0.2752528674594683, "kl": 0.06867218017578125, "learning_rate": 3.350617283950617e-07, "loss": 0.0001, "reward": 1.6803572475910187, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.6848214659839869, "rewards/format_reward_func": 0.9955357164144516, "step": 2714 }, { "completion_length": 249.27233505249023, "epoch": 0.45534179974014, "grad_norm": 0.17644565403173249, "kl": 0.35277557373046875, "learning_rate": 3.353086419753086e-07, "loss": 0.0004, "reward": 1.7946429252624512, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7991071715950966, "rewards/format_reward_func": 0.9955357164144516, "step": 2716 }, { "completion_length": 257.3259038925171, "epoch": 0.4556771029800075, "grad_norm": 0.18222821575997314, "kl": 0.06207275390625, "learning_rate": 3.3555555555555556e-07, "loss": 0.0001, "reward": 1.7410715147852898, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357424914837, "rewards/format_reward_func": 0.9955357164144516, "step": 2718 }, { "completion_length": 258.589298248291, "epoch": 0.4560124062198751, "grad_norm": 0.40252505186651977, "kl": 0.049884796142578125, "learning_rate": 3.3580246913580245e-07, "loss": 0.0, "reward": 1.7500000819563866, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7589285857975483, "rewards/format_reward_func": 0.9910714328289032, "step": 2720 }, { "completion_length": 244.1696548461914, "epoch": 0.4563477094597427, "grad_norm": 0.44376256831190036, "kl": 0.5896148681640625, "learning_rate": 3.360493827160494e-07, "loss": 0.0006, "reward": 1.7410715147852898, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7455357406288385, "rewards/format_reward_func": 0.9955357164144516, "step": 2722 }, { "completion_length": 246.1919765472412, "epoch": 0.4566830126996102, "grad_norm": 0.29343191916555844, "kl": 0.07892227172851562, "learning_rate": 3.362962962962963e-07, "loss": 0.0001, "reward": 1.700000062584877, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7000000327825546, "rewards/format_reward_func": 1.0, "step": 2724 }, { "completion_length": 245.77679443359375, "epoch": 0.4570183159394778, "grad_norm": 0.2667512601424003, "kl": 0.09537887573242188, "learning_rate": 3.365432098765432e-07, "loss": 0.0001, "reward": 1.7660714909434319, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.9955357164144516, "step": 2726 }, { "completion_length": 249.2634038925171, "epoch": 0.4573536191793453, "grad_norm": 0.38442369886561234, "kl": 0.36904144287109375, "learning_rate": 3.367901234567901e-07, "loss": 0.0004, "reward": 1.7571429386734962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 2728 }, { "completion_length": 254.82144260406494, "epoch": 0.4576889224192129, "grad_norm": 0.3137500522359342, "kl": 0.061252593994140625, "learning_rate": 3.37037037037037e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7625000402331352, "rewards/format_reward_func": 0.9910714328289032, "step": 2730 }, { "completion_length": 246.1785831451416, "epoch": 0.4580242256590804, "grad_norm": 0.2809572400397591, "kl": 0.11748123168945312, "learning_rate": 3.3728395061728394e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7991071753203869, "rewards/format_reward_func": 0.9955357164144516, "step": 2732 }, { "completion_length": 255.915189743042, "epoch": 0.458359528898948, "grad_norm": 0.2971358870435252, "kl": 0.028736114501953125, "learning_rate": 3.375308641975308e-07, "loss": 0.0, "reward": 1.8017857819795609, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8062500320374966, "rewards/format_reward_func": 0.9955357164144516, "step": 2734 }, { "completion_length": 257.7544765472412, "epoch": 0.45869483213881557, "grad_norm": 0.3195023386657291, "kl": 0.030345916748046875, "learning_rate": 3.3777777777777777e-07, "loss": 0.0, "reward": 1.7517857551574707, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7562500201165676, "rewards/format_reward_func": 0.9955357164144516, "step": 2736 }, { "completion_length": 246.2366180419922, "epoch": 0.4590301353786831, "grad_norm": 0.9720315208811928, "kl": 0.024379730224609375, "learning_rate": 3.380246913580247e-07, "loss": 0.0, "reward": 1.79464291036129, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7991071715950966, "rewards/format_reward_func": 0.9955357164144516, "step": 2738 }, { "completion_length": 250.5803689956665, "epoch": 0.45936543861855067, "grad_norm": 0.26845835890331887, "kl": 0.0423736572265625, "learning_rate": 3.382716049382716e-07, "loss": 0.0, "reward": 1.7696429267525673, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7741071656346321, "rewards/format_reward_func": 0.9955357164144516, "step": 2740 }, { "completion_length": 261.7544765472412, "epoch": 0.4597007418584182, "grad_norm": 0.19155715201137047, "kl": 0.08993148803710938, "learning_rate": 3.385185185185185e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285887777805, "rewards/format_reward_func": 1.0, "step": 2742 }, { "completion_length": 255.4375123977661, "epoch": 0.4600360450982858, "grad_norm": 0.24483028716397143, "kl": 0.07487106323242188, "learning_rate": 3.387654320987654e-07, "loss": 0.0001, "reward": 1.6910715326666832, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.704464316368103, "rewards/format_reward_func": 0.9866071492433548, "step": 2744 }, { "completion_length": 251.3973331451416, "epoch": 0.4603713483381533, "grad_norm": 0.208316662402822, "kl": 0.02557373046875, "learning_rate": 3.390123456790123e-07, "loss": 0.0, "reward": 1.7714286372065544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 2746 }, { "completion_length": 259.19197845458984, "epoch": 0.4607066515780209, "grad_norm": 0.1999658236725068, "kl": 0.037628173828125, "learning_rate": 3.3925925925925926e-07, "loss": 0.0, "reward": 1.7767857536673546, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7812500298023224, "rewards/format_reward_func": 0.9955357164144516, "step": 2748 }, { "completion_length": 247.6741180419922, "epoch": 0.46104195481788846, "grad_norm": 0.26139475662583994, "kl": 0.0295257568359375, "learning_rate": 3.3950617283950614e-07, "loss": 0.0, "reward": 1.7464286461472511, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 2750 }, { "completion_length": 258.54018783569336, "epoch": 0.461377258057756, "grad_norm": 0.13583613800255978, "kl": 0.0282440185546875, "learning_rate": 3.397530864197531e-07, "loss": 0.0, "reward": 1.7642857730388641, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7732143141329288, "rewards/format_reward_func": 0.9910714328289032, "step": 2752 }, { "completion_length": 251.55358123779297, "epoch": 0.46171256129762356, "grad_norm": 0.24682817309475594, "kl": 0.0307464599609375, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": 1.8017857745289803, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.8062500357627869, "rewards/format_reward_func": 0.9955357164144516, "step": 2754 }, { "completion_length": 257.0848321914673, "epoch": 0.4620478645374911, "grad_norm": 0.15096030885914136, "kl": 0.02770233154296875, "learning_rate": 3.4024691358024686e-07, "loss": 0.0, "reward": 1.7678571790456772, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7767857424914837, "rewards/format_reward_func": 0.9910714328289032, "step": 2756 }, { "completion_length": 252.02680206298828, "epoch": 0.46238316777735866, "grad_norm": 0.23076502570985755, "kl": 0.026927947998046875, "learning_rate": 3.404938271604938e-07, "loss": 0.0, "reward": 1.7375000640749931, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643215835094, "rewards/format_reward_func": 0.9955357164144516, "step": 2758 }, { "completion_length": 260.3348331451416, "epoch": 0.4627184710172262, "grad_norm": 0.34134054770080197, "kl": 0.050151824951171875, "learning_rate": 3.407407407407407e-07, "loss": 0.0001, "reward": 1.758928619325161, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9955357164144516, "step": 2760 }, { "completion_length": 255.1071548461914, "epoch": 0.46305377425709376, "grad_norm": 0.43200431098358355, "kl": 0.082275390625, "learning_rate": 3.4098765432098763e-07, "loss": 0.0001, "reward": 1.7482143715023994, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7616071663796902, "rewards/format_reward_func": 0.9866071492433548, "step": 2762 }, { "completion_length": 262.69643783569336, "epoch": 0.4633890774969613, "grad_norm": 0.37310963538113023, "kl": 0.16982269287109375, "learning_rate": 3.412345679012346e-07, "loss": 0.0002, "reward": 1.8035714700818062, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 2764 }, { "completion_length": 251.98215579986572, "epoch": 0.46372438073682887, "grad_norm": 0.5049977633001252, "kl": 0.03232574462890625, "learning_rate": 3.4148148148148146e-07, "loss": 0.0, "reward": 1.7392857745289803, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 2766 }, { "completion_length": 249.11608219146729, "epoch": 0.46405968397669645, "grad_norm": 0.16067851846385553, "kl": 0.03436279296875, "learning_rate": 3.417283950617284e-07, "loss": 0.0, "reward": 1.7410714775323868, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7455357480794191, "rewards/format_reward_func": 0.9955357164144516, "step": 2768 }, { "completion_length": 247.83929634094238, "epoch": 0.46439498721656397, "grad_norm": 0.23395496207649558, "kl": 0.054126739501953125, "learning_rate": 3.419753086419753e-07, "loss": 0.0001, "reward": 1.7482143566012383, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 2770 }, { "completion_length": 250.0937623977661, "epoch": 0.46473029045643155, "grad_norm": 0.18523182105694547, "kl": 0.024806976318359375, "learning_rate": 3.422222222222222e-07, "loss": 0.0, "reward": 1.821428619325161, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8214285895228386, "rewards/format_reward_func": 1.0, "step": 2772 }, { "completion_length": 256.6875114440918, "epoch": 0.4650655936962991, "grad_norm": 0.24536718633510723, "kl": 0.19739151000976562, "learning_rate": 3.424691358024691e-07, "loss": 0.0002, "reward": 1.7464286461472511, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.746428593993187, "rewards/format_reward_func": 1.0, "step": 2774 }, { "completion_length": 256.1562614440918, "epoch": 0.46540089693616665, "grad_norm": 0.3292800179362696, "kl": 0.12223052978515625, "learning_rate": 3.42716049382716e-07, "loss": 0.0001, "reward": 1.714285783469677, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7232143152505159, "rewards/format_reward_func": 0.9910714328289032, "step": 2776 }, { "completion_length": 242.8571538925171, "epoch": 0.4657362001760342, "grad_norm": 0.40450119722326106, "kl": 0.03858184814453125, "learning_rate": 3.4296296296296295e-07, "loss": 0.0, "reward": 1.8250000551342964, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8250000216066837, "rewards/format_reward_func": 1.0, "step": 2778 }, { "completion_length": 250.78572463989258, "epoch": 0.46607150341590176, "grad_norm": 0.40163561926349534, "kl": 0.05098724365234375, "learning_rate": 3.432098765432099e-07, "loss": 0.0001, "reward": 1.7250000834465027, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7250000350177288, "rewards/format_reward_func": 1.0, "step": 2780 }, { "completion_length": 247.16072750091553, "epoch": 0.46640680665576933, "grad_norm": 0.2388646035496535, "kl": 0.050304412841796875, "learning_rate": 3.434567901234568e-07, "loss": 0.0001, "reward": 1.7982143610715866, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.8026785925030708, "rewards/format_reward_func": 0.9955357164144516, "step": 2782 }, { "completion_length": 251.508939743042, "epoch": 0.46674210989563686, "grad_norm": 0.23875992787468778, "kl": 0.04918670654296875, "learning_rate": 3.4370370370370367e-07, "loss": 0.0, "reward": 1.7571429088711739, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7660714592784643, "rewards/format_reward_func": 0.9910714328289032, "step": 2784 }, { "completion_length": 251.3437623977661, "epoch": 0.46707741313550444, "grad_norm": 0.188319172046665, "kl": 0.3220100402832031, "learning_rate": 3.4395061728395056e-07, "loss": 0.0003, "reward": 1.7785715013742447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714510828257, "rewards/format_reward_func": 1.0, "step": 2786 }, { "completion_length": 247.6607255935669, "epoch": 0.46741271637537196, "grad_norm": 0.3122224174936983, "kl": 0.0294189453125, "learning_rate": 3.441975308641975e-07, "loss": 0.0, "reward": 1.776785746216774, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500167638063, "rewards/format_reward_func": 0.9955357164144516, "step": 2788 }, { "completion_length": 254.26787090301514, "epoch": 0.46774801961523954, "grad_norm": 0.22040518881569834, "kl": 0.098358154296875, "learning_rate": 3.4444444444444444e-07, "loss": 0.0001, "reward": 1.7178572341799736, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571745753288, "rewards/format_reward_func": 1.0, "step": 2790 }, { "completion_length": 242.29911708831787, "epoch": 0.46808332285510706, "grad_norm": 0.3139530838726713, "kl": 0.039592742919921875, "learning_rate": 3.4469135802469133e-07, "loss": 0.0, "reward": 1.7714286297559738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 2792 }, { "completion_length": 254.37947463989258, "epoch": 0.46841862609497464, "grad_norm": 0.2681590517820234, "kl": 0.03741455078125, "learning_rate": 3.4493827160493827e-07, "loss": 0.0, "reward": 1.7857143506407738, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7946428842842579, "rewards/format_reward_func": 0.9910714328289032, "step": 2794 }, { "completion_length": 256.9241180419922, "epoch": 0.4687539293348422, "grad_norm": 0.17318988896415513, "kl": 0.1388397216796875, "learning_rate": 3.451851851851852e-07, "loss": 0.0001, "reward": 1.7446429133415222, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.749107176437974, "rewards/format_reward_func": 0.9955357164144516, "step": 2796 }, { "completion_length": 264.62500953674316, "epoch": 0.46908923257470975, "grad_norm": 0.5645059371676063, "kl": 0.10883331298828125, "learning_rate": 3.4543209876543205e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7946428842842579, "rewards/format_reward_func": 0.9910714328289032, "step": 2798 }, { "completion_length": 264.3080463409424, "epoch": 0.4694245358145773, "grad_norm": 0.3930393143745379, "kl": 0.0624847412109375, "learning_rate": 3.45679012345679e-07, "loss": 0.0001, "reward": 1.698214367032051, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7026786059141159, "rewards/format_reward_func": 0.9955357164144516, "step": 2800 }, { "completion_length": 257.2098340988159, "epoch": 0.46975983905444485, "grad_norm": 0.2631303652587504, "kl": 0.04084014892578125, "learning_rate": 3.459259259259259e-07, "loss": 0.0, "reward": 1.7107143625617027, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7196428962051868, "rewards/format_reward_func": 0.9910714328289032, "step": 2802 }, { "completion_length": 257.65179443359375, "epoch": 0.47009514229431243, "grad_norm": 0.22529626467128386, "kl": 0.0889739990234375, "learning_rate": 3.461728395061728e-07, "loss": 0.0001, "reward": 1.7732143625617027, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7776785921305418, "rewards/format_reward_func": 0.9955357164144516, "step": 2804 }, { "completion_length": 254.2678689956665, "epoch": 0.47043044553417995, "grad_norm": 0.6203961669806676, "kl": 0.20491790771484375, "learning_rate": 3.4641975308641976e-07, "loss": 0.0002, "reward": 1.7517857924103737, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 2806 }, { "completion_length": 256.9107275009155, "epoch": 0.47076574877404753, "grad_norm": 0.24761485503433714, "kl": 0.09076690673828125, "learning_rate": 3.4666666666666665e-07, "loss": 0.0001, "reward": 1.7839286252856255, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 2808 }, { "completion_length": 258.24108028411865, "epoch": 0.4711010520139151, "grad_norm": 0.2496260686532331, "kl": 0.03143310546875, "learning_rate": 3.469135802469136e-07, "loss": 0.0, "reward": 1.7482143864035606, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7526785917580128, "rewards/format_reward_func": 0.9955357164144516, "step": 2810 }, { "completion_length": 254.9509038925171, "epoch": 0.47143635525378264, "grad_norm": 0.6028251611292678, "kl": 0.3297767639160156, "learning_rate": 3.4716049382716053e-07, "loss": 0.0003, "reward": 1.7464286237955093, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.755357164889574, "rewards/format_reward_func": 0.9910714328289032, "step": 2812 }, { "completion_length": 269.39286708831787, "epoch": 0.4717716584936502, "grad_norm": 0.3415395676744013, "kl": 0.1169281005859375, "learning_rate": 3.4740740740740737e-07, "loss": 0.0001, "reward": 1.7392857745289803, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7571428753435612, "rewards/format_reward_func": 0.9821428656578064, "step": 2814 }, { "completion_length": 263.53125953674316, "epoch": 0.47210696173351774, "grad_norm": 0.23960314502568703, "kl": 0.050708770751953125, "learning_rate": 3.476543209876543e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.0909137288108468, "rewards/equation_reward_func": 0.7750000171363354, "rewards/format_reward_func": 0.9821428656578064, "step": 2816 }, { "completion_length": 268.6651887893677, "epoch": 0.4724422649733853, "grad_norm": 0.22502083720491892, "kl": 0.3942832946777344, "learning_rate": 3.479012345679012e-07, "loss": 0.0004, "reward": 1.7125000581145287, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7258929070085287, "rewards/format_reward_func": 0.9866071492433548, "step": 2818 }, { "completion_length": 266.383939743042, "epoch": 0.47277756821325284, "grad_norm": 0.3852008894067241, "kl": 0.5472488403320312, "learning_rate": 3.4814814814814814e-07, "loss": 0.0005, "reward": 1.7785715088248253, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 2820 }, { "completion_length": 277.2678737640381, "epoch": 0.4731128714531204, "grad_norm": 0.4898862429233096, "kl": 0.44490814208984375, "learning_rate": 3.483950617283951e-07, "loss": 0.0004, "reward": 1.7464286535978317, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 0.9821428656578064, "step": 2822 }, { "completion_length": 271.20983505249023, "epoch": 0.47344817469298794, "grad_norm": 0.3050807925241347, "kl": 0.32114410400390625, "learning_rate": 3.4864197530864197e-07, "loss": 0.0003, "reward": 1.7107143327593803, "reward_std": 0.05555839091539383, "rewards/equation_reward_func": 0.7285714615136385, "rewards/format_reward_func": 0.9821428656578064, "step": 2824 }, { "completion_length": 262.2857255935669, "epoch": 0.4737834779328555, "grad_norm": 0.32417959842074145, "kl": 0.27597808837890625, "learning_rate": 3.488888888888889e-07, "loss": 0.0003, "reward": 1.7196429446339607, "reward_std": 0.06313453428447247, "rewards/equation_reward_func": 0.7330357395112514, "rewards/format_reward_func": 0.9866071492433548, "step": 2826 }, { "completion_length": 256.62947940826416, "epoch": 0.4741187811727231, "grad_norm": 0.43158639531492315, "kl": 0.3496856689453125, "learning_rate": 3.4913580246913574e-07, "loss": 0.0003, "reward": 1.7857143506407738, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7946428786963224, "rewards/format_reward_func": 0.9910714328289032, "step": 2828 }, { "completion_length": 257.06251430511475, "epoch": 0.4744540844125906, "grad_norm": 0.16878279995351914, "kl": 0.02948760986328125, "learning_rate": 3.493827160493827e-07, "loss": 0.0, "reward": 1.7964286133646965, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 2830 }, { "completion_length": 268.3035821914673, "epoch": 0.4747893876524582, "grad_norm": 1.3822497181999542, "kl": 1.5914840698242188, "learning_rate": 3.496296296296296e-07, "loss": 0.0016, "reward": 1.723214365541935, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7366071864962578, "rewards/format_reward_func": 0.9866071492433548, "step": 2832 }, { "completion_length": 272.46876430511475, "epoch": 0.47512469089232573, "grad_norm": 0.31405924815564323, "kl": 0.60906982421875, "learning_rate": 3.498765432098765e-07, "loss": 0.0006, "reward": 1.721428632736206, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 0.9821428656578064, "step": 2834 }, { "completion_length": 265.4419765472412, "epoch": 0.4754599941321933, "grad_norm": 0.3334541089600988, "kl": 0.1193695068359375, "learning_rate": 3.5012345679012345e-07, "loss": 0.0001, "reward": 1.7196429297327995, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.7330357357859612, "rewards/format_reward_func": 0.9866071492433548, "step": 2836 }, { "completion_length": 261.68751430511475, "epoch": 0.47579529737206083, "grad_norm": 0.2359020883684203, "kl": 0.02530670166015625, "learning_rate": 3.503703703703704e-07, "loss": 0.0, "reward": 1.7392857819795609, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 2838 }, { "completion_length": 270.17858123779297, "epoch": 0.4761306006119284, "grad_norm": 0.19232319568658027, "kl": 0.028003692626953125, "learning_rate": 3.506172839506173e-07, "loss": 0.0, "reward": 1.6857143491506577, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7035714518278837, "rewards/format_reward_func": 0.9821428656578064, "step": 2840 }, { "completion_length": 248.52679824829102, "epoch": 0.476465903851796, "grad_norm": 0.14569116729208106, "kl": 0.08068084716796875, "learning_rate": 3.5086419753086417e-07, "loss": 0.0001, "reward": 1.8071429133415222, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 2842 }, { "completion_length": 254.58929634094238, "epoch": 0.4768012070916635, "grad_norm": 1.0018321660589764, "kl": 0.047954559326171875, "learning_rate": 3.5111111111111106e-07, "loss": 0.0, "reward": 1.8000000640749931, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.8089286014437675, "rewards/format_reward_func": 0.9910714328289032, "step": 2844 }, { "completion_length": 242.83483219146729, "epoch": 0.4771365103315311, "grad_norm": 0.1876792247373185, "kl": 0.19379425048828125, "learning_rate": 3.51358024691358e-07, "loss": 0.0002, "reward": 1.7428572177886963, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 2846 }, { "completion_length": 257.9866199493408, "epoch": 0.4774718135713986, "grad_norm": 0.21949993388796094, "kl": 0.054508209228515625, "learning_rate": 3.5160493827160494e-07, "loss": 0.0001, "reward": 1.7696429416537285, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7741071656346321, "rewards/format_reward_func": 0.9955357164144516, "step": 2848 }, { "completion_length": 248.25447463989258, "epoch": 0.4778071168112662, "grad_norm": 0.47622915884966727, "kl": 0.5040206909179688, "learning_rate": 3.5185185185185183e-07, "loss": 0.0005, "reward": 1.7625000849366188, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643089175224, "rewards/format_reward_func": 0.9955357164144516, "step": 2850 }, { "completion_length": 249.97768783569336, "epoch": 0.4781424200511337, "grad_norm": 0.40265000030374465, "kl": 0.05511474609375, "learning_rate": 3.5209876543209877e-07, "loss": 0.0001, "reward": 1.7803572043776512, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7848214618861675, "rewards/format_reward_func": 0.9955357164144516, "step": 2852 }, { "completion_length": 264.2812614440918, "epoch": 0.4784777232910013, "grad_norm": 0.38387369362331086, "kl": 0.31607818603515625, "learning_rate": 3.5234567901234566e-07, "loss": 0.0003, "reward": 1.7267857939004898, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7401785887777805, "rewards/format_reward_func": 0.9866071492433548, "step": 2854 }, { "completion_length": 257.3392972946167, "epoch": 0.4788130265308689, "grad_norm": 0.5951750074514578, "kl": 0.13494873046875, "learning_rate": 3.5259259259259255e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7455357499420643, "rewards/format_reward_func": 0.9955357164144516, "step": 2856 }, { "completion_length": 254.5134048461914, "epoch": 0.4791483297707364, "grad_norm": 0.17450689785228546, "kl": 0.03231048583984375, "learning_rate": 3.528395061728395e-07, "loss": 0.0, "reward": 1.741071492433548, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357536673546, "rewards/format_reward_func": 0.9955357164144516, "step": 2858 }, { "completion_length": 257.2321529388428, "epoch": 0.479483633010604, "grad_norm": 0.1944576284948148, "kl": 0.288116455078125, "learning_rate": 3.530864197530864e-07, "loss": 0.0003, "reward": 1.7428571954369545, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7517857421189547, "rewards/format_reward_func": 0.9910714328289032, "step": 2860 }, { "completion_length": 260.0357275009155, "epoch": 0.4798189362504715, "grad_norm": 0.19695027022142356, "kl": 0.067718505859375, "learning_rate": 3.533333333333333e-07, "loss": 0.0001, "reward": 1.805357202887535, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8098214454948902, "rewards/format_reward_func": 0.9955357164144516, "step": 2862 }, { "completion_length": 255.81251430511475, "epoch": 0.4801542394903391, "grad_norm": 0.23358145138386263, "kl": 0.33197021484375, "learning_rate": 3.5358024691358026e-07, "loss": 0.0003, "reward": 1.733928643167019, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7383928894996643, "rewards/format_reward_func": 0.9955357164144516, "step": 2864 }, { "completion_length": 248.89733505249023, "epoch": 0.4804895427302066, "grad_norm": 0.349013914148386, "kl": 0.0921783447265625, "learning_rate": 3.5382716049382715e-07, "loss": 0.0001, "reward": 1.725000075995922, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7339286021888256, "rewards/format_reward_func": 0.9910714328289032, "step": 2866 }, { "completion_length": 246.35715675354004, "epoch": 0.4808248459700742, "grad_norm": 0.5646698298864009, "kl": 0.31024169921875, "learning_rate": 3.540740740740741e-07, "loss": 0.0003, "reward": 1.6839286461472511, "reward_std": 0.10354063473641872, "rewards/equation_reward_func": 0.6973214633762836, "rewards/format_reward_func": 0.9866071492433548, "step": 2868 }, { "completion_length": 258.14287090301514, "epoch": 0.48116014920994177, "grad_norm": 0.52150068072927, "kl": 0.208526611328125, "learning_rate": 3.5432098765432093e-07, "loss": 0.0002, "reward": 1.73214291036129, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7410714700818062, "rewards/format_reward_func": 0.9910714328289032, "step": 2870 }, { "completion_length": 249.07590198516846, "epoch": 0.4814954524498093, "grad_norm": 0.31669922163507147, "kl": 0.0349273681640625, "learning_rate": 3.5456790123456787e-07, "loss": 0.0, "reward": 1.7464286237955093, "reward_std": 0.07576144114136696, "rewards/equation_reward_func": 0.755357189103961, "rewards/format_reward_func": 0.9910714328289032, "step": 2872 }, { "completion_length": 249.3392972946167, "epoch": 0.48183075568967687, "grad_norm": 0.25581827702063864, "kl": 0.05368804931640625, "learning_rate": 3.548148148148148e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428954601288, "rewards/format_reward_func": 1.0, "step": 2874 }, { "completion_length": 259.40179538726807, "epoch": 0.4821660589295444, "grad_norm": 0.2812439109626435, "kl": 1.0510787963867188, "learning_rate": 3.550617283950617e-07, "loss": 0.0011, "reward": 1.6892857775092125, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.6982143223285675, "rewards/format_reward_func": 0.9910714328289032, "step": 2876 }, { "completion_length": 250.75447273254395, "epoch": 0.482501362169412, "grad_norm": 0.19207748486652276, "kl": 0.322998046875, "learning_rate": 3.5530864197530864e-07, "loss": 0.0003, "reward": 1.7035715207457542, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7035714648663998, "rewards/format_reward_func": 1.0, "step": 2878 }, { "completion_length": 260.4017963409424, "epoch": 0.4828366654092795, "grad_norm": 0.22444574368780876, "kl": 0.17897415161132812, "learning_rate": 3.5555555555555553e-07, "loss": 0.0002, "reward": 1.7642857730388641, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7732143364846706, "rewards/format_reward_func": 0.9910714328289032, "step": 2880 }, { "completion_length": 250.5178680419922, "epoch": 0.4831719686491471, "grad_norm": 0.5271841775692403, "kl": 0.5541343688964844, "learning_rate": 3.5580246913580247e-07, "loss": 0.0006, "reward": 1.7732143476605415, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7776785977184772, "rewards/format_reward_func": 0.9955357164144516, "step": 2882 }, { "completion_length": 255.05804634094238, "epoch": 0.4835072718890146, "grad_norm": 0.465173051193132, "kl": 0.31304931640625, "learning_rate": 3.5604938271604936e-07, "loss": 0.0003, "reward": 1.694642923772335, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.6991071719676256, "rewards/format_reward_func": 0.9955357164144516, "step": 2884 }, { "completion_length": 261.0982255935669, "epoch": 0.4838425751288822, "grad_norm": 0.3209358019665937, "kl": 0.6431045532226562, "learning_rate": 3.5629629629629625e-07, "loss": 0.0006, "reward": 1.742857202887535, "reward_std": 0.1010152529925108, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 0.9821428656578064, "step": 2886 }, { "completion_length": 253.11608409881592, "epoch": 0.48417787836874976, "grad_norm": 0.2680813222428663, "kl": 0.21021652221679688, "learning_rate": 3.565432098765432e-07, "loss": 0.0002, "reward": 1.7303572222590446, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7348214685916901, "rewards/format_reward_func": 0.9955357164144516, "step": 2888 }, { "completion_length": 253.08929443359375, "epoch": 0.4845131816086173, "grad_norm": 0.5583623489192696, "kl": 0.2989044189453125, "learning_rate": 3.5679012345679013e-07, "loss": 0.0003, "reward": 1.753571480512619, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7625000402331352, "rewards/format_reward_func": 0.9910714328289032, "step": 2890 }, { "completion_length": 248.2455472946167, "epoch": 0.48484848484848486, "grad_norm": 0.2820007992710127, "kl": 0.31207275390625, "learning_rate": 3.57037037037037e-07, "loss": 0.0003, "reward": 1.7875000536441803, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7910714484751225, "rewards/format_reward_func": 0.9964285790920258, "step": 2892 }, { "completion_length": 253.58483409881592, "epoch": 0.4851837880883524, "grad_norm": 0.7035434638322244, "kl": 0.6702957153320312, "learning_rate": 3.5728395061728396e-07, "loss": 0.0007, "reward": 1.7000000923871994, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7089285925030708, "rewards/format_reward_func": 0.9910714328289032, "step": 2894 }, { "completion_length": 256.2410831451416, "epoch": 0.48551909132821996, "grad_norm": 0.3080211232059757, "kl": 0.267303466796875, "learning_rate": 3.5753086419753085e-07, "loss": 0.0003, "reward": 1.7410714998841286, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7544643096625805, "rewards/format_reward_func": 0.9866071492433548, "step": 2896 }, { "completion_length": 257.6384029388428, "epoch": 0.4858543945680875, "grad_norm": 0.28553151949283306, "kl": 0.15687179565429688, "learning_rate": 3.5777777777777773e-07, "loss": 0.0002, "reward": 1.7339286133646965, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7473214752972126, "rewards/format_reward_func": 0.9866071492433548, "step": 2898 }, { "completion_length": 255.65625667572021, "epoch": 0.48618969780795507, "grad_norm": 0.21765843503980004, "kl": 0.5928115844726562, "learning_rate": 3.580246913580247e-07, "loss": 0.0006, "reward": 1.6803572103381157, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.6937500424683094, "rewards/format_reward_func": 0.9866071492433548, "step": 2900 }, { "completion_length": 254.33929920196533, "epoch": 0.48652500104782265, "grad_norm": 0.44552416758581037, "kl": 0.3790931701660156, "learning_rate": 3.5827160493827156e-07, "loss": 0.0004, "reward": 1.6964286491274834, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7053571753203869, "rewards/format_reward_func": 0.9910714328289032, "step": 2902 }, { "completion_length": 253.09376335144043, "epoch": 0.48686030428769017, "grad_norm": 0.9647608524452777, "kl": 1.1598663330078125, "learning_rate": 3.585185185185185e-07, "loss": 0.0012, "reward": 1.714285783469677, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7142857518047094, "rewards/format_reward_func": 1.0, "step": 2904 }, { "completion_length": 255.12054920196533, "epoch": 0.48719560752755775, "grad_norm": 0.29048605992776716, "kl": 0.21368408203125, "learning_rate": 3.587654320987654e-07, "loss": 0.0002, "reward": 1.7589286342263222, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7723214663565159, "rewards/format_reward_func": 0.9866071492433548, "step": 2906 }, { "completion_length": 255.02679920196533, "epoch": 0.4875309107674253, "grad_norm": 0.2027351239625565, "kl": 0.7143478393554688, "learning_rate": 3.5901234567901234e-07, "loss": 0.0007, "reward": 1.7660714983940125, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.7794643118977547, "rewards/format_reward_func": 0.9866071492433548, "step": 2908 }, { "completion_length": 252.3973331451416, "epoch": 0.48786621400729285, "grad_norm": 0.19049513041306962, "kl": 0.08922576904296875, "learning_rate": 3.592592592592593e-07, "loss": 0.0001, "reward": 1.791071504354477, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.795535746961832, "rewards/format_reward_func": 0.9955357164144516, "step": 2910 }, { "completion_length": 256.40625953674316, "epoch": 0.4882015172471604, "grad_norm": 0.35376853781284395, "kl": 0.48523712158203125, "learning_rate": 3.5950617283950616e-07, "loss": 0.0005, "reward": 1.769642911851406, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7741071805357933, "rewards/format_reward_func": 0.9955357164144516, "step": 2912 }, { "completion_length": 263.09822273254395, "epoch": 0.48853682048702796, "grad_norm": 0.36369623998455325, "kl": 0.08524322509765625, "learning_rate": 3.5975308641975305e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 2914 }, { "completion_length": 245.4553689956665, "epoch": 0.48887212372689554, "grad_norm": 0.1363731743599683, "kl": 0.03830718994140625, "learning_rate": 3.6e-07, "loss": 0.0, "reward": 1.7232143729925156, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276785969734192, "rewards/format_reward_func": 0.9955357164144516, "step": 2916 }, { "completion_length": 259.6875123977661, "epoch": 0.48920742696676306, "grad_norm": 0.22073589348512726, "kl": 0.175567626953125, "learning_rate": 3.602469135802469e-07, "loss": 0.0002, "reward": 1.7821428999304771, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.791071455925703, "rewards/format_reward_func": 0.9910714328289032, "step": 2918 }, { "completion_length": 256.28572845458984, "epoch": 0.48954273020663064, "grad_norm": 0.2817423794616662, "kl": 0.23409271240234375, "learning_rate": 3.604938271604938e-07, "loss": 0.0002, "reward": 1.782142922282219, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428775787354, "rewards/format_reward_func": 1.0, "step": 2920 }, { "completion_length": 262.0446500778198, "epoch": 0.48987803344649816, "grad_norm": 0.5882471776016766, "kl": 0.3173255920410156, "learning_rate": 3.607407407407407e-07, "loss": 0.0003, "reward": 1.6803572252392769, "reward_std": 0.06818529777228832, "rewards/equation_reward_func": 0.6937500387430191, "rewards/format_reward_func": 0.9866071492433548, "step": 2922 }, { "completion_length": 247.25447463989258, "epoch": 0.49021333668636574, "grad_norm": 0.2754932860486546, "kl": 0.6283416748046875, "learning_rate": 3.6098765432098765e-07, "loss": 0.0006, "reward": 1.758928656578064, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 2924 }, { "completion_length": 242.98661994934082, "epoch": 0.49054863992623327, "grad_norm": 0.31778700835528595, "kl": 0.11323165893554688, "learning_rate": 3.612345679012346e-07, "loss": 0.0001, "reward": 1.7339286729693413, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 2926 }, { "completion_length": 255.58929920196533, "epoch": 0.49088394316610084, "grad_norm": 0.2705317676366948, "kl": 1.1042404174804688, "learning_rate": 3.6148148148148143e-07, "loss": 0.0011, "reward": 1.7803571820259094, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7848214693367481, "rewards/format_reward_func": 0.9955357164144516, "step": 2928 }, { "completion_length": 248.96430015563965, "epoch": 0.4912192464059684, "grad_norm": 0.1774856462598626, "kl": 0.22403717041015625, "learning_rate": 3.6172839506172837e-07, "loss": 0.0002, "reward": 1.810714341700077, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8107143193483353, "rewards/format_reward_func": 1.0, "step": 2930 }, { "completion_length": 250.04911613464355, "epoch": 0.49155454964583595, "grad_norm": 0.2009737631916188, "kl": 0.3294029235839844, "learning_rate": 3.6197530864197526e-07, "loss": 0.0003, "reward": 1.7714286297559738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 2932 }, { "completion_length": 252.9062623977661, "epoch": 0.4918898528857035, "grad_norm": 0.24142261752824526, "kl": 0.37865447998046875, "learning_rate": 3.622222222222222e-07, "loss": 0.0004, "reward": 1.7660714760422707, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7705357484519482, "rewards/format_reward_func": 0.9955357164144516, "step": 2934 }, { "completion_length": 245.23215198516846, "epoch": 0.49222515612557105, "grad_norm": 0.3224585490033016, "kl": 1.2538604736328125, "learning_rate": 3.6246913580246914e-07, "loss": 0.0013, "reward": 1.8285714834928513, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8285714499652386, "rewards/format_reward_func": 1.0, "step": 2936 }, { "completion_length": 242.7098331451416, "epoch": 0.49256045936543863, "grad_norm": 0.07482152881354606, "kl": 0.6891937255859375, "learning_rate": 3.6271604938271603e-07, "loss": 0.0007, "reward": 1.7821429446339607, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 2938 }, { "completion_length": 250.76340293884277, "epoch": 0.49289576260530615, "grad_norm": 0.3688486923883066, "kl": 0.46811676025390625, "learning_rate": 3.6296296296296297e-07, "loss": 0.0005, "reward": 1.7053572237491608, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.709821468219161, "rewards/format_reward_func": 0.9955357164144516, "step": 2940 }, { "completion_length": 247.9598331451416, "epoch": 0.49323106584517373, "grad_norm": 0.38334694612665543, "kl": 0.5134353637695312, "learning_rate": 3.6320987654320986e-07, "loss": 0.0005, "reward": 1.814285770058632, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8142857346683741, "rewards/format_reward_func": 1.0, "step": 2942 }, { "completion_length": 253.24108505249023, "epoch": 0.49356636908504126, "grad_norm": 0.2466986529569209, "kl": 0.073577880859375, "learning_rate": 3.6345679012345675e-07, "loss": 0.0001, "reward": 1.682142935693264, "reward_std": 0.09596449043601751, "rewards/equation_reward_func": 0.7000000309199095, "rewards/format_reward_func": 0.9821428656578064, "step": 2944 }, { "completion_length": 253.5044755935669, "epoch": 0.49390167232490884, "grad_norm": 0.22385096048864217, "kl": 0.6382865905761719, "learning_rate": 3.637037037037037e-07, "loss": 0.0006, "reward": 1.7464286461472511, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7553571835160255, "rewards/format_reward_func": 0.9910714328289032, "step": 2946 }, { "completion_length": 244.7544755935669, "epoch": 0.4942369755647764, "grad_norm": 0.3057270423350129, "kl": 0.124908447265625, "learning_rate": 3.639506172839506e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 2948 }, { "completion_length": 251.68304634094238, "epoch": 0.49457227880464394, "grad_norm": 0.8529840796001801, "kl": 1.887176513671875, "learning_rate": 3.641975308641975e-07, "loss": 0.0019, "reward": 1.7017858028411865, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7062500342726707, "rewards/format_reward_func": 0.9955357164144516, "step": 2950 }, { "completion_length": 250.33929824829102, "epoch": 0.4949075820445115, "grad_norm": 0.32756161445028503, "kl": 0.14838409423828125, "learning_rate": 3.6444444444444446e-07, "loss": 0.0001, "reward": 1.7089286372065544, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7133928779512644, "rewards/format_reward_func": 0.9955357164144516, "step": 2952 }, { "completion_length": 259.4419775009155, "epoch": 0.49524288528437904, "grad_norm": 0.7213068467496084, "kl": 0.8767471313476562, "learning_rate": 3.6469135802469135e-07, "loss": 0.0009, "reward": 1.7714286148548126, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7803571783006191, "rewards/format_reward_func": 0.9910714328289032, "step": 2954 }, { "completion_length": 253.06250953674316, "epoch": 0.4955781885242466, "grad_norm": 0.3223780611086043, "kl": 0.16445159912109375, "learning_rate": 3.6493827160493824e-07, "loss": 0.0002, "reward": 1.7607143223285675, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.769642885774374, "rewards/format_reward_func": 0.9910714328289032, "step": 2956 }, { "completion_length": 250.9732265472412, "epoch": 0.49591349176411414, "grad_norm": 0.31015288975976857, "kl": 0.32781982421875, "learning_rate": 3.651851851851851e-07, "loss": 0.0003, "reward": 1.7000000774860382, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7089285999536514, "rewards/format_reward_func": 0.9910714328289032, "step": 2958 }, { "completion_length": 244.3750123977661, "epoch": 0.4962487950039817, "grad_norm": 0.24747093123257072, "kl": 0.4291877746582031, "learning_rate": 3.6543209876543207e-07, "loss": 0.0004, "reward": 1.7482143491506577, "reward_std": 0.06313453428447247, "rewards/equation_reward_func": 0.7616071701049805, "rewards/format_reward_func": 0.9866071492433548, "step": 2960 }, { "completion_length": 257.76786708831787, "epoch": 0.4965840982438493, "grad_norm": 0.3119423738396043, "kl": 0.40831756591796875, "learning_rate": 3.65679012345679e-07, "loss": 0.0004, "reward": 1.732142947614193, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 0.9821428656578064, "step": 2962 }, { "completion_length": 259.17858505249023, "epoch": 0.4969194014837168, "grad_norm": 0.1830517098733492, "kl": 0.05135345458984375, "learning_rate": 3.659259259259259e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 0.9821428619325161, "step": 2964 }, { "completion_length": 243.9419755935669, "epoch": 0.4972547047235844, "grad_norm": 0.21681102140064856, "kl": 0.10626220703125, "learning_rate": 3.6617283950617284e-07, "loss": 0.0001, "reward": 1.7964286133646965, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.8053571581840515, "rewards/format_reward_func": 0.9910714328289032, "step": 2966 }, { "completion_length": 255.5312614440918, "epoch": 0.49759000796345193, "grad_norm": 0.30420774625152075, "kl": 0.27170562744140625, "learning_rate": 3.664197530864198e-07, "loss": 0.0003, "reward": 1.7839286178350449, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7973214574158192, "rewards/format_reward_func": 0.9866071492433548, "step": 2968 }, { "completion_length": 260.1562623977661, "epoch": 0.4979253112033195, "grad_norm": 0.5679008529737339, "kl": 0.6922416687011719, "learning_rate": 3.666666666666666e-07, "loss": 0.0007, "reward": 1.703571505844593, "reward_std": 0.09091372601687908, "rewards/equation_reward_func": 0.7303571589291096, "rewards/format_reward_func": 0.9732142984867096, "step": 2970 }, { "completion_length": 256.95983123779297, "epoch": 0.49826061444318703, "grad_norm": 0.21251929924314933, "kl": 0.3111114501953125, "learning_rate": 3.6691358024691356e-07, "loss": 0.0003, "reward": 1.7339286282658577, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7473214603960514, "rewards/format_reward_func": 0.9866071492433548, "step": 2972 }, { "completion_length": 250.0044755935669, "epoch": 0.4985959176830546, "grad_norm": 0.2855125484052691, "kl": 0.6980438232421875, "learning_rate": 3.6716049382716044e-07, "loss": 0.0007, "reward": 1.7303571924567223, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7437500283122063, "rewards/format_reward_func": 0.9866071492433548, "step": 2974 }, { "completion_length": 266.54019260406494, "epoch": 0.4989312209229222, "grad_norm": 0.4551964805783462, "kl": 0.3116912841796875, "learning_rate": 3.674074074074074e-07, "loss": 0.0003, "reward": 1.8089286163449287, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.8223214447498322, "rewards/format_reward_func": 0.9866071492433548, "step": 2976 }, { "completion_length": 257.308048248291, "epoch": 0.4992665241627897, "grad_norm": 0.38176881311264943, "kl": 0.36710357666015625, "learning_rate": 3.6765432098765433e-07, "loss": 0.0004, "reward": 1.7035714983940125, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7214285992085934, "rewards/format_reward_func": 0.9821428656578064, "step": 2978 }, { "completion_length": 260.9598340988159, "epoch": 0.4996018274026573, "grad_norm": 0.14641630559689583, "kl": 0.6771926879882812, "learning_rate": 3.679012345679012e-07, "loss": 0.0007, "reward": 1.74642863124609, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.755357176065445, "rewards/format_reward_func": 0.9910714328289032, "step": 2980 }, { "completion_length": 258.0625114440918, "epoch": 0.4999371306425248, "grad_norm": 0.2595195319935882, "kl": 0.0545196533203125, "learning_rate": 3.6814814814814816e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9910714328289032, "step": 2982 }, { "completion_length": 263.83929920196533, "epoch": 0.5002724338823924, "grad_norm": 0.24109920616356117, "kl": 0.05385589599609375, "learning_rate": 3.68395061728395e-07, "loss": 0.0001, "reward": 1.7982143461704254, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.8116071783006191, "rewards/format_reward_func": 0.9866071492433548, "step": 2984 }, { "completion_length": 252.6160831451416, "epoch": 0.5006077371222599, "grad_norm": 0.31178103313116434, "kl": 0.11287689208984375, "learning_rate": 3.6864197530864193e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.8107143193483353, "rewards/format_reward_func": 0.9821428656578064, "step": 2986 }, { "completion_length": 258.7276906967163, "epoch": 0.5009430403621274, "grad_norm": 0.257026450835211, "kl": 0.25646209716796875, "learning_rate": 3.688888888888889e-07, "loss": 0.0003, "reward": 1.7714286297559738, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7803571596741676, "rewards/format_reward_func": 0.9910714328289032, "step": 2988 }, { "completion_length": 251.8125123977661, "epoch": 0.5012783436019951, "grad_norm": 0.26271403970330076, "kl": 0.6702995300292969, "learning_rate": 3.6913580246913576e-07, "loss": 0.0007, "reward": 1.8125000521540642, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.8169642947614193, "rewards/format_reward_func": 0.9955357164144516, "step": 2990 }, { "completion_length": 256.6384038925171, "epoch": 0.5016136468418626, "grad_norm": 0.27592010057674143, "kl": 0.1610565185546875, "learning_rate": 3.693827160493827e-07, "loss": 0.0002, "reward": 1.6785715073347092, "reward_std": 0.08081220369786024, "rewards/equation_reward_func": 0.6964286081492901, "rewards/format_reward_func": 0.9821428656578064, "step": 2992 }, { "completion_length": 257.120548248291, "epoch": 0.5019489500817301, "grad_norm": 0.5610497082441622, "kl": 0.25261688232421875, "learning_rate": 3.6962962962962965e-07, "loss": 0.0003, "reward": 1.680357240140438, "reward_std": 0.10859139636158943, "rewards/equation_reward_func": 0.7026785984635353, "rewards/format_reward_func": 0.977678582072258, "step": 2994 }, { "completion_length": 268.7410840988159, "epoch": 0.5022842533215978, "grad_norm": 0.5487386942065536, "kl": 0.42702484130859375, "learning_rate": 3.6987654320987653e-07, "loss": 0.0004, "reward": 1.7446429282426834, "reward_std": 0.08838834799826145, "rewards/equation_reward_func": 0.7580357454717159, "rewards/format_reward_func": 0.9866071492433548, "step": 2996 }, { "completion_length": 250.97322463989258, "epoch": 0.5026195565614653, "grad_norm": 0.3198474395173545, "kl": 0.5860786437988281, "learning_rate": 3.701234567901235e-07, "loss": 0.0006, "reward": 1.7339286506175995, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7383928876370192, "rewards/format_reward_func": 0.9955357164144516, "step": 2998 }, { "completion_length": 255.59822463989258, "epoch": 0.5029548598013328, "grad_norm": 0.26785004048529404, "kl": 1.1002006530761719, "learning_rate": 3.703703703703703e-07, "loss": 0.0011, "reward": 1.748214341700077, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7616071663796902, "rewards/format_reward_func": 0.9866071492433548, "step": 3000 }, { "completion_length": 261.35269260406494, "epoch": 0.5032901630412003, "grad_norm": 0.3310002273693556, "kl": 0.20697784423828125, "learning_rate": 3.7061728395061725e-07, "loss": 0.0002, "reward": 1.6982143595814705, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7116071823984385, "rewards/format_reward_func": 0.9866071492433548, "step": 3002 }, { "completion_length": 253.9732255935669, "epoch": 0.503625466281068, "grad_norm": 0.13514736768128646, "kl": 0.20084381103515625, "learning_rate": 3.708641975308642e-07, "loss": 0.0002, "reward": 1.7875000461935997, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.800892872735858, "rewards/format_reward_func": 0.9866071492433548, "step": 3004 }, { "completion_length": 259.071439743042, "epoch": 0.5039607695209355, "grad_norm": 0.3078015700685397, "kl": 0.37459564208984375, "learning_rate": 3.711111111111111e-07, "loss": 0.0004, "reward": 1.7839286252856255, "reward_std": 0.06313453428447247, "rewards/equation_reward_func": 0.7973214499652386, "rewards/format_reward_func": 0.9866071492433548, "step": 3006 }, { "completion_length": 263.93751335144043, "epoch": 0.504296072760803, "grad_norm": 0.36645424767648727, "kl": 0.47135162353515625, "learning_rate": 3.71358024691358e-07, "loss": 0.0005, "reward": 1.7571429163217545, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 0.9821428656578064, "step": 3008 }, { "completion_length": 255.758939743042, "epoch": 0.5046313760006707, "grad_norm": 0.7864030190498497, "kl": 1.5985565185546875, "learning_rate": 3.7160493827160496e-07, "loss": 0.0016, "reward": 1.7482143342494965, "reward_std": 0.07828682288527489, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9776785783469677, "step": 3010 }, { "completion_length": 248.2634038925171, "epoch": 0.5049666792405382, "grad_norm": 0.2667122406171907, "kl": 0.03884124755859375, "learning_rate": 3.7185185185185185e-07, "loss": 0.0, "reward": 1.7642857804894447, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7732143178582191, "rewards/format_reward_func": 0.9910714328289032, "step": 3012 }, { "completion_length": 267.8482303619385, "epoch": 0.5053019824804057, "grad_norm": 0.3173331245909394, "kl": 1.0220565795898438, "learning_rate": 3.7209876543209874e-07, "loss": 0.001, "reward": 1.7321429252624512, "reward_std": 0.09596449043601751, "rewards/equation_reward_func": 0.7589285932481289, "rewards/format_reward_func": 0.9732142984867096, "step": 3014 }, { "completion_length": 243.81697368621826, "epoch": 0.5056372857202732, "grad_norm": 0.34077550562632575, "kl": 0.040744781494140625, "learning_rate": 3.7234567901234563e-07, "loss": 0.0, "reward": 1.7589286267757416, "reward_std": 0.09848987217992544, "rewards/equation_reward_func": 0.7812500298023224, "rewards/format_reward_func": 0.977678582072258, "step": 3016 }, { "completion_length": 261.33929347991943, "epoch": 0.5059725889601409, "grad_norm": 0.15313021699348667, "kl": 0.43424224853515625, "learning_rate": 3.7259259259259257e-07, "loss": 0.0004, "reward": 1.716071479022503, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7294643260538578, "rewards/format_reward_func": 0.9866071492433548, "step": 3018 }, { "completion_length": 258.4509029388428, "epoch": 0.5063078922000084, "grad_norm": 0.5652791164362287, "kl": 0.9497528076171875, "learning_rate": 3.728395061728395e-07, "loss": 0.0009, "reward": 1.6750000938773155, "reward_std": 0.11616754159331322, "rewards/equation_reward_func": 0.6928571835160255, "rewards/format_reward_func": 0.9821428656578064, "step": 3020 }, { "completion_length": 267.790189743042, "epoch": 0.5066431954398759, "grad_norm": 0.2861186061735038, "kl": 0.532745361328125, "learning_rate": 3.730864197530864e-07, "loss": 0.0005, "reward": 1.7785714715719223, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7875000201165676, "rewards/format_reward_func": 0.9910714328289032, "step": 3022 }, { "completion_length": 266.2500162124634, "epoch": 0.5069784986797435, "grad_norm": 0.4827885536321817, "kl": 0.32605743408203125, "learning_rate": 3.7333333333333334e-07, "loss": 0.0003, "reward": 1.6928572058677673, "reward_std": 0.1010152529925108, "rewards/equation_reward_func": 0.7107143234461546, "rewards/format_reward_func": 0.9821428656578064, "step": 3024 }, { "completion_length": 256.75447368621826, "epoch": 0.5073138019196111, "grad_norm": 0.29762340493815626, "kl": 0.08817291259765625, "learning_rate": 3.7358024691358023e-07, "loss": 0.0001, "reward": 1.6732143610715866, "reward_std": 0.08838834706693888, "rewards/equation_reward_func": 0.6955357305705547, "rewards/format_reward_func": 0.977678582072258, "step": 3026 }, { "completion_length": 250.3794765472412, "epoch": 0.5076491051594786, "grad_norm": 0.29653367322563745, "kl": 0.04032135009765625, "learning_rate": 3.738271604938271e-07, "loss": 0.0, "reward": 1.717857226729393, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7267857417464256, "rewards/format_reward_func": 0.9910714328289032, "step": 3028 }, { "completion_length": 256.73661708831787, "epoch": 0.5079844083993461, "grad_norm": 0.19002438532993526, "kl": 0.0325164794921875, "learning_rate": 3.7407407407407406e-07, "loss": 0.0, "reward": 1.7125000953674316, "reward_std": 0.0328299580141902, "rewards/equation_reward_func": 0.716964315623045, "rewards/format_reward_func": 0.9955357164144516, "step": 3030 }, { "completion_length": 264.87947940826416, "epoch": 0.5083197116392137, "grad_norm": 0.47697001771334524, "kl": 0.1249542236328125, "learning_rate": 3.7432098765432095e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.748214315623045, "rewards/format_reward_func": 0.9910714328289032, "step": 3032 }, { "completion_length": 252.37501335144043, "epoch": 0.5086550148790813, "grad_norm": 0.19164757321243217, "kl": 0.16481781005859375, "learning_rate": 3.745679012345679e-07, "loss": 0.0002, "reward": 1.6839286610484123, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.6973214689642191, "rewards/format_reward_func": 0.9866071492433548, "step": 3034 }, { "completion_length": 242.11608409881592, "epoch": 0.5089903181189488, "grad_norm": 0.2965385453786233, "kl": 0.102325439453125, "learning_rate": 3.7481481481481483e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.769642885774374, "rewards/format_reward_func": 0.9910714328289032, "step": 3036 }, { "completion_length": 264.0134038925171, "epoch": 0.5093256213588164, "grad_norm": 0.26828654844694805, "kl": 0.50030517578125, "learning_rate": 3.750617283950617e-07, "loss": 0.0005, "reward": 1.7232143431901932, "reward_std": 0.0833375845104456, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9598214477300644, "step": 3038 }, { "completion_length": 254.5625114440918, "epoch": 0.509660924598684, "grad_norm": 0.193990185560616, "kl": 0.0839996337890625, "learning_rate": 3.7530864197530866e-07, "loss": 0.0001, "reward": 1.739285759627819, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7482143323868513, "rewards/format_reward_func": 0.9910714328289032, "step": 3040 }, { "completion_length": 246.9419755935669, "epoch": 0.5099962278385515, "grad_norm": 0.19313521551747367, "kl": 0.03554534912109375, "learning_rate": 3.755555555555555e-07, "loss": 0.0, "reward": 1.7839286252856255, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928753435612, "rewards/format_reward_func": 0.9955357164144516, "step": 3042 }, { "completion_length": 255.01340675354004, "epoch": 0.510331531078419, "grad_norm": 0.7765180616026391, "kl": 0.51654052734375, "learning_rate": 3.7580246913580244e-07, "loss": 0.0005, "reward": 1.7482143342494965, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.977678582072258, "step": 3044 }, { "completion_length": 256.6339416503906, "epoch": 0.5106668343182866, "grad_norm": 1.1364308614205811, "kl": 0.6205825805664062, "learning_rate": 3.760493827160494e-07, "loss": 0.0006, "reward": 1.7375000715255737, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7508928962051868, "rewards/format_reward_func": 0.9866071492433548, "step": 3046 }, { "completion_length": 246.1830472946167, "epoch": 0.5110021375581542, "grad_norm": 0.2493496018800458, "kl": 0.158294677734375, "learning_rate": 3.7629629629629627e-07, "loss": 0.0002, "reward": 1.7750000432133675, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7839285992085934, "rewards/format_reward_func": 0.9910714328289032, "step": 3048 }, { "completion_length": 261.41072940826416, "epoch": 0.5113374407980217, "grad_norm": 0.19985846821705897, "kl": 0.2151336669921875, "learning_rate": 3.765432098765432e-07, "loss": 0.0002, "reward": 1.7446428835391998, "reward_std": 0.08838834706693888, "rewards/equation_reward_func": 0.766964316368103, "rewards/format_reward_func": 0.977678582072258, "step": 3050 }, { "completion_length": 257.71429920196533, "epoch": 0.5116727440378893, "grad_norm": 0.22633980489201774, "kl": 0.155548095703125, "learning_rate": 3.767901234567901e-07, "loss": 0.0002, "reward": 1.694642961025238, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7071428969502449, "rewards/format_reward_func": 0.9875000044703484, "step": 3052 }, { "completion_length": 259.6160840988159, "epoch": 0.5120080472777568, "grad_norm": 0.13005143693972465, "kl": 0.12020111083984375, "learning_rate": 3.7703703703703704e-07, "loss": 0.0001, "reward": 1.7982143387198448, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8026785999536514, "rewards/format_reward_func": 0.9955357164144516, "step": 3054 }, { "completion_length": 252.43751049041748, "epoch": 0.5123433505176244, "grad_norm": 0.29216567176643266, "kl": 0.304046630859375, "learning_rate": 3.772839506172839e-07, "loss": 0.0003, "reward": 1.7129464820027351, "reward_std": 0.08270623860880733, "rewards/equation_reward_func": 0.7232143189758062, "rewards/format_reward_func": 0.9897321499884129, "step": 3056 }, { "completion_length": 251.09376525878906, "epoch": 0.5126786537574919, "grad_norm": 0.6033130029832788, "kl": 0.08933258056640625, "learning_rate": 3.775308641975308e-07, "loss": 0.0001, "reward": 1.7767857685685158, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7901786081492901, "rewards/format_reward_func": 0.9866071492433548, "step": 3058 }, { "completion_length": 252.43751335144043, "epoch": 0.5130139569973595, "grad_norm": 0.20902483387252335, "kl": 0.0396728515625, "learning_rate": 3.7777777777777775e-07, "loss": 0.0, "reward": 1.7375000715255737, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 3060 }, { "completion_length": 249.22769165039062, "epoch": 0.513349260237227, "grad_norm": 0.260245315584681, "kl": 0.18543243408203125, "learning_rate": 3.780246913580247e-07, "loss": 0.0002, "reward": 1.7446429207921028, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7491071820259094, "rewards/format_reward_func": 0.9955357164144516, "step": 3062 }, { "completion_length": 254.43751049041748, "epoch": 0.5136845634770946, "grad_norm": 0.26409884638490183, "kl": 0.037139892578125, "learning_rate": 3.782716049382716e-07, "loss": 0.0, "reward": 1.696428656578064, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.6964286081492901, "rewards/format_reward_func": 1.0, "step": 3064 }, { "completion_length": 260.54465103149414, "epoch": 0.5140198667169622, "grad_norm": 0.1428687655533802, "kl": 0.0744171142578125, "learning_rate": 3.785185185185185e-07, "loss": 0.0001, "reward": 1.7482143640518188, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 3066 }, { "completion_length": 246.53572368621826, "epoch": 0.5143551699568297, "grad_norm": 0.16185149934506265, "kl": 0.0672454833984375, "learning_rate": 3.787654320987654e-07, "loss": 0.0001, "reward": 1.7589286491274834, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9955357164144516, "step": 3068 }, { "completion_length": 258.9821557998657, "epoch": 0.5146904731966973, "grad_norm": 0.2306501331554964, "kl": 0.0682220458984375, "learning_rate": 3.790123456790123e-07, "loss": 0.0001, "reward": 1.7035715207457542, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7125000543892384, "rewards/format_reward_func": 0.9910714328289032, "step": 3070 }, { "completion_length": 264.87947845458984, "epoch": 0.5150257764365648, "grad_norm": 0.22456347881762204, "kl": 0.06907272338867188, "learning_rate": 3.7925925925925924e-07, "loss": 0.0001, "reward": 1.6892858147621155, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6892857514321804, "rewards/format_reward_func": 1.0, "step": 3072 }, { "completion_length": 258.65179538726807, "epoch": 0.5153610796764324, "grad_norm": 0.35293272384908336, "kl": 0.2332611083984375, "learning_rate": 3.7950617283950613e-07, "loss": 0.0002, "reward": 1.739285796880722, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7482143044471741, "rewards/format_reward_func": 0.9910714328289032, "step": 3074 }, { "completion_length": 251.87054443359375, "epoch": 0.5156963829162999, "grad_norm": 0.1800318548596281, "kl": 0.1234283447265625, "learning_rate": 3.7975308641975307e-07, "loss": 0.0001, "reward": 1.7839286252856255, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928902447224, "rewards/format_reward_func": 0.9955357164144516, "step": 3076 }, { "completion_length": 246.4553689956665, "epoch": 0.5160316861561675, "grad_norm": 0.1740786371284468, "kl": 0.02825164794921875, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": 1.7267857939004898, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7312500402331352, "rewards/format_reward_func": 0.9955357164144516, "step": 3078 }, { "completion_length": 258.22769260406494, "epoch": 0.5163669893960351, "grad_norm": 0.3160476219985606, "kl": 0.057590484619140625, "learning_rate": 3.802469135802469e-07, "loss": 0.0001, "reward": 1.7589286342263222, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7633928805589676, "rewards/format_reward_func": 0.9955357164144516, "step": 3080 }, { "completion_length": 252.28572750091553, "epoch": 0.5167022926359026, "grad_norm": 0.4762047675136293, "kl": 0.20147705078125, "learning_rate": 3.8049382716049384e-07, "loss": 0.0002, "reward": 1.751785770058632, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.7651785984635353, "rewards/format_reward_func": 0.9866071492433548, "step": 3082 }, { "completion_length": 257.0401906967163, "epoch": 0.5170375958757701, "grad_norm": 0.4295341728220505, "kl": 0.029018402099609375, "learning_rate": 3.8074074074074073e-07, "loss": 0.0, "reward": 1.7089286372065544, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7133928965777159, "rewards/format_reward_func": 0.9955357164144516, "step": 3084 }, { "completion_length": 258.2321529388428, "epoch": 0.5173728991156377, "grad_norm": 0.2085002371188389, "kl": 0.2578277587890625, "learning_rate": 3.809876543209876e-07, "loss": 0.0003, "reward": 1.7178572043776512, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571913391352, "rewards/format_reward_func": 1.0, "step": 3086 }, { "completion_length": 263.6384038925171, "epoch": 0.5177082023555053, "grad_norm": 0.564076640872581, "kl": 0.08950042724609375, "learning_rate": 3.8123456790123456e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 3088 }, { "completion_length": 252.54465675354004, "epoch": 0.5180435055953728, "grad_norm": 0.18742434123139506, "kl": 0.1312103271484375, "learning_rate": 3.8148148148148145e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7562500163912773, "rewards/format_reward_func": 0.9955357164144516, "step": 3090 }, { "completion_length": 250.01786708831787, "epoch": 0.5183788088352403, "grad_norm": 0.20853683279872454, "kl": 0.11835479736328125, "learning_rate": 3.817283950617284e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285835623741, "rewards/format_reward_func": 1.0, "step": 3092 }, { "completion_length": 261.95983695983887, "epoch": 0.518714112075108, "grad_norm": 0.20069159944437373, "kl": 0.052577972412109375, "learning_rate": 3.819753086419753e-07, "loss": 0.0001, "reward": 1.7750000432133675, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.783928606659174, "rewards/format_reward_func": 0.9910714328289032, "step": 3094 }, { "completion_length": 248.60268878936768, "epoch": 0.5190494153149755, "grad_norm": 0.18308556647430432, "kl": 0.05411529541015625, "learning_rate": 3.822222222222222e-07, "loss": 0.0001, "reward": 1.7196429520845413, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7241071797907352, "rewards/format_reward_func": 0.9955357164144516, "step": 3096 }, { "completion_length": 252.9107255935669, "epoch": 0.519384718554843, "grad_norm": 0.23069855632685662, "kl": 0.0457916259765625, "learning_rate": 3.8246913580246916e-07, "loss": 0.0, "reward": 1.744642935693264, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7491071671247482, "rewards/format_reward_func": 0.9955357164144516, "step": 3098 }, { "completion_length": 250.46429920196533, "epoch": 0.5197200217947106, "grad_norm": 0.259615918588719, "kl": 0.05632781982421875, "learning_rate": 3.82716049382716e-07, "loss": 0.0001, "reward": 1.7589286342263222, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7633929066359997, "rewards/format_reward_func": 0.9955357164144516, "step": 3100 }, { "completion_length": 255.2142972946167, "epoch": 0.5200553250345782, "grad_norm": 0.2773907131668989, "kl": 0.36530303955078125, "learning_rate": 3.8296296296296294e-07, "loss": 0.0004, "reward": 1.758928656578064, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928693830967, "rewards/format_reward_func": 0.9955357164144516, "step": 3102 }, { "completion_length": 248.92412090301514, "epoch": 0.5203906282744457, "grad_norm": 0.37992031338955395, "kl": 0.17604446411132812, "learning_rate": 3.8320987654320983e-07, "loss": 0.0002, "reward": 1.7678572162985802, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7767857499420643, "rewards/format_reward_func": 0.9910714328289032, "step": 3104 }, { "completion_length": 258.3125114440918, "epoch": 0.5207259315143132, "grad_norm": 0.394132585956962, "kl": 0.433685302734375, "learning_rate": 3.8345679012345677e-07, "loss": 0.0004, "reward": 1.6821429505944252, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.6910714581608772, "rewards/format_reward_func": 0.9910714328289032, "step": 3106 }, { "completion_length": 264.12501430511475, "epoch": 0.5210612347541808, "grad_norm": 0.18801212145262286, "kl": 0.64044189453125, "learning_rate": 3.837037037037037e-07, "loss": 0.0006, "reward": 1.7093750983476639, "reward_std": 0.037249374436214566, "rewards/equation_reward_func": 0.7107143178582191, "rewards/format_reward_func": 0.9986607171595097, "step": 3108 }, { "completion_length": 251.20983409881592, "epoch": 0.5213965379940484, "grad_norm": 0.25513628192429766, "kl": 0.08748626708984375, "learning_rate": 3.839506172839506e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428906172514, "rewards/format_reward_func": 1.0, "step": 3110 }, { "completion_length": 255.2857265472412, "epoch": 0.5217318412339159, "grad_norm": 0.28046900188641055, "kl": 0.3914947509765625, "learning_rate": 3.8419753086419754e-07, "loss": 0.0004, "reward": 1.7464286386966705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 3112 }, { "completion_length": 256.214298248291, "epoch": 0.5220671444737834, "grad_norm": 0.22531422784150337, "kl": 0.24646377563476562, "learning_rate": 3.8444444444444443e-07, "loss": 0.0002, "reward": 1.796428605914116, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286170899868, "rewards/format_reward_func": 1.0, "step": 3114 }, { "completion_length": 257.6250104904175, "epoch": 0.5224024477136511, "grad_norm": 0.2730117358782168, "kl": 0.4022064208984375, "learning_rate": 3.846913580246913e-07, "loss": 0.0004, "reward": 1.7392857745289803, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7482143193483353, "rewards/format_reward_func": 0.9910714328289032, "step": 3116 }, { "completion_length": 250.9687623977661, "epoch": 0.5227377509535186, "grad_norm": 0.6576136637774147, "kl": 0.7910614013671875, "learning_rate": 3.8493827160493826e-07, "loss": 0.0008, "reward": 1.7821428924798965, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 3118 }, { "completion_length": 260.790189743042, "epoch": 0.5230730541933861, "grad_norm": 0.24315733472477172, "kl": 0.04315185546875, "learning_rate": 3.8518518518518515e-07, "loss": 0.0, "reward": 1.7857143431901932, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7946428805589676, "rewards/format_reward_func": 0.9910714328289032, "step": 3120 }, { "completion_length": 264.5000114440918, "epoch": 0.5234083574332536, "grad_norm": 0.22778397283566784, "kl": 0.7026290893554688, "learning_rate": 3.854320987654321e-07, "loss": 0.0007, "reward": 1.823214329779148, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.8276785910129547, "rewards/format_reward_func": 0.9955357164144516, "step": 3122 }, { "completion_length": 262.43751335144043, "epoch": 0.5237436606731213, "grad_norm": 0.21985933881844663, "kl": 0.5026016235351562, "learning_rate": 3.8567901234567903e-07, "loss": 0.0005, "reward": 1.742857202887535, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 3124 }, { "completion_length": 249.99108219146729, "epoch": 0.5240789639129888, "grad_norm": 0.33885053896790346, "kl": 0.574188232421875, "learning_rate": 3.859259259259259e-07, "loss": 0.0006, "reward": 1.7892857789993286, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857566475868, "rewards/format_reward_func": 1.0, "step": 3126 }, { "completion_length": 257.49108505249023, "epoch": 0.5244142671528563, "grad_norm": 0.2792327924735609, "kl": 0.32171630859375, "learning_rate": 3.861728395061728e-07, "loss": 0.0003, "reward": 1.750000074505806, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 3128 }, { "completion_length": 263.89733505249023, "epoch": 0.524749570392724, "grad_norm": 0.5251332741259391, "kl": 0.3920097351074219, "learning_rate": 3.864197530864197e-07, "loss": 0.0004, "reward": 1.7375000715255737, "reward_std": 0.08838834706693888, "rewards/equation_reward_func": 0.750892898067832, "rewards/format_reward_func": 0.9866071492433548, "step": 3130 }, { "completion_length": 258.620548248291, "epoch": 0.5250848736325915, "grad_norm": 0.17825779621215138, "kl": 0.44925689697265625, "learning_rate": 3.8666666666666664e-07, "loss": 0.0004, "reward": 1.7982143387198448, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.8116071708500385, "rewards/format_reward_func": 0.9866071492433548, "step": 3132 }, { "completion_length": 259.758939743042, "epoch": 0.525420176872459, "grad_norm": 0.22491108314728606, "kl": 0.089263916015625, "learning_rate": 3.869135802469136e-07, "loss": 0.0001, "reward": 1.7517857924103737, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 3134 }, { "completion_length": 265.76787090301514, "epoch": 0.5257554801123265, "grad_norm": 0.25530360444858247, "kl": 0.16037750244140625, "learning_rate": 3.8716049382716046e-07, "loss": 0.0002, "reward": 1.7232143506407738, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7276786155998707, "rewards/format_reward_func": 0.9955357164144516, "step": 3136 }, { "completion_length": 266.25894260406494, "epoch": 0.5260907833521942, "grad_norm": 0.21909712127541298, "kl": 0.12729644775390625, "learning_rate": 3.874074074074074e-07, "loss": 0.0001, "reward": 1.7250000983476639, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000238418579, "rewards/format_reward_func": 1.0, "step": 3138 }, { "completion_length": 254.43304538726807, "epoch": 0.5264260865920617, "grad_norm": 0.36150859827321014, "kl": 0.08214187622070312, "learning_rate": 3.8765432098765435e-07, "loss": 0.0001, "reward": 1.7232143506407738, "reward_std": 0.07828682195395231, "rewards/equation_reward_func": 0.7366071734577417, "rewards/format_reward_func": 0.9866071492433548, "step": 3140 }, { "completion_length": 264.1160831451416, "epoch": 0.5267613898319292, "grad_norm": 0.2555141927292338, "kl": 0.055820465087890625, "learning_rate": 3.879012345679012e-07, "loss": 0.0001, "reward": 1.698214367032051, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7116071879863739, "rewards/format_reward_func": 0.9866071492433548, "step": 3142 }, { "completion_length": 258.9196548461914, "epoch": 0.5270966930717969, "grad_norm": 0.1265249629832738, "kl": 0.09015274047851562, "learning_rate": 3.881481481481481e-07, "loss": 0.0001, "reward": 1.8392857387661934, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8392857313156128, "rewards/format_reward_func": 1.0, "step": 3144 }, { "completion_length": 262.70983600616455, "epoch": 0.5274319963116644, "grad_norm": 0.2757642721961975, "kl": 0.04498291015625, "learning_rate": 3.88395061728395e-07, "loss": 0.0, "reward": 1.7678572237491608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 3146 }, { "completion_length": 265.4866199493408, "epoch": 0.5277672995515319, "grad_norm": 0.21547518845810912, "kl": 0.05883026123046875, "learning_rate": 3.8864197530864195e-07, "loss": 0.0001, "reward": 1.6821429505944252, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.6821428947150707, "rewards/format_reward_func": 1.0, "step": 3148 }, { "completion_length": 262.4687623977661, "epoch": 0.5281026027913994, "grad_norm": 0.11776663498490876, "kl": 0.15918731689453125, "learning_rate": 3.888888888888889e-07, "loss": 0.0002, "reward": 1.8321428894996643, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8321428801864386, "rewards/format_reward_func": 1.0, "step": 3150 }, { "completion_length": 259.6116247177124, "epoch": 0.5284379060312671, "grad_norm": 0.35745340475596804, "kl": 0.0491790771484375, "learning_rate": 3.891358024691358e-07, "loss": 0.0, "reward": 1.7142857983708382, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7321428880095482, "rewards/format_reward_func": 0.9821428656578064, "step": 3152 }, { "completion_length": 262.5401906967163, "epoch": 0.5287732092711346, "grad_norm": 0.21916166918525395, "kl": 0.047977447509765625, "learning_rate": 3.893827160493827e-07, "loss": 0.0, "reward": 1.7660714760422707, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9955357164144516, "step": 3154 }, { "completion_length": 264.5000123977661, "epoch": 0.5291085125110021, "grad_norm": 0.30194854312892355, "kl": 0.06650543212890625, "learning_rate": 3.8962962962962956e-07, "loss": 0.0001, "reward": 1.7303572297096252, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7348214648663998, "rewards/format_reward_func": 0.9955357164144516, "step": 3156 }, { "completion_length": 260.9910821914673, "epoch": 0.5294438157508697, "grad_norm": 0.08596848806407578, "kl": 0.05260467529296875, "learning_rate": 3.898765432098765e-07, "loss": 0.0001, "reward": 1.8125000670552254, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8169643096625805, "rewards/format_reward_func": 0.9955357164144516, "step": 3158 }, { "completion_length": 259.5000104904175, "epoch": 0.5297791189907373, "grad_norm": 0.16451427183843081, "kl": 0.037384033203125, "learning_rate": 3.9012345679012344e-07, "loss": 0.0, "reward": 1.791071467101574, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 3160 }, { "completion_length": 267.2187614440918, "epoch": 0.5301144222306048, "grad_norm": 0.2709270291715736, "kl": 0.031284332275390625, "learning_rate": 3.9037037037037033e-07, "loss": 0.0, "reward": 1.8000000566244125, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 3162 }, { "completion_length": 271.5491180419922, "epoch": 0.5304497254704723, "grad_norm": 0.3913483799722536, "kl": 0.08670806884765625, "learning_rate": 3.9061728395061727e-07, "loss": 0.0001, "reward": 1.7875000536441803, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7919643223285675, "rewards/format_reward_func": 0.9955357164144516, "step": 3164 }, { "completion_length": 257.41965675354004, "epoch": 0.53078502871034, "grad_norm": 0.17074821649440147, "kl": 0.03655242919921875, "learning_rate": 3.908641975308642e-07, "loss": 0.0, "reward": 1.7785714864730835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 3166 }, { "completion_length": 268.9821538925171, "epoch": 0.5311203319502075, "grad_norm": 0.19454170410731372, "kl": 0.0432891845703125, "learning_rate": 3.911111111111111e-07, "loss": 0.0, "reward": 1.6910715103149414, "reward_std": 0.0732360603287816, "rewards/equation_reward_func": 0.7044643256813288, "rewards/format_reward_func": 0.9866071492433548, "step": 3168 }, { "completion_length": 259.4464406967163, "epoch": 0.531455635190075, "grad_norm": 0.2802742714672685, "kl": 0.03183746337890625, "learning_rate": 3.91358024691358e-07, "loss": 0.0, "reward": 1.7357143685221672, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143312692642, "rewards/format_reward_func": 1.0, "step": 3170 }, { "completion_length": 255.79465579986572, "epoch": 0.5317909384299426, "grad_norm": 0.24431833873124947, "kl": 0.07769012451171875, "learning_rate": 3.916049382716049e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7821428943425417, "rewards/format_reward_func": 1.0, "step": 3172 }, { "completion_length": 261.43305015563965, "epoch": 0.5321262416698102, "grad_norm": 0.2838905343856962, "kl": 0.0420074462890625, "learning_rate": 3.918518518518518e-07, "loss": 0.0, "reward": 1.7053572162985802, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7098214700818062, "rewards/format_reward_func": 0.9955357164144516, "step": 3174 }, { "completion_length": 261.9464406967163, "epoch": 0.5324615449096777, "grad_norm": 0.383031704099419, "kl": 0.052783966064453125, "learning_rate": 3.9209876543209876e-07, "loss": 0.0001, "reward": 1.7660715132951736, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 3176 }, { "completion_length": 253.23661994934082, "epoch": 0.5327968481495452, "grad_norm": 0.3703123406378154, "kl": 0.0786895751953125, "learning_rate": 3.9234567901234565e-07, "loss": 0.0001, "reward": 1.773214340209961, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 3178 }, { "completion_length": 267.46876525878906, "epoch": 0.5331321513894128, "grad_norm": 0.2207114618581663, "kl": 0.0382232666015625, "learning_rate": 3.925925925925926e-07, "loss": 0.0, "reward": 1.725000061094761, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7339286021888256, "rewards/format_reward_func": 0.9910714328289032, "step": 3180 }, { "completion_length": 260.8259086608887, "epoch": 0.5334674546292804, "grad_norm": 0.3279488891762705, "kl": 0.03820037841796875, "learning_rate": 3.9283950617283953e-07, "loss": 0.0, "reward": 1.7464286386966705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 3182 }, { "completion_length": 253.90625858306885, "epoch": 0.5338027578691479, "grad_norm": 0.2417637339334101, "kl": 0.031524658203125, "learning_rate": 3.930864197530864e-07, "loss": 0.0, "reward": 1.7375000789761543, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7419643253087997, "rewards/format_reward_func": 0.9955357164144516, "step": 3184 }, { "completion_length": 255.61608600616455, "epoch": 0.5341380611090155, "grad_norm": 0.20489918160535506, "kl": 0.056060791015625, "learning_rate": 3.933333333333333e-07, "loss": 0.0001, "reward": 1.7875000461935997, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 3186 }, { "completion_length": 257.433048248291, "epoch": 0.534473364348883, "grad_norm": 0.25187838214239483, "kl": 0.026729583740234375, "learning_rate": 3.935802469135802e-07, "loss": 0.0, "reward": 1.7714286595582962, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7803571820259094, "rewards/format_reward_func": 0.9910714328289032, "step": 3188 }, { "completion_length": 269.12501335144043, "epoch": 0.5348086675887506, "grad_norm": 0.28126525645205525, "kl": 0.05770111083984375, "learning_rate": 3.9382716049382714e-07, "loss": 0.0001, "reward": 1.6446429342031479, "reward_std": 0.10859139915555716, "rewards/equation_reward_func": 0.666964327916503, "rewards/format_reward_func": 0.977678582072258, "step": 3190 }, { "completion_length": 264.58929920196533, "epoch": 0.5351439708286181, "grad_norm": 0.21792582285765727, "kl": 0.02896881103515625, "learning_rate": 3.940740740740741e-07, "loss": 0.0, "reward": 1.7214286401867867, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7214286141097546, "rewards/format_reward_func": 1.0, "step": 3192 }, { "completion_length": 261.4419775009155, "epoch": 0.5354792740684857, "grad_norm": 0.26582125873677714, "kl": 0.1292572021484375, "learning_rate": 3.9432098765432097e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143297791481, "rewards/format_reward_func": 1.0, "step": 3194 }, { "completion_length": 254.1830472946167, "epoch": 0.5358145773083532, "grad_norm": 0.12337232155198748, "kl": 0.09566116333007812, "learning_rate": 3.945679012345679e-07, "loss": 0.0001, "reward": 1.7767857611179352, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7812500167638063, "rewards/format_reward_func": 0.9955357164144516, "step": 3196 }, { "completion_length": 246.31250858306885, "epoch": 0.5361498805482208, "grad_norm": 0.13809997347069525, "kl": 0.02978515625, "learning_rate": 3.948148148148148e-07, "loss": 0.0, "reward": 1.7107143551111221, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7107143253087997, "rewards/format_reward_func": 1.0, "step": 3198 }, { "completion_length": 252.37055015563965, "epoch": 0.5364851837880884, "grad_norm": 0.2090765586833693, "kl": 0.037384033203125, "learning_rate": 3.950617283950617e-07, "loss": 0.0, "reward": 1.7642858028411865, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 3200 }, { "completion_length": 256.308048248291, "epoch": 0.5368204870279559, "grad_norm": 0.2867788682863003, "kl": 0.027191162109375, "learning_rate": 3.9530864197530863e-07, "loss": 0.0, "reward": 1.7428572326898575, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571619093418, "rewards/format_reward_func": 1.0, "step": 3202 }, { "completion_length": 242.46876049041748, "epoch": 0.5371557902678235, "grad_norm": 0.19248161610849127, "kl": 0.0301055908203125, "learning_rate": 3.955555555555555e-07, "loss": 0.0, "reward": 1.7857143431901932, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 3204 }, { "completion_length": 253.46876049041748, "epoch": 0.537491093507691, "grad_norm": 0.3401432023399993, "kl": 0.047245025634765625, "learning_rate": 3.9580246913580246e-07, "loss": 0.0, "reward": 1.7553572207689285, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7598214633762836, "rewards/format_reward_func": 0.9955357164144516, "step": 3206 }, { "completion_length": 251.1205472946167, "epoch": 0.5378263967475586, "grad_norm": 0.3760622304921827, "kl": 0.032135009765625, "learning_rate": 3.960493827160494e-07, "loss": 0.0, "reward": 1.7767857611179352, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7812500353902578, "rewards/format_reward_func": 0.9955357164144516, "step": 3208 }, { "completion_length": 252.9330472946167, "epoch": 0.5381616999874261, "grad_norm": 0.3604519919978038, "kl": 0.10263824462890625, "learning_rate": 3.962962962962963e-07, "loss": 0.0001, "reward": 1.7071429565548897, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.707142885774374, "rewards/format_reward_func": 1.0, "step": 3210 }, { "completion_length": 251.75893878936768, "epoch": 0.5384970032272937, "grad_norm": 0.22457165395344272, "kl": 0.15355682373046875, "learning_rate": 3.9654320987654323e-07, "loss": 0.0002, "reward": 1.7464286237955093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.746428620070219, "rewards/format_reward_func": 1.0, "step": 3212 }, { "completion_length": 256.7812623977661, "epoch": 0.5388323064671613, "grad_norm": 0.21129731942518568, "kl": 0.11245346069335938, "learning_rate": 3.9679012345679006e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9955357164144516, "step": 3214 }, { "completion_length": 246.5000114440918, "epoch": 0.5391676097070288, "grad_norm": 0.18051497905212877, "kl": 0.03157806396484375, "learning_rate": 3.97037037037037e-07, "loss": 0.0, "reward": 1.7625000849366188, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9955357164144516, "step": 3216 }, { "completion_length": 245.3794765472412, "epoch": 0.5395029129468963, "grad_norm": 0.9203589714968138, "kl": 0.261016845703125, "learning_rate": 3.9728395061728395e-07, "loss": 0.0003, "reward": 1.6696429401636124, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.6741071753203869, "rewards/format_reward_func": 0.9955357164144516, "step": 3218 }, { "completion_length": 250.7232255935669, "epoch": 0.5398382161867639, "grad_norm": 0.3017788178994304, "kl": 0.03530120849609375, "learning_rate": 3.9753086419753083e-07, "loss": 0.0, "reward": 1.677678644657135, "reward_std": 0.07197336759418249, "rewards/equation_reward_func": 0.683928593993187, "rewards/format_reward_func": 0.9937500059604645, "step": 3220 }, { "completion_length": 249.57144165039062, "epoch": 0.5401735194266315, "grad_norm": 0.5284210803368328, "kl": 0.11692047119140625, "learning_rate": 3.977777777777778e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 3222 }, { "completion_length": 246.1919765472412, "epoch": 0.540508822666499, "grad_norm": 0.13747975941449442, "kl": 0.24882888793945312, "learning_rate": 3.9802469135802466e-07, "loss": 0.0002, "reward": 1.8142857626080513, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857477068901, "rewards/format_reward_func": 1.0, "step": 3224 }, { "completion_length": 255.8080472946167, "epoch": 0.5408441259063665, "grad_norm": 0.2599070366859934, "kl": 0.08871078491210938, "learning_rate": 3.982716049382716e-07, "loss": 0.0001, "reward": 1.7803572043776512, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 3226 }, { "completion_length": 250.17411613464355, "epoch": 0.5411794291462341, "grad_norm": 0.5749702489024938, "kl": 0.1636505126953125, "learning_rate": 3.985185185185185e-07, "loss": 0.0002, "reward": 1.7000000923871994, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7000000309199095, "rewards/format_reward_func": 1.0, "step": 3228 }, { "completion_length": 248.71429347991943, "epoch": 0.5415147323861017, "grad_norm": 0.4718805170337936, "kl": 0.0569000244140625, "learning_rate": 3.987654320987654e-07, "loss": 0.0001, "reward": 1.6589286774396896, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.6633928883820772, "rewards/format_reward_func": 0.9955357164144516, "step": 3230 }, { "completion_length": 251.4196538925171, "epoch": 0.5418500356259692, "grad_norm": 0.36286334724653574, "kl": 0.04682159423828125, "learning_rate": 3.990123456790123e-07, "loss": 0.0, "reward": 1.796428620815277, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.7964286133646965, "rewards/format_reward_func": 1.0, "step": 3232 }, { "completion_length": 253.1026906967163, "epoch": 0.5421853388658368, "grad_norm": 0.24718759183417172, "kl": 0.08046340942382812, "learning_rate": 3.9925925925925926e-07, "loss": 0.0001, "reward": 1.7875000461935997, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7919643297791481, "rewards/format_reward_func": 0.9955357164144516, "step": 3234 }, { "completion_length": 253.43750953674316, "epoch": 0.5425206421057044, "grad_norm": 0.14230902666933232, "kl": 0.028270721435546875, "learning_rate": 3.9950617283950615e-07, "loss": 0.0, "reward": 1.7857143506407738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 3236 }, { "completion_length": 254.06697940826416, "epoch": 0.5428559453455719, "grad_norm": 0.6630361438819221, "kl": 0.07397842407226562, "learning_rate": 3.997530864197531e-07, "loss": 0.0001, "reward": 1.7107143551111221, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7196429036557674, "rewards/format_reward_func": 0.9910714328289032, "step": 3238 }, { "completion_length": 243.4062623977661, "epoch": 0.5431912485854394, "grad_norm": 0.15745796244051208, "kl": 0.023761749267578125, "learning_rate": 4e-07, "loss": 0.0, "reward": 1.8089286088943481, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.8133928924798965, "rewards/format_reward_func": 0.9955357164144516, "step": 3240 }, { "completion_length": 247.17411994934082, "epoch": 0.543526551825307, "grad_norm": 0.11801541444411404, "kl": 0.045162200927734375, "learning_rate": 4.0024691358024687e-07, "loss": 0.0, "reward": 1.667857214808464, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.6678571905940771, "rewards/format_reward_func": 1.0, "step": 3242 }, { "completion_length": 246.5937614440918, "epoch": 0.5438618550651746, "grad_norm": 0.24052847857381288, "kl": 0.038970947265625, "learning_rate": 4.004938271604938e-07, "loss": 0.0, "reward": 1.7142858058214188, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857480794191, "rewards/format_reward_func": 1.0, "step": 3244 }, { "completion_length": 246.37054824829102, "epoch": 0.5441971583050421, "grad_norm": 0.20182668701059084, "kl": 0.0252532958984375, "learning_rate": 4.007407407407407e-07, "loss": 0.0, "reward": 1.791071467101574, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.795535746961832, "rewards/format_reward_func": 0.9955357164144516, "step": 3246 }, { "completion_length": 248.64733505249023, "epoch": 0.5445324615449096, "grad_norm": 0.25054728863777276, "kl": 0.023464202880859375, "learning_rate": 4.0098765432098764e-07, "loss": 0.0, "reward": 1.7571429312229156, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7660714536905289, "rewards/format_reward_func": 0.9910714328289032, "step": 3248 }, { "completion_length": 252.946439743042, "epoch": 0.5448677647847773, "grad_norm": 0.2787722432356983, "kl": 0.05184173583984375, "learning_rate": 4.0123456790123453e-07, "loss": 0.0001, "reward": 1.7125001102685928, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7169643118977547, "rewards/format_reward_func": 0.9955357164144516, "step": 3250 }, { "completion_length": 242.03572463989258, "epoch": 0.5452030680246448, "grad_norm": 0.18001654363123426, "kl": 0.0496063232421875, "learning_rate": 4.0148148148148147e-07, "loss": 0.0, "reward": 1.8392857611179352, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8392857238650322, "rewards/format_reward_func": 1.0, "step": 3252 }, { "completion_length": 244.8839406967163, "epoch": 0.5455383712645123, "grad_norm": 0.3107235931047464, "kl": 0.032939910888671875, "learning_rate": 4.017283950617284e-07, "loss": 0.0, "reward": 1.7642857879400253, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 3254 }, { "completion_length": 254.87054443359375, "epoch": 0.5458736745043798, "grad_norm": 0.5024745088879808, "kl": 0.0463104248046875, "learning_rate": 4.0197530864197525e-07, "loss": 0.0, "reward": 1.7660714983940125, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7705357447266579, "rewards/format_reward_func": 0.9955357164144516, "step": 3256 }, { "completion_length": 243.50447368621826, "epoch": 0.5462089777442475, "grad_norm": 0.19769072115630126, "kl": 0.028018951416015625, "learning_rate": 4.022222222222222e-07, "loss": 0.0, "reward": 1.782142922282219, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 3258 }, { "completion_length": 252.5134038925171, "epoch": 0.546544280984115, "grad_norm": 0.25804087534525366, "kl": 0.037353515625, "learning_rate": 4.0246913580246913e-07, "loss": 0.0, "reward": 1.7875000685453415, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7919643111526966, "rewards/format_reward_func": 0.9955357164144516, "step": 3260 }, { "completion_length": 249.3125114440918, "epoch": 0.5468795842239825, "grad_norm": 0.3788881858860527, "kl": 0.036830902099609375, "learning_rate": 4.02716049382716e-07, "loss": 0.0, "reward": 1.7178572192788124, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.717857176437974, "rewards/format_reward_func": 1.0, "step": 3262 }, { "completion_length": 249.19643878936768, "epoch": 0.5472148874638502, "grad_norm": 0.15403221802227582, "kl": 0.03723907470703125, "learning_rate": 4.0296296296296296e-07, "loss": 0.0, "reward": 1.757142923772335, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 3264 }, { "completion_length": 242.4509048461914, "epoch": 0.5475501907037177, "grad_norm": 0.4151835339606924, "kl": 0.03864288330078125, "learning_rate": 4.0320987654320985e-07, "loss": 0.0, "reward": 1.7982143461704254, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.8026785999536514, "rewards/format_reward_func": 0.9955357164144516, "step": 3266 }, { "completion_length": 257.71876335144043, "epoch": 0.5478854939435852, "grad_norm": 0.28616567137191395, "kl": 0.03456878662109375, "learning_rate": 4.034567901234568e-07, "loss": 0.0, "reward": 1.6892858296632767, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.698214303702116, "rewards/format_reward_func": 0.9910714328289032, "step": 3268 }, { "completion_length": 247.76786613464355, "epoch": 0.5482207971834527, "grad_norm": 0.33138120271755234, "kl": 0.027130126953125, "learning_rate": 4.0370370370370373e-07, "loss": 0.0, "reward": 1.807142898440361, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 3270 }, { "completion_length": 253.633939743042, "epoch": 0.5485561004233204, "grad_norm": 0.2025119387803373, "kl": 0.07606124877929688, "learning_rate": 4.0395061728395057e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 3272 }, { "completion_length": 249.10269165039062, "epoch": 0.5488914036631879, "grad_norm": 0.3509880306028421, "kl": 0.06534576416015625, "learning_rate": 4.041975308641975e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 3274 }, { "completion_length": 248.8884048461914, "epoch": 0.5492267069030554, "grad_norm": 0.3225348638068555, "kl": 0.06076812744140625, "learning_rate": 4.044444444444444e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 3276 }, { "completion_length": 246.52679347991943, "epoch": 0.549562010142923, "grad_norm": 0.1652321697093135, "kl": 0.045623779296875, "learning_rate": 4.0469135802469134e-07, "loss": 0.0, "reward": 1.7678572237491608, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 3278 }, { "completion_length": 248.196439743042, "epoch": 0.5498973133827906, "grad_norm": 0.27022884678968045, "kl": 0.03060150146484375, "learning_rate": 4.049382716049383e-07, "loss": 0.0, "reward": 1.8160714954137802, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8205357268452644, "rewards/format_reward_func": 0.9955357164144516, "step": 3280 }, { "completion_length": 257.5267972946167, "epoch": 0.5502326166226581, "grad_norm": 0.4939771906485696, "kl": 0.14651107788085938, "learning_rate": 4.0518518518518517e-07, "loss": 0.0001, "reward": 1.7410714849829674, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7544643189758062, "rewards/format_reward_func": 0.9866071492433548, "step": 3282 }, { "completion_length": 248.93750858306885, "epoch": 0.5505679198625256, "grad_norm": 0.3348022744898028, "kl": 0.032135009765625, "learning_rate": 4.054320987654321e-07, "loss": 0.0, "reward": 1.7125000655651093, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.716964315623045, "rewards/format_reward_func": 0.9955357164144516, "step": 3284 }, { "completion_length": 253.27679538726807, "epoch": 0.5509032231023933, "grad_norm": 0.4286314695204813, "kl": 0.06396102905273438, "learning_rate": 4.05679012345679e-07, "loss": 0.0001, "reward": 1.7125000804662704, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7169643193483353, "rewards/format_reward_func": 0.9955357164144516, "step": 3286 }, { "completion_length": 244.12947750091553, "epoch": 0.5512385263422608, "grad_norm": 0.23906097739214408, "kl": 0.027721405029296875, "learning_rate": 4.059259259259259e-07, "loss": 0.0, "reward": 1.7767857983708382, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7812500298023224, "rewards/format_reward_func": 0.9955357164144516, "step": 3288 }, { "completion_length": 248.7991189956665, "epoch": 0.5515738295821283, "grad_norm": 0.3127992260300639, "kl": 0.031581878662109375, "learning_rate": 4.061728395061728e-07, "loss": 0.0, "reward": 1.7071429342031479, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7071428820490837, "rewards/format_reward_func": 1.0, "step": 3290 }, { "completion_length": 249.4330472946167, "epoch": 0.5519091328219959, "grad_norm": 0.20008099014907435, "kl": 0.0303955078125, "learning_rate": 4.064197530864197e-07, "loss": 0.0, "reward": 1.748214341700077, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7526785936206579, "rewards/format_reward_func": 0.9955357164144516, "step": 3292 }, { "completion_length": 256.7366180419922, "epoch": 0.5522444360618635, "grad_norm": 0.2468886215141364, "kl": 0.029834747314453125, "learning_rate": 4.0666666666666666e-07, "loss": 0.0, "reward": 1.7267858013510704, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.731250025331974, "rewards/format_reward_func": 0.9955357164144516, "step": 3294 }, { "completion_length": 255.7053680419922, "epoch": 0.552579739301731, "grad_norm": 0.34286991600304806, "kl": 0.032001495361328125, "learning_rate": 4.069135802469136e-07, "loss": 0.0, "reward": 1.792857177555561, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 3296 }, { "completion_length": 246.50001049041748, "epoch": 0.5529150425415985, "grad_norm": 0.24654089324264858, "kl": 0.024234771728515625, "learning_rate": 4.071604938271605e-07, "loss": 0.0, "reward": 1.7500000819563866, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 3298 }, { "completion_length": 256.11162090301514, "epoch": 0.5532503457814661, "grad_norm": 0.28781653670523644, "kl": 0.028278350830078125, "learning_rate": 4.0740740740740737e-07, "loss": 0.0, "reward": 1.7642857655882835, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7732143141329288, "rewards/format_reward_func": 0.9910714328289032, "step": 3300 }, { "completion_length": 249.9642972946167, "epoch": 0.5535856490213337, "grad_norm": 0.3905942900862037, "kl": 0.026691436767578125, "learning_rate": 4.0765432098765426e-07, "loss": 0.0, "reward": 1.778571493923664, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 3302 }, { "completion_length": 241.17858219146729, "epoch": 0.5539209522612012, "grad_norm": 0.29323430298274383, "kl": 0.028812408447265625, "learning_rate": 4.079012345679012e-07, "loss": 0.0, "reward": 1.814285770058632, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857477068901, "rewards/format_reward_func": 1.0, "step": 3304 }, { "completion_length": 242.20983219146729, "epoch": 0.5542562555010688, "grad_norm": 0.15078015109364112, "kl": 0.033367156982421875, "learning_rate": 4.0814814814814814e-07, "loss": 0.0, "reward": 1.8178571835160255, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8178571723401546, "rewards/format_reward_func": 1.0, "step": 3306 }, { "completion_length": 265.4196557998657, "epoch": 0.5545915587409364, "grad_norm": 0.3085516247377834, "kl": 0.03081512451171875, "learning_rate": 4.0839506172839503e-07, "loss": 0.0, "reward": 1.7428572103381157, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 3308 }, { "completion_length": 247.75447463989258, "epoch": 0.5549268619808039, "grad_norm": 0.3167820411329688, "kl": 0.0263214111328125, "learning_rate": 4.08641975308642e-07, "loss": 0.0, "reward": 1.735714390873909, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7446428947150707, "rewards/format_reward_func": 0.9910714328289032, "step": 3310 }, { "completion_length": 253.93304538726807, "epoch": 0.5552621652206714, "grad_norm": 0.26818302974849845, "kl": 0.02646636962890625, "learning_rate": 4.088888888888889e-07, "loss": 0.0, "reward": 1.7250000685453415, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000443309546, "rewards/format_reward_func": 1.0, "step": 3312 }, { "completion_length": 244.9107255935669, "epoch": 0.555597468460539, "grad_norm": 0.18918188197499258, "kl": 0.030887603759765625, "learning_rate": 4.0913580246913575e-07, "loss": 0.0, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 3314 }, { "completion_length": 253.24108123779297, "epoch": 0.5559327717004066, "grad_norm": 0.1587575101234306, "kl": 0.03311920166015625, "learning_rate": 4.093827160493827e-07, "loss": 0.0, "reward": 1.7482143267989159, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526785954833031, "rewards/format_reward_func": 0.9955357164144516, "step": 3316 }, { "completion_length": 243.60715579986572, "epoch": 0.5562680749402741, "grad_norm": 0.1467960955689759, "kl": 0.02486419677734375, "learning_rate": 4.096296296296296e-07, "loss": 0.0, "reward": 1.7714286521077156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 3318 }, { "completion_length": 266.9151906967163, "epoch": 0.5566033781801417, "grad_norm": 0.13892360946264437, "kl": 0.045444488525390625, "learning_rate": 4.098765432098765e-07, "loss": 0.0, "reward": 1.7714286595582962, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 3320 }, { "completion_length": 254.00447463989258, "epoch": 0.5569386814200092, "grad_norm": 0.24898486187793703, "kl": 0.030483245849609375, "learning_rate": 4.1012345679012346e-07, "loss": 0.0, "reward": 1.7946429327130318, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7991071678698063, "rewards/format_reward_func": 0.9955357164144516, "step": 3322 }, { "completion_length": 250.87501335144043, "epoch": 0.5572739846598768, "grad_norm": 0.2500029856545992, "kl": 0.03379058837890625, "learning_rate": 4.1037037037037035e-07, "loss": 0.0, "reward": 1.7535714879631996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 3324 }, { "completion_length": 255.415189743042, "epoch": 0.5576092878997443, "grad_norm": 0.29364029372225736, "kl": 0.020549774169921875, "learning_rate": 4.106172839506173e-07, "loss": 0.0, "reward": 1.7285715192556381, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7285714596509933, "rewards/format_reward_func": 1.0, "step": 3326 }, { "completion_length": 266.12054920196533, "epoch": 0.5579445911396119, "grad_norm": 0.25937008559805713, "kl": 0.05391693115234375, "learning_rate": 4.1086419753086413e-07, "loss": 0.0001, "reward": 1.6928572207689285, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7017857618629932, "rewards/format_reward_func": 0.9910714328289032, "step": 3328 }, { "completion_length": 254.53126525878906, "epoch": 0.5582798943794794, "grad_norm": 0.25669521443692506, "kl": 0.018245697021484375, "learning_rate": 4.1111111111111107e-07, "loss": 0.0, "reward": 1.805357187986374, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.809821454808116, "rewards/format_reward_func": 0.9955357164144516, "step": 3330 }, { "completion_length": 271.227689743042, "epoch": 0.558615197619347, "grad_norm": 0.3242229226510621, "kl": 0.025936126708984375, "learning_rate": 4.11358024691358e-07, "loss": 0.0, "reward": 1.7857143431901932, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7946428842842579, "rewards/format_reward_func": 0.9910714328289032, "step": 3332 }, { "completion_length": 257.14733123779297, "epoch": 0.5589505008592146, "grad_norm": 0.16505049686226236, "kl": 0.01825714111328125, "learning_rate": 4.116049382716049e-07, "loss": 0.0, "reward": 1.733928643167019, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7383929006755352, "rewards/format_reward_func": 0.9955357164144516, "step": 3334 }, { "completion_length": 243.60268878936768, "epoch": 0.5592858040990821, "grad_norm": 0.24723268697187017, "kl": 0.021701812744140625, "learning_rate": 4.1185185185185184e-07, "loss": 0.0, "reward": 1.7571429386734962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428921073675, "rewards/format_reward_func": 1.0, "step": 3336 }, { "completion_length": 253.2500114440918, "epoch": 0.5596211073389497, "grad_norm": 0.2512410261507857, "kl": 0.032398223876953125, "learning_rate": 4.120987654320988e-07, "loss": 0.0, "reward": 1.7517857924103737, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500201165676, "rewards/format_reward_func": 0.9955357164144516, "step": 3338 }, { "completion_length": 247.8928680419922, "epoch": 0.5599564105788172, "grad_norm": 0.20871903585274287, "kl": 0.02616119384765625, "learning_rate": 4.1234567901234567e-07, "loss": 0.0, "reward": 1.8214286267757416, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8214285932481289, "rewards/format_reward_func": 1.0, "step": 3340 }, { "completion_length": 262.75893783569336, "epoch": 0.5602917138186848, "grad_norm": 0.3195999706923693, "kl": 0.020320892333984375, "learning_rate": 4.1259259259259256e-07, "loss": 0.0, "reward": 1.769642911851406, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 3342 }, { "completion_length": 258.5803699493408, "epoch": 0.5606270170585523, "grad_norm": 0.12534649378964235, "kl": 0.021961212158203125, "learning_rate": 4.1283950617283945e-07, "loss": 0.0, "reward": 1.7589286267757416, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7633928768336773, "rewards/format_reward_func": 0.9955357164144516, "step": 3344 }, { "completion_length": 269.0892963409424, "epoch": 0.5609623202984199, "grad_norm": 0.18819239134083926, "kl": 0.022518157958984375, "learning_rate": 4.130864197530864e-07, "loss": 0.0, "reward": 1.7785715088248253, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 3346 }, { "completion_length": 256.68304443359375, "epoch": 0.5612976235382875, "grad_norm": 0.17556385032649469, "kl": 0.034328460693359375, "learning_rate": 4.1333333333333333e-07, "loss": 0.0, "reward": 1.714285783469677, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.722321467474103, "rewards/format_reward_func": 0.9919642880558968, "step": 3348 }, { "completion_length": 258.80804538726807, "epoch": 0.561632926778155, "grad_norm": 0.19901055099402243, "kl": 0.05564117431640625, "learning_rate": 4.135802469135802e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 3350 }, { "completion_length": 258.51786708831787, "epoch": 0.5619682300180225, "grad_norm": 0.2753161669446979, "kl": 0.25748443603515625, "learning_rate": 4.1382716049382716e-07, "loss": 0.0003, "reward": 1.7946429252624512, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.8080357350409031, "rewards/format_reward_func": 0.9866071492433548, "step": 3352 }, { "completion_length": 268.1294775009155, "epoch": 0.5623035332578901, "grad_norm": 0.25622783243812475, "kl": 0.02396392822265625, "learning_rate": 4.140740740740741e-07, "loss": 0.0, "reward": 1.7250000685453415, "reward_std": 0.10606601554900408, "rewards/equation_reward_func": 0.7428571581840515, "rewards/format_reward_func": 0.9821428656578064, "step": 3354 }, { "completion_length": 259.7812614440918, "epoch": 0.5626388364977577, "grad_norm": 0.11022749186091951, "kl": 0.02692413330078125, "learning_rate": 4.1432098765432094e-07, "loss": 0.0, "reward": 1.771428644657135, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 3356 }, { "completion_length": 251.7053689956665, "epoch": 0.5629741397376252, "grad_norm": 0.25546924480034083, "kl": 0.02170562744140625, "learning_rate": 4.145679012345679e-07, "loss": 0.0, "reward": 1.8000000640749931, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8000000156462193, "rewards/format_reward_func": 1.0, "step": 3358 }, { "completion_length": 268.7857246398926, "epoch": 0.5633094429774927, "grad_norm": 0.26079986354717527, "kl": 0.06954193115234375, "learning_rate": 4.1481481481481476e-07, "loss": 0.0001, "reward": 1.7910714820027351, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7955357357859612, "rewards/format_reward_func": 0.9955357164144516, "step": 3360 }, { "completion_length": 254.7366180419922, "epoch": 0.5636447462173603, "grad_norm": 0.10312987099632315, "kl": 0.0501708984375, "learning_rate": 4.150617283950617e-07, "loss": 0.0001, "reward": 1.8660714626312256, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8705357313156128, "rewards/format_reward_func": 0.9955357164144516, "step": 3362 }, { "completion_length": 271.85715770721436, "epoch": 0.5639800494572279, "grad_norm": 0.21244909731089875, "kl": 0.06446456909179688, "learning_rate": 4.1530864197530865e-07, "loss": 0.0001, "reward": 1.666071504354477, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.67946432903409, "rewards/format_reward_func": 0.9866071492433548, "step": 3364 }, { "completion_length": 263.2009057998657, "epoch": 0.5643153526970954, "grad_norm": 0.282676138928249, "kl": 0.0485687255859375, "learning_rate": 4.1555555555555554e-07, "loss": 0.0, "reward": 1.7142857983708382, "reward_std": 0.09091372694820166, "rewards/equation_reward_func": 0.723214328289032, "rewards/format_reward_func": 0.9910714328289032, "step": 3366 }, { "completion_length": 249.79465579986572, "epoch": 0.564650655936963, "grad_norm": 0.2782074907909122, "kl": 0.0198974609375, "learning_rate": 4.158024691358025e-07, "loss": 0.0, "reward": 1.782142922282219, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 3368 }, { "completion_length": 265.97768783569336, "epoch": 0.5649859591768306, "grad_norm": 0.28559346638349703, "kl": 0.0273895263671875, "learning_rate": 4.1604938271604937e-07, "loss": 0.0, "reward": 1.7500000670552254, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 3370 }, { "completion_length": 257.76786708831787, "epoch": 0.5653212624166981, "grad_norm": 0.318607705266382, "kl": 0.022426605224609375, "learning_rate": 4.1629629629629625e-07, "loss": 0.0, "reward": 1.7642858028411865, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7642857450991869, "rewards/format_reward_func": 1.0, "step": 3372 }, { "completion_length": 269.2634057998657, "epoch": 0.5656565656565656, "grad_norm": 0.23426763199945552, "kl": 0.02051544189453125, "learning_rate": 4.165432098765432e-07, "loss": 0.0, "reward": 1.7928571924567223, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 3374 }, { "completion_length": 273.3705463409424, "epoch": 0.5659918688964332, "grad_norm": 0.23627110930730033, "kl": 0.06494903564453125, "learning_rate": 4.167901234567901e-07, "loss": 0.0001, "reward": 1.667857214808464, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.6767857521772385, "rewards/format_reward_func": 0.9910714328289032, "step": 3376 }, { "completion_length": 261.133939743042, "epoch": 0.5663271721363008, "grad_norm": 0.2841826348282849, "kl": 0.051708221435546875, "learning_rate": 4.17037037037037e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 3378 }, { "completion_length": 258.04911708831787, "epoch": 0.5666624753761683, "grad_norm": 0.12243006146358627, "kl": 0.0408172607421875, "learning_rate": 4.1728395061728397e-07, "loss": 0.0, "reward": 1.7357143238186836, "reward_std": 0.03030457627028227, "rewards/equation_reward_func": 0.744642898440361, "rewards/format_reward_func": 0.9910714328289032, "step": 3380 }, { "completion_length": 264.5535831451416, "epoch": 0.5669977786160358, "grad_norm": 0.185726261550338, "kl": 0.03836822509765625, "learning_rate": 4.1753086419753085e-07, "loss": 0.0, "reward": 1.7839286178350449, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7883928902447224, "rewards/format_reward_func": 0.9955357164144516, "step": 3382 }, { "completion_length": 268.5580472946167, "epoch": 0.5673330818559035, "grad_norm": 0.646482019678551, "kl": 0.1957550048828125, "learning_rate": 4.177777777777778e-07, "loss": 0.0002, "reward": 1.725000075995922, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7428571581840515, "rewards/format_reward_func": 0.9821428656578064, "step": 3384 }, { "completion_length": 262.4910840988159, "epoch": 0.567668385095771, "grad_norm": 0.28343271824101846, "kl": 0.032367706298828125, "learning_rate": 4.1802469135802463e-07, "loss": 0.0, "reward": 1.769642911851406, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7741071823984385, "rewards/format_reward_func": 0.9955357164144516, "step": 3386 }, { "completion_length": 267.2009048461914, "epoch": 0.5680036883356385, "grad_norm": 0.30762902087198446, "kl": 0.22566604614257812, "learning_rate": 4.1827160493827157e-07, "loss": 0.0002, "reward": 1.7607143372297287, "reward_std": 0.09596448950469494, "rewards/equation_reward_func": 0.769642885774374, "rewards/format_reward_func": 0.9910714328289032, "step": 3388 }, { "completion_length": 261.4509029388428, "epoch": 0.568338991575506, "grad_norm": 0.11987462910732502, "kl": 0.06043243408203125, "learning_rate": 4.185185185185185e-07, "loss": 0.0001, "reward": 1.7285714969038963, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7375000324100256, "rewards/format_reward_func": 0.9910714328289032, "step": 3390 }, { "completion_length": 264.91072368621826, "epoch": 0.5686742948153737, "grad_norm": 0.19351392333550568, "kl": 0.05767250061035156, "learning_rate": 4.187654320987654e-07, "loss": 0.0001, "reward": 1.7803572043776512, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214618861675, "rewards/format_reward_func": 0.9955357164144516, "step": 3392 }, { "completion_length": 259.0089406967163, "epoch": 0.5690095980552412, "grad_norm": 0.19611064120910207, "kl": 0.3055915832519531, "learning_rate": 4.1901234567901234e-07, "loss": 0.0003, "reward": 1.6803572252392769, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.684821467846632, "rewards/format_reward_func": 0.9955357164144516, "step": 3394 }, { "completion_length": 255.05804824829102, "epoch": 0.5693449012951087, "grad_norm": 0.23378041111291098, "kl": 0.034976959228515625, "learning_rate": 4.1925925925925923e-07, "loss": 0.0, "reward": 1.7642857655882835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857562750578, "rewards/format_reward_func": 1.0, "step": 3396 }, { "completion_length": 255.34822463989258, "epoch": 0.5696802045349764, "grad_norm": 0.20520753045702905, "kl": 0.12027740478515625, "learning_rate": 4.1950617283950617e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 3398 }, { "completion_length": 263.2142972946167, "epoch": 0.5700155077748439, "grad_norm": 0.23000060551566762, "kl": 0.09436416625976562, "learning_rate": 4.1975308641975306e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571429014205933, "rewards/format_reward_func": 1.0, "step": 3400 }, { "completion_length": 257.0535840988159, "epoch": 0.5703508110147114, "grad_norm": 0.27011411244907235, "kl": 0.02211761474609375, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": 1.7678572162985802, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 3402 }, { "completion_length": 260.5580472946167, "epoch": 0.5706861142545789, "grad_norm": 0.35785158580122634, "kl": 0.030055999755859375, "learning_rate": 4.202469135802469e-07, "loss": 0.0, "reward": 1.773214377462864, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.777678593993187, "rewards/format_reward_func": 0.9955357164144516, "step": 3404 }, { "completion_length": 269.73662090301514, "epoch": 0.5710214174944466, "grad_norm": 0.2128801567734277, "kl": 0.246490478515625, "learning_rate": 4.2049382716049383e-07, "loss": 0.0002, "reward": 1.7785715013742447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 3406 }, { "completion_length": 270.5312623977661, "epoch": 0.5713567207343141, "grad_norm": 0.21184046708081816, "kl": 0.039031982421875, "learning_rate": 4.207407407407407e-07, "loss": 0.0, "reward": 1.7464286610484123, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286163449287, "rewards/format_reward_func": 1.0, "step": 3408 }, { "completion_length": 268.50001335144043, "epoch": 0.5716920239741816, "grad_norm": 0.46570394726523234, "kl": 0.2350921630859375, "learning_rate": 4.2098765432098766e-07, "loss": 0.0002, "reward": 1.735267922282219, "reward_std": 0.08144354820251465, "rewards/equation_reward_func": 0.7473214603960514, "rewards/format_reward_func": 0.9879464358091354, "step": 3410 }, { "completion_length": 267.25894260406494, "epoch": 0.5720273272140493, "grad_norm": 0.2918455764955246, "kl": 0.12889862060546875, "learning_rate": 4.2123456790123455e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 3412 }, { "completion_length": 262.6294765472412, "epoch": 0.5723626304539168, "grad_norm": 0.24566651888157798, "kl": 0.15311050415039062, "learning_rate": 4.2148148148148144e-07, "loss": 0.0002, "reward": 1.7660714983940125, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 3414 }, { "completion_length": 265.6651906967163, "epoch": 0.5726979336937843, "grad_norm": 0.3001935923900465, "kl": 0.13809585571289062, "learning_rate": 4.217283950617284e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7732143215835094, "rewards/format_reward_func": 0.9910714328289032, "step": 3416 }, { "completion_length": 257.58483505249023, "epoch": 0.5730332369336518, "grad_norm": 0.2690218679515261, "kl": 0.030422210693359375, "learning_rate": 4.2197530864197527e-07, "loss": 0.0, "reward": 1.7464286461472511, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7553571704775095, "rewards/format_reward_func": 0.9910714328289032, "step": 3418 }, { "completion_length": 260.55358600616455, "epoch": 0.5733685401735195, "grad_norm": 0.5002997706686756, "kl": 0.168212890625, "learning_rate": 4.222222222222222e-07, "loss": 0.0002, "reward": 1.6875000596046448, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.6919643338769674, "rewards/format_reward_func": 0.9955357164144516, "step": 3420 }, { "completion_length": 254.7410831451416, "epoch": 0.573703843413387, "grad_norm": 0.2889726297534045, "kl": 0.024105072021484375, "learning_rate": 4.224691358024691e-07, "loss": 0.0, "reward": 1.7678572088479996, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7678571902215481, "rewards/format_reward_func": 1.0, "step": 3422 }, { "completion_length": 253.49108600616455, "epoch": 0.5740391466532545, "grad_norm": 0.21100531871607167, "kl": 0.02407073974609375, "learning_rate": 4.2271604938271604e-07, "loss": 0.0, "reward": 1.8285714983940125, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8285714499652386, "rewards/format_reward_func": 1.0, "step": 3424 }, { "completion_length": 262.3169775009155, "epoch": 0.5743744498931221, "grad_norm": 0.33953151575700286, "kl": 0.038204193115234375, "learning_rate": 4.22962962962963e-07, "loss": 0.0, "reward": 1.7232143580913544, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7276786006987095, "rewards/format_reward_func": 0.9955357164144516, "step": 3426 }, { "completion_length": 265.00447845458984, "epoch": 0.5747097531329897, "grad_norm": 0.33966484859625834, "kl": 0.032154083251953125, "learning_rate": 4.232098765432098e-07, "loss": 0.0, "reward": 1.7285714820027351, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 3428 }, { "completion_length": 263.45537185668945, "epoch": 0.5750450563728572, "grad_norm": 0.2174688881241481, "kl": 0.09495925903320312, "learning_rate": 4.2345679012345676e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7553571686148643, "rewards/format_reward_func": 0.9910714328289032, "step": 3430 }, { "completion_length": 267.9419765472412, "epoch": 0.5753803596127247, "grad_norm": 0.18092360650560677, "kl": 0.2818183898925781, "learning_rate": 4.237037037037037e-07, "loss": 0.0003, "reward": 1.757142923772335, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 3432 }, { "completion_length": 268.40180015563965, "epoch": 0.5757156628525923, "grad_norm": 0.36947735776015517, "kl": 0.067230224609375, "learning_rate": 4.239506172839506e-07, "loss": 0.0001, "reward": 1.7517858073115349, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7562500312924385, "rewards/format_reward_func": 0.9955357164144516, "step": 3434 }, { "completion_length": 264.8303737640381, "epoch": 0.5760509660924599, "grad_norm": 0.18742152942426193, "kl": 0.06931304931640625, "learning_rate": 4.2419753086419753e-07, "loss": 0.0001, "reward": 1.8160714954137802, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.8205357491970062, "rewards/format_reward_func": 0.9955357164144516, "step": 3436 }, { "completion_length": 264.5312614440918, "epoch": 0.5763862693323274, "grad_norm": 0.2804896692149197, "kl": 0.05013275146484375, "learning_rate": 4.244444444444444e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 3438 }, { "completion_length": 271.4732275009155, "epoch": 0.576721572572195, "grad_norm": 0.18135283016552467, "kl": 0.050182342529296875, "learning_rate": 4.2469135802469136e-07, "loss": 0.0001, "reward": 1.7107143849134445, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7107143141329288, "rewards/format_reward_func": 1.0, "step": 3440 }, { "completion_length": 269.2544765472412, "epoch": 0.5770568758120626, "grad_norm": 0.2010359293113445, "kl": 0.035064697265625, "learning_rate": 4.2493827160493825e-07, "loss": 0.0, "reward": 1.7428572177886963, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571842610836, "rewards/format_reward_func": 1.0, "step": 3442 }, { "completion_length": 271.80358505249023, "epoch": 0.5773921790519301, "grad_norm": 0.23671808515711454, "kl": 0.025234222412109375, "learning_rate": 4.2518518518518513e-07, "loss": 0.0, "reward": 1.725000075995922, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 1.0, "step": 3444 }, { "completion_length": 252.01340579986572, "epoch": 0.5777274822917976, "grad_norm": 0.1996366433463103, "kl": 0.024082183837890625, "learning_rate": 4.254320987654321e-07, "loss": 0.0, "reward": 1.785714328289032, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 3446 }, { "completion_length": 273.3348340988159, "epoch": 0.5780627855316652, "grad_norm": 0.14140392980556388, "kl": 0.033966064453125, "learning_rate": 4.2567901234567896e-07, "loss": 0.0, "reward": 1.803571492433548, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 3448 }, { "completion_length": 261.0446548461914, "epoch": 0.5783980887715328, "grad_norm": 0.25123462292319676, "kl": 0.0379486083984375, "learning_rate": 4.259259259259259e-07, "loss": 0.0, "reward": 1.7392857670783997, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857298254967, "rewards/format_reward_func": 1.0, "step": 3450 }, { "completion_length": 267.0893020629883, "epoch": 0.5787333920114003, "grad_norm": 0.2699059485478521, "kl": 0.11783599853515625, "learning_rate": 4.2617283950617285e-07, "loss": 0.0001, "reward": 1.69821435213089, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7026785928755999, "rewards/format_reward_func": 0.9955357164144516, "step": 3452 }, { "completion_length": 268.49108600616455, "epoch": 0.5790686952512679, "grad_norm": 0.2084339363468767, "kl": 0.033172607421875, "learning_rate": 4.2641975308641973e-07, "loss": 0.0, "reward": 1.758482187986374, "reward_std": 0.03851206600666046, "rewards/equation_reward_func": 0.7660714592784643, "rewards/format_reward_func": 0.9924107193946838, "step": 3454 }, { "completion_length": 259.3839406967163, "epoch": 0.5794039984911354, "grad_norm": 0.17434411741363384, "kl": 0.0621795654296875, "learning_rate": 4.266666666666667e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 3456 }, { "completion_length": 265.0089387893677, "epoch": 0.579739301731003, "grad_norm": 0.3167726445807968, "kl": 0.03658294677734375, "learning_rate": 4.2691358024691356e-07, "loss": 0.0, "reward": 1.7785714715719223, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 3458 }, { "completion_length": 259.3928699493408, "epoch": 0.5800746049708705, "grad_norm": 0.2945341184109623, "kl": 0.0406341552734375, "learning_rate": 4.2716049382716045e-07, "loss": 0.0, "reward": 1.7821429297327995, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 3460 }, { "completion_length": 265.3660821914673, "epoch": 0.5804099082107381, "grad_norm": 0.0889414542345476, "kl": 0.028598785400390625, "learning_rate": 4.274074074074074e-07, "loss": 0.0, "reward": 1.7857143431901932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 3462 }, { "completion_length": 262.91072940826416, "epoch": 0.5807452114506056, "grad_norm": 0.13428184690834213, "kl": 0.03705596923828125, "learning_rate": 4.276543209876543e-07, "loss": 0.0, "reward": 1.7196429371833801, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 3464 }, { "completion_length": 263.2098340988159, "epoch": 0.5810805146904732, "grad_norm": 0.2829331040542585, "kl": 0.05352783203125, "learning_rate": 4.279012345679012e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7553571686148643, "rewards/format_reward_func": 0.9910714328289032, "step": 3466 }, { "completion_length": 272.1339416503906, "epoch": 0.5814158179303408, "grad_norm": 0.15916034359450193, "kl": 0.05171966552734375, "learning_rate": 4.2814814814814816e-07, "loss": 0.0001, "reward": 1.7196429446339607, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 3468 }, { "completion_length": 256.48215770721436, "epoch": 0.5817511211702083, "grad_norm": 0.3651088268384066, "kl": 0.0333251953125, "learning_rate": 4.2839506172839505e-07, "loss": 0.0, "reward": 1.7107143700122833, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7196428962051868, "rewards/format_reward_func": 0.9910714328289032, "step": 3470 }, { "completion_length": 259.0134029388428, "epoch": 0.5820864244100759, "grad_norm": 0.41088836334113304, "kl": 0.04825592041015625, "learning_rate": 4.2864197530864194e-07, "loss": 0.0, "reward": 1.7553572282195091, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7598214671015739, "rewards/format_reward_func": 0.9955357164144516, "step": 3472 }, { "completion_length": 256.48662090301514, "epoch": 0.5824217276499434, "grad_norm": 0.14851198065644858, "kl": 0.032390594482421875, "learning_rate": 4.2888888888888883e-07, "loss": 0.0, "reward": 1.79464291036129, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7991071790456772, "rewards/format_reward_func": 0.9955357164144516, "step": 3474 }, { "completion_length": 258.7901916503906, "epoch": 0.582757030889811, "grad_norm": 0.1969503633329373, "kl": 0.0368194580078125, "learning_rate": 4.2913580246913577e-07, "loss": 0.0, "reward": 1.7089286595582962, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7133928835391998, "rewards/format_reward_func": 0.9955357164144516, "step": 3476 }, { "completion_length": 257.8928699493408, "epoch": 0.5830923341296785, "grad_norm": 0.3246895776320101, "kl": 0.12652206420898438, "learning_rate": 4.293827160493827e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 3478 }, { "completion_length": 258.3392963409424, "epoch": 0.5834276373695461, "grad_norm": 0.28282227891625317, "kl": 0.02780914306640625, "learning_rate": 4.296296296296296e-07, "loss": 0.0, "reward": 1.7375000789761543, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7508928887546062, "rewards/format_reward_func": 0.9866071492433548, "step": 3480 }, { "completion_length": 257.0491189956665, "epoch": 0.5837629406094136, "grad_norm": 0.18441674849853126, "kl": 0.02960205078125, "learning_rate": 4.2987654320987654e-07, "loss": 0.0, "reward": 1.7785714864730835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 3482 }, { "completion_length": 260.7812614440918, "epoch": 0.5840982438492812, "grad_norm": 0.23324259920480803, "kl": 0.03562164306640625, "learning_rate": 4.301234567901235e-07, "loss": 0.0, "reward": 1.725000075995922, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7250000238418579, "rewards/format_reward_func": 1.0, "step": 3484 }, { "completion_length": 263.1651906967163, "epoch": 0.5844335470891487, "grad_norm": 0.41779317202713057, "kl": 0.08310699462890625, "learning_rate": 4.303703703703703e-07, "loss": 0.0001, "reward": 1.6625000908970833, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.6758929044008255, "rewards/format_reward_func": 0.9866071492433548, "step": 3486 }, { "completion_length": 254.05804824829102, "epoch": 0.5847688503290163, "grad_norm": 0.21896304164973182, "kl": 0.16289520263671875, "learning_rate": 4.3061728395061726e-07, "loss": 0.0002, "reward": 1.7589286267757416, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 3488 }, { "completion_length": 250.95090103149414, "epoch": 0.5851041535688839, "grad_norm": 0.15230906377968959, "kl": 0.1225433349609375, "learning_rate": 4.3086419753086415e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857544124126, "rewards/format_reward_func": 1.0, "step": 3490 }, { "completion_length": 253.61161708831787, "epoch": 0.5854394568087514, "grad_norm": 0.13634640897201905, "kl": 0.06240081787109375, "learning_rate": 4.311111111111111e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.792857164517045, "rewards/format_reward_func": 1.0, "step": 3492 }, { "completion_length": 259.2410831451416, "epoch": 0.585774760048619, "grad_norm": 0.20498559394312932, "kl": 0.02706146240234375, "learning_rate": 4.3135802469135803e-07, "loss": 0.0, "reward": 1.8125000298023224, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.8169643245637417, "rewards/format_reward_func": 0.9955357164144516, "step": 3494 }, { "completion_length": 249.1384048461914, "epoch": 0.5861100632884865, "grad_norm": 0.31684540637992265, "kl": 0.05062103271484375, "learning_rate": 4.316049382716049e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 3496 }, { "completion_length": 265.8660821914673, "epoch": 0.5864453665283541, "grad_norm": 0.2820708311134308, "kl": 0.08450698852539062, "learning_rate": 4.3185185185185186e-07, "loss": 0.0001, "reward": 1.8250000402331352, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.8339285887777805, "rewards/format_reward_func": 0.9910714328289032, "step": 3498 }, { "completion_length": 258.276798248291, "epoch": 0.5867806697682216, "grad_norm": 0.18988002197728343, "kl": 0.0531158447265625, "learning_rate": 4.320987654320987e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.8017857559025288, "rewards/format_reward_func": 0.9910714328289032, "step": 3500 }, { "completion_length": 253.03572940826416, "epoch": 0.5871159730080892, "grad_norm": 0.17893344507554287, "kl": 0.02783966064453125, "learning_rate": 4.3234567901234564e-07, "loss": 0.0, "reward": 1.8071429133415222, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428909897804, "rewards/format_reward_func": 1.0, "step": 3502 }, { "completion_length": 259.8973321914673, "epoch": 0.5874512762479568, "grad_norm": 0.22936645917658965, "kl": 0.02904510498046875, "learning_rate": 4.325925925925926e-07, "loss": 0.0, "reward": 1.7857143580913544, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 3504 }, { "completion_length": 260.5357275009155, "epoch": 0.5877865794878243, "grad_norm": 0.41946924390943746, "kl": 0.3809051513671875, "learning_rate": 4.3283950617283947e-07, "loss": 0.0004, "reward": 1.7357143387198448, "reward_std": 0.10606601741164923, "rewards/equation_reward_func": 0.7625000216066837, "rewards/format_reward_func": 0.9732142984867096, "step": 3506 }, { "completion_length": 259.0982275009155, "epoch": 0.5881218827276918, "grad_norm": 0.22516693768991086, "kl": 0.03868865966796875, "learning_rate": 4.330864197530864e-07, "loss": 0.0, "reward": 1.76071435213089, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7696428932249546, "rewards/format_reward_func": 0.9910714328289032, "step": 3508 }, { "completion_length": 246.68304634094238, "epoch": 0.5884571859675594, "grad_norm": 0.16891949258278482, "kl": 0.02500152587890625, "learning_rate": 4.3333333333333335e-07, "loss": 0.0, "reward": 1.7660715207457542, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 3510 }, { "completion_length": 256.5044746398926, "epoch": 0.588792489207427, "grad_norm": 0.16682964616175375, "kl": 0.03672027587890625, "learning_rate": 4.3358024691358024e-07, "loss": 0.0, "reward": 1.7732143551111221, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776785865426064, "rewards/format_reward_func": 0.9955357164144516, "step": 3512 }, { "completion_length": 262.71429538726807, "epoch": 0.5891277924472945, "grad_norm": 0.22590571064361892, "kl": 0.036014556884765625, "learning_rate": 4.338271604938271e-07, "loss": 0.0, "reward": 1.7714286297559738, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7803571689873934, "rewards/format_reward_func": 0.9910714328289032, "step": 3514 }, { "completion_length": 262.2366199493408, "epoch": 0.589463095687162, "grad_norm": 0.2227653303159639, "kl": 0.05431365966796875, "learning_rate": 4.34074074074074e-07, "loss": 0.0001, "reward": 1.716071493923664, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.7294643130153418, "rewards/format_reward_func": 0.9866071492433548, "step": 3516 }, { "completion_length": 263.89287090301514, "epoch": 0.5897983989270297, "grad_norm": 0.22453191240129117, "kl": 0.161773681640625, "learning_rate": 4.3432098765432096e-07, "loss": 0.0002, "reward": 1.7642857730388641, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 3518 }, { "completion_length": 261.9509029388428, "epoch": 0.5901337021668972, "grad_norm": 0.25398918227897027, "kl": 0.0648956298828125, "learning_rate": 4.345679012345679e-07, "loss": 0.0001, "reward": 1.7357143834233284, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 3520 }, { "completion_length": 262.4598340988159, "epoch": 0.5904690054067647, "grad_norm": 0.2944405516763985, "kl": 0.025665283203125, "learning_rate": 4.348148148148148e-07, "loss": 0.0, "reward": 1.7642857730388641, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7732143104076385, "rewards/format_reward_func": 0.9910714328289032, "step": 3522 }, { "completion_length": 271.6696586608887, "epoch": 0.5908043086466322, "grad_norm": 0.24076922620858562, "kl": 0.0668487548828125, "learning_rate": 4.350617283950617e-07, "loss": 0.0001, "reward": 1.7482143566012383, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526786103844643, "rewards/format_reward_func": 0.9955357164144516, "step": 3524 }, { "completion_length": 262.7812604904175, "epoch": 0.5911396118864999, "grad_norm": 0.3099566866844783, "kl": 0.07646942138671875, "learning_rate": 4.3530864197530867e-07, "loss": 0.0001, "reward": 1.746428668498993, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 0.9821428656578064, "step": 3526 }, { "completion_length": 258.2232255935669, "epoch": 0.5914749151263674, "grad_norm": 0.17710092998351407, "kl": 0.031280517578125, "learning_rate": 4.355555555555555e-07, "loss": 0.0, "reward": 1.8071429207921028, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8160714544355869, "rewards/format_reward_func": 0.9910714328289032, "step": 3528 }, { "completion_length": 249.54465579986572, "epoch": 0.5918102183662349, "grad_norm": 0.2899026154248438, "kl": 0.0284576416015625, "learning_rate": 4.3580246913580244e-07, "loss": 0.0, "reward": 1.7875000685453415, "reward_std": 0.06818529777228832, "rewards/equation_reward_func": 0.8008928876370192, "rewards/format_reward_func": 0.9866071492433548, "step": 3530 }, { "completion_length": 255.883939743042, "epoch": 0.5921455216061026, "grad_norm": 0.15538153919110007, "kl": 0.02648162841796875, "learning_rate": 4.3604938271604933e-07, "loss": 0.0, "reward": 1.800000049173832, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 3532 }, { "completion_length": 263.08929920196533, "epoch": 0.5924808248459701, "grad_norm": 0.3173318092500211, "kl": 0.06317138671875, "learning_rate": 4.362962962962963e-07, "loss": 0.0001, "reward": 1.7875000685453415, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 3534 }, { "completion_length": 252.59376049041748, "epoch": 0.5928161280858376, "grad_norm": 0.345473218389504, "kl": 0.054718017578125, "learning_rate": 4.365432098765432e-07, "loss": 0.0001, "reward": 1.787500061094761, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7919643148779869, "rewards/format_reward_func": 0.9955357164144516, "step": 3536 }, { "completion_length": 266.7723331451416, "epoch": 0.5931514313257051, "grad_norm": 0.1637067972754746, "kl": 0.04750823974609375, "learning_rate": 4.367901234567901e-07, "loss": 0.0, "reward": 1.7357143461704254, "reward_std": 0.03030457627028227, "rewards/equation_reward_func": 0.7446428872644901, "rewards/format_reward_func": 0.9910714328289032, "step": 3538 }, { "completion_length": 259.96875858306885, "epoch": 0.5934867345655728, "grad_norm": 0.30374336459795204, "kl": 0.037841796875, "learning_rate": 4.3703703703703704e-07, "loss": 0.0, "reward": 1.762500062584877, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.766964316368103, "rewards/format_reward_func": 0.9955357164144516, "step": 3540 }, { "completion_length": 257.20537090301514, "epoch": 0.5938220378054403, "grad_norm": 0.18775558730289618, "kl": 0.18558502197265625, "learning_rate": 4.372839506172839e-07, "loss": 0.0002, "reward": 1.7482143566012383, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 3542 }, { "completion_length": 267.7589416503906, "epoch": 0.5941573410453078, "grad_norm": 0.27798905796014955, "kl": 0.0243682861328125, "learning_rate": 4.375308641975308e-07, "loss": 0.0, "reward": 1.7464286237955093, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7553571723401546, "rewards/format_reward_func": 0.9910714328289032, "step": 3544 }, { "completion_length": 254.3839406967163, "epoch": 0.5944926442851755, "grad_norm": 0.16482240300876788, "kl": 0.2323150634765625, "learning_rate": 4.3777777777777776e-07, "loss": 0.0002, "reward": 1.7535714954137802, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7535714767873287, "rewards/format_reward_func": 1.0, "step": 3546 }, { "completion_length": 269.70090103149414, "epoch": 0.594827947525043, "grad_norm": 0.21117970471010705, "kl": 0.17474365234375, "learning_rate": 4.3802469135802465e-07, "loss": 0.0002, "reward": 1.7589286267757416, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.772321455180645, "rewards/format_reward_func": 0.9866071492433548, "step": 3548 }, { "completion_length": 260.1517972946167, "epoch": 0.5951632507649105, "grad_norm": 0.2455256251100981, "kl": 0.029510498046875, "learning_rate": 4.382716049382716e-07, "loss": 0.0, "reward": 1.8000000715255737, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 0.9821428656578064, "step": 3550 }, { "completion_length": 267.6428699493408, "epoch": 0.595498554004778, "grad_norm": 0.21551056799708354, "kl": 0.03183746337890625, "learning_rate": 4.3851851851851853e-07, "loss": 0.0, "reward": 1.7196429520845413, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.724107176065445, "rewards/format_reward_func": 0.9955357164144516, "step": 3552 }, { "completion_length": 258.040189743042, "epoch": 0.5958338572446457, "grad_norm": 0.1783330798048646, "kl": 0.14756011962890625, "learning_rate": 4.387654320987654e-07, "loss": 0.0001, "reward": 1.707142949104309, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7071428969502449, "rewards/format_reward_func": 1.0, "step": 3554 }, { "completion_length": 249.8259038925171, "epoch": 0.5961691604845132, "grad_norm": 0.21122598255990901, "kl": 0.02812957763671875, "learning_rate": 4.3901234567901236e-07, "loss": 0.0, "reward": 1.773214340209961, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7776786088943481, "rewards/format_reward_func": 0.9955357164144516, "step": 3556 }, { "completion_length": 250.7991132736206, "epoch": 0.5965044637243807, "grad_norm": 0.1805206880065137, "kl": 0.0556182861328125, "learning_rate": 4.392592592592592e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 3558 }, { "completion_length": 250.87054634094238, "epoch": 0.5968397669642483, "grad_norm": 0.7938875469714126, "kl": 0.3453216552734375, "learning_rate": 4.3950617283950614e-07, "loss": 0.0003, "reward": 1.7946429401636124, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.799107164144516, "rewards/format_reward_func": 0.9955357164144516, "step": 3560 }, { "completion_length": 260.9732265472412, "epoch": 0.5971750702041159, "grad_norm": 0.19210387107786725, "kl": 0.04297637939453125, "learning_rate": 4.397530864197531e-07, "loss": 0.0, "reward": 1.7464286386966705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286144822836, "rewards/format_reward_func": 1.0, "step": 3562 }, { "completion_length": 245.0669755935669, "epoch": 0.5975103734439834, "grad_norm": 0.2801304059499941, "kl": 0.0260467529296875, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 1.7928571924567223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 3564 }, { "completion_length": 271.33037090301514, "epoch": 0.5978456766838509, "grad_norm": 0.11718352236281365, "kl": 0.06449508666992188, "learning_rate": 4.402469135802469e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7544643245637417, "rewards/format_reward_func": 0.9866071492433548, "step": 3566 }, { "completion_length": 260.3348321914673, "epoch": 0.5981809799237185, "grad_norm": 0.3075517351831485, "kl": 0.08880615234375, "learning_rate": 4.404938271604938e-07, "loss": 0.0001, "reward": 1.687500074505806, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7098214663565159, "rewards/format_reward_func": 0.977678582072258, "step": 3568 }, { "completion_length": 263.1026906967163, "epoch": 0.5985162831635861, "grad_norm": 0.267592381406727, "kl": 0.07773208618164062, "learning_rate": 4.4074074074074074e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7357143014669418, "rewards/format_reward_func": 1.0, "step": 3570 }, { "completion_length": 259.83036708831787, "epoch": 0.5988515864034536, "grad_norm": 0.16486227865906747, "kl": 0.0298004150390625, "learning_rate": 4.4098765432098763e-07, "loss": 0.0, "reward": 1.7589286342263222, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 3572 }, { "completion_length": 263.4285831451416, "epoch": 0.5991868896433212, "grad_norm": 0.2593585175782869, "kl": 0.02993011474609375, "learning_rate": 4.412345679012345e-07, "loss": 0.0, "reward": 1.717857226729393, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7267857454717159, "rewards/format_reward_func": 0.9910714328289032, "step": 3574 }, { "completion_length": 253.36161994934082, "epoch": 0.5995221928831888, "grad_norm": 0.22218048407968422, "kl": 0.029876708984375, "learning_rate": 4.4148148148148146e-07, "loss": 0.0, "reward": 1.8071429133415222, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8071428909897804, "rewards/format_reward_func": 1.0, "step": 3576 }, { "completion_length": 255.6785831451416, "epoch": 0.5998574961230563, "grad_norm": 0.27713844671160104, "kl": 0.02791595458984375, "learning_rate": 4.417283950617284e-07, "loss": 0.0, "reward": 1.7214286550879478, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.721428606659174, "rewards/format_reward_func": 1.0, "step": 3578 }, { "completion_length": 258.5223331451416, "epoch": 0.6001927993629238, "grad_norm": 0.2268908937876009, "kl": 0.14574432373046875, "learning_rate": 4.419753086419753e-07, "loss": 0.0001, "reward": 1.7839286252856255, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7973214574158192, "rewards/format_reward_func": 0.9866071492433548, "step": 3580 }, { "completion_length": 260.96876430511475, "epoch": 0.6005281026027914, "grad_norm": 0.008560615600363496, "kl": 0.04407501220703125, "learning_rate": 4.4222222222222223e-07, "loss": 0.0, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 3582 }, { "completion_length": 263.3794746398926, "epoch": 0.600863405842659, "grad_norm": 0.3125592782204781, "kl": 0.039703369140625, "learning_rate": 4.424691358024691e-07, "loss": 0.0, "reward": 1.7321429401636124, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7410714589059353, "rewards/format_reward_func": 0.9910714328289032, "step": 3584 }, { "completion_length": 265.0401945114136, "epoch": 0.6011987090825265, "grad_norm": 0.23566926170906846, "kl": 0.064239501953125, "learning_rate": 4.42716049382716e-07, "loss": 0.0001, "reward": 1.7482143566012383, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 3586 }, { "completion_length": 260.8169765472412, "epoch": 0.6015340123223941, "grad_norm": 0.18104321652659505, "kl": 0.06757354736328125, "learning_rate": 4.4296296296296295e-07, "loss": 0.0001, "reward": 1.7142857685685158, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857518047094, "rewards/format_reward_func": 1.0, "step": 3588 }, { "completion_length": 247.33929824829102, "epoch": 0.6018693155622616, "grad_norm": 0.2856698746219867, "kl": 0.051727294921875, "learning_rate": 4.4320987654320984e-07, "loss": 0.0001, "reward": 1.7250000834465027, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000312924385, "rewards/format_reward_func": 1.0, "step": 3590 }, { "completion_length": 258.3259029388428, "epoch": 0.6022046188021292, "grad_norm": 0.4413945663867954, "kl": 0.070465087890625, "learning_rate": 4.434567901234568e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 3592 }, { "completion_length": 260.0669746398926, "epoch": 0.6025399220419967, "grad_norm": 0.20788258491713757, "kl": 0.076507568359375, "learning_rate": 4.4370370370370367e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7642857320606709, "rewards/format_reward_func": 0.9821428656578064, "step": 3594 }, { "completion_length": 257.54019260406494, "epoch": 0.6028752252818643, "grad_norm": 0.35534726550307105, "kl": 0.02875518798828125, "learning_rate": 4.439506172839506e-07, "loss": 0.0, "reward": 1.7821429297327995, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7821428775787354, "rewards/format_reward_func": 1.0, "step": 3596 }, { "completion_length": 265.495548248291, "epoch": 0.6032105285217318, "grad_norm": 0.32475422464455267, "kl": 0.25457000732421875, "learning_rate": 4.4419753086419755e-07, "loss": 0.0003, "reward": 1.691071517765522, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.6955357398837805, "rewards/format_reward_func": 0.9955357164144516, "step": 3598 }, { "completion_length": 256.9910821914673, "epoch": 0.6035458317615994, "grad_norm": 0.3155386853226163, "kl": 0.033954620361328125, "learning_rate": 4.444444444444444e-07, "loss": 0.0, "reward": 1.7339286506175995, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 3600 }, { "completion_length": 249.37501049041748, "epoch": 0.603881135001467, "grad_norm": 0.3366549119682878, "kl": 0.15158843994140625, "learning_rate": 4.446913580246913e-07, "loss": 0.0002, "reward": 1.8053571954369545, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8098214510828257, "rewards/format_reward_func": 0.9955357164144516, "step": 3602 }, { "completion_length": 261.1160831451416, "epoch": 0.6042164382413345, "grad_norm": 0.2374163328003629, "kl": 0.10363006591796875, "learning_rate": 4.4493827160493827e-07, "loss": 0.0001, "reward": 1.7964286506175995, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 3604 }, { "completion_length": 265.6518039703369, "epoch": 0.604551741481202, "grad_norm": 0.24493017949672824, "kl": 0.33367919921875, "learning_rate": 4.4518518518518515e-07, "loss": 0.0003, "reward": 1.7375001013278961, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7508928775787354, "rewards/format_reward_func": 0.9866071492433548, "step": 3606 }, { "completion_length": 267.93304920196533, "epoch": 0.6048870447210696, "grad_norm": 0.4602830429080869, "kl": 0.5742149353027344, "learning_rate": 4.454320987654321e-07, "loss": 0.0006, "reward": 1.778571479022503, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7875000312924385, "rewards/format_reward_func": 0.9910714328289032, "step": 3608 }, { "completion_length": 269.2678699493408, "epoch": 0.6052223479609372, "grad_norm": 0.3401300966547623, "kl": 0.16431427001953125, "learning_rate": 4.45679012345679e-07, "loss": 0.0002, "reward": 1.7714286372065544, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7803571745753288, "rewards/format_reward_func": 0.9910714328289032, "step": 3610 }, { "completion_length": 274.7009029388428, "epoch": 0.6055576512008047, "grad_norm": 0.32991585835717985, "kl": 0.12230682373046875, "learning_rate": 4.459259259259259e-07, "loss": 0.0001, "reward": 1.7196429297327995, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.7330357395112514, "rewards/format_reward_func": 0.9866071492433548, "step": 3612 }, { "completion_length": 270.4866189956665, "epoch": 0.6058929544406723, "grad_norm": 0.255009936150431, "kl": 0.03687286376953125, "learning_rate": 4.461728395061728e-07, "loss": 0.0, "reward": 1.7625000551342964, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7758928798139095, "rewards/format_reward_func": 0.9866071492433548, "step": 3614 }, { "completion_length": 275.08929920196533, "epoch": 0.6062282576805398, "grad_norm": 0.18435769219493214, "kl": 0.14650726318359375, "learning_rate": 4.464197530864197e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7589286118745804, "rewards/format_reward_func": 0.9910714328289032, "step": 3616 }, { "completion_length": 275.9955472946167, "epoch": 0.6065635609204074, "grad_norm": 0.50317108046287, "kl": 0.8645095825195312, "learning_rate": 4.4666666666666664e-07, "loss": 0.0009, "reward": 1.7982143312692642, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.8116071671247482, "rewards/format_reward_func": 0.9866071492433548, "step": 3618 }, { "completion_length": 265.30804538726807, "epoch": 0.6068988641602749, "grad_norm": 0.2825123505770023, "kl": 0.03264617919921875, "learning_rate": 4.4691358024691353e-07, "loss": 0.0, "reward": 1.7464286386966705, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.755357176065445, "rewards/format_reward_func": 0.9910714328289032, "step": 3620 }, { "completion_length": 271.2857246398926, "epoch": 0.6072341674001425, "grad_norm": 0.18990244417672877, "kl": 0.08856964111328125, "learning_rate": 4.4716049382716047e-07, "loss": 0.0001, "reward": 1.7053572162985802, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7187500335276127, "rewards/format_reward_func": 0.9866071492433548, "step": 3622 }, { "completion_length": 255.42411994934082, "epoch": 0.6075694706400101, "grad_norm": 0.28196388269226436, "kl": 0.16127777099609375, "learning_rate": 4.474074074074074e-07, "loss": 0.0002, "reward": 1.8125000521540642, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8169643022119999, "rewards/format_reward_func": 0.9955357164144516, "step": 3624 }, { "completion_length": 273.1428689956665, "epoch": 0.6079047738798776, "grad_norm": 0.4959674196608172, "kl": 1.2173080444335938, "learning_rate": 4.476543209876543e-07, "loss": 0.0012, "reward": 1.714285783469677, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.732142886146903, "rewards/format_reward_func": 0.9821428656578064, "step": 3626 }, { "completion_length": 264.4330472946167, "epoch": 0.6082400771197451, "grad_norm": 0.18397191856889522, "kl": 0.22011566162109375, "learning_rate": 4.479012345679012e-07, "loss": 0.0002, "reward": 1.7714286521077156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 3628 }, { "completion_length": 265.06697845458984, "epoch": 0.6085753803596127, "grad_norm": 0.27923133250657345, "kl": 0.48724365234375, "learning_rate": 4.4814814814814813e-07, "loss": 0.0005, "reward": 1.7446429207921028, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7580357510596514, "rewards/format_reward_func": 0.9866071492433548, "step": 3630 }, { "completion_length": 263.29912185668945, "epoch": 0.6089106835994803, "grad_norm": 0.2536893755418448, "kl": 0.7797088623046875, "learning_rate": 4.48395061728395e-07, "loss": 0.0008, "reward": 1.8160715252161026, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.820535734295845, "rewards/format_reward_func": 0.9955357164144516, "step": 3632 }, { "completion_length": 263.401798248291, "epoch": 0.6092459868393478, "grad_norm": 0.18328565704488098, "kl": 0.48796844482421875, "learning_rate": 4.4864197530864196e-07, "loss": 0.0005, "reward": 1.7196429297327995, "reward_std": 0.0328299580141902, "rewards/equation_reward_func": 0.7241071872413158, "rewards/format_reward_func": 0.9955357164144516, "step": 3634 }, { "completion_length": 257.8973331451416, "epoch": 0.6095812900792154, "grad_norm": 0.4203365404663292, "kl": 0.1136932373046875, "learning_rate": 4.4888888888888885e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7732142955064774, "rewards/format_reward_func": 0.9910714328289032, "step": 3636 }, { "completion_length": 261.6384038925171, "epoch": 0.609916593319083, "grad_norm": 0.27518745976881687, "kl": 0.206268310546875, "learning_rate": 4.491358024691358e-07, "loss": 0.0002, "reward": 1.6767858192324638, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.681250024586916, "rewards/format_reward_func": 0.9955357164144516, "step": 3638 }, { "completion_length": 258.3660840988159, "epoch": 0.6102518965589505, "grad_norm": 0.28078115457804403, "kl": 0.970550537109375, "learning_rate": 4.4938271604938273e-07, "loss": 0.001, "reward": 1.7750000655651093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 3640 }, { "completion_length": 261.7500104904175, "epoch": 0.610587199798818, "grad_norm": 0.38733232869582146, "kl": 1.2825469970703125, "learning_rate": 4.496296296296296e-07, "loss": 0.0013, "reward": 1.7160715013742447, "reward_std": 0.09848987404257059, "rewards/equation_reward_func": 0.7383928783237934, "rewards/format_reward_func": 0.977678582072258, "step": 3642 }, { "completion_length": 254.3928680419922, "epoch": 0.6109225030386856, "grad_norm": 0.22281246987673115, "kl": 0.1315765380859375, "learning_rate": 4.498765432098765e-07, "loss": 0.0001, "reward": 1.7642857655882835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 3644 }, { "completion_length": 275.2098331451416, "epoch": 0.6112578062785532, "grad_norm": 0.3146735029654327, "kl": 1.4842376708984375, "learning_rate": 4.501234567901234e-07, "loss": 0.0015, "reward": 1.676785796880722, "reward_std": 0.11364216171205044, "rewards/equation_reward_func": 0.6991071626543999, "rewards/format_reward_func": 0.977678582072258, "step": 3646 }, { "completion_length": 255.4241180419922, "epoch": 0.6115931095184207, "grad_norm": 0.2005355282523996, "kl": 0.8999481201171875, "learning_rate": 4.5037037037037034e-07, "loss": 0.0009, "reward": 1.7816964611411095, "reward_std": 0.03598668519407511, "rewards/equation_reward_func": 0.7937500290572643, "rewards/format_reward_func": 0.9879464358091354, "step": 3648 }, { "completion_length": 274.8169755935669, "epoch": 0.6119284127582882, "grad_norm": 0.14003841077332135, "kl": 2.5995941162109375, "learning_rate": 4.506172839506173e-07, "loss": 0.0026, "reward": 1.7392857894301414, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7482143137603998, "rewards/format_reward_func": 0.9910714328289032, "step": 3650 }, { "completion_length": 259.9642972946167, "epoch": 0.6122637159981559, "grad_norm": 0.4061581313184363, "kl": 0.16374969482421875, "learning_rate": 4.5086419753086417e-07, "loss": 0.0002, "reward": 1.7892857491970062, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7982143089175224, "rewards/format_reward_func": 0.9910714328289032, "step": 3652 }, { "completion_length": 268.370548248291, "epoch": 0.6125990192380234, "grad_norm": 0.38859381055339315, "kl": 1.5375289916992188, "learning_rate": 4.511111111111111e-07, "loss": 0.0015, "reward": 1.7678572162985802, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7767857424914837, "rewards/format_reward_func": 0.9910714328289032, "step": 3654 }, { "completion_length": 253.7812623977661, "epoch": 0.6129343224778909, "grad_norm": 0.22522029722982645, "kl": 0.46714019775390625, "learning_rate": 4.5135802469135805e-07, "loss": 0.0005, "reward": 1.7160715013742447, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7294643335044384, "rewards/format_reward_func": 0.9866071492433548, "step": 3656 }, { "completion_length": 257.25001430511475, "epoch": 0.6132696257177584, "grad_norm": 0.32075938946960136, "kl": 0.14533233642578125, "learning_rate": 4.516049382716049e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.8258928768336773, "rewards/format_reward_func": 0.9866071492433548, "step": 3658 }, { "completion_length": 262.0535840988159, "epoch": 0.6136049289576261, "grad_norm": 0.16659359770005583, "kl": 0.3695220947265625, "learning_rate": 4.5185185185185183e-07, "loss": 0.0004, "reward": 1.744642935693264, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 3660 }, { "completion_length": 259.53126430511475, "epoch": 0.6139402321974936, "grad_norm": 0.3132566015456766, "kl": 0.03826141357421875, "learning_rate": 4.520987654320987e-07, "loss": 0.0, "reward": 1.7339286357164383, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7473214641213417, "rewards/format_reward_func": 0.9866071492433548, "step": 3662 }, { "completion_length": 256.51787090301514, "epoch": 0.6142755354373611, "grad_norm": 0.2757417864081997, "kl": 0.06600189208984375, "learning_rate": 4.5234567901234566e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 3664 }, { "completion_length": 245.51340198516846, "epoch": 0.6146108386772288, "grad_norm": 0.17796460260570873, "kl": 0.2266845703125, "learning_rate": 4.525925925925926e-07, "loss": 0.0002, "reward": 1.751785784959793, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 3666 }, { "completion_length": 258.04912281036377, "epoch": 0.6149461419170963, "grad_norm": 0.3287939137579827, "kl": 0.10906219482421875, "learning_rate": 4.528395061728395e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7633928898721933, "rewards/format_reward_func": 0.9955357164144516, "step": 3668 }, { "completion_length": 248.98661994934082, "epoch": 0.6152814451569638, "grad_norm": 0.12255448020441631, "kl": 0.041259765625, "learning_rate": 4.5308641975308643e-07, "loss": 0.0, "reward": 1.7946429029107094, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7991071753203869, "rewards/format_reward_func": 0.9955357164144516, "step": 3670 }, { "completion_length": 260.6919746398926, "epoch": 0.6156167483968313, "grad_norm": 0.26965476202319116, "kl": 0.07506561279296875, "learning_rate": 4.5333333333333326e-07, "loss": 0.0001, "reward": 1.7392857670783997, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.748214315623045, "rewards/format_reward_func": 0.9910714328289032, "step": 3672 }, { "completion_length": 253.81697940826416, "epoch": 0.615952051636699, "grad_norm": 0.22782034526467285, "kl": 0.09051513671875, "learning_rate": 4.535802469135802e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7875000275671482, "rewards/format_reward_func": 0.9910714328289032, "step": 3674 }, { "completion_length": 254.91519165039062, "epoch": 0.6162873548765665, "grad_norm": 0.2151209777437395, "kl": 0.23415374755859375, "learning_rate": 4.5382716049382715e-07, "loss": 0.0002, "reward": 1.7642857655882835, "reward_std": 0.06060915347188711, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 0.9821428656578064, "step": 3676 }, { "completion_length": 257.40626335144043, "epoch": 0.616622658116434, "grad_norm": 0.16959510394683283, "kl": 0.04297637939453125, "learning_rate": 4.5407407407407403e-07, "loss": 0.0, "reward": 1.7196429371833801, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 3678 }, { "completion_length": 264.46876335144043, "epoch": 0.6169579613563017, "grad_norm": 0.4565196614990558, "kl": 0.9423294067382812, "learning_rate": 4.54320987654321e-07, "loss": 0.0009, "reward": 1.6928572207689285, "reward_std": 0.09091372694820166, "rewards/equation_reward_func": 0.7196428887546062, "rewards/format_reward_func": 0.9732142984867096, "step": 3680 }, { "completion_length": 249.86161708831787, "epoch": 0.6172932645961692, "grad_norm": 0.20489827746347586, "kl": 0.0494842529296875, "learning_rate": 4.545679012345679e-07, "loss": 0.0, "reward": 1.728571504354477, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714522004128, "rewards/format_reward_func": 1.0, "step": 3682 }, { "completion_length": 260.1294765472412, "epoch": 0.6176285678360367, "grad_norm": 0.8204189816885677, "kl": 1.1546859741210938, "learning_rate": 4.548148148148148e-07, "loss": 0.0012, "reward": 1.7500000596046448, "reward_std": 0.07071067858487368, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 0.9821428656578064, "step": 3684 }, { "completion_length": 250.92858219146729, "epoch": 0.6179638710759042, "grad_norm": 0.3160806438632435, "kl": 0.0600128173828125, "learning_rate": 4.550617283950617e-07, "loss": 0.0001, "reward": 1.7160715088248253, "reward_std": 0.0883883461356163, "rewards/equation_reward_func": 0.7294643092900515, "rewards/format_reward_func": 0.9866071492433548, "step": 3686 }, { "completion_length": 257.26786708831787, "epoch": 0.6182991743157719, "grad_norm": 0.36269127757830905, "kl": 0.04349517822265625, "learning_rate": 4.553086419753086e-07, "loss": 0.0, "reward": 1.7750000655651093, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 3688 }, { "completion_length": 253.61608219146729, "epoch": 0.6186344775556394, "grad_norm": 0.5216867037004947, "kl": 1.238555908203125, "learning_rate": 4.555555555555555e-07, "loss": 0.0012, "reward": 1.7321429252624512, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.741071468219161, "rewards/format_reward_func": 0.9910714328289032, "step": 3690 }, { "completion_length": 259.3839387893677, "epoch": 0.6189697807955069, "grad_norm": 0.8136101314294178, "kl": 3.5810470581054688, "learning_rate": 4.5580246913580246e-07, "loss": 0.0036, "reward": 1.7357143834233284, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9910714328289032, "step": 3692 }, { "completion_length": 260.73215675354004, "epoch": 0.6193050840353745, "grad_norm": 0.2764728454179333, "kl": 0.43869781494140625, "learning_rate": 4.5604938271604935e-07, "loss": 0.0004, "reward": 1.7339286506175995, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7383928913623095, "rewards/format_reward_func": 0.9955357164144516, "step": 3694 }, { "completion_length": 255.40625953674316, "epoch": 0.6196403872752421, "grad_norm": 0.27911070506951186, "kl": 0.2676239013671875, "learning_rate": 4.562962962962963e-07, "loss": 0.0003, "reward": 1.748214341700077, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 3696 }, { "completion_length": 243.78572368621826, "epoch": 0.6199756905151096, "grad_norm": 0.23063148507154832, "kl": 0.04782867431640625, "learning_rate": 4.5654320987654324e-07, "loss": 0.0, "reward": 1.7517857998609543, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 3698 }, { "completion_length": 249.63394260406494, "epoch": 0.6203109937549771, "grad_norm": 0.250609945184822, "kl": 0.130401611328125, "learning_rate": 4.5679012345679007e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7696428932249546, "rewards/format_reward_func": 0.9910714328289032, "step": 3700 }, { "completion_length": 252.46876430511475, "epoch": 0.6206462969948447, "grad_norm": 0.3030883228478924, "kl": 0.6184768676757812, "learning_rate": 4.57037037037037e-07, "loss": 0.0006, "reward": 1.7785714864730835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 3702 }, { "completion_length": 245.9821538925171, "epoch": 0.6209816002347123, "grad_norm": 0.22697660760153765, "kl": 0.03972625732421875, "learning_rate": 4.572839506172839e-07, "loss": 0.0, "reward": 1.7428572177886963, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 3704 }, { "completion_length": 246.85268878936768, "epoch": 0.6213169034745798, "grad_norm": 0.692975794353227, "kl": 1.3213653564453125, "learning_rate": 4.5753086419753084e-07, "loss": 0.0013, "reward": 1.7714286148548126, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7803571857511997, "rewards/format_reward_func": 0.9910714328289032, "step": 3706 }, { "completion_length": 246.72768878936768, "epoch": 0.6216522067144474, "grad_norm": 0.2097287152304815, "kl": 0.0494232177734375, "learning_rate": 4.577777777777778e-07, "loss": 0.0, "reward": 1.7160715162754059, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7205357514321804, "rewards/format_reward_func": 0.9955357164144516, "step": 3708 }, { "completion_length": 252.20536708831787, "epoch": 0.621987509954315, "grad_norm": 0.3316795839811799, "kl": 0.046783447265625, "learning_rate": 4.5802469135802467e-07, "loss": 0.0, "reward": 1.6767858117818832, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.6812500283122063, "rewards/format_reward_func": 0.9955357164144516, "step": 3710 }, { "completion_length": 247.56251049041748, "epoch": 0.6223228131941825, "grad_norm": 0.1709766239519439, "kl": 0.240081787109375, "learning_rate": 4.582716049382716e-07, "loss": 0.0002, "reward": 1.7843750640749931, "reward_std": 0.02209708606824279, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 0.9986607171595097, "step": 3712 }, { "completion_length": 255.37947750091553, "epoch": 0.62265811643405, "grad_norm": 0.24275119545561563, "kl": 0.20494842529296875, "learning_rate": 4.5851851851851845e-07, "loss": 0.0002, "reward": 1.782142922282219, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7910714596509933, "rewards/format_reward_func": 0.9910714328289032, "step": 3714 }, { "completion_length": 257.3348331451416, "epoch": 0.6229934196739176, "grad_norm": 0.1281208930031389, "kl": 0.1625823974609375, "learning_rate": 4.587654320987654e-07, "loss": 0.0002, "reward": 1.7625000551342964, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669642996042967, "rewards/format_reward_func": 0.9955357164144516, "step": 3716 }, { "completion_length": 257.07590103149414, "epoch": 0.6233287229137852, "grad_norm": 0.30500522924730594, "kl": 0.0452117919921875, "learning_rate": 4.5901234567901233e-07, "loss": 0.0, "reward": 1.7750000655651093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 3718 }, { "completion_length": 263.6384048461914, "epoch": 0.6236640261536527, "grad_norm": 0.21559576261696073, "kl": 0.29811859130859375, "learning_rate": 4.592592592592592e-07, "loss": 0.0003, "reward": 1.7428572177886963, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 3720 }, { "completion_length": 250.7053689956665, "epoch": 0.6239993293935203, "grad_norm": 0.2677336786068505, "kl": 0.135406494140625, "learning_rate": 4.5950617283950616e-07, "loss": 0.0001, "reward": 1.7125000581145287, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7258928865194321, "rewards/format_reward_func": 0.9866071492433548, "step": 3722 }, { "completion_length": 254.31697750091553, "epoch": 0.6243346326333878, "grad_norm": 0.20902682284429114, "kl": 0.03919219970703125, "learning_rate": 4.597530864197531e-07, "loss": 0.0, "reward": 1.7375000789761543, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7508928887546062, "rewards/format_reward_func": 0.9866071492433548, "step": 3724 }, { "completion_length": 257.21876335144043, "epoch": 0.6246699358732554, "grad_norm": 0.31779276105932713, "kl": 0.034637451171875, "learning_rate": 4.6e-07, "loss": 0.0, "reward": 1.701785795390606, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7062500342726707, "rewards/format_reward_func": 0.9955357164144516, "step": 3726 }, { "completion_length": 262.53572368621826, "epoch": 0.6250052391131229, "grad_norm": 0.2653994448875676, "kl": 0.065643310546875, "learning_rate": 4.6024691358024693e-07, "loss": 0.0001, "reward": 1.78035718947649, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7848214693367481, "rewards/format_reward_func": 0.9955357164144516, "step": 3728 }, { "completion_length": 254.70536994934082, "epoch": 0.6253405423529905, "grad_norm": 0.19647143960852814, "kl": 0.039215087890625, "learning_rate": 4.6049382716049377e-07, "loss": 0.0, "reward": 1.7392858117818832, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 3730 }, { "completion_length": 252.8794755935669, "epoch": 0.625675845592858, "grad_norm": 0.2663136311118988, "kl": 0.04831695556640625, "learning_rate": 4.607407407407407e-07, "loss": 0.0, "reward": 1.791071467101574, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7955357320606709, "rewards/format_reward_func": 0.9955357164144516, "step": 3732 }, { "completion_length": 261.4866180419922, "epoch": 0.6260111488327256, "grad_norm": 0.5976991428641791, "kl": 0.09447479248046875, "learning_rate": 4.6098765432098765e-07, "loss": 0.0001, "reward": 1.6808036416769028, "reward_std": 0.05745242489501834, "rewards/equation_reward_func": 0.6910714711993933, "rewards/format_reward_func": 0.9897321499884129, "step": 3734 }, { "completion_length": 249.4062623977661, "epoch": 0.6263464520725931, "grad_norm": 0.20909877453812703, "kl": 0.03856658935546875, "learning_rate": 4.6123456790123454e-07, "loss": 0.0, "reward": 1.7517857551574707, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7562500406056643, "rewards/format_reward_func": 0.9955357164144516, "step": 3736 }, { "completion_length": 258.77679920196533, "epoch": 0.6266817553124607, "grad_norm": 0.20655282498276678, "kl": 0.04911041259765625, "learning_rate": 4.614814814814815e-07, "loss": 0.0, "reward": 1.7875000685453415, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7919643148779869, "rewards/format_reward_func": 0.9955357164144516, "step": 3738 }, { "completion_length": 247.31697463989258, "epoch": 0.6270170585523283, "grad_norm": 0.2317755787603404, "kl": 0.07225799560546875, "learning_rate": 4.6172839506172837e-07, "loss": 0.0001, "reward": 1.7767857685685158, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7812500223517418, "rewards/format_reward_func": 0.9955357164144516, "step": 3740 }, { "completion_length": 241.24554824829102, "epoch": 0.6273523617921958, "grad_norm": 0.24892404972568108, "kl": 0.21956634521484375, "learning_rate": 4.619753086419753e-07, "loss": 0.0002, "reward": 1.750000074505806, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7589286044239998, "rewards/format_reward_func": 0.9910714328289032, "step": 3742 }, { "completion_length": 250.0357265472412, "epoch": 0.6276876650320634, "grad_norm": 0.2655812732823818, "kl": 0.07500457763671875, "learning_rate": 4.622222222222222e-07, "loss": 0.0001, "reward": 1.7517857626080513, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 3744 }, { "completion_length": 255.49108505249023, "epoch": 0.6280229682719309, "grad_norm": 0.37752015835330077, "kl": 0.08672332763671875, "learning_rate": 4.624691358024691e-07, "loss": 0.0001, "reward": 1.7910714894533157, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 3746 }, { "completion_length": 242.4598331451416, "epoch": 0.6283582715117985, "grad_norm": 0.22297982995630966, "kl": 0.07720947265625, "learning_rate": 4.62716049382716e-07, "loss": 0.0001, "reward": 1.7196429297327995, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7241071574389935, "rewards/format_reward_func": 0.9955357164144516, "step": 3748 }, { "completion_length": 247.51340579986572, "epoch": 0.628693574751666, "grad_norm": 0.2722172443493719, "kl": 0.3848876953125, "learning_rate": 4.6296296296296297e-07, "loss": 0.0004, "reward": 1.7553572207689285, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7687500268220901, "rewards/format_reward_func": 0.9866071492433548, "step": 3750 }, { "completion_length": 244.4509048461914, "epoch": 0.6290288779915336, "grad_norm": 0.23104650402682889, "kl": 0.04361724853515625, "learning_rate": 4.6320987654320986e-07, "loss": 0.0, "reward": 1.769642911851406, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 3752 }, { "completion_length": 248.19197845458984, "epoch": 0.6293641812314011, "grad_norm": 0.2429922187286504, "kl": 0.41692352294921875, "learning_rate": 4.634567901234568e-07, "loss": 0.0004, "reward": 1.7375000566244125, "reward_std": 0.10859139543026686, "rewards/equation_reward_func": 0.7508928887546062, "rewards/format_reward_func": 0.9866071492433548, "step": 3754 }, { "completion_length": 243.48661994934082, "epoch": 0.6296994844712687, "grad_norm": 0.5067240377764414, "kl": 1.2078857421875, "learning_rate": 4.637037037037037e-07, "loss": 0.0012, "reward": 1.8008929044008255, "reward_std": 0.049244935158640146, "rewards/equation_reward_func": 0.8026785850524902, "rewards/format_reward_func": 0.9982142895460129, "step": 3756 }, { "completion_length": 254.66965770721436, "epoch": 0.6300347877111363, "grad_norm": 0.20975011767382296, "kl": 0.07564544677734375, "learning_rate": 4.639506172839506e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7660714611411095, "rewards/format_reward_func": 0.9910714328289032, "step": 3758 }, { "completion_length": 262.8035840988159, "epoch": 0.6303700909510038, "grad_norm": 0.9492404850582921, "kl": 3.7341232299804688, "learning_rate": 4.641975308641975e-07, "loss": 0.0037, "reward": 1.7589286118745804, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7812500186264515, "rewards/format_reward_func": 0.977678582072258, "step": 3760 }, { "completion_length": 273.1250123977661, "epoch": 0.6307053941908713, "grad_norm": 0.4768543484406165, "kl": 0.6936798095703125, "learning_rate": 4.644444444444444e-07, "loss": 0.0007, "reward": 1.6446429416537285, "reward_std": 0.13889597728848457, "rewards/equation_reward_func": 0.6937500238418579, "rewards/format_reward_func": 0.9508928805589676, "step": 3762 }, { "completion_length": 253.3169765472412, "epoch": 0.6310406974307389, "grad_norm": 0.545748958630044, "kl": 0.9039535522460938, "learning_rate": 4.6469135802469134e-07, "loss": 0.0009, "reward": 1.6625000536441803, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.7026786003261805, "rewards/format_reward_func": 0.9598214477300644, "step": 3764 }, { "completion_length": 262.6830463409424, "epoch": 0.6313760006706065, "grad_norm": 0.39882267008820893, "kl": 0.4140472412109375, "learning_rate": 4.6493827160493823e-07, "loss": 0.0004, "reward": 1.708928644657135, "reward_std": 0.12879444938153028, "rewards/equation_reward_func": 0.7491071783006191, "rewards/format_reward_func": 0.9598214477300644, "step": 3766 }, { "completion_length": 275.0089416503906, "epoch": 0.631711303910474, "grad_norm": 0.4620834014351106, "kl": 0.78497314453125, "learning_rate": 4.651851851851852e-07, "loss": 0.0008, "reward": 1.6575893387198448, "reward_std": 0.20139916171319783, "rewards/equation_reward_func": 0.7080357428640127, "rewards/format_reward_func": 0.9495535977184772, "step": 3768 }, { "completion_length": 261.3437623977661, "epoch": 0.6320466071503416, "grad_norm": 0.40476944953139327, "kl": 2.376129150390625, "learning_rate": 4.654320987654321e-07, "loss": 0.0024, "reward": 1.6946428939700127, "reward_std": 0.10859139915555716, "rewards/equation_reward_func": 0.734821442514658, "rewards/format_reward_func": 0.9598214477300644, "step": 3770 }, { "completion_length": 248.98215579986572, "epoch": 0.6323819103902092, "grad_norm": 0.18001335211723607, "kl": 1.412933349609375, "learning_rate": 4.6567901234567895e-07, "loss": 0.0014, "reward": 1.7330357730388641, "reward_std": 0.08460027631372213, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9803571552038193, "step": 3772 }, { "completion_length": 263.4330463409424, "epoch": 0.6327172136300767, "grad_norm": 0.3190957941370173, "kl": 0.8990097045898438, "learning_rate": 4.659259259259259e-07, "loss": 0.0009, "reward": 1.68392863124609, "reward_std": 0.0833375845104456, "rewards/equation_reward_func": 0.706250037997961, "rewards/format_reward_func": 0.977678582072258, "step": 3774 }, { "completion_length": 254.8794755935669, "epoch": 0.6330525168699442, "grad_norm": 0.31257213744887996, "kl": 0.89825439453125, "learning_rate": 4.6617283950617283e-07, "loss": 0.0009, "reward": 1.7276786416769028, "reward_std": 0.07197337062098086, "rewards/equation_reward_func": 0.7437500357627869, "rewards/format_reward_func": 0.9839285798370838, "step": 3776 }, { "completion_length": 258.2276916503906, "epoch": 0.6333878201098118, "grad_norm": 0.7055444014800766, "kl": 8.225692749023438, "learning_rate": 4.664197530864197e-07, "loss": 0.0082, "reward": 1.6964286342263222, "reward_std": 0.10606601648032665, "rewards/equation_reward_func": 0.7232143171131611, "rewards/format_reward_func": 0.9732142984867096, "step": 3778 }, { "completion_length": 265.3482275009155, "epoch": 0.6337231233496794, "grad_norm": 0.9790034779343154, "kl": 4.100105285644531, "learning_rate": 4.6666666666666666e-07, "loss": 0.0041, "reward": 1.657142959535122, "reward_std": 0.1616244027391076, "rewards/equation_reward_func": 0.6928571723401546, "rewards/format_reward_func": 0.9642857313156128, "step": 3780 }, { "completion_length": 256.03572845458984, "epoch": 0.6340584265895469, "grad_norm": 0.29187955878990074, "kl": 0.35442352294921875, "learning_rate": 4.6691358024691355e-07, "loss": 0.0004, "reward": 1.6660715118050575, "reward_std": 0.10859139915555716, "rewards/equation_reward_func": 0.6883928924798965, "rewards/format_reward_func": 0.977678582072258, "step": 3782 }, { "completion_length": 252.56697750091553, "epoch": 0.6343937298294144, "grad_norm": 0.41425872484802523, "kl": 0.2930908203125, "learning_rate": 4.671604938271605e-07, "loss": 0.0003, "reward": 1.7535714730620384, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7625000216066837, "rewards/format_reward_func": 0.9910714328289032, "step": 3784 }, { "completion_length": 244.29018878936768, "epoch": 0.6347290330692821, "grad_norm": 0.4734383263821688, "kl": 0.1266326904296875, "learning_rate": 4.674074074074074e-07, "loss": 0.0001, "reward": 1.6875000819563866, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.6919643208384514, "rewards/format_reward_func": 0.9955357164144516, "step": 3786 }, { "completion_length": 255.27679634094238, "epoch": 0.6350643363091496, "grad_norm": 0.5475088235706634, "kl": 0.07224273681640625, "learning_rate": 4.6765432098765427e-07, "loss": 0.0001, "reward": 1.657142959535122, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.6741071790456772, "rewards/format_reward_func": 0.9830357283353806, "step": 3788 }, { "completion_length": 241.01340293884277, "epoch": 0.6353996395490171, "grad_norm": 0.20196700802888495, "kl": 0.046722412109375, "learning_rate": 4.679012345679012e-07, "loss": 0.0, "reward": 1.7357143759727478, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 3790 }, { "completion_length": 260.00893783569336, "epoch": 0.6357349427888846, "grad_norm": 0.6002225994492422, "kl": 0.1472930908203125, "learning_rate": 4.681481481481481e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7446428835391998, "rewards/format_reward_func": 0.9910714328289032, "step": 3792 }, { "completion_length": 253.6026906967163, "epoch": 0.6360702460287523, "grad_norm": 0.23931617298852403, "kl": 0.0593109130859375, "learning_rate": 4.6839506172839504e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.745535746216774, "rewards/format_reward_func": 0.9955357164144516, "step": 3794 }, { "completion_length": 250.92411708831787, "epoch": 0.6364055492686198, "grad_norm": 0.2321628612327699, "kl": 0.057647705078125, "learning_rate": 4.68641975308642e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7303571663796902, "rewards/format_reward_func": 0.9910714328289032, "step": 3796 }, { "completion_length": 253.66965103149414, "epoch": 0.6367408525084873, "grad_norm": 0.34411481195861043, "kl": 0.0452880859375, "learning_rate": 4.6888888888888887e-07, "loss": 0.0, "reward": 1.7250000685453415, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.7339286170899868, "rewards/format_reward_func": 0.9910714328289032, "step": 3798 }, { "completion_length": 249.4017972946167, "epoch": 0.637076155748355, "grad_norm": 0.3156697298150423, "kl": 0.0444183349609375, "learning_rate": 4.6913580246913576e-07, "loss": 0.0, "reward": 1.8125000596046448, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.8169643133878708, "rewards/format_reward_func": 0.9955357164144516, "step": 3800 }, { "completion_length": 251.35268688201904, "epoch": 0.6374114589882225, "grad_norm": 0.39588853568629256, "kl": 0.0497589111328125, "learning_rate": 4.693827160493827e-07, "loss": 0.0, "reward": 1.7366072237491608, "reward_std": 0.06944798771291971, "rewards/equation_reward_func": 0.7473214641213417, "rewards/format_reward_func": 0.9892857223749161, "step": 3802 }, { "completion_length": 269.97769260406494, "epoch": 0.63774676222809, "grad_norm": 0.2850366769440957, "kl": 0.05326080322265625, "learning_rate": 4.696296296296296e-07, "loss": 0.0001, "reward": 1.692857250571251, "reward_std": 0.11111677810549736, "rewards/equation_reward_func": 0.7107143178582191, "rewards/format_reward_func": 0.9821428656578064, "step": 3804 }, { "completion_length": 244.63840293884277, "epoch": 0.6380820654679575, "grad_norm": 0.27088312156428124, "kl": 0.0478973388671875, "learning_rate": 4.6987654320987653e-07, "loss": 0.0, "reward": 1.7982143461704254, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8026785925030708, "rewards/format_reward_func": 0.9955357164144516, "step": 3806 }, { "completion_length": 255.5223331451416, "epoch": 0.6384173687078252, "grad_norm": 0.15589935664162494, "kl": 0.05022430419921875, "learning_rate": 4.701234567901234e-07, "loss": 0.0001, "reward": 1.6910715103149414, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.6955357454717159, "rewards/format_reward_func": 0.9955357164144516, "step": 3808 }, { "completion_length": 260.58483123779297, "epoch": 0.6387526719476927, "grad_norm": 0.46828900118421973, "kl": 0.0674285888671875, "learning_rate": 4.7037037037037036e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7991071566939354, "rewards/format_reward_func": 0.977678582072258, "step": 3810 }, { "completion_length": 243.7053689956665, "epoch": 0.6390879751875602, "grad_norm": 0.1797720332442718, "kl": 0.05417633056640625, "learning_rate": 4.706172839506173e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 3812 }, { "completion_length": 252.46429634094238, "epoch": 0.6394232784274279, "grad_norm": 1.0245365579568664, "kl": 0.049041748046875, "learning_rate": 4.7086419753086414e-07, "loss": 0.0, "reward": 1.7839286252856255, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928939700127, "rewards/format_reward_func": 0.9955357164144516, "step": 3814 }, { "completion_length": 248.0803680419922, "epoch": 0.6397585816672954, "grad_norm": 0.1690612877408516, "kl": 0.04561614990234375, "learning_rate": 4.711111111111111e-07, "loss": 0.0, "reward": 1.8089286237955093, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.813392885029316, "rewards/format_reward_func": 0.9955357164144516, "step": 3816 }, { "completion_length": 242.28125953674316, "epoch": 0.6400938849071629, "grad_norm": 0.1799869429711767, "kl": 0.04736328125, "learning_rate": 4.7135802469135797e-07, "loss": 0.0, "reward": 1.8000000566244125, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.8089286014437675, "rewards/format_reward_func": 0.9910714328289032, "step": 3818 }, { "completion_length": 247.89286994934082, "epoch": 0.6404291881470304, "grad_norm": 0.2382277237406641, "kl": 0.04730224609375, "learning_rate": 4.716049382716049e-07, "loss": 0.0, "reward": 1.7089286744594574, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7133928835391998, "rewards/format_reward_func": 0.9955357164144516, "step": 3820 }, { "completion_length": 253.94643878936768, "epoch": 0.6407644913868981, "grad_norm": 0.28596620542102036, "kl": 0.0505523681640625, "learning_rate": 4.7185185185185185e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 3822 }, { "completion_length": 247.9732255935669, "epoch": 0.6410997946267656, "grad_norm": 0.11720032512746174, "kl": 0.05326080322265625, "learning_rate": 4.7209876543209874e-07, "loss": 0.0001, "reward": 1.744642935693264, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7580357398837805, "rewards/format_reward_func": 0.9866071492433548, "step": 3824 }, { "completion_length": 263.95090675354004, "epoch": 0.6414350978666331, "grad_norm": 0.17071112444417788, "kl": 0.06207275390625, "learning_rate": 4.723456790123457e-07, "loss": 0.0001, "reward": 1.7303572073578835, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7437500283122063, "rewards/format_reward_func": 0.9866071492433548, "step": 3826 }, { "completion_length": 259.48661613464355, "epoch": 0.6417704011065007, "grad_norm": 0.2601322695367908, "kl": 0.06282806396484375, "learning_rate": 4.725925925925926e-07, "loss": 0.0001, "reward": 1.7633929252624512, "reward_std": 0.10227794293314219, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9750000089406967, "step": 3828 }, { "completion_length": 258.39733600616455, "epoch": 0.6421057043463683, "grad_norm": 0.22779481358853262, "kl": 0.04955291748046875, "learning_rate": 4.7283950617283945e-07, "loss": 0.0, "reward": 1.7535714879631996, "reward_std": 0.07576144207268953, "rewards/equation_reward_func": 0.771428594365716, "rewards/format_reward_func": 0.9821428656578064, "step": 3830 }, { "completion_length": 250.21876430511475, "epoch": 0.6424410075862358, "grad_norm": 0.43527318397332027, "kl": 0.05060577392578125, "learning_rate": 4.730864197530864e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 0.9821428656578064, "step": 3832 }, { "completion_length": 262.4598340988159, "epoch": 0.6427763108261033, "grad_norm": 0.21911360098756538, "kl": 0.06902313232421875, "learning_rate": 4.733333333333333e-07, "loss": 0.0001, "reward": 1.6392858177423477, "reward_std": 0.09596449136734009, "rewards/equation_reward_func": 0.6660714633762836, "rewards/format_reward_func": 0.9732142984867096, "step": 3834 }, { "completion_length": 273.68304920196533, "epoch": 0.643111614065971, "grad_norm": 1.615608755048004, "kl": 0.154632568359375, "learning_rate": 4.735802469135802e-07, "loss": 0.0002, "reward": 1.6790179163217545, "reward_std": 0.12058696104213595, "rewards/equation_reward_func": 0.7294643055647612, "rewards/format_reward_func": 0.949553593993187, "step": 3836 }, { "completion_length": 264.53572273254395, "epoch": 0.6434469173058385, "grad_norm": 0.2778056266442193, "kl": 0.07929229736328125, "learning_rate": 4.7382716049382717e-07, "loss": 0.0001, "reward": 1.680357240140438, "reward_std": 0.13384521193802357, "rewards/equation_reward_func": 0.7294643186032772, "rewards/format_reward_func": 0.9508928768336773, "step": 3838 }, { "completion_length": 267.4062623977661, "epoch": 0.643782220545706, "grad_norm": 0.18156955523379578, "kl": 0.06781005859375, "learning_rate": 4.7407407407407405e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.08586296625435352, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 0.9642857275903225, "step": 3840 }, { "completion_length": 260.8839406967163, "epoch": 0.6441175237855736, "grad_norm": 0.31977780335247097, "kl": 0.153656005859375, "learning_rate": 4.74320987654321e-07, "loss": 0.0002, "reward": 1.7285714894533157, "reward_std": 0.07071068044751883, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 0.9821428656578064, "step": 3842 }, { "completion_length": 261.85269355773926, "epoch": 0.6444528270254412, "grad_norm": 0.3417284451794752, "kl": 0.09317779541015625, "learning_rate": 4.745679012345679e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 0.9821428656578064, "step": 3844 }, { "completion_length": 246.83036994934082, "epoch": 0.6447881302653087, "grad_norm": 0.17477561853506282, "kl": 0.06887054443359375, "learning_rate": 4.7481481481481477e-07, "loss": 0.0001, "reward": 1.7625000327825546, "reward_std": 0.0833375845104456, "rewards/equation_reward_func": 0.7758928872644901, "rewards/format_reward_func": 0.9866071492433548, "step": 3846 }, { "completion_length": 255.99554634094238, "epoch": 0.6451234335051762, "grad_norm": 0.44253711066314283, "kl": 0.1705474853515625, "learning_rate": 4.750617283950617e-07, "loss": 0.0002, "reward": 1.7821429073810577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428794413805, "rewards/format_reward_func": 1.0, "step": 3848 }, { "completion_length": 248.18750953674316, "epoch": 0.6454587367450438, "grad_norm": 0.2387987965129388, "kl": 0.067352294921875, "learning_rate": 4.753086419753086e-07, "loss": 0.0001, "reward": 1.7642858028411865, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7732143215835094, "rewards/format_reward_func": 0.9910714328289032, "step": 3850 }, { "completion_length": 251.83483409881592, "epoch": 0.6457940399849114, "grad_norm": 0.18727904381625438, "kl": 0.0418701171875, "learning_rate": 4.7555555555555554e-07, "loss": 0.0, "reward": 1.7571429312229156, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 3852 }, { "completion_length": 253.39733219146729, "epoch": 0.6461293432247789, "grad_norm": 0.18840059404995516, "kl": 0.23563385009765625, "learning_rate": 4.758024691358025e-07, "loss": 0.0002, "reward": 1.7321429252624512, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428973227739, "rewards/format_reward_func": 1.0, "step": 3854 }, { "completion_length": 259.3571557998657, "epoch": 0.6464646464646465, "grad_norm": 0.24891815306132903, "kl": 0.1553192138671875, "learning_rate": 4.7604938271604937e-07, "loss": 0.0002, "reward": 1.7517857551574707, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500536441803, "rewards/format_reward_func": 0.9955357164144516, "step": 3856 }, { "completion_length": 249.96875953674316, "epoch": 0.646799949704514, "grad_norm": 0.1939251343094505, "kl": 0.10715484619140625, "learning_rate": 4.7629629629629626e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.73214291036129, "rewards/format_reward_func": 1.0, "step": 3858 }, { "completion_length": 264.3616189956665, "epoch": 0.6471352529443816, "grad_norm": 0.31045812888398727, "kl": 0.240264892578125, "learning_rate": 4.7654320987654315e-07, "loss": 0.0002, "reward": 1.7089286521077156, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7133929040282965, "rewards/format_reward_func": 0.9955357164144516, "step": 3860 }, { "completion_length": 248.1741189956665, "epoch": 0.6474705561842491, "grad_norm": 0.29967271897328407, "kl": 0.142913818359375, "learning_rate": 4.767901234567901e-07, "loss": 0.0001, "reward": 1.7553572058677673, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.759821455925703, "rewards/format_reward_func": 0.9955357164144516, "step": 3862 }, { "completion_length": 244.29465293884277, "epoch": 0.6478058594241167, "grad_norm": 0.34126496872835815, "kl": 0.0392608642578125, "learning_rate": 4.77037037037037e-07, "loss": 0.0, "reward": 1.7178572192788124, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7178571820259094, "rewards/format_reward_func": 1.0, "step": 3864 }, { "completion_length": 253.07144165039062, "epoch": 0.6481411626639842, "grad_norm": 5.56472514014859, "kl": 5.6567535400390625, "learning_rate": 4.772839506172839e-07, "loss": 0.0057, "reward": 1.775000050663948, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.783928606659174, "rewards/format_reward_func": 0.9910714328289032, "step": 3866 }, { "completion_length": 260.7812614440918, "epoch": 0.6484764659038518, "grad_norm": 0.2560663927017658, "kl": 0.22957611083984375, "learning_rate": 4.775308641975309e-07, "loss": 0.0002, "reward": 1.7428571954369545, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7517857402563095, "rewards/format_reward_func": 0.9910714328289032, "step": 3868 }, { "completion_length": 254.96876335144043, "epoch": 0.6488117691437193, "grad_norm": 0.21644056151909313, "kl": 0.15191650390625, "learning_rate": 4.777777777777778e-07, "loss": 0.0002, "reward": 1.764285795390606, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 3870 }, { "completion_length": 254.66072463989258, "epoch": 0.6491470723835869, "grad_norm": 0.346480276669036, "kl": 0.04920196533203125, "learning_rate": 4.780246913580246e-07, "loss": 0.0, "reward": 1.748214341700077, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 3872 }, { "completion_length": 267.8482265472412, "epoch": 0.6494823756234545, "grad_norm": 0.29899998204476824, "kl": 0.03923797607421875, "learning_rate": 4.782716049382716e-07, "loss": 0.0, "reward": 1.7321429550647736, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7321428917348385, "rewards/format_reward_func": 1.0, "step": 3874 }, { "completion_length": 258.6026906967163, "epoch": 0.649817678863322, "grad_norm": 0.08752132121753611, "kl": 0.0488739013671875, "learning_rate": 4.785185185185185e-07, "loss": 0.0, "reward": 1.7267858013510704, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7312500216066837, "rewards/format_reward_func": 0.9955357164144516, "step": 3876 }, { "completion_length": 261.4687662124634, "epoch": 0.6501529821031896, "grad_norm": 0.32540356794934916, "kl": 0.0558624267578125, "learning_rate": 4.787654320987654e-07, "loss": 0.0001, "reward": 1.7428572326898575, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7517857421189547, "rewards/format_reward_func": 0.9910714328289032, "step": 3878 }, { "completion_length": 239.008939743042, "epoch": 0.6504882853430571, "grad_norm": 0.23190866512432148, "kl": 0.072418212890625, "learning_rate": 4.790123456790123e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 3880 }, { "completion_length": 254.6875123977661, "epoch": 0.6508235885829247, "grad_norm": 0.6270072702489933, "kl": 0.17505645751953125, "learning_rate": 4.792592592592592e-07, "loss": 0.0002, "reward": 1.726785808801651, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.731250025331974, "rewards/format_reward_func": 0.9955357164144516, "step": 3882 }, { "completion_length": 260.0134048461914, "epoch": 0.6511588918227922, "grad_norm": 0.23875093940047093, "kl": 0.08642578125, "learning_rate": 4.795061728395062e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7553571797907352, "rewards/format_reward_func": 0.9910714328289032, "step": 3884 }, { "completion_length": 245.8928689956665, "epoch": 0.6514941950626598, "grad_norm": 0.15408522982548276, "kl": 0.04544830322265625, "learning_rate": 4.79753086419753e-07, "loss": 0.0, "reward": 1.7160715237259865, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7205357477068901, "rewards/format_reward_func": 0.9955357164144516, "step": 3886 }, { "completion_length": 260.94644355773926, "epoch": 0.6518294983025273, "grad_norm": 0.1560889557012829, "kl": 0.0451812744140625, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 1.7142857760190964, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857443541288, "rewards/format_reward_func": 1.0, "step": 3888 }, { "completion_length": 250.03125953674316, "epoch": 0.6521648015423949, "grad_norm": 0.1550896631696827, "kl": 0.04175567626953125, "learning_rate": 4.802469135802469e-07, "loss": 0.0, "reward": 1.74821437895298, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7526785992085934, "rewards/format_reward_func": 0.9955357164144516, "step": 3890 }, { "completion_length": 244.3660831451416, "epoch": 0.6525001047822625, "grad_norm": 0.3224570837771038, "kl": 0.05951690673828125, "learning_rate": 4.804938271604938e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7821428999304771, "rewards/format_reward_func": 1.0, "step": 3892 }, { "completion_length": 233.0625114440918, "epoch": 0.65283540802213, "grad_norm": 0.23181470359460926, "kl": 0.04352569580078125, "learning_rate": 4.807407407407407e-07, "loss": 0.0, "reward": 1.7535714879631996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714488476515, "rewards/format_reward_func": 1.0, "step": 3894 }, { "completion_length": 247.50447273254395, "epoch": 0.6531707112619975, "grad_norm": 0.4284565414178115, "kl": 0.0639801025390625, "learning_rate": 4.809876543209876e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 3896 }, { "completion_length": 254.9419755935669, "epoch": 0.6535060145018651, "grad_norm": 0.2229631953694362, "kl": 0.09334564208984375, "learning_rate": 4.812345679012346e-07, "loss": 0.0001, "reward": 1.7285714969038963, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.728571455925703, "rewards/format_reward_func": 1.0, "step": 3898 }, { "completion_length": 244.6071548461914, "epoch": 0.6538413177417327, "grad_norm": 0.1938204139192448, "kl": 0.06211090087890625, "learning_rate": 4.814814814814814e-07, "loss": 0.0001, "reward": 1.733928643167019, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7383928801864386, "rewards/format_reward_func": 0.9955357164144516, "step": 3900 }, { "completion_length": 257.8392963409424, "epoch": 0.6541766209816002, "grad_norm": 0.21032308816890635, "kl": 0.29449462890625, "learning_rate": 4.817283950617283e-07, "loss": 0.0003, "reward": 1.7571429088711739, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 3902 }, { "completion_length": 247.4509038925171, "epoch": 0.6545119242214678, "grad_norm": 0.3046733976307481, "kl": 0.14829254150390625, "learning_rate": 4.819753086419753e-07, "loss": 0.0001, "reward": 1.8089286163449287, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8133928775787354, "rewards/format_reward_func": 0.9955357164144516, "step": 3904 }, { "completion_length": 246.89733695983887, "epoch": 0.6548472274613354, "grad_norm": 0.39941927413480616, "kl": 0.2657928466796875, "learning_rate": 4.822222222222222e-07, "loss": 0.0003, "reward": 1.7285714969038963, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 3906 }, { "completion_length": 251.4241180419922, "epoch": 0.6551825307012029, "grad_norm": 0.16084090620029584, "kl": 0.05809783935546875, "learning_rate": 4.824691358024692e-07, "loss": 0.0001, "reward": 1.755357213318348, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 3908 }, { "completion_length": 253.85269165039062, "epoch": 0.6555178339410704, "grad_norm": 0.19861177988293185, "kl": 0.0594329833984375, "learning_rate": 4.82716049382716e-07, "loss": 0.0001, "reward": 1.7642858028411865, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 3910 }, { "completion_length": 246.08929634094238, "epoch": 0.655853137180938, "grad_norm": 0.24120335029514853, "kl": 0.047882080078125, "learning_rate": 4.829629629629629e-07, "loss": 0.0, "reward": 1.707142949104309, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7071428913623095, "rewards/format_reward_func": 1.0, "step": 3912 }, { "completion_length": 245.1428689956665, "epoch": 0.6561884404208056, "grad_norm": 0.30892198013938815, "kl": 0.05008697509765625, "learning_rate": 4.832098765432099e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 3914 }, { "completion_length": 256.74554920196533, "epoch": 0.6565237436606731, "grad_norm": 0.2089904775804934, "kl": 0.05450439453125, "learning_rate": 4.834567901234567e-07, "loss": 0.0001, "reward": 1.7750000432133675, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7839286141097546, "rewards/format_reward_func": 0.9910714328289032, "step": 3916 }, { "completion_length": 250.4196548461914, "epoch": 0.6568590469005406, "grad_norm": 0.26592058060680673, "kl": 0.0504913330078125, "learning_rate": 4.837037037037037e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 1.0, "step": 3918 }, { "completion_length": 249.6785831451416, "epoch": 0.6571943501404083, "grad_norm": 0.040893189818078074, "kl": 0.04575347900390625, "learning_rate": 4.839506172839506e-07, "loss": 0.0, "reward": 1.782142922282219, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7910714522004128, "rewards/format_reward_func": 0.9910714328289032, "step": 3920 }, { "completion_length": 259.4509057998657, "epoch": 0.6575296533802758, "grad_norm": 0.13055749625194799, "kl": 0.18926239013671875, "learning_rate": 4.841975308641975e-07, "loss": 0.0002, "reward": 1.7000000700354576, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7089285999536514, "rewards/format_reward_func": 0.9910714328289032, "step": 3922 }, { "completion_length": 257.4285821914673, "epoch": 0.6578649566201433, "grad_norm": 0.2842193792194698, "kl": 0.5179595947265625, "learning_rate": 4.844444444444445e-07, "loss": 0.0005, "reward": 1.7035714983940125, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.712500037625432, "rewards/format_reward_func": 0.9910714328289032, "step": 3924 }, { "completion_length": 257.8616199493408, "epoch": 0.6582002598600109, "grad_norm": 0.21372307018495712, "kl": 0.04400634765625, "learning_rate": 4.846913580246913e-07, "loss": 0.0, "reward": 1.6982143595814705, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.702678607776761, "rewards/format_reward_func": 0.9955357164144516, "step": 3926 }, { "completion_length": 250.1741189956665, "epoch": 0.6585355630998785, "grad_norm": 0.2179283235343074, "kl": 0.0731353759765625, "learning_rate": 4.849382716049383e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 3928 }, { "completion_length": 255.47769165039062, "epoch": 0.658870866339746, "grad_norm": 0.20049233410239328, "kl": 0.0641937255859375, "learning_rate": 4.851851851851852e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 3930 }, { "completion_length": 248.6294755935669, "epoch": 0.6592061695796135, "grad_norm": 0.41126545761337224, "kl": 0.062530517578125, "learning_rate": 4.85432098765432e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7589285969734192, "rewards/format_reward_func": 0.9910714328289032, "step": 3932 }, { "completion_length": 243.2500114440918, "epoch": 0.6595414728194812, "grad_norm": 0.3831697730735232, "kl": 0.14061737060546875, "learning_rate": 4.85679012345679e-07, "loss": 0.0001, "reward": 1.7517857998609543, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500350177288, "rewards/format_reward_func": 0.9955357164144516, "step": 3934 }, { "completion_length": 244.65625953674316, "epoch": 0.6598767760593487, "grad_norm": 0.19433989843152025, "kl": 0.05853271484375, "learning_rate": 4.859259259259259e-07, "loss": 0.0001, "reward": 1.723214365541935, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7276786118745804, "rewards/format_reward_func": 0.9955357164144516, "step": 3936 }, { "completion_length": 247.46429824829102, "epoch": 0.6602120792992162, "grad_norm": 0.27797701510862327, "kl": 0.07159423828125, "learning_rate": 4.861728395061729e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 3938 }, { "completion_length": 245.87054824829102, "epoch": 0.6605473825390837, "grad_norm": 0.3248319516246433, "kl": 0.05426025390625, "learning_rate": 4.864197530864198e-07, "loss": 0.0001, "reward": 1.7000000923871994, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7000000365078449, "rewards/format_reward_func": 1.0, "step": 3940 }, { "completion_length": 256.95536613464355, "epoch": 0.6608826857789514, "grad_norm": 0.21364917257066787, "kl": 0.9769058227539062, "learning_rate": 4.866666666666666e-07, "loss": 0.001, "reward": 1.7607143446803093, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 3942 }, { "completion_length": 242.3303680419922, "epoch": 0.6612179890188189, "grad_norm": 0.6023822317409888, "kl": 1.2734832763671875, "learning_rate": 4.869135802469136e-07, "loss": 0.0013, "reward": 1.7776786386966705, "reward_std": 0.08207489224150777, "rewards/equation_reward_func": 0.7839285992085934, "rewards/format_reward_func": 0.9937500059604645, "step": 3944 }, { "completion_length": 247.9821538925171, "epoch": 0.6615532922586864, "grad_norm": 0.21272371324088035, "kl": 0.08065032958984375, "learning_rate": 4.871604938271604e-07, "loss": 0.0001, "reward": 1.8071429133415222, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428872644901, "rewards/format_reward_func": 1.0, "step": 3946 }, { "completion_length": 255.55804824829102, "epoch": 0.661888595498554, "grad_norm": 0.2834225170622278, "kl": 0.0803070068359375, "learning_rate": 4.874074074074073e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7812500335276127, "rewards/format_reward_func": 0.9955357164144516, "step": 3948 }, { "completion_length": 249.44197845458984, "epoch": 0.6622238987384216, "grad_norm": 0.16218911032112202, "kl": 0.06241607666015625, "learning_rate": 4.876543209876543e-07, "loss": 0.0001, "reward": 1.814285770058632, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 3950 }, { "completion_length": 241.6071548461914, "epoch": 0.6625592019782891, "grad_norm": 0.24746330003861983, "kl": 0.06597900390625, "learning_rate": 4.879012345679012e-07, "loss": 0.0001, "reward": 1.8058036267757416, "reward_std": 0.03219861118122935, "rewards/equation_reward_func": 0.8071428760886192, "rewards/format_reward_func": 0.9986607171595097, "step": 3952 }, { "completion_length": 256.14733505249023, "epoch": 0.6628945052181566, "grad_norm": 0.26130082812710603, "kl": 0.0624542236328125, "learning_rate": 4.881481481481482e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 3954 }, { "completion_length": 238.16519165039062, "epoch": 0.6632298084580243, "grad_norm": 0.5290773175802812, "kl": 0.268890380859375, "learning_rate": 4.883950617283951e-07, "loss": 0.0003, "reward": 1.7696429267525673, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 3956 }, { "completion_length": 247.38840675354004, "epoch": 0.6635651116978918, "grad_norm": 0.18398423923323337, "kl": 0.072296142578125, "learning_rate": 4.88641975308642e-07, "loss": 0.0001, "reward": 1.7178572192788124, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571745753288, "rewards/format_reward_func": 1.0, "step": 3958 }, { "completion_length": 238.4776906967163, "epoch": 0.6639004149377593, "grad_norm": 0.9794345870204398, "kl": 0.08960723876953125, "learning_rate": 4.888888888888889e-07, "loss": 0.0001, "reward": 1.742857240140438, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7428571656346321, "rewards/format_reward_func": 1.0, "step": 3960 }, { "completion_length": 239.95983409881592, "epoch": 0.6642357181776269, "grad_norm": 0.27360081177453655, "kl": 0.092071533203125, "learning_rate": 4.891358024691357e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 3962 }, { "completion_length": 251.196439743042, "epoch": 0.6645710214174945, "grad_norm": 0.31778832714402766, "kl": 0.05353546142578125, "learning_rate": 4.893827160493827e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428880095482, "rewards/format_reward_func": 1.0, "step": 3964 }, { "completion_length": 242.4687623977661, "epoch": 0.664906324657362, "grad_norm": 0.17589893133387763, "kl": 0.10141754150390625, "learning_rate": 4.896296296296296e-07, "loss": 0.0001, "reward": 1.7714286521077156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 3966 }, { "completion_length": 233.05357933044434, "epoch": 0.6652416278972295, "grad_norm": 0.21849168483193168, "kl": 0.052154541015625, "learning_rate": 4.898765432098765e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571660071611, "rewards/format_reward_func": 1.0, "step": 3968 }, { "completion_length": 245.81697273254395, "epoch": 0.6655769311370971, "grad_norm": 0.17133072243592265, "kl": 0.060943603515625, "learning_rate": 4.901234567901235e-07, "loss": 0.0001, "reward": 1.801785759627819, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8062500283122063, "rewards/format_reward_func": 0.9955357164144516, "step": 3970 }, { "completion_length": 230.18304920196533, "epoch": 0.6659122343769647, "grad_norm": 0.28829507057273934, "kl": 0.063507080078125, "learning_rate": 4.903703703703703e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 3972 }, { "completion_length": 245.37500953674316, "epoch": 0.6662475376168322, "grad_norm": 0.38017604329471877, "kl": 0.047882080078125, "learning_rate": 4.906172839506173e-07, "loss": 0.0, "reward": 1.773214340209961, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.9955357164144516, "step": 3974 }, { "completion_length": 243.52233123779297, "epoch": 0.6665828408566998, "grad_norm": 0.15516051295757663, "kl": 0.05037689208984375, "learning_rate": 4.908641975308642e-07, "loss": 0.0001, "reward": 1.716071531176567, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7205357439815998, "rewards/format_reward_func": 0.9955357164144516, "step": 3976 }, { "completion_length": 238.6875123977661, "epoch": 0.6669181440965674, "grad_norm": 0.24182324189802537, "kl": 0.04228973388671875, "learning_rate": 4.91111111111111e-07, "loss": 0.0, "reward": 1.7285714820027351, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7375000454485416, "rewards/format_reward_func": 0.9910714328289032, "step": 3978 }, { "completion_length": 234.69643688201904, "epoch": 0.6672534473364349, "grad_norm": 0.2698467568318753, "kl": 0.0405731201171875, "learning_rate": 4.91358024691358e-07, "loss": 0.0, "reward": 1.7500000819563866, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 3980 }, { "completion_length": 236.25893783569336, "epoch": 0.6675887505763024, "grad_norm": 0.23898061740767884, "kl": 0.0444488525390625, "learning_rate": 4.916049382716049e-07, "loss": 0.0, "reward": 1.7750000581145287, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 3982 }, { "completion_length": 246.40626049041748, "epoch": 0.66792405381617, "grad_norm": 0.20696305255706168, "kl": 0.05846405029296875, "learning_rate": 4.918518518518519e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428824216127, "rewards/format_reward_func": 1.0, "step": 3984 }, { "completion_length": 248.3214406967163, "epoch": 0.6682593570560376, "grad_norm": 0.21276370391828747, "kl": 0.04009246826171875, "learning_rate": 4.920987654320987e-07, "loss": 0.0, "reward": 1.8107143342494965, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 3986 }, { "completion_length": 254.3259038925171, "epoch": 0.6685946602959051, "grad_norm": 0.10395895819260763, "kl": 0.0443267822265625, "learning_rate": 4.923456790123456e-07, "loss": 0.0, "reward": 1.68571437895298, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.6857143174856901, "rewards/format_reward_func": 1.0, "step": 3988 }, { "completion_length": 233.83929824829102, "epoch": 0.6689299635357726, "grad_norm": 0.3865026619681927, "kl": 0.055450439453125, "learning_rate": 4.925925925925926e-07, "loss": 0.0001, "reward": 1.8285714983940125, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.8375000171363354, "rewards/format_reward_func": 0.9910714328289032, "step": 3990 }, { "completion_length": 247.30358219146729, "epoch": 0.6692652667756402, "grad_norm": 0.1731516426047963, "kl": 0.0422210693359375, "learning_rate": 4.928395061728395e-07, "loss": 0.0, "reward": 1.8142857626080513, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857514321804, "rewards/format_reward_func": 1.0, "step": 3992 }, { "completion_length": 242.26340579986572, "epoch": 0.6696005700155078, "grad_norm": 0.3130980713610786, "kl": 0.0402679443359375, "learning_rate": 4.930864197530864e-07, "loss": 0.0, "reward": 1.7785715013742447, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 3994 }, { "completion_length": 240.4776906967163, "epoch": 0.6699358732553753, "grad_norm": 0.18480171777833662, "kl": 0.04180145263671875, "learning_rate": 4.933333333333333e-07, "loss": 0.0, "reward": 1.7464286386966705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 3996 }, { "completion_length": 239.3214406967163, "epoch": 0.6702711764952429, "grad_norm": 0.2287792806277298, "kl": 0.05632781982421875, "learning_rate": 4.935802469135802e-07, "loss": 0.0001, "reward": 1.8071428909897804, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428872644901, "rewards/format_reward_func": 1.0, "step": 3998 }, { "completion_length": 233.94643783569336, "epoch": 0.6706064797351105, "grad_norm": 0.05918938448798421, "kl": 0.038482666015625, "learning_rate": 4.938271604938271e-07, "loss": 0.0, "reward": 1.7642857730388641, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7732143104076385, "rewards/format_reward_func": 0.9910714328289032, "step": 4000 }, { "completion_length": 253.99554920196533, "epoch": 0.670941782974978, "grad_norm": 0.1558838101555109, "kl": 0.039703369140625, "learning_rate": 4.94074074074074e-07, "loss": 0.0, "reward": 1.730357214808464, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7348214518278837, "rewards/format_reward_func": 0.9955357164144516, "step": 4002 }, { "completion_length": 237.4062623977661, "epoch": 0.6712770862148455, "grad_norm": 0.23672466601064102, "kl": 0.03436279296875, "learning_rate": 4.94320987654321e-07, "loss": 0.0, "reward": 1.755357213318348, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 4004 }, { "completion_length": 235.45983028411865, "epoch": 0.6716123894547131, "grad_norm": 0.23630688175269002, "kl": 0.04888153076171875, "learning_rate": 4.945679012345679e-07, "loss": 0.0, "reward": 1.7625000774860382, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7669643275439739, "rewards/format_reward_func": 0.9955357164144516, "step": 4006 }, { "completion_length": 234.4107255935669, "epoch": 0.6719476926945807, "grad_norm": 0.1509615780935135, "kl": 0.0428009033203125, "learning_rate": 4.948148148148148e-07, "loss": 0.0, "reward": 1.7142857909202576, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857424914837, "rewards/format_reward_func": 1.0, "step": 4008 }, { "completion_length": 248.5669755935669, "epoch": 0.6722829959344482, "grad_norm": 0.24903427922550173, "kl": 0.03661346435546875, "learning_rate": 4.950617283950617e-07, "loss": 0.0, "reward": 1.7750000804662704, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7839285917580128, "rewards/format_reward_func": 0.9910714328289032, "step": 4010 }, { "completion_length": 239.47322463989258, "epoch": 0.6726182991743158, "grad_norm": 0.3243591425526177, "kl": 0.0417327880859375, "learning_rate": 4.953086419753086e-07, "loss": 0.0, "reward": 1.7803572118282318, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7848214469850063, "rewards/format_reward_func": 0.9955357164144516, "step": 4012 }, { "completion_length": 245.58483028411865, "epoch": 0.6729536024141833, "grad_norm": 0.19513335100841234, "kl": 0.055511474609375, "learning_rate": 4.955555555555556e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 4014 }, { "completion_length": 232.09822273254395, "epoch": 0.6732889056540509, "grad_norm": 0.2645182804473894, "kl": 0.0370941162109375, "learning_rate": 4.958024691358024e-07, "loss": 0.0, "reward": 1.8321428894996643, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8321428894996643, "rewards/format_reward_func": 1.0, "step": 4016 }, { "completion_length": 232.3392972946167, "epoch": 0.6736242088939184, "grad_norm": 0.2761508130046895, "kl": 0.04465484619140625, "learning_rate": 4.960493827160493e-07, "loss": 0.0, "reward": 1.7642857804894447, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 4018 }, { "completion_length": 244.05358123779297, "epoch": 0.673959512133786, "grad_norm": 0.29287932155581453, "kl": 0.085784912109375, "learning_rate": 4.962962962962963e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 4020 }, { "completion_length": 240.5223331451416, "epoch": 0.6742948153736535, "grad_norm": 0.224555054040133, "kl": 0.03826904296875, "learning_rate": 4.965432098765432e-07, "loss": 0.0, "reward": 1.7928571999073029, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 4022 }, { "completion_length": 234.50001335144043, "epoch": 0.6746301186135211, "grad_norm": 0.24682979395578233, "kl": 0.037628173828125, "learning_rate": 4.967901234567901e-07, "loss": 0.0, "reward": 1.7964286506175995, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 4024 }, { "completion_length": 233.65179634094238, "epoch": 0.6749654218533887, "grad_norm": 0.1787319429847436, "kl": 0.04058074951171875, "learning_rate": 4.97037037037037e-07, "loss": 0.0, "reward": 1.7892857566475868, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 4026 }, { "completion_length": 252.76340675354004, "epoch": 0.6753007250932562, "grad_norm": 0.39406210419938975, "kl": 0.0468902587890625, "learning_rate": 4.972839506172839e-07, "loss": 0.0, "reward": 1.7428572103381157, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7428571656346321, "rewards/format_reward_func": 1.0, "step": 4028 }, { "completion_length": 242.0937614440918, "epoch": 0.6756360283331238, "grad_norm": 0.4016790575300383, "kl": 0.043731689453125, "learning_rate": 4.975308641975308e-07, "loss": 0.0, "reward": 1.764285758137703, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857544124126, "rewards/format_reward_func": 1.0, "step": 4030 }, { "completion_length": 244.1384048461914, "epoch": 0.6759713315729913, "grad_norm": 0.33463115537827764, "kl": 0.04061126708984375, "learning_rate": 4.977777777777777e-07, "loss": 0.0, "reward": 1.7857143431901932, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7946428768336773, "rewards/format_reward_func": 0.9910714328289032, "step": 4032 }, { "completion_length": 252.40179443359375, "epoch": 0.6763066348128589, "grad_norm": 0.16931830679373155, "kl": 0.0411834716796875, "learning_rate": 4.980246913580247e-07, "loss": 0.0, "reward": 1.8017857521772385, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8062500357627869, "rewards/format_reward_func": 0.9955357164144516, "step": 4034 }, { "completion_length": 241.4687614440918, "epoch": 0.6766419380527264, "grad_norm": 0.3814898253396864, "kl": 0.0712890625, "learning_rate": 4.982716049382716e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 4036 }, { "completion_length": 247.8750123977661, "epoch": 0.676977241292594, "grad_norm": 0.16397861442788428, "kl": 0.0486602783203125, "learning_rate": 4.985185185185185e-07, "loss": 0.0, "reward": 1.7410715147852898, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7544643171131611, "rewards/format_reward_func": 0.9866071492433548, "step": 4038 }, { "completion_length": 252.11608409881592, "epoch": 0.6773125445324616, "grad_norm": 0.6544560134450959, "kl": 0.1189422607421875, "learning_rate": 4.987654320987654e-07, "loss": 0.0001, "reward": 1.7446429207921028, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7482143267989159, "rewards/format_reward_func": 0.9964285716414452, "step": 4040 }, { "completion_length": 245.38394165039062, "epoch": 0.6776478477723291, "grad_norm": 0.2508024090704286, "kl": 0.0443267822265625, "learning_rate": 4.990123456790123e-07, "loss": 0.0, "reward": 1.7250000834465027, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000350177288, "rewards/format_reward_func": 1.0, "step": 4042 }, { "completion_length": 251.5044765472412, "epoch": 0.6779831510121966, "grad_norm": 0.2267515588216866, "kl": 0.03842926025390625, "learning_rate": 4.992592592592593e-07, "loss": 0.0, "reward": 1.7410714849829674, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7455357573926449, "rewards/format_reward_func": 0.9955357164144516, "step": 4044 }, { "completion_length": 248.20090579986572, "epoch": 0.6783184542520642, "grad_norm": 0.24181546344204055, "kl": 0.04335784912109375, "learning_rate": 4.995061728395061e-07, "loss": 0.0, "reward": 1.7535715103149414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 4046 }, { "completion_length": 243.383939743042, "epoch": 0.6786537574919318, "grad_norm": 0.1358614247171582, "kl": 0.03560638427734375, "learning_rate": 4.99753086419753e-07, "loss": 0.0, "reward": 1.7392858043313026, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 4048 }, { "completion_length": 253.8303689956665, "epoch": 0.6789890607317993, "grad_norm": 0.17895015083217394, "kl": 0.0395355224609375, "learning_rate": 5e-07, "loss": 0.0, "reward": 1.7450893595814705, "reward_std": 0.037249373737722635, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 0.9986607171595097, "step": 4050 }, { "completion_length": 255.852689743042, "epoch": 0.6793243639716668, "grad_norm": 0.28552055154012596, "kl": 0.038604736328125, "learning_rate": 4.999999997122036e-07, "loss": 0.0, "reward": 1.7285715192556381, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.728571442887187, "rewards/format_reward_func": 1.0, "step": 4052 }, { "completion_length": 243.11608123779297, "epoch": 0.6796596672115345, "grad_norm": 0.4019371586263685, "kl": 0.04984283447265625, "learning_rate": 4.999999988488147e-07, "loss": 0.0, "reward": 1.682142972946167, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.6821428947150707, "rewards/format_reward_func": 1.0, "step": 4054 }, { "completion_length": 255.87947845458984, "epoch": 0.679994970451402, "grad_norm": 0.21289432083989482, "kl": 0.04183197021484375, "learning_rate": 4.999999974098329e-07, "loss": 0.0, "reward": 1.7607143744826317, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.760714303702116, "rewards/format_reward_func": 1.0, "step": 4056 }, { "completion_length": 247.18304538726807, "epoch": 0.6803302736912695, "grad_norm": 0.5173896017748014, "kl": 0.05936431884765625, "learning_rate": 4.999999953952585e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 4058 }, { "completion_length": 254.18751335144043, "epoch": 0.680665576931137, "grad_norm": 0.21114889600510645, "kl": 0.03630828857421875, "learning_rate": 4.999999928050913e-07, "loss": 0.0, "reward": 1.6946429163217545, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.6991071812808514, "rewards/format_reward_func": 0.9955357164144516, "step": 4060 }, { "completion_length": 263.2276916503906, "epoch": 0.6810008801710047, "grad_norm": 0.7612326196090565, "kl": 0.044281005859375, "learning_rate": 4.999999896393315e-07, "loss": 0.0, "reward": 1.7892857864499092, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.789285734295845, "rewards/format_reward_func": 1.0, "step": 4062 }, { "completion_length": 249.9509038925171, "epoch": 0.6813361834108722, "grad_norm": 0.3232769892257542, "kl": 0.0448150634765625, "learning_rate": 4.99999985897979e-07, "loss": 0.0, "reward": 1.7696429193019867, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 4064 }, { "completion_length": 250.33483409881592, "epoch": 0.6816714866507397, "grad_norm": 0.2483480478469988, "kl": 0.03652191162109375, "learning_rate": 4.99999981581034e-07, "loss": 0.0, "reward": 1.76071435213089, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 4066 }, { "completion_length": 254.665189743042, "epoch": 0.6820067898906074, "grad_norm": 0.18635947460012328, "kl": 0.03978729248046875, "learning_rate": 4.999999766884962e-07, "loss": 0.0, "reward": 1.739285796880722, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 4068 }, { "completion_length": 251.08483123779297, "epoch": 0.6823420931304749, "grad_norm": 0.16341314696001333, "kl": 0.03818511962890625, "learning_rate": 4.999999712203659e-07, "loss": 0.0, "reward": 1.7321429252624512, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7321428805589676, "rewards/format_reward_func": 1.0, "step": 4070 }, { "completion_length": 248.5803689956665, "epoch": 0.6826773963703424, "grad_norm": 0.22882997275793895, "kl": 0.03765106201171875, "learning_rate": 4.999999651766429e-07, "loss": 0.0, "reward": 1.8285714909434319, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8285714499652386, "rewards/format_reward_func": 1.0, "step": 4072 }, { "completion_length": 247.45983028411865, "epoch": 0.6830126996102099, "grad_norm": 0.30347725721095986, "kl": 0.03338623046875, "learning_rate": 4.999999585573273e-07, "loss": 0.0, "reward": 1.769642911851406, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.9955357164144516, "step": 4074 }, { "completion_length": 243.571439743042, "epoch": 0.6833480028500776, "grad_norm": 0.09929448446690448, "kl": 0.0332489013671875, "learning_rate": 4.99999951362419e-07, "loss": 0.0, "reward": 1.7821429148316383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428999304771, "rewards/format_reward_func": 1.0, "step": 4076 }, { "completion_length": 248.22322463989258, "epoch": 0.6836833060899451, "grad_norm": 0.23674906761103792, "kl": 0.03179168701171875, "learning_rate": 4.999999435919181e-07, "loss": 0.0, "reward": 1.8196429163217545, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8241071738302708, "rewards/format_reward_func": 0.9955357164144516, "step": 4078 }, { "completion_length": 249.00001430511475, "epoch": 0.6840186093298126, "grad_norm": 0.2803521117329266, "kl": 0.03696441650390625, "learning_rate": 4.999999352458248e-07, "loss": 0.0, "reward": 1.7571429461240768, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.757142873480916, "rewards/format_reward_func": 1.0, "step": 4080 }, { "completion_length": 255.54465293884277, "epoch": 0.6843539125696803, "grad_norm": 0.18009514947831426, "kl": 0.08721160888671875, "learning_rate": 4.999999263241389e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7803571708500385, "rewards/format_reward_func": 0.9910714328289032, "step": 4082 }, { "completion_length": 255.66072463989258, "epoch": 0.6846892158095478, "grad_norm": 0.2365843923361676, "kl": 0.04498291015625, "learning_rate": 4.999999168268604e-07, "loss": 0.0, "reward": 1.7642857730388641, "reward_std": 0.09091372601687908, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 4084 }, { "completion_length": 246.0134048461914, "epoch": 0.6850245190494153, "grad_norm": 0.4343459100999908, "kl": 0.03946685791015625, "learning_rate": 4.999999067539895e-07, "loss": 0.0, "reward": 1.7660714909434319, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 4086 }, { "completion_length": 255.75001335144043, "epoch": 0.6853598222892828, "grad_norm": 0.22611496179012103, "kl": 0.03882598876953125, "learning_rate": 4.99999896105526e-07, "loss": 0.0, "reward": 1.7589286491274834, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.763392873108387, "rewards/format_reward_func": 0.9955357164144516, "step": 4088 }, { "completion_length": 245.85268878936768, "epoch": 0.6856951255291505, "grad_norm": 0.1820744807846706, "kl": 0.055633544921875, "learning_rate": 4.999998848814701e-07, "loss": 0.0001, "reward": 1.742857202887535, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 4090 }, { "completion_length": 245.1785831451416, "epoch": 0.686030428769018, "grad_norm": 0.2699339435142035, "kl": 0.0371856689453125, "learning_rate": 4.999998730818219e-07, "loss": 0.0, "reward": 1.7767857760190964, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7812500409781933, "rewards/format_reward_func": 0.9955357164144516, "step": 4092 }, { "completion_length": 250.8794755935669, "epoch": 0.6863657320088855, "grad_norm": 0.18753002933538254, "kl": 0.06398773193359375, "learning_rate": 4.999998607065812e-07, "loss": 0.0001, "reward": 1.7285714820027351, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714726895094, "rewards/format_reward_func": 1.0, "step": 4094 }, { "completion_length": 250.4509038925171, "epoch": 0.6867010352487531, "grad_norm": 0.23536728398067072, "kl": 0.0369110107421875, "learning_rate": 4.999998477557482e-07, "loss": 0.0, "reward": 1.7250000834465027, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 1.0, "step": 4096 }, { "completion_length": 249.23215770721436, "epoch": 0.6870363384886207, "grad_norm": 0.2694385693254488, "kl": 0.04195404052734375, "learning_rate": 4.999998342293226e-07, "loss": 0.0, "reward": 1.7125000655651093, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7169643379747868, "rewards/format_reward_func": 0.9955357164144516, "step": 4098 }, { "completion_length": 237.4642972946167, "epoch": 0.6873716417284882, "grad_norm": 0.20593764322751265, "kl": 0.05100250244140625, "learning_rate": 4.999998201273049e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7982143014669418, "rewards/format_reward_func": 0.9910714328289032, "step": 4100 }, { "completion_length": 243.3303689956665, "epoch": 0.6877069449683557, "grad_norm": 0.2052447336531401, "kl": 0.03682708740234375, "learning_rate": 4.999998054496949e-07, "loss": 0.0, "reward": 1.7892857864499092, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 4102 }, { "completion_length": 248.73215293884277, "epoch": 0.6880422482082234, "grad_norm": 0.19766473428498785, "kl": 0.0457916259765625, "learning_rate": 4.999997901964926e-07, "loss": 0.0, "reward": 1.7357143461704254, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 1.0, "step": 4104 }, { "completion_length": 252.6250123977661, "epoch": 0.6883775514480909, "grad_norm": 0.2928526606003355, "kl": 0.04019927978515625, "learning_rate": 4.999997743676982e-07, "loss": 0.0, "reward": 1.7214286625385284, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7303571812808514, "rewards/format_reward_func": 0.9910714328289032, "step": 4106 }, { "completion_length": 237.3973331451416, "epoch": 0.6887128546879584, "grad_norm": 0.11667197186742236, "kl": 0.034725189208984375, "learning_rate": 4.999997579633115e-07, "loss": 0.0, "reward": 1.8178571909666061, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.8267857357859612, "rewards/format_reward_func": 0.9910714328289032, "step": 4108 }, { "completion_length": 244.5446538925171, "epoch": 0.689048157927826, "grad_norm": 0.2764998637390368, "kl": 0.06385040283203125, "learning_rate": 4.999997409833327e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 4110 }, { "completion_length": 244.508939743042, "epoch": 0.6893834611676936, "grad_norm": 0.2871939727209474, "kl": 0.038726806640625, "learning_rate": 4.999997234277618e-07, "loss": 0.0, "reward": 1.725000075995922, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000312924385, "rewards/format_reward_func": 1.0, "step": 4112 }, { "completion_length": 240.6428680419922, "epoch": 0.6897187644075611, "grad_norm": 0.26143479711552936, "kl": 0.0357818603515625, "learning_rate": 4.999997052965989e-07, "loss": 0.0, "reward": 1.767857201397419, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.767857177183032, "rewards/format_reward_func": 1.0, "step": 4114 }, { "completion_length": 251.58929824829102, "epoch": 0.6900540676474286, "grad_norm": 0.3161348591284228, "kl": 0.09564971923828125, "learning_rate": 4.99999686589844e-07, "loss": 0.0001, "reward": 1.7107143551111221, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7107143215835094, "rewards/format_reward_func": 1.0, "step": 4116 }, { "completion_length": 247.52232933044434, "epoch": 0.6903893708872962, "grad_norm": 0.45746669207629176, "kl": 0.18476104736328125, "learning_rate": 4.999996673074971e-07, "loss": 0.0002, "reward": 1.7696429193019867, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7741071600466967, "rewards/format_reward_func": 0.9955357164144516, "step": 4118 }, { "completion_length": 249.89733123779297, "epoch": 0.6907246741271638, "grad_norm": 0.275198361958013, "kl": 0.0482330322265625, "learning_rate": 4.999996474495583e-07, "loss": 0.0, "reward": 1.796428643167019, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7964285910129547, "rewards/format_reward_func": 1.0, "step": 4120 }, { "completion_length": 250.66072940826416, "epoch": 0.6910599773670313, "grad_norm": 0.1616753946632268, "kl": 0.0462493896484375, "learning_rate": 4.999996270160275e-07, "loss": 0.0, "reward": 1.7839286103844643, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 4122 }, { "completion_length": 242.08929920196533, "epoch": 0.6913952806068988, "grad_norm": 0.28224358954519824, "kl": 0.03643035888671875, "learning_rate": 4.999996060069049e-07, "loss": 0.0, "reward": 1.739285796880722, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 4124 }, { "completion_length": 239.62054538726807, "epoch": 0.6917305838467664, "grad_norm": 0.2959435225661601, "kl": 0.32910919189453125, "learning_rate": 4.999995844221906e-07, "loss": 0.0003, "reward": 1.7642857506871223, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 4126 }, { "completion_length": 256.1160840988159, "epoch": 0.692065887086634, "grad_norm": 0.16159897450649705, "kl": 0.04415130615234375, "learning_rate": 4.999995622618846e-07, "loss": 0.0, "reward": 1.7196428999304771, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.724107176065445, "rewards/format_reward_func": 0.9955357164144516, "step": 4128 }, { "completion_length": 241.51787185668945, "epoch": 0.6924011903265015, "grad_norm": 0.16545788625877894, "kl": 0.0444793701171875, "learning_rate": 4.999995395259868e-07, "loss": 0.0, "reward": 1.7250000685453415, "reward_std": 0.025253813713788986, "rewards/equation_reward_func": 0.7339285910129547, "rewards/format_reward_func": 0.9910714328289032, "step": 4130 }, { "completion_length": 245.4241180419922, "epoch": 0.6927364935663691, "grad_norm": 0.1501842352304599, "kl": 0.6236724853515625, "learning_rate": 4.999995162144974e-07, "loss": 0.0006, "reward": 1.79464291036129, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.8080357499420643, "rewards/format_reward_func": 0.9866071492433548, "step": 4132 }, { "completion_length": 248.04911708831787, "epoch": 0.6930717968062367, "grad_norm": 0.12794145815186433, "kl": 0.04578399658203125, "learning_rate": 4.999994923274164e-07, "loss": 0.0, "reward": 1.8071429133415222, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 4134 }, { "completion_length": 253.52233219146729, "epoch": 0.6934071000461042, "grad_norm": 0.24779154668910555, "kl": 0.340484619140625, "learning_rate": 4.999994678647439e-07, "loss": 0.0003, "reward": 1.817857213318348, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8178571611642838, "rewards/format_reward_func": 1.0, "step": 4136 }, { "completion_length": 252.89733409881592, "epoch": 0.6937424032859717, "grad_norm": 0.06910974345915508, "kl": 0.4642486572265625, "learning_rate": 4.999994428264799e-07, "loss": 0.0005, "reward": 1.7196429371833801, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7241071704775095, "rewards/format_reward_func": 0.9955357164144516, "step": 4138 }, { "completion_length": 245.62501430511475, "epoch": 0.6940777065258393, "grad_norm": 0.24250637427227054, "kl": 0.04045867919921875, "learning_rate": 4.999994172126245e-07, "loss": 0.0, "reward": 1.7607143595814705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 4140 }, { "completion_length": 252.32143783569336, "epoch": 0.6944130097657069, "grad_norm": 0.21264235248878327, "kl": 0.040924072265625, "learning_rate": 4.999993910231778e-07, "loss": 0.0, "reward": 1.7875000685453415, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.791964303702116, "rewards/format_reward_func": 0.9955357164144516, "step": 4142 }, { "completion_length": 256.4776887893677, "epoch": 0.6947483130055744, "grad_norm": 0.18883792480654485, "kl": 0.0831146240234375, "learning_rate": 4.999993642581397e-07, "loss": 0.0001, "reward": 1.7803571969270706, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.793750025331974, "rewards/format_reward_func": 0.9866071492433548, "step": 4144 }, { "completion_length": 256.1026906967163, "epoch": 0.695083616245442, "grad_norm": 0.4030051813383355, "kl": 0.08502197265625, "learning_rate": 4.999993369175105e-07, "loss": 0.0001, "reward": 1.7750000953674316, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7839285954833031, "rewards/format_reward_func": 0.9910714328289032, "step": 4146 }, { "completion_length": 250.8705472946167, "epoch": 0.6954189194853095, "grad_norm": 0.25329291872625403, "kl": 0.055938720703125, "learning_rate": 4.999993090012901e-07, "loss": 0.0001, "reward": 1.723214365541935, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7276785969734192, "rewards/format_reward_func": 0.9955357164144516, "step": 4148 }, { "completion_length": 261.0134029388428, "epoch": 0.6957542227251771, "grad_norm": 0.28460622009753217, "kl": 0.0702972412109375, "learning_rate": 4.999992805094786e-07, "loss": 0.0001, "reward": 1.764285758137703, "reward_std": 0.11111677717417479, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 0.9821428656578064, "step": 4150 }, { "completion_length": 260.727689743042, "epoch": 0.6960895259650446, "grad_norm": 0.24220639520506346, "kl": 0.05754852294921875, "learning_rate": 4.99999251442076e-07, "loss": 0.0001, "reward": 1.7517857775092125, "reward_std": 0.08838834706693888, "rewards/equation_reward_func": 0.7651786059141159, "rewards/format_reward_func": 0.9866071492433548, "step": 4152 }, { "completion_length": 253.7053680419922, "epoch": 0.6964248292049122, "grad_norm": 0.20059391948110178, "kl": 0.0672760009765625, "learning_rate": 4.999992217990825e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 4154 }, { "completion_length": 257.23661708831787, "epoch": 0.6967601324447797, "grad_norm": 0.17352793125955543, "kl": 0.09174346923828125, "learning_rate": 4.99999191580498e-07, "loss": 0.0001, "reward": 1.7660714909434319, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9955357164144516, "step": 4156 }, { "completion_length": 247.43304538726807, "epoch": 0.6970954356846473, "grad_norm": 0.4455503723352921, "kl": 0.055694580078125, "learning_rate": 4.999991607863228e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 4158 }, { "completion_length": 249.290189743042, "epoch": 0.6974307389245149, "grad_norm": 0.207070863422582, "kl": 0.05780029296875, "learning_rate": 4.999991294165567e-07, "loss": 0.0001, "reward": 1.7482143566012383, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526785843074322, "rewards/format_reward_func": 0.9955357164144516, "step": 4160 }, { "completion_length": 243.20536613464355, "epoch": 0.6977660421643824, "grad_norm": 0.30572698157206896, "kl": 0.05780029296875, "learning_rate": 4.999990974712001e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 4162 }, { "completion_length": 250.08483219146729, "epoch": 0.69810134540425, "grad_norm": 0.2265343376034305, "kl": 0.0560302734375, "learning_rate": 4.999990649502528e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 4164 }, { "completion_length": 250.10715579986572, "epoch": 0.6984366486441175, "grad_norm": 0.204340719128157, "kl": 0.04798126220703125, "learning_rate": 4.999990318537149e-07, "loss": 0.0, "reward": 1.7571429014205933, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 4166 }, { "completion_length": 261.6071548461914, "epoch": 0.6987719518839851, "grad_norm": 0.2263993650313919, "kl": 0.0576019287109375, "learning_rate": 4.999989981815865e-07, "loss": 0.0001, "reward": 1.7660715132951736, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.9955357164144516, "step": 4168 }, { "completion_length": 258.6785840988159, "epoch": 0.6991072551238526, "grad_norm": 0.5688840890288541, "kl": 0.08251953125, "learning_rate": 4.999989639338678e-07, "loss": 0.0001, "reward": 1.6928572356700897, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.6928571872413158, "rewards/format_reward_func": 1.0, "step": 4170 }, { "completion_length": 251.9866189956665, "epoch": 0.6994425583637202, "grad_norm": 0.26599224494018464, "kl": 0.06134796142578125, "learning_rate": 4.999989291105588e-07, "loss": 0.0001, "reward": 1.714285783469677, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7232143376022577, "rewards/format_reward_func": 0.9910714328289032, "step": 4172 }, { "completion_length": 243.78572463989258, "epoch": 0.6997778616035878, "grad_norm": 0.2615405086321927, "kl": 0.0572052001953125, "learning_rate": 4.999988937116595e-07, "loss": 0.0001, "reward": 1.744642935693264, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.9955357164144516, "step": 4174 }, { "completion_length": 246.68751335144043, "epoch": 0.7001131648434553, "grad_norm": 0.27148599882970154, "kl": 0.060150146484375, "learning_rate": 4.999988577371702e-07, "loss": 0.0001, "reward": 1.7089286372065544, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7133928909897804, "rewards/format_reward_func": 0.9955357164144516, "step": 4176 }, { "completion_length": 246.977689743042, "epoch": 0.7004484680833228, "grad_norm": 0.28624932053045216, "kl": 0.0514373779296875, "learning_rate": 4.999988211870907e-07, "loss": 0.0001, "reward": 1.817857213318348, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.8178571611642838, "rewards/format_reward_func": 1.0, "step": 4178 }, { "completion_length": 247.38840579986572, "epoch": 0.7007837713231904, "grad_norm": 0.23016414642039873, "kl": 0.06423187255859375, "learning_rate": 4.999987840614212e-07, "loss": 0.0001, "reward": 1.7660714909434319, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.9955357164144516, "step": 4180 }, { "completion_length": 244.47322845458984, "epoch": 0.701119074563058, "grad_norm": 0.35210518296275867, "kl": 0.0606842041015625, "learning_rate": 4.999987463601619e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 4182 }, { "completion_length": 235.9598331451416, "epoch": 0.7014543778029255, "grad_norm": 0.19201963590977644, "kl": 0.0580291748046875, "learning_rate": 4.999987080833128e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 4184 }, { "completion_length": 240.67411518096924, "epoch": 0.701789681042793, "grad_norm": 0.13459371154887387, "kl": 0.04982757568359375, "learning_rate": 4.999986692308739e-07, "loss": 0.0, "reward": 1.7392857745289803, "reward_std": 0.025253813713788986, "rewards/equation_reward_func": 0.748214315623045, "rewards/format_reward_func": 0.9910714328289032, "step": 4186 }, { "completion_length": 248.06697750091553, "epoch": 0.7021249842826607, "grad_norm": 0.3074579710955277, "kl": 0.05918121337890625, "learning_rate": 4.999986298028454e-07, "loss": 0.0001, "reward": 1.733928643167019, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.738392885774374, "rewards/format_reward_func": 0.9955357164144516, "step": 4188 }, { "completion_length": 246.7500114440918, "epoch": 0.7024602875225282, "grad_norm": 0.3280048381695006, "kl": 0.0514984130859375, "learning_rate": 4.999985897992274e-07, "loss": 0.0001, "reward": 1.7214286252856255, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7214286029338837, "rewards/format_reward_func": 1.0, "step": 4190 }, { "completion_length": 237.38840293884277, "epoch": 0.7027955907623957, "grad_norm": 0.2873894479997041, "kl": 0.0552215576171875, "learning_rate": 4.999985492200199e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000111758709, "rewards/format_reward_func": 1.0, "step": 4192 }, { "completion_length": 235.9821538925171, "epoch": 0.7031308940022633, "grad_norm": 0.2801387774210403, "kl": 0.0472564697265625, "learning_rate": 4.999985080652232e-07, "loss": 0.0, "reward": 1.7357143461704254, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7446428835391998, "rewards/format_reward_func": 0.9910714328289032, "step": 4194 }, { "completion_length": 241.6071548461914, "epoch": 0.7034661972421309, "grad_norm": 0.2864929528982983, "kl": 0.0567474365234375, "learning_rate": 4.99998466334837e-07, "loss": 0.0001, "reward": 1.7285714969038963, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7375000268220901, "rewards/format_reward_func": 0.9910714328289032, "step": 4196 }, { "completion_length": 233.7232255935669, "epoch": 0.7038015004819984, "grad_norm": 0.27030146938564104, "kl": 0.0601806640625, "learning_rate": 4.999984240288618e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7928571682423353, "rewards/format_reward_func": 1.0, "step": 4198 }, { "completion_length": 229.2187614440918, "epoch": 0.7041368037218659, "grad_norm": 0.12054623921859627, "kl": 0.061126708984375, "learning_rate": 4.999983811472975e-07, "loss": 0.0001, "reward": 1.7285715118050575, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 4200 }, { "completion_length": 234.73215293884277, "epoch": 0.7044721069617336, "grad_norm": 0.19157688673168496, "kl": 0.05464935302734375, "learning_rate": 4.999983376901442e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.800000037997961, "rewards/format_reward_func": 1.0, "step": 4202 }, { "completion_length": 232.62500953674316, "epoch": 0.7048074102016011, "grad_norm": 0.31481143427392383, "kl": 0.05657958984375, "learning_rate": 4.99998293657402e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500223517418, "rewards/format_reward_func": 0.9955357164144516, "step": 4204 }, { "completion_length": 241.8125114440918, "epoch": 0.7051427134414686, "grad_norm": 0.7161200664730465, "kl": 0.06140899658203125, "learning_rate": 4.999982490490711e-07, "loss": 0.0001, "reward": 1.7500001043081284, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 4206 }, { "completion_length": 237.290189743042, "epoch": 0.7054780166813361, "grad_norm": 0.2976173951165174, "kl": 0.074066162109375, "learning_rate": 4.999982038651515e-07, "loss": 0.0001, "reward": 1.7142858058214188, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857313156128, "rewards/format_reward_func": 1.0, "step": 4208 }, { "completion_length": 237.55804634094238, "epoch": 0.7058133199212038, "grad_norm": 1.294270106977747, "kl": 0.0904998779296875, "learning_rate": 4.999981581056434e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 4210 }, { "completion_length": 267.2544746398926, "epoch": 0.7061486231610713, "grad_norm": 0.6168138104071692, "kl": 0.399688720703125, "learning_rate": 4.999981117705468e-07, "loss": 0.0004, "reward": 1.526785783469677, "reward_std": 0.33082496002316475, "rewards/equation_reward_func": 0.6741071753203869, "rewards/format_reward_func": 0.8526786118745804, "step": 4212 }, { "completion_length": 260.12500858306885, "epoch": 0.7064839264009388, "grad_norm": 0.31882764649050893, "kl": 0.3432769775390625, "learning_rate": 4.999980648598619e-07, "loss": 0.0003, "reward": 1.548214353621006, "reward_std": 0.154048265889287, "rewards/equation_reward_func": 0.6419643182307482, "rewards/format_reward_func": 0.9062500335276127, "step": 4214 }, { "completion_length": 246.56697750091553, "epoch": 0.7068192296408065, "grad_norm": 0.24098316483255156, "kl": 0.2245941162109375, "learning_rate": 4.999980173735887e-07, "loss": 0.0002, "reward": 1.7535714879631996, "reward_std": 0.09596449136734009, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 0.9821428656578064, "step": 4216 }, { "completion_length": 234.25893783569336, "epoch": 0.707154532880674, "grad_norm": 0.316137458068654, "kl": 0.201934814453125, "learning_rate": 4.999979693117275e-07, "loss": 0.0002, "reward": 1.7839286178350449, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 4218 }, { "completion_length": 237.37947368621826, "epoch": 0.7074898361205415, "grad_norm": 0.08814150042085736, "kl": 0.1511077880859375, "learning_rate": 4.999979206742782e-07, "loss": 0.0002, "reward": 1.7214286625385284, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7214286010712385, "rewards/format_reward_func": 1.0, "step": 4220 }, { "completion_length": 238.258939743042, "epoch": 0.707825139360409, "grad_norm": 0.18570467030693735, "kl": 0.11859130859375, "learning_rate": 4.99997871461241e-07, "loss": 0.0001, "reward": 1.7178572118282318, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571987897158, "rewards/format_reward_func": 1.0, "step": 4222 }, { "completion_length": 231.2009048461914, "epoch": 0.7081604426002767, "grad_norm": 0.2520463039518999, "kl": 0.0881805419921875, "learning_rate": 4.99997821672616e-07, "loss": 0.0001, "reward": 1.755357213318348, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214484751225, "rewards/format_reward_func": 0.9955357164144516, "step": 4224 }, { "completion_length": 221.31250953674316, "epoch": 0.7084957458401442, "grad_norm": 0.40034280305593867, "kl": 0.0886993408203125, "learning_rate": 4.999977713084033e-07, "loss": 0.0001, "reward": 1.7839286178350449, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7883928939700127, "rewards/format_reward_func": 0.9955357164144516, "step": 4226 }, { "completion_length": 232.2857255935669, "epoch": 0.7088310490800117, "grad_norm": 0.18969120135227732, "kl": 0.161529541015625, "learning_rate": 4.99997720368603e-07, "loss": 0.0002, "reward": 1.7821429073810577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 4228 }, { "completion_length": 224.8750114440918, "epoch": 0.7091663523198793, "grad_norm": 0.17195233399392781, "kl": 0.1042633056640625, "learning_rate": 4.999976688532153e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 1.0, "step": 4230 }, { "completion_length": 216.96429538726807, "epoch": 0.7095016555597469, "grad_norm": 0.26720859900169286, "kl": 0.0937652587890625, "learning_rate": 4.999976167622403e-07, "loss": 0.0001, "reward": 1.7892858013510704, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857305705547, "rewards/format_reward_func": 1.0, "step": 4232 }, { "completion_length": 230.7321548461914, "epoch": 0.7098369587996144, "grad_norm": 0.15169793509915727, "kl": 0.101806640625, "learning_rate": 4.99997564095678e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 4234 }, { "completion_length": 224.22768783569336, "epoch": 0.7101722620394819, "grad_norm": 0.0039682440196801665, "kl": 0.0744781494140625, "learning_rate": 4.999975108535288e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 4236 }, { "completion_length": 231.23215293884277, "epoch": 0.7105075652793496, "grad_norm": 0.25855297377161357, "kl": 0.1085968017578125, "learning_rate": 4.999974570357925e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 4238 }, { "completion_length": 221.85715293884277, "epoch": 0.7108428685192171, "grad_norm": 0.33264525079129287, "kl": 0.124664306640625, "learning_rate": 4.999974026424694e-07, "loss": 0.0001, "reward": 1.7665179520845413, "reward_std": 0.037249373737722635, "rewards/equation_reward_func": 0.7678571604192257, "rewards/format_reward_func": 0.9986607171595097, "step": 4240 }, { "completion_length": 230.91072273254395, "epoch": 0.7111781717590846, "grad_norm": 0.30380484394856, "kl": 0.088958740234375, "learning_rate": 4.999973476735596e-07, "loss": 0.0001, "reward": 1.7142858058214188, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7142857499420643, "rewards/format_reward_func": 1.0, "step": 4242 }, { "completion_length": 209.61161518096924, "epoch": 0.7115134749989521, "grad_norm": 0.4488191314283875, "kl": 0.10036468505859375, "learning_rate": 4.999972921290632e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9955357164144516, "step": 4244 }, { "completion_length": 215.65626049041748, "epoch": 0.7118487782388198, "grad_norm": 0.2875484664537562, "kl": 0.10089874267578125, "learning_rate": 4.999972360089804e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 4246 }, { "completion_length": 224.06697368621826, "epoch": 0.7121840814786873, "grad_norm": 0.25228779296360854, "kl": 0.1021575927734375, "learning_rate": 4.99997179313311e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857142947614193, "rewards/format_reward_func": 1.0, "step": 4248 }, { "completion_length": 221.2991189956665, "epoch": 0.7125193847185548, "grad_norm": 0.21487976932657415, "kl": 0.101104736328125, "learning_rate": 4.999971220420557e-07, "loss": 0.0001, "reward": 1.7589286267757416, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7633928824216127, "rewards/format_reward_func": 0.9955357164144516, "step": 4250 }, { "completion_length": 221.16518878936768, "epoch": 0.7128546879584224, "grad_norm": 0.17815512314744297, "kl": 0.1637115478515625, "learning_rate": 4.999970641952142e-07, "loss": 0.0002, "reward": 1.7464286163449287, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7464286349713802, "rewards/format_reward_func": 1.0, "step": 4252 }, { "completion_length": 229.12054634094238, "epoch": 0.71318999119829, "grad_norm": 0.28107228216462243, "kl": 0.1016998291015625, "learning_rate": 4.999970057727869e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 4254 }, { "completion_length": 224.0803680419922, "epoch": 0.7135252944381575, "grad_norm": 0.2769542511971505, "kl": 0.09307861328125, "learning_rate": 4.999969467747736e-07, "loss": 0.0001, "reward": 1.6785715073347092, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.678571455180645, "rewards/format_reward_func": 1.0, "step": 4256 }, { "completion_length": 220.14286708831787, "epoch": 0.713860597678025, "grad_norm": 0.3152799908343859, "kl": 0.1051177978515625, "learning_rate": 4.999968872011748e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 4258 }, { "completion_length": 218.1741180419922, "epoch": 0.7141959009178926, "grad_norm": 0.3424092259120418, "kl": 0.09275054931640625, "learning_rate": 4.999968270519905e-07, "loss": 0.0001, "reward": 1.787500061094761, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7919643148779869, "rewards/format_reward_func": 0.9955357164144516, "step": 4260 }, { "completion_length": 217.66965198516846, "epoch": 0.7145312041577602, "grad_norm": 0.24849842953591986, "kl": 0.1216278076171875, "learning_rate": 4.999967663272207e-07, "loss": 0.0001, "reward": 1.817857176065445, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8178571574389935, "rewards/format_reward_func": 1.0, "step": 4262 }, { "completion_length": 216.36161518096924, "epoch": 0.7148665073976277, "grad_norm": 0.1771942860689893, "kl": 0.1179046630859375, "learning_rate": 4.999967050268657e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 4264 }, { "completion_length": 227.24108028411865, "epoch": 0.7152018106374953, "grad_norm": 0.23422982109973303, "kl": 0.180267333984375, "learning_rate": 4.999966431509255e-07, "loss": 0.0002, "reward": 1.7892857789993286, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 4266 }, { "completion_length": 222.05357933044434, "epoch": 0.7155371138773629, "grad_norm": 0.2085439923439692, "kl": 0.128448486328125, "learning_rate": 4.999965806994005e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 4268 }, { "completion_length": 222.42411994934082, "epoch": 0.7158724171172304, "grad_norm": 0.2230259143420745, "kl": 0.12346649169921875, "learning_rate": 4.999965176722905e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 4270 }, { "completion_length": 228.90625858306885, "epoch": 0.7162077203570979, "grad_norm": 0.14893081654121634, "kl": 0.11907958984375, "learning_rate": 4.999964540695959e-07, "loss": 0.0001, "reward": 1.7214286625385284, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7214285954833031, "rewards/format_reward_func": 1.0, "step": 4272 }, { "completion_length": 225.11608123779297, "epoch": 0.7165430235969655, "grad_norm": 0.28803787165074113, "kl": 0.1007537841796875, "learning_rate": 4.999963898913168e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 4274 }, { "completion_length": 228.2232255935669, "epoch": 0.7168783268368331, "grad_norm": 0.3384674104032473, "kl": 0.105621337890625, "learning_rate": 4.999963251374533e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428917348385, "rewards/format_reward_func": 1.0, "step": 4276 }, { "completion_length": 226.52679634094238, "epoch": 0.7172136300767006, "grad_norm": 0.2255294947243617, "kl": 0.087432861328125, "learning_rate": 4.999962598080055e-07, "loss": 0.0001, "reward": 1.6875000894069672, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.6919643133878708, "rewards/format_reward_func": 0.9955357164144516, "step": 4278 }, { "completion_length": 229.29911708831787, "epoch": 0.7175489333165682, "grad_norm": 0.17192694908242845, "kl": 0.1046295166015625, "learning_rate": 4.999961939029738e-07, "loss": 0.0001, "reward": 1.753571517765522, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714469850063, "rewards/format_reward_func": 1.0, "step": 4280 }, { "completion_length": 228.3571538925171, "epoch": 0.7178842365564357, "grad_norm": 0.2238045079730679, "kl": 0.1636505126953125, "learning_rate": 4.999961274223581e-07, "loss": 0.0002, "reward": 1.8071428835391998, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428909897804, "rewards/format_reward_func": 1.0, "step": 4282 }, { "completion_length": 229.05358028411865, "epoch": 0.7182195397963033, "grad_norm": 0.1545800715199412, "kl": 0.1165924072265625, "learning_rate": 4.999960603661585e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571812808514, "rewards/format_reward_func": 1.0, "step": 4284 }, { "completion_length": 221.13393878936768, "epoch": 0.7185548430361708, "grad_norm": 0.16588131399199002, "kl": 0.10345458984375, "learning_rate": 4.999959927343754e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714440047741, "rewards/format_reward_func": 1.0, "step": 4286 }, { "completion_length": 227.6250114440918, "epoch": 0.7188901462760384, "grad_norm": 0.23818363949622876, "kl": 0.1177215576171875, "learning_rate": 4.999959245270088e-07, "loss": 0.0001, "reward": 1.7571429461240768, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428921073675, "rewards/format_reward_func": 1.0, "step": 4288 }, { "completion_length": 223.75000762939453, "epoch": 0.719225449515906, "grad_norm": 0.30625169067460517, "kl": 0.1202545166015625, "learning_rate": 4.999958557440589e-07, "loss": 0.0001, "reward": 1.7446429282426834, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071783006191, "rewards/format_reward_func": 0.9955357164144516, "step": 4290 }, { "completion_length": 223.0803680419922, "epoch": 0.7195607527557735, "grad_norm": 0.3376410563758561, "kl": 0.11126708984375, "learning_rate": 4.999957863855257e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857525497675, "rewards/format_reward_func": 1.0, "step": 4292 }, { "completion_length": 220.6741180419922, "epoch": 0.7198960559956411, "grad_norm": 0.1431025431630177, "kl": 0.0991058349609375, "learning_rate": 4.999957164514097e-07, "loss": 0.0001, "reward": 1.7571428939700127, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 4294 }, { "completion_length": 218.46429538726807, "epoch": 0.7202313592355086, "grad_norm": 0.20456145213274624, "kl": 0.118072509765625, "learning_rate": 4.999956459417109e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 4296 }, { "completion_length": 220.79911422729492, "epoch": 0.7205666624753762, "grad_norm": 0.34602565657175177, "kl": 0.10986328125, "learning_rate": 4.999955748564293e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 1.0, "step": 4298 }, { "completion_length": 226.42858219146729, "epoch": 0.7209019657152437, "grad_norm": 0.30351734013178966, "kl": 0.1095733642578125, "learning_rate": 4.999955031955653e-07, "loss": 0.0001, "reward": 1.7607143372297287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 4300 }, { "completion_length": 224.81697177886963, "epoch": 0.7212372689551113, "grad_norm": 0.22631733827458714, "kl": 0.1202392578125, "learning_rate": 4.999954309591188e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7526786103844643, "rewards/format_reward_func": 0.9955357164144516, "step": 4302 }, { "completion_length": 222.08929538726807, "epoch": 0.7215725721949788, "grad_norm": 0.19482129354219505, "kl": 0.1111297607421875, "learning_rate": 4.999953581470903e-07, "loss": 0.0001, "reward": 1.81428574770689, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857421189547, "rewards/format_reward_func": 1.0, "step": 4304 }, { "completion_length": 228.1116189956665, "epoch": 0.7219078754348464, "grad_norm": 0.15421096995458153, "kl": 0.106353759765625, "learning_rate": 4.999952847594796e-07, "loss": 0.0001, "reward": 1.821428619325161, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8214286006987095, "rewards/format_reward_func": 1.0, "step": 4306 }, { "completion_length": 222.49108028411865, "epoch": 0.722243178674714, "grad_norm": 0.2832644661349313, "kl": 0.1219482421875, "learning_rate": 4.999952107962873e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 4308 }, { "completion_length": 228.3526906967163, "epoch": 0.7225784819145815, "grad_norm": 0.2739123316787506, "kl": 0.1038055419921875, "learning_rate": 4.999951362575131e-07, "loss": 0.0001, "reward": 1.7660714834928513, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.770535746589303, "rewards/format_reward_func": 0.9955357164144516, "step": 4310 }, { "completion_length": 224.91965293884277, "epoch": 0.722913785154449, "grad_norm": 0.33445050842398866, "kl": 0.131072998046875, "learning_rate": 4.999950611431576e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 4312 }, { "completion_length": 226.31697463989258, "epoch": 0.7232490883943166, "grad_norm": 0.21341353048079031, "kl": 0.12420654296875, "learning_rate": 4.999949854532206e-07, "loss": 0.0001, "reward": 1.7160715162754059, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7205357402563095, "rewards/format_reward_func": 0.9955357164144516, "step": 4314 }, { "completion_length": 229.93304443359375, "epoch": 0.7235843916341842, "grad_norm": 0.30601848106418333, "kl": 0.1174163818359375, "learning_rate": 4.999949091877026e-07, "loss": 0.0001, "reward": 1.7589286640286446, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 4316 }, { "completion_length": 232.65179920196533, "epoch": 0.7239196948740517, "grad_norm": 0.1687238975285787, "kl": 0.10968017578125, "learning_rate": 4.999948323466035e-07, "loss": 0.0001, "reward": 1.6839286535978317, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.6883928887546062, "rewards/format_reward_func": 0.9955357164144516, "step": 4318 }, { "completion_length": 224.08483123779297, "epoch": 0.7242549981139192, "grad_norm": 0.004285123511293161, "kl": 0.11199951171875, "learning_rate": 4.999947549299238e-07, "loss": 0.0001, "reward": 1.782142885029316, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7910714726895094, "rewards/format_reward_func": 0.9910714328289032, "step": 4320 }, { "completion_length": 224.9196538925171, "epoch": 0.7245903013537869, "grad_norm": 0.19576979486681245, "kl": 0.107696533203125, "learning_rate": 4.999946769376633e-07, "loss": 0.0001, "reward": 1.7357143312692642, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 4322 }, { "completion_length": 221.56697177886963, "epoch": 0.7249256045936544, "grad_norm": 0.28962875146992173, "kl": 0.10064697265625, "learning_rate": 4.999945983698224e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714715719223, "rewards/format_reward_func": 1.0, "step": 4324 }, { "completion_length": 233.17411708831787, "epoch": 0.7252609078335219, "grad_norm": 0.46336086157682704, "kl": 0.10491943359375, "learning_rate": 4.999945192264013e-07, "loss": 0.0001, "reward": 1.7665179446339607, "reward_std": 0.06755394907668233, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 0.9986607171595097, "step": 4326 }, { "completion_length": 230.23661613464355, "epoch": 0.7255962110733895, "grad_norm": 0.2822536367827795, "kl": 0.1027679443359375, "learning_rate": 4.999944395074001e-07, "loss": 0.0001, "reward": 1.796428643167019, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 4328 }, { "completion_length": 233.15626049041748, "epoch": 0.7259315143132571, "grad_norm": 0.19606233734620573, "kl": 0.0909881591796875, "learning_rate": 4.99994359212819e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 4330 }, { "completion_length": 229.98215293884277, "epoch": 0.7262668175531246, "grad_norm": 0.3140268857004368, "kl": 0.103546142578125, "learning_rate": 4.999942783426583e-07, "loss": 0.0001, "reward": 1.7303572297096252, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7348214648663998, "rewards/format_reward_func": 0.9955357164144516, "step": 4332 }, { "completion_length": 241.1294765472412, "epoch": 0.7266021207929921, "grad_norm": 0.0038540600931844153, "kl": 0.0908050537109375, "learning_rate": 4.999941968969179e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7535714693367481, "rewards/format_reward_func": 1.0, "step": 4334 }, { "completion_length": 237.35268783569336, "epoch": 0.7269374240328598, "grad_norm": 0.28605535444102753, "kl": 0.094390869140625, "learning_rate": 4.999941148755983e-07, "loss": 0.0001, "reward": 1.8165179342031479, "reward_std": 0.0473508988507092, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 0.9986607171595097, "step": 4336 }, { "completion_length": 248.6830472946167, "epoch": 0.7272727272727273, "grad_norm": 0.14850043176738945, "kl": 0.0977783203125, "learning_rate": 4.999940322786994e-07, "loss": 0.0001, "reward": 1.8232143372297287, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8276785910129547, "rewards/format_reward_func": 0.9955357164144516, "step": 4338 }, { "completion_length": 244.63840293884277, "epoch": 0.7276080305125948, "grad_norm": 0.3221783517308769, "kl": 0.1132965087890625, "learning_rate": 4.999939491062217e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 4340 }, { "completion_length": 241.77679634094238, "epoch": 0.7279433337524623, "grad_norm": 0.182633691742018, "kl": 0.111907958984375, "learning_rate": 4.999938653581652e-07, "loss": 0.0001, "reward": 1.7107143625617027, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7107143122702837, "rewards/format_reward_func": 1.0, "step": 4342 }, { "completion_length": 248.05358028411865, "epoch": 0.72827863699233, "grad_norm": 0.18817809176508976, "kl": 0.1070709228515625, "learning_rate": 4.999937810345301e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571850061417, "rewards/format_reward_func": 1.0, "step": 4344 }, { "completion_length": 243.54018878936768, "epoch": 0.7286139402321975, "grad_norm": 0.22139500314548327, "kl": 0.121734619140625, "learning_rate": 4.999936961353166e-07, "loss": 0.0001, "reward": 1.714285783469677, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857536673546, "rewards/format_reward_func": 1.0, "step": 4346 }, { "completion_length": 238.27233409881592, "epoch": 0.728949243472065, "grad_norm": 0.2784243303823706, "kl": 0.097625732421875, "learning_rate": 4.999936106605251e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 4348 }, { "completion_length": 243.17858219146729, "epoch": 0.7292845467119327, "grad_norm": 0.23200086785651602, "kl": 0.102752685546875, "learning_rate": 4.999935246101554e-07, "loss": 0.0001, "reward": 1.8214286044239998, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8214286081492901, "rewards/format_reward_func": 1.0, "step": 4350 }, { "completion_length": 243.65626049041748, "epoch": 0.7296198499518002, "grad_norm": 0.2619354559845573, "kl": 0.103515625, "learning_rate": 4.99993437984208e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 4352 }, { "completion_length": 244.821439743042, "epoch": 0.7299551531916677, "grad_norm": 0.38513346945983323, "kl": 0.101348876953125, "learning_rate": 4.99993350782683e-07, "loss": 0.0001, "reward": 1.8232143372297287, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.827678594738245, "rewards/format_reward_func": 0.9955357164144516, "step": 4354 }, { "completion_length": 229.3884038925171, "epoch": 0.7302904564315352, "grad_norm": 0.12321027482845619, "kl": 0.107513427734375, "learning_rate": 4.999932630055807e-07, "loss": 0.0001, "reward": 1.8250000327825546, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8250000327825546, "rewards/format_reward_func": 1.0, "step": 4356 }, { "completion_length": 242.5446548461914, "epoch": 0.7306257596714029, "grad_norm": 0.30145710161379585, "kl": 0.223480224609375, "learning_rate": 4.99993174652901e-07, "loss": 0.0002, "reward": 1.728571504354477, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7375000454485416, "rewards/format_reward_func": 0.9910714328289032, "step": 4358 }, { "completion_length": 236.34822273254395, "epoch": 0.7309610629112704, "grad_norm": 0.3049816173707025, "kl": 0.103057861328125, "learning_rate": 4.999930857246445e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 4360 }, { "completion_length": 245.19644260406494, "epoch": 0.7312963661511379, "grad_norm": 0.262340460674097, "kl": 0.0929107666015625, "learning_rate": 4.999929962208113e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 4362 }, { "completion_length": 238.6339406967163, "epoch": 0.7316316693910055, "grad_norm": 0.21572310070899314, "kl": 0.097137451171875, "learning_rate": 4.999929061414014e-07, "loss": 0.0001, "reward": 1.7446429058909416, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7491071838885546, "rewards/format_reward_func": 0.9955357164144516, "step": 4364 }, { "completion_length": 236.9866180419922, "epoch": 0.7319669726308731, "grad_norm": 0.20339241801146096, "kl": 0.1023712158203125, "learning_rate": 4.999928154864152e-07, "loss": 0.0001, "reward": 1.7558036297559738, "reward_std": 0.04230013629421592, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 0.9986607171595097, "step": 4366 }, { "completion_length": 239.3705472946167, "epoch": 0.7323022758707406, "grad_norm": 0.055339300543254144, "kl": 0.09466552734375, "learning_rate": 4.999927242558527e-07, "loss": 0.0001, "reward": 1.719642922282219, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7241071742027998, "rewards/format_reward_func": 0.9955357164144516, "step": 4368 }, { "completion_length": 237.75001049041748, "epoch": 0.7326375791106081, "grad_norm": 0.2988596314576556, "kl": 0.091400146484375, "learning_rate": 4.999926324497145e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428954601288, "rewards/format_reward_func": 1.0, "step": 4370 }, { "completion_length": 234.7321538925171, "epoch": 0.7329728823504758, "grad_norm": 0.12317099081226329, "kl": 0.095947265625, "learning_rate": 4.999925400680004e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 4372 }, { "completion_length": 231.85268783569336, "epoch": 0.7333081855903433, "grad_norm": 0.2941982386958749, "kl": 0.09271240234375, "learning_rate": 4.999924471107108e-07, "loss": 0.0001, "reward": 1.807142898440361, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428872644901, "rewards/format_reward_func": 1.0, "step": 4374 }, { "completion_length": 229.54018783569336, "epoch": 0.7336434888302108, "grad_norm": 0.24718352959271161, "kl": 0.0890960693359375, "learning_rate": 4.99992353577846e-07, "loss": 0.0001, "reward": 1.7982143461704254, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.8026785925030708, "rewards/format_reward_func": 0.9955357164144516, "step": 4376 }, { "completion_length": 237.3794755935669, "epoch": 0.7339787920700783, "grad_norm": 0.21844899000323725, "kl": 0.0857391357421875, "learning_rate": 4.999922594694059e-07, "loss": 0.0001, "reward": 1.7982143312692642, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8026786036789417, "rewards/format_reward_func": 0.9955357164144516, "step": 4378 }, { "completion_length": 228.34822463989258, "epoch": 0.734314095309946, "grad_norm": 0.31139955126338215, "kl": 0.0860137939453125, "learning_rate": 4.999921647853911e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 4380 }, { "completion_length": 236.80804538726807, "epoch": 0.7346493985498135, "grad_norm": 0.25288666916101726, "kl": 0.0904083251953125, "learning_rate": 4.999920695258016e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 4382 }, { "completion_length": 237.9509038925171, "epoch": 0.734984701789681, "grad_norm": 0.15349708899045258, "kl": 0.0903472900390625, "learning_rate": 4.999919736906377e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 4384 }, { "completion_length": 233.35715198516846, "epoch": 0.7353200050295486, "grad_norm": 0.20999069531459416, "kl": 0.0869598388671875, "learning_rate": 4.999918772798995e-07, "loss": 0.0001, "reward": 1.6910715103149414, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.6955357491970062, "rewards/format_reward_func": 0.9955357164144516, "step": 4386 }, { "completion_length": 225.27679538726807, "epoch": 0.7356553082694162, "grad_norm": 0.21644641902809753, "kl": 0.0892486572265625, "learning_rate": 4.999917802935875e-07, "loss": 0.0001, "reward": 1.7607143744826317, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 4388 }, { "completion_length": 242.883939743042, "epoch": 0.7359906115092837, "grad_norm": 0.2097634616773623, "kl": 0.10491943359375, "learning_rate": 4.999916827317016e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 4390 }, { "completion_length": 234.321439743042, "epoch": 0.7363259147491512, "grad_norm": 0.301361903974932, "kl": 0.0983428955078125, "learning_rate": 4.99991584594242e-07, "loss": 0.0001, "reward": 1.7267857491970062, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7312500327825546, "rewards/format_reward_func": 0.9955357164144516, "step": 4392 }, { "completion_length": 240.28572463989258, "epoch": 0.7366612179890188, "grad_norm": 0.2971735765340939, "kl": 0.106536865234375, "learning_rate": 4.999914858812094e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 4394 }, { "completion_length": 236.8035831451416, "epoch": 0.7369965212288864, "grad_norm": 0.24862167834729915, "kl": 0.1034393310546875, "learning_rate": 4.999913865926035e-07, "loss": 0.0001, "reward": 1.7839286252856255, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 4396 }, { "completion_length": 236.5892972946167, "epoch": 0.7373318244687539, "grad_norm": 0.17730660559017838, "kl": 0.1024017333984375, "learning_rate": 4.999912867284247e-07, "loss": 0.0001, "reward": 1.7589286267757416, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 4398 }, { "completion_length": 235.58929538726807, "epoch": 0.7376671277086215, "grad_norm": 0.19618760815384542, "kl": 0.0888214111328125, "learning_rate": 4.999911862886735e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 4400 }, { "completion_length": 241.71429824829102, "epoch": 0.738002430948489, "grad_norm": 0.23361102730300204, "kl": 0.097503662109375, "learning_rate": 4.999910852733496e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.09596449043601751, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 0.9821428656578064, "step": 4402 }, { "completion_length": 230.92858123779297, "epoch": 0.7383377341883566, "grad_norm": 0.13161910784321273, "kl": 0.102325439453125, "learning_rate": 4.999909836824538e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7642857544124126, "rewards/format_reward_func": 1.0, "step": 4404 }, { "completion_length": 238.383939743042, "epoch": 0.7386730374282241, "grad_norm": 0.11006102889865772, "kl": 0.088531494140625, "learning_rate": 4.999908815159859e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7455357387661934, "rewards/format_reward_func": 0.9955357164144516, "step": 4406 }, { "completion_length": 242.133939743042, "epoch": 0.7390083406680917, "grad_norm": 0.21817612106604028, "kl": 0.0933990478515625, "learning_rate": 4.999907787739464e-07, "loss": 0.0001, "reward": 1.7571429535746574, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428753435612, "rewards/format_reward_func": 1.0, "step": 4408 }, { "completion_length": 240.34375858306885, "epoch": 0.7393436439079593, "grad_norm": 0.15413740306942578, "kl": 0.0780181884765625, "learning_rate": 4.999906754563354e-07, "loss": 0.0001, "reward": 1.7446429282426834, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 4410 }, { "completion_length": 231.25893783569336, "epoch": 0.7396789471478268, "grad_norm": 0.14813305726517229, "kl": 0.089752197265625, "learning_rate": 4.999905715631532e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.785714328289032, "rewards/format_reward_func": 1.0, "step": 4412 }, { "completion_length": 236.66072273254395, "epoch": 0.7400142503876944, "grad_norm": 0.46415342718216857, "kl": 0.08685302734375, "learning_rate": 4.999904670943999e-07, "loss": 0.0001, "reward": 1.7892857640981674, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7892857231199741, "rewards/format_reward_func": 1.0, "step": 4414 }, { "completion_length": 231.31697463989258, "epoch": 0.7403495536275619, "grad_norm": 0.2764112079171355, "kl": 0.081451416015625, "learning_rate": 4.999903620500759e-07, "loss": 0.0001, "reward": 1.805357202887535, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.8098214566707611, "rewards/format_reward_func": 0.9955357164144516, "step": 4416 }, { "completion_length": 230.25447368621826, "epoch": 0.7406848568674295, "grad_norm": 0.11346544928980262, "kl": 0.0750732421875, "learning_rate": 4.999902564301815e-07, "loss": 0.0001, "reward": 1.8035714626312256, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8035714626312256, "rewards/format_reward_func": 1.0, "step": 4418 }, { "completion_length": 227.22768878936768, "epoch": 0.741020160107297, "grad_norm": 0.23948562808508556, "kl": 0.0700836181640625, "learning_rate": 4.999901502347168e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.810714315623045, "rewards/format_reward_func": 1.0, "step": 4420 }, { "completion_length": 230.86608409881592, "epoch": 0.7413554633471646, "grad_norm": 0.2489915856046395, "kl": 0.0712127685546875, "learning_rate": 4.99990043463682e-07, "loss": 0.0001, "reward": 1.6915179416537285, "reward_std": 0.06250318652018905, "rewards/equation_reward_func": 0.6928571667522192, "rewards/format_reward_func": 0.9986607171595097, "step": 4422 }, { "completion_length": 228.29911613464355, "epoch": 0.7416907665870321, "grad_norm": 0.1812571795709786, "kl": 0.077301025390625, "learning_rate": 4.999899361170775e-07, "loss": 0.0001, "reward": 1.7290179431438446, "reward_std": 0.0599778073374182, "rewards/equation_reward_func": 0.7348214685916901, "rewards/format_reward_func": 0.9941964335739613, "step": 4424 }, { "completion_length": 230.34375858306885, "epoch": 0.7420260698268997, "grad_norm": 0.26999021740137746, "kl": 0.0660858154296875, "learning_rate": 4.999898281949035e-07, "loss": 0.0001, "reward": 1.7000000849366188, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.708928631618619, "rewards/format_reward_func": 0.9910714328289032, "step": 4426 }, { "completion_length": 225.80358028411865, "epoch": 0.7423613730667673, "grad_norm": 0.30457239085627186, "kl": 0.0674896240234375, "learning_rate": 4.999897196971602e-07, "loss": 0.0001, "reward": 1.7214286550879478, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214286103844643, "rewards/format_reward_func": 1.0, "step": 4428 }, { "completion_length": 224.40626049041748, "epoch": 0.7426966763066348, "grad_norm": 0.21223745143930695, "kl": 0.0684814453125, "learning_rate": 4.999896106238479e-07, "loss": 0.0001, "reward": 1.8035714700818062, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 4430 }, { "completion_length": 227.92411708831787, "epoch": 0.7430319795465024, "grad_norm": 0.26564997925657297, "kl": 0.06092071533203125, "learning_rate": 4.999895009749667e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714693367481, "rewards/format_reward_func": 1.0, "step": 4432 }, { "completion_length": 218.03126049041748, "epoch": 0.7433672827863699, "grad_norm": 0.23386483907548092, "kl": 0.068023681640625, "learning_rate": 4.999893907505172e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 4434 }, { "completion_length": 227.23661518096924, "epoch": 0.7437025860262375, "grad_norm": 0.2580465030816322, "kl": 0.065032958984375, "learning_rate": 4.999892799504992e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 4436 }, { "completion_length": 218.92411613464355, "epoch": 0.744037889266105, "grad_norm": 0.3291190988547193, "kl": 0.087890625, "learning_rate": 4.999891685749133e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 4438 }, { "completion_length": 230.83483219146729, "epoch": 0.7443731925059726, "grad_norm": 0.20748276719564904, "kl": 0.0592041015625, "learning_rate": 4.999890566237597e-07, "loss": 0.0001, "reward": 1.8232143446803093, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.827678594738245, "rewards/format_reward_func": 0.9955357164144516, "step": 4440 }, { "completion_length": 223.0892972946167, "epoch": 0.7447084957458402, "grad_norm": 0.3388534696826737, "kl": 0.067169189453125, "learning_rate": 4.999889440970385e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714562982321, "rewards/format_reward_func": 1.0, "step": 4442 }, { "completion_length": 222.49554634094238, "epoch": 0.7450437989857077, "grad_norm": 0.2706600096126691, "kl": 0.0620880126953125, "learning_rate": 4.999888309947501e-07, "loss": 0.0001, "reward": 1.7375000640749931, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643253087997, "rewards/format_reward_func": 0.9955357164144516, "step": 4444 }, { "completion_length": 221.94197368621826, "epoch": 0.7453791022255752, "grad_norm": 0.24547408044411922, "kl": 0.06488037109375, "learning_rate": 4.999887173168947e-07, "loss": 0.0001, "reward": 1.7285714969038963, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 4446 }, { "completion_length": 214.54018783569336, "epoch": 0.7457144054654428, "grad_norm": 0.1853125626531355, "kl": 0.0753631591796875, "learning_rate": 4.999886030634727e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 4448 }, { "completion_length": 221.7053680419922, "epoch": 0.7460497087053104, "grad_norm": 0.19931767396673297, "kl": 0.0742950439453125, "learning_rate": 4.999884882344842e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 4450 }, { "completion_length": 217.12947273254395, "epoch": 0.7463850119451779, "grad_norm": 0.1636954950166591, "kl": 0.06884765625, "learning_rate": 4.999883728299294e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 4452 }, { "completion_length": 215.13840103149414, "epoch": 0.7467203151850454, "grad_norm": 0.2250329654524265, "kl": 0.07806396484375, "learning_rate": 4.999882568498087e-07, "loss": 0.0001, "reward": 1.7107143476605415, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7107143308967352, "rewards/format_reward_func": 1.0, "step": 4454 }, { "completion_length": 213.21429443359375, "epoch": 0.7470556184249131, "grad_norm": 0.2505331278808723, "kl": 0.0777130126953125, "learning_rate": 4.999881402941225e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.792857164517045, "rewards/format_reward_func": 1.0, "step": 4456 }, { "completion_length": 222.19197463989258, "epoch": 0.7473909216647806, "grad_norm": 0.16466956778998829, "kl": 0.0843505859375, "learning_rate": 4.999880231628707e-07, "loss": 0.0001, "reward": 1.74910718947649, "reward_std": 0.05177031829953194, "rewards/equation_reward_func": 0.7553571872413158, "rewards/format_reward_func": 0.9937500059604645, "step": 4458 }, { "completion_length": 221.11161613464355, "epoch": 0.7477262249046481, "grad_norm": 0.2396358314615966, "kl": 0.0821380615234375, "learning_rate": 4.999879054560539e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 4460 }, { "completion_length": 227.1651906967163, "epoch": 0.7480615281445157, "grad_norm": 0.33908675843157704, "kl": 0.0939178466796875, "learning_rate": 4.999877871736723e-07, "loss": 0.0001, "reward": 1.7892857939004898, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.789285734295845, "rewards/format_reward_func": 1.0, "step": 4462 }, { "completion_length": 216.33929538726807, "epoch": 0.7483968313843833, "grad_norm": 0.14003622530992152, "kl": 0.09588623046875, "learning_rate": 4.999876683157261e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 4464 }, { "completion_length": 218.0446538925171, "epoch": 0.7487321346242508, "grad_norm": 0.2560114087268138, "kl": 0.09271240234375, "learning_rate": 4.999875488822155e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 4466 }, { "completion_length": 224.3437614440918, "epoch": 0.7490674378641183, "grad_norm": 0.31758820202141924, "kl": 0.07741546630859375, "learning_rate": 4.999874288731409e-07, "loss": 0.0001, "reward": 1.7232143729925156, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7276786118745804, "rewards/format_reward_func": 0.9955357164144516, "step": 4468 }, { "completion_length": 213.69643878936768, "epoch": 0.749402741103986, "grad_norm": 0.44047391205664077, "kl": 0.089691162109375, "learning_rate": 4.999873082885027e-07, "loss": 0.0001, "reward": 1.7142857983708382, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857387661934, "rewards/format_reward_func": 1.0, "step": 4470 }, { "completion_length": 207.86608123779297, "epoch": 0.7497380443438535, "grad_norm": 0.4861281163334542, "kl": 0.09527587890625, "learning_rate": 4.999871871283008e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 4472 }, { "completion_length": 209.946439743042, "epoch": 0.750073347583721, "grad_norm": 0.19385061512985016, "kl": 0.1087646484375, "learning_rate": 4.999870653925359e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 4474 }, { "completion_length": 214.26786518096924, "epoch": 0.7504086508235885, "grad_norm": 0.15678270263892813, "kl": 0.098419189453125, "learning_rate": 4.99986943081208e-07, "loss": 0.0001, "reward": 1.737500086426735, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.741964315995574, "rewards/format_reward_func": 0.9955357164144516, "step": 4476 }, { "completion_length": 203.92411613464355, "epoch": 0.7507439540634562, "grad_norm": 0.2732266849605991, "kl": 0.11285400390625, "learning_rate": 4.999868201943175e-07, "loss": 0.0001, "reward": 1.7089286595582962, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7133928798139095, "rewards/format_reward_func": 0.9955357164144516, "step": 4478 }, { "completion_length": 215.11161613464355, "epoch": 0.7510792573033237, "grad_norm": 0.17807125877809504, "kl": 0.111358642578125, "learning_rate": 4.999866967318645e-07, "loss": 0.0001, "reward": 1.7071429342031479, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7071428913623095, "rewards/format_reward_func": 1.0, "step": 4480 }, { "completion_length": 216.14286708831787, "epoch": 0.7514145605431912, "grad_norm": 0.25479605394237403, "kl": 0.1113739013671875, "learning_rate": 4.999865726938497e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 4482 }, { "completion_length": 208.52233028411865, "epoch": 0.7517498637830589, "grad_norm": 0.3540568480128976, "kl": 0.128082275390625, "learning_rate": 4.999864480802729e-07, "loss": 0.0001, "reward": 1.7142858281731606, "reward_std": 0.07071067858487368, "rewards/equation_reward_func": 0.7232143059372902, "rewards/format_reward_func": 0.9910714328289032, "step": 4484 }, { "completion_length": 222.41072273254395, "epoch": 0.7520851670229264, "grad_norm": 0.28369640416103614, "kl": 0.104766845703125, "learning_rate": 4.999863228911347e-07, "loss": 0.0001, "reward": 1.7214286476373672, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7294643130153418, "rewards/format_reward_func": 0.9919642955064774, "step": 4486 }, { "completion_length": 213.75893783569336, "epoch": 0.7524204702627939, "grad_norm": 0.0049135323066809514, "kl": 0.1141357421875, "learning_rate": 4.999861971264353e-07, "loss": 0.0001, "reward": 1.6982143744826317, "reward_std": 0.012626906856894493, "rewards/equation_reward_func": 0.7026786096394062, "rewards/format_reward_func": 0.9955357164144516, "step": 4488 }, { "completion_length": 221.0803680419922, "epoch": 0.7527557735026614, "grad_norm": 0.20714617745583425, "kl": 0.104827880859375, "learning_rate": 4.999860707861751e-07, "loss": 0.0001, "reward": 1.7607143446803093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 4490 }, { "completion_length": 216.79911708831787, "epoch": 0.7530910767425291, "grad_norm": 0.3164850402145241, "kl": 0.0961151123046875, "learning_rate": 4.999859438703541e-07, "loss": 0.0001, "reward": 1.8017857670783997, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.8062500357627869, "rewards/format_reward_func": 0.9955357164144516, "step": 4492 }, { "completion_length": 216.8303680419922, "epoch": 0.7534263799823966, "grad_norm": 0.41792000510914323, "kl": 0.1044158935546875, "learning_rate": 4.999858163789728e-07, "loss": 0.0001, "reward": 1.7732143551111221, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7776785902678967, "rewards/format_reward_func": 0.9955357164144516, "step": 4494 }, { "completion_length": 220.0446548461914, "epoch": 0.7537616832222641, "grad_norm": 0.3411173294484826, "kl": 0.0836029052734375, "learning_rate": 4.999856883120314e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 4496 }, { "completion_length": 220.37500858306885, "epoch": 0.7540969864621316, "grad_norm": 0.2682306203706297, "kl": 0.0904388427734375, "learning_rate": 4.999855596695304e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000316649675, "rewards/format_reward_func": 1.0, "step": 4498 }, { "completion_length": 219.66072273254395, "epoch": 0.7544322897019993, "grad_norm": 0.25123051432297105, "kl": 0.0814666748046875, "learning_rate": 4.999854304514699e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7812500335276127, "rewards/format_reward_func": 0.9955357164144516, "step": 4500 }, { "completion_length": 221.74554538726807, "epoch": 0.7547675929418668, "grad_norm": 0.25857901554901946, "kl": 0.0801544189453125, "learning_rate": 4.999853006578503e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 4502 }, { "completion_length": 227.44643878936768, "epoch": 0.7551028961817343, "grad_norm": 0.37851301366513695, "kl": 0.092071533203125, "learning_rate": 4.999851702886719e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7455357555299997, "rewards/format_reward_func": 0.9955357164144516, "step": 4504 }, { "completion_length": 225.27233219146729, "epoch": 0.755438199421602, "grad_norm": 0.2513086311013245, "kl": 0.0858306884765625, "learning_rate": 4.999850393439348e-07, "loss": 0.0001, "reward": 1.7035714983940125, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7035714853554964, "rewards/format_reward_func": 1.0, "step": 4506 }, { "completion_length": 226.72768688201904, "epoch": 0.7557735026614695, "grad_norm": 0.3411021833760753, "kl": 0.0855865478515625, "learning_rate": 4.999849078236395e-07, "loss": 0.0001, "reward": 1.7035715132951736, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7035714630037546, "rewards/format_reward_func": 1.0, "step": 4508 }, { "completion_length": 223.00001049041748, "epoch": 0.756108805901337, "grad_norm": 0.2224263673764874, "kl": 0.09161376953125, "learning_rate": 4.999847757277862e-07, "loss": 0.0001, "reward": 1.8500000089406967, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8500000238418579, "rewards/format_reward_func": 1.0, "step": 4510 }, { "completion_length": 233.04911708831787, "epoch": 0.7564441091412045, "grad_norm": 0.003793286746061747, "kl": 0.0831298828125, "learning_rate": 4.999846430563753e-07, "loss": 0.0001, "reward": 1.7571429088711739, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 4512 }, { "completion_length": 226.94197368621826, "epoch": 0.7567794123810722, "grad_norm": 0.27291864041588626, "kl": 0.0889892578125, "learning_rate": 4.999845098094071e-07, "loss": 0.0001, "reward": 1.7250000834465027, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000387430191, "rewards/format_reward_func": 1.0, "step": 4514 }, { "completion_length": 224.39733219146729, "epoch": 0.7571147156209397, "grad_norm": 0.22265584848231698, "kl": 0.087921142578125, "learning_rate": 4.999843759868818e-07, "loss": 0.0001, "reward": 1.7589286118745804, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9955357164144516, "step": 4516 }, { "completion_length": 228.15179538726807, "epoch": 0.7574500188608072, "grad_norm": 0.20859738186600213, "kl": 0.08905029296875, "learning_rate": 4.999842415887999e-07, "loss": 0.0001, "reward": 1.8214286118745804, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8214285969734192, "rewards/format_reward_func": 1.0, "step": 4518 }, { "completion_length": 235.78125858306885, "epoch": 0.7577853221006748, "grad_norm": 0.2778610372869086, "kl": 0.10809326171875, "learning_rate": 4.999841066151615e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 4520 }, { "completion_length": 222.90625858306885, "epoch": 0.7581206253405424, "grad_norm": 0.47212251372452274, "kl": 0.087066650390625, "learning_rate": 4.99983971065967e-07, "loss": 0.0001, "reward": 1.758928619325161, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.763392886146903, "rewards/format_reward_func": 0.9955357164144516, "step": 4522 }, { "completion_length": 226.86161613464355, "epoch": 0.7584559285804099, "grad_norm": 0.17318141957091432, "kl": 0.08984375, "learning_rate": 4.999838349412168e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143357396126, "rewards/format_reward_func": 1.0, "step": 4524 }, { "completion_length": 235.1741180419922, "epoch": 0.7587912318202774, "grad_norm": 0.2131832104092294, "kl": 0.0755767822265625, "learning_rate": 4.99983698240911e-07, "loss": 0.0001, "reward": 1.7290179133415222, "reward_std": 0.04987628059461713, "rewards/equation_reward_func": 0.7348214574158192, "rewards/format_reward_func": 0.9941964335739613, "step": 4526 }, { "completion_length": 236.27233123779297, "epoch": 0.759126535060145, "grad_norm": 0.13201408256364378, "kl": 0.06096649169921875, "learning_rate": 4.999835609650501e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 4528 }, { "completion_length": 240.81697273254395, "epoch": 0.7594618383000126, "grad_norm": 0.21052478496355254, "kl": 0.0613861083984375, "learning_rate": 4.999834231136344e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 1.0, "step": 4530 }, { "completion_length": 241.55358219146729, "epoch": 0.7597971415398801, "grad_norm": 0.13556321579045813, "kl": 0.058197021484375, "learning_rate": 4.999832846866641e-07, "loss": 0.0001, "reward": 1.7843750566244125, "reward_std": 0.04230013629421592, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 0.9986607171595097, "step": 4532 }, { "completion_length": 227.63840579986572, "epoch": 0.7601324447797477, "grad_norm": 0.23882174356311658, "kl": 0.069305419921875, "learning_rate": 4.999831456841395e-07, "loss": 0.0001, "reward": 1.8125000670552254, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.8169643171131611, "rewards/format_reward_func": 0.9955357164144516, "step": 4534 }, { "completion_length": 237.0134048461914, "epoch": 0.7604677480196153, "grad_norm": 0.18469807951411582, "kl": 0.059722900390625, "learning_rate": 4.999830061060613e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 4536 }, { "completion_length": 233.2991180419922, "epoch": 0.7608030512594828, "grad_norm": 0.23399028758696575, "kl": 0.05745697021484375, "learning_rate": 4.999828659524293e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714562982321, "rewards/format_reward_func": 1.0, "step": 4538 }, { "completion_length": 239.99108219146729, "epoch": 0.7611383544993503, "grad_norm": 0.18787150584745277, "kl": 0.0531463623046875, "learning_rate": 4.999827252232441e-07, "loss": 0.0001, "reward": 1.7571429461240768, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428883820772, "rewards/format_reward_func": 1.0, "step": 4540 }, { "completion_length": 231.78125953674316, "epoch": 0.7614736577392179, "grad_norm": 0.3122036679038995, "kl": 0.05462646484375, "learning_rate": 4.99982583918506e-07, "loss": 0.0001, "reward": 1.7232143506407738, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7276786062866449, "rewards/format_reward_func": 0.9955357164144516, "step": 4542 }, { "completion_length": 237.63393878936768, "epoch": 0.7618089609790855, "grad_norm": 0.3230807616701815, "kl": 0.0553741455078125, "learning_rate": 4.999824420382153e-07, "loss": 0.0001, "reward": 1.7285715267062187, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714484751225, "rewards/format_reward_func": 1.0, "step": 4544 }, { "completion_length": 236.14286994934082, "epoch": 0.762144264218953, "grad_norm": 0.27987783469978955, "kl": 0.0552215576171875, "learning_rate": 4.999822995823724e-07, "loss": 0.0001, "reward": 1.7839286550879478, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 4546 }, { "completion_length": 235.84822368621826, "epoch": 0.7624795674588206, "grad_norm": 0.297388880050552, "kl": 0.0589752197265625, "learning_rate": 4.999821565509774e-07, "loss": 0.0001, "reward": 1.7392857745289803, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857503145933, "rewards/format_reward_func": 1.0, "step": 4548 }, { "completion_length": 227.46429538726807, "epoch": 0.7628148706986881, "grad_norm": 0.18228938587993593, "kl": 0.0537109375, "learning_rate": 4.999820129440309e-07, "loss": 0.0001, "reward": 1.7593750804662704, "reward_std": 0.02714784862473607, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 0.9986607171595097, "step": 4550 }, { "completion_length": 238.2500114440918, "epoch": 0.7631501739385557, "grad_norm": 0.29408530140054356, "kl": 0.0583953857421875, "learning_rate": 4.999818687615332e-07, "loss": 0.0001, "reward": 1.7379465028643608, "reward_std": 0.037249373737722635, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 0.9986607171595097, "step": 4552 }, { "completion_length": 237.05358505249023, "epoch": 0.7634854771784232, "grad_norm": 0.21134377845046912, "kl": 0.0565338134765625, "learning_rate": 4.999817240034843e-07, "loss": 0.0001, "reward": 1.7964286506175995, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 4554 }, { "completion_length": 237.1785831451416, "epoch": 0.7638207804182908, "grad_norm": 0.3860091198419233, "kl": 0.0579986572265625, "learning_rate": 4.99981578669885e-07, "loss": 0.0001, "reward": 1.7107143625617027, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7107143253087997, "rewards/format_reward_func": 1.0, "step": 4556 }, { "completion_length": 233.02233219146729, "epoch": 0.7641560836581583, "grad_norm": 0.22527084686531942, "kl": 0.05401611328125, "learning_rate": 4.999814327607354e-07, "loss": 0.0001, "reward": 1.8196429163217545, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.8241071775555611, "rewards/format_reward_func": 0.9955357164144516, "step": 4558 }, { "completion_length": 235.0357255935669, "epoch": 0.7644913868980259, "grad_norm": 0.20327881479355508, "kl": 0.0784149169921875, "learning_rate": 4.999812862760358e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286059141159, "rewards/format_reward_func": 1.0, "step": 4560 }, { "completion_length": 246.8794765472412, "epoch": 0.7648266901378935, "grad_norm": 0.27998225865935156, "kl": 0.05908203125, "learning_rate": 4.999811392157866e-07, "loss": 0.0001, "reward": 1.7178572118282318, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7267857566475868, "rewards/format_reward_func": 0.9910714328289032, "step": 4562 }, { "completion_length": 230.05804634094238, "epoch": 0.765161993377761, "grad_norm": 0.22216394342281265, "kl": 0.0649871826171875, "learning_rate": 4.999809915799882e-07, "loss": 0.0001, "reward": 1.776785783469677, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500335276127, "rewards/format_reward_func": 0.9955357164144516, "step": 4564 }, { "completion_length": 238.883939743042, "epoch": 0.7654972966176286, "grad_norm": 0.31322171637204343, "kl": 0.060089111328125, "learning_rate": 4.999808433686408e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 4566 }, { "completion_length": 233.1607265472412, "epoch": 0.7658325998574961, "grad_norm": 0.2450290221357226, "kl": 0.0533905029296875, "learning_rate": 4.999806945817447e-07, "loss": 0.0001, "reward": 1.7910714820027351, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7955357506871223, "rewards/format_reward_func": 0.9955357164144516, "step": 4568 }, { "completion_length": 241.15626049041748, "epoch": 0.7661679030973637, "grad_norm": 0.189232939152715, "kl": 0.050811767578125, "learning_rate": 4.999805452193006e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 4570 }, { "completion_length": 234.21429634094238, "epoch": 0.7665032063372312, "grad_norm": 0.2246744520966242, "kl": 0.05870819091796875, "learning_rate": 4.999803952813084e-07, "loss": 0.0001, "reward": 1.6964286491274834, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6964286137372255, "rewards/format_reward_func": 1.0, "step": 4572 }, { "completion_length": 232.60268783569336, "epoch": 0.7668385095770988, "grad_norm": 0.3987041317522369, "kl": 0.05547332763671875, "learning_rate": 4.999802447677688e-07, "loss": 0.0001, "reward": 1.8250000476837158, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8250000216066837, "rewards/format_reward_func": 1.0, "step": 4574 }, { "completion_length": 233.89733123779297, "epoch": 0.7671738128169664, "grad_norm": 0.285725683188748, "kl": 0.0704803466796875, "learning_rate": 4.999800936786818e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 4576 }, { "completion_length": 226.71429538726807, "epoch": 0.7675091160568339, "grad_norm": 0.13415930213725472, "kl": 0.0756378173828125, "learning_rate": 4.999799420140481e-07, "loss": 0.0001, "reward": 1.7410714998841286, "reward_std": 0.0328299580141902, "rewards/equation_reward_func": 0.745535746216774, "rewards/format_reward_func": 0.9955357164144516, "step": 4578 }, { "completion_length": 223.12054824829102, "epoch": 0.7678444192967014, "grad_norm": 0.26553482124859545, "kl": 0.0814056396484375, "learning_rate": 4.999797897738679e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 4580 }, { "completion_length": 227.18750858306885, "epoch": 0.768179722536569, "grad_norm": 0.25025503420441153, "kl": 0.0826568603515625, "learning_rate": 4.999796369581414e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000223517418, "rewards/format_reward_func": 1.0, "step": 4582 }, { "completion_length": 227.69643783569336, "epoch": 0.7685150257764366, "grad_norm": 0.24404022688359478, "kl": 0.0773773193359375, "learning_rate": 4.999794835668692e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 4584 }, { "completion_length": 230.09375953674316, "epoch": 0.7688503290163041, "grad_norm": 0.29632477108847594, "kl": 0.0727691650390625, "learning_rate": 4.999793296000515e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428947150707, "rewards/format_reward_func": 1.0, "step": 4586 }, { "completion_length": 227.6205472946167, "epoch": 0.7691856322561716, "grad_norm": 0.13950592949953058, "kl": 0.0792388916015625, "learning_rate": 4.999791750576887e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 4588 }, { "completion_length": 226.1071548461914, "epoch": 0.7695209354960393, "grad_norm": 0.16795992322820116, "kl": 0.0785980224609375, "learning_rate": 4.999790199397813e-07, "loss": 0.0001, "reward": 1.7892857566475868, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857436090708, "rewards/format_reward_func": 1.0, "step": 4590 }, { "completion_length": 233.9732255935669, "epoch": 0.7698562387359068, "grad_norm": 0.2591469522626041, "kl": 0.0863189697265625, "learning_rate": 4.999788642463293e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 4592 }, { "completion_length": 230.99108219146729, "epoch": 0.7701915419757743, "grad_norm": 0.18624617499016574, "kl": 0.083465576171875, "learning_rate": 4.999787079773333e-07, "loss": 0.0001, "reward": 1.710714377462864, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7107143178582191, "rewards/format_reward_func": 1.0, "step": 4594 }, { "completion_length": 226.89733123779297, "epoch": 0.7705268452156419, "grad_norm": 0.32854213372203656, "kl": 0.06622314453125, "learning_rate": 4.999785511327936e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 4596 }, { "completion_length": 228.39733123779297, "epoch": 0.7708621484555095, "grad_norm": 0.27153695010869333, "kl": 0.080352783203125, "learning_rate": 4.999783937127107e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857544124126, "rewards/format_reward_func": 1.0, "step": 4598 }, { "completion_length": 229.54018878936768, "epoch": 0.771197451695377, "grad_norm": 0.20648485712931536, "kl": 0.068572998046875, "learning_rate": 4.999782357170849e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571734577417, "rewards/format_reward_func": 1.0, "step": 4600 }, { "completion_length": 221.12947463989258, "epoch": 0.7715327549352445, "grad_norm": 0.2005184739035029, "kl": 0.070556640625, "learning_rate": 4.999780771459164e-07, "loss": 0.0001, "reward": 1.814285770058632, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.814285758882761, "rewards/format_reward_func": 1.0, "step": 4602 }, { "completion_length": 238.4553689956665, "epoch": 0.7718680581751122, "grad_norm": 0.31052239666651466, "kl": 0.065032958984375, "learning_rate": 4.999779179992057e-07, "loss": 0.0001, "reward": 1.7808036282658577, "reward_std": 0.037249373737722635, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 0.9986607171595097, "step": 4604 }, { "completion_length": 238.0089406967163, "epoch": 0.7722033614149797, "grad_norm": 0.1775028297389513, "kl": 0.071075439453125, "learning_rate": 4.999777582769532e-07, "loss": 0.0001, "reward": 1.7107143849134445, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7107143178582191, "rewards/format_reward_func": 1.0, "step": 4606 }, { "completion_length": 240.54465293884277, "epoch": 0.7725386646548472, "grad_norm": 0.4550589791379301, "kl": 0.0648345947265625, "learning_rate": 4.999775979791591e-07, "loss": 0.0001, "reward": 1.7071429565548897, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7071429006755352, "rewards/format_reward_func": 1.0, "step": 4608 }, { "completion_length": 235.96875953674316, "epoch": 0.7728739678947147, "grad_norm": 0.2854396965216469, "kl": 0.067779541015625, "learning_rate": 4.999774371058239e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857596278191, "rewards/format_reward_func": 1.0, "step": 4610 }, { "completion_length": 233.13393878936768, "epoch": 0.7732092711345824, "grad_norm": 0.3207019572513585, "kl": 0.085296630859375, "learning_rate": 4.999772756569482e-07, "loss": 0.0001, "reward": 1.725000075995922, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000573694706, "rewards/format_reward_func": 1.0, "step": 4612 }, { "completion_length": 236.45983219146729, "epoch": 0.7735445743744499, "grad_norm": 0.3600593221437672, "kl": 0.0930633544921875, "learning_rate": 4.999771136325318e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 4614 }, { "completion_length": 242.47322463989258, "epoch": 0.7738798776143174, "grad_norm": 0.4511198912935247, "kl": 0.069305419921875, "learning_rate": 4.999769510325756e-07, "loss": 0.0001, "reward": 1.7446429282426834, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071838885546, "rewards/format_reward_func": 0.9955357164144516, "step": 4616 }, { "completion_length": 235.93304634094238, "epoch": 0.7742151808541851, "grad_norm": 0.26591008144932465, "kl": 0.0665130615234375, "learning_rate": 4.999767878570797e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 4618 }, { "completion_length": 245.4062614440918, "epoch": 0.7745504840940526, "grad_norm": 0.2758105569697166, "kl": 0.062347412109375, "learning_rate": 4.999766241060446e-07, "loss": 0.0001, "reward": 1.7392858043313026, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857540398836, "rewards/format_reward_func": 1.0, "step": 4620 }, { "completion_length": 239.96429443359375, "epoch": 0.7748857873339201, "grad_norm": 0.1850762925480341, "kl": 0.0596771240234375, "learning_rate": 4.999764597794706e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 4622 }, { "completion_length": 233.30358219146729, "epoch": 0.7752210905737876, "grad_norm": 0.2923727962502333, "kl": 0.0526580810546875, "learning_rate": 4.999762948773581e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 4624 }, { "completion_length": 237.18750953674316, "epoch": 0.7755563938136553, "grad_norm": 0.29674990766493126, "kl": 0.0712127685546875, "learning_rate": 4.999761293997074e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357573926449, "rewards/format_reward_func": 0.9955357164144516, "step": 4626 }, { "completion_length": 243.34376335144043, "epoch": 0.7758916970535228, "grad_norm": 0.12933399523501038, "kl": 0.065521240234375, "learning_rate": 4.999759633465191e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 4628 }, { "completion_length": 243.59375953674316, "epoch": 0.7762270002933903, "grad_norm": 0.3046586914770116, "kl": 0.0637664794921875, "learning_rate": 4.999757967177934e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 4630 }, { "completion_length": 242.4151906967163, "epoch": 0.7765623035332578, "grad_norm": 0.18912232830456813, "kl": 0.056549072265625, "learning_rate": 4.999756295135306e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 4632 }, { "completion_length": 251.13393783569336, "epoch": 0.7768976067731255, "grad_norm": 0.45607946377712877, "kl": 0.0615692138671875, "learning_rate": 4.999754617337315e-07, "loss": 0.0001, "reward": 1.7285715118050575, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 4634 }, { "completion_length": 254.1919765472412, "epoch": 0.777232910012993, "grad_norm": 0.20415688926947373, "kl": 0.0582427978515625, "learning_rate": 4.99975293378396e-07, "loss": 0.0001, "reward": 1.7696429342031479, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7741071656346321, "rewards/format_reward_func": 0.9955357164144516, "step": 4636 }, { "completion_length": 258.8482265472412, "epoch": 0.7775682132528605, "grad_norm": 0.35244348616472587, "kl": 0.06280517578125, "learning_rate": 4.999751244475247e-07, "loss": 0.0001, "reward": 1.7375000715255737, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.741964302957058, "rewards/format_reward_func": 0.9955357164144516, "step": 4638 }, { "completion_length": 242.8080472946167, "epoch": 0.7779035164927282, "grad_norm": 0.1982457603260857, "kl": 0.0550079345703125, "learning_rate": 4.99974954941118e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 4640 }, { "completion_length": 244.2142972946167, "epoch": 0.7782388197325957, "grad_norm": 0.31280826798701716, "kl": 0.0586395263671875, "learning_rate": 4.999747848591763e-07, "loss": 0.0001, "reward": 1.7750000804662704, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 4642 }, { "completion_length": 245.86608219146729, "epoch": 0.7785741229724632, "grad_norm": 0.19968901429705566, "kl": 0.08355712890625, "learning_rate": 4.999746142017e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571812808514, "rewards/format_reward_func": 1.0, "step": 4644 }, { "completion_length": 250.78126525878906, "epoch": 0.7789094262123307, "grad_norm": 0.305616407661564, "kl": 0.063568115234375, "learning_rate": 4.999744429686894e-07, "loss": 0.0001, "reward": 1.7875000685453415, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7919643148779869, "rewards/format_reward_func": 0.9955357164144516, "step": 4646 }, { "completion_length": 250.68304824829102, "epoch": 0.7792447294521984, "grad_norm": 0.20280940768286793, "kl": 0.0625, "learning_rate": 4.99974271160145e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428872644901, "rewards/format_reward_func": 1.0, "step": 4648 }, { "completion_length": 249.08929920196533, "epoch": 0.7795800326920659, "grad_norm": 0.26340091091539314, "kl": 0.0640411376953125, "learning_rate": 4.999740987760671e-07, "loss": 0.0001, "reward": 1.7504464983940125, "reward_std": 0.05997780757024884, "rewards/equation_reward_func": 0.7651786133646965, "rewards/format_reward_func": 0.9852678664028645, "step": 4650 }, { "completion_length": 248.95090579986572, "epoch": 0.7799153359319334, "grad_norm": 0.3629256277036554, "kl": 0.0641937255859375, "learning_rate": 4.999739258164561e-07, "loss": 0.0001, "reward": 1.7892857640981674, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 4652 }, { "completion_length": 243.58929538726807, "epoch": 0.780250639171801, "grad_norm": 0.1749645065771544, "kl": 0.0629425048828125, "learning_rate": 4.999737522813124e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571589291096, "rewards/format_reward_func": 1.0, "step": 4654 }, { "completion_length": 250.03125953674316, "epoch": 0.7805859424116686, "grad_norm": 0.5312740210004152, "kl": 0.0813446044921875, "learning_rate": 4.999735781706365e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7910714615136385, "rewards/format_reward_func": 0.9910714328289032, "step": 4656 }, { "completion_length": 248.3616189956665, "epoch": 0.7809212456515361, "grad_norm": 0.1763252818665863, "kl": 0.0592498779296875, "learning_rate": 4.999734034844289e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 4658 }, { "completion_length": 248.52233028411865, "epoch": 0.7812565488914036, "grad_norm": 0.27396564397788564, "kl": 0.080841064453125, "learning_rate": 4.999732282226896e-07, "loss": 0.0001, "reward": 1.7160715237259865, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7205357477068901, "rewards/format_reward_func": 0.9955357164144516, "step": 4660 }, { "completion_length": 249.57590579986572, "epoch": 0.7815918521312712, "grad_norm": 0.29187870403616006, "kl": 0.067626953125, "learning_rate": 4.999730523854194e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7589286006987095, "rewards/format_reward_func": 0.9910714328289032, "step": 4662 }, { "completion_length": 244.07143878936768, "epoch": 0.7819271553711388, "grad_norm": 0.2820194000485553, "kl": 0.0643157958984375, "learning_rate": 4.999728759726185e-07, "loss": 0.0001, "reward": 1.708928644657135, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7133928909897804, "rewards/format_reward_func": 0.9955357164144516, "step": 4664 }, { "completion_length": 246.00001049041748, "epoch": 0.7822624586110063, "grad_norm": 0.21389940795165543, "kl": 0.0571746826171875, "learning_rate": 4.999726989842874e-07, "loss": 0.0001, "reward": 1.8267857655882835, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.8312500230967999, "rewards/format_reward_func": 0.9955357164144516, "step": 4666 }, { "completion_length": 250.70536994934082, "epoch": 0.7825977618508739, "grad_norm": 0.20641362672396715, "kl": 0.0699310302734375, "learning_rate": 4.999725214204263e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 4668 }, { "completion_length": 249.31697368621826, "epoch": 0.7829330650907415, "grad_norm": 0.25385673762727295, "kl": 0.059173583984375, "learning_rate": 4.999723432810359e-07, "loss": 0.0001, "reward": 1.764285795390606, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 4670 }, { "completion_length": 246.61608505249023, "epoch": 0.783268368330609, "grad_norm": 0.2611840826725265, "kl": 0.0884552001953125, "learning_rate": 4.999721645661165e-07, "loss": 0.0001, "reward": 1.773214340209961, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7776785977184772, "rewards/format_reward_func": 0.9955357164144516, "step": 4672 }, { "completion_length": 250.42411994934082, "epoch": 0.7836036715704765, "grad_norm": 0.6915758002266459, "kl": 0.102630615234375, "learning_rate": 4.999719852756685e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143022119999, "rewards/format_reward_func": 1.0, "step": 4674 }, { "completion_length": 244.9375114440918, "epoch": 0.7839389748103441, "grad_norm": 0.2730890094350734, "kl": 0.0650482177734375, "learning_rate": 4.999718054096922e-07, "loss": 0.0001, "reward": 1.7580358013510704, "reward_std": 0.059346460737288, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9982142895460129, "step": 4676 }, { "completion_length": 252.31251430511475, "epoch": 0.7842742780502117, "grad_norm": 0.21485735246991344, "kl": 0.05242919921875, "learning_rate": 4.999716249681883e-07, "loss": 0.0001, "reward": 1.814285770058632, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 4678 }, { "completion_length": 242.25447463989258, "epoch": 0.7846095812900792, "grad_norm": 0.20691538510174362, "kl": 0.0582122802734375, "learning_rate": 4.999714439511568e-07, "loss": 0.0001, "reward": 1.725000075995922, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 1.0, "step": 4680 }, { "completion_length": 253.27679824829102, "epoch": 0.7849448845299468, "grad_norm": 0.21287063557356575, "kl": 0.05328369140625, "learning_rate": 4.999712623585985e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7812500335276127, "rewards/format_reward_func": 0.9955357164144516, "step": 4682 }, { "completion_length": 261.8303689956665, "epoch": 0.7852801877698143, "grad_norm": 0.3163254069693305, "kl": 0.052947998046875, "learning_rate": 4.999710801905137e-07, "loss": 0.0001, "reward": 1.7553571835160255, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 4684 }, { "completion_length": 255.2634048461914, "epoch": 0.7856154910096819, "grad_norm": 0.13242133409732526, "kl": 0.04907989501953125, "learning_rate": 4.999708974469028e-07, "loss": 0.0, "reward": 1.7357143685221672, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7446428760886192, "rewards/format_reward_func": 0.9910714328289032, "step": 4686 }, { "completion_length": 261.0491189956665, "epoch": 0.7859507942495494, "grad_norm": 0.30653420990937486, "kl": 0.0510711669921875, "learning_rate": 4.999707141277662e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 1.0, "step": 4688 }, { "completion_length": 248.20090007781982, "epoch": 0.786286097489417, "grad_norm": 0.1144669042372897, "kl": 0.04682159423828125, "learning_rate": 4.999705302331042e-07, "loss": 0.0, "reward": 1.7392857819795609, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 4690 }, { "completion_length": 249.48215579986572, "epoch": 0.7866214007292845, "grad_norm": 0.10345151057283293, "kl": 0.0513916015625, "learning_rate": 4.999703457629175e-07, "loss": 0.0001, "reward": 1.832142911851406, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8321428708732128, "rewards/format_reward_func": 1.0, "step": 4692 }, { "completion_length": 250.82590293884277, "epoch": 0.7869567039691521, "grad_norm": 0.3815717918348069, "kl": 0.056793212890625, "learning_rate": 4.999701607172063e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143126428127, "rewards/format_reward_func": 1.0, "step": 4694 }, { "completion_length": 244.66072750091553, "epoch": 0.7872920072090197, "grad_norm": 0.21475385548445075, "kl": 0.06305694580078125, "learning_rate": 4.999699750959712e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 4696 }, { "completion_length": 244.7321548461914, "epoch": 0.7876273104488872, "grad_norm": 0.30145265431663126, "kl": 0.0600738525390625, "learning_rate": 4.999697888992124e-07, "loss": 0.0001, "reward": 1.739732213318348, "reward_std": 0.05745242489501834, "rewards/equation_reward_func": 0.7446428835391998, "rewards/format_reward_func": 0.9950892888009548, "step": 4698 }, { "completion_length": 248.17411708831787, "epoch": 0.7879626136887548, "grad_norm": 0.167633043860595, "kl": 0.04900360107421875, "learning_rate": 4.999696021269305e-07, "loss": 0.0, "reward": 1.7285714969038963, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7285714615136385, "rewards/format_reward_func": 1.0, "step": 4700 }, { "completion_length": 244.21429538726807, "epoch": 0.7882979169286223, "grad_norm": 0.18431015600634554, "kl": 0.069671630859375, "learning_rate": 4.99969414779126e-07, "loss": 0.0001, "reward": 1.7928571850061417, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 4702 }, { "completion_length": 243.78126049041748, "epoch": 0.7886332201684899, "grad_norm": 0.15342521106660034, "kl": 0.0619964599609375, "learning_rate": 4.999692268557992e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 4704 }, { "completion_length": 244.00001335144043, "epoch": 0.7889685234083574, "grad_norm": 0.2383847607881836, "kl": 0.06097412109375, "learning_rate": 4.999690383569505e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 4706 }, { "completion_length": 243.5759038925171, "epoch": 0.789303826648225, "grad_norm": 0.3709918619220796, "kl": 0.05507659912109375, "learning_rate": 4.999688492825803e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.09596449043601751, "rewards/equation_reward_func": 0.7839285992085934, "rewards/format_reward_func": 0.9910714328289032, "step": 4708 }, { "completion_length": 237.4776906967163, "epoch": 0.7896391298880926, "grad_norm": 0.13338077156805117, "kl": 0.04788970947265625, "learning_rate": 4.999686596326894e-07, "loss": 0.0, "reward": 1.7803572118282318, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7848214358091354, "rewards/format_reward_func": 0.9955357164144516, "step": 4710 }, { "completion_length": 250.5625123977661, "epoch": 0.7899744331279601, "grad_norm": 0.34551931468587194, "kl": 0.0578155517578125, "learning_rate": 4.999684694072776e-07, "loss": 0.0001, "reward": 1.7714286521077156, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714285980910063, "rewards/format_reward_func": 1.0, "step": 4712 }, { "completion_length": 241.7053680419922, "epoch": 0.7903097363678276, "grad_norm": 0.24656622735147873, "kl": 0.0606536865234375, "learning_rate": 4.99968278606346e-07, "loss": 0.0001, "reward": 1.7750000804662704, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 4714 }, { "completion_length": 242.2321548461914, "epoch": 0.7906450396076952, "grad_norm": 0.2908306029397776, "kl": 0.06658172607421875, "learning_rate": 4.999680872298946e-07, "loss": 0.0001, "reward": 1.7267858013510704, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7312500365078449, "rewards/format_reward_func": 0.9955357164144516, "step": 4716 }, { "completion_length": 235.7767972946167, "epoch": 0.7909803428475628, "grad_norm": 0.1948066984295881, "kl": 0.0508880615234375, "learning_rate": 4.999678952779241e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 4718 }, { "completion_length": 238.86608123779297, "epoch": 0.7913156460874303, "grad_norm": 0.22120546745892436, "kl": 0.0527191162109375, "learning_rate": 4.999677027504347e-07, "loss": 0.0001, "reward": 1.8267857804894447, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8312500230967999, "rewards/format_reward_func": 0.9955357164144516, "step": 4720 }, { "completion_length": 245.20983123779297, "epoch": 0.7916509493272978, "grad_norm": 0.1871303555582509, "kl": 0.05560302734375, "learning_rate": 4.99967509647427e-07, "loss": 0.0001, "reward": 1.7178572043776512, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571671247482, "rewards/format_reward_func": 1.0, "step": 4722 }, { "completion_length": 238.77233028411865, "epoch": 0.7919862525671655, "grad_norm": 0.4842632365108479, "kl": 0.0672454833984375, "learning_rate": 4.999673159689015e-07, "loss": 0.0001, "reward": 1.7803571745753288, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7848214618861675, "rewards/format_reward_func": 0.9955357164144516, "step": 4724 }, { "completion_length": 243.75000953674316, "epoch": 0.792321555807033, "grad_norm": 0.2471484051407418, "kl": 0.0733184814453125, "learning_rate": 4.999671217148585e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571619093418, "rewards/format_reward_func": 1.0, "step": 4726 }, { "completion_length": 239.40180015563965, "epoch": 0.7926568590469005, "grad_norm": 0.21497333312573835, "kl": 0.0543670654296875, "learning_rate": 4.999669268852984e-07, "loss": 0.0001, "reward": 1.7375000715255737, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643066823483, "rewards/format_reward_func": 0.9955357164144516, "step": 4728 }, { "completion_length": 247.93304634094238, "epoch": 0.792992162286768, "grad_norm": 0.1317931786223707, "kl": 0.05374908447265625, "learning_rate": 4.999667314802218e-07, "loss": 0.0001, "reward": 1.753571517765522, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 4730 }, { "completion_length": 240.290189743042, "epoch": 0.7933274655266357, "grad_norm": 0.2581755180585634, "kl": 0.05332183837890625, "learning_rate": 4.999665354996292e-07, "loss": 0.0001, "reward": 1.796428643167019, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 4732 }, { "completion_length": 239.62054824829102, "epoch": 0.7936627687665032, "grad_norm": 0.20370485342766165, "kl": 0.0496673583984375, "learning_rate": 4.99966338943521e-07, "loss": 0.0, "reward": 1.7785714864730835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 4734 }, { "completion_length": 242.45537090301514, "epoch": 0.7939980720063707, "grad_norm": 0.29268332541299974, "kl": 0.0518035888671875, "learning_rate": 4.999661418118975e-07, "loss": 0.0001, "reward": 1.7714286148548126, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 4736 }, { "completion_length": 244.91518783569336, "epoch": 0.7943333752462384, "grad_norm": 0.16324022350176343, "kl": 0.090301513671875, "learning_rate": 4.999659441047592e-07, "loss": 0.0001, "reward": 1.787500061094761, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 4738 }, { "completion_length": 246.04465675354004, "epoch": 0.7946686784861059, "grad_norm": 0.1255347734918534, "kl": 0.04901123046875, "learning_rate": 4.999657458221067e-07, "loss": 0.0, "reward": 1.7642857879400253, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 4740 }, { "completion_length": 249.23215579986572, "epoch": 0.7950039817259734, "grad_norm": 0.16224730747256555, "kl": 0.0569610595703125, "learning_rate": 4.999655469639404e-07, "loss": 0.0001, "reward": 1.753571480512619, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714767873287, "rewards/format_reward_func": 1.0, "step": 4742 }, { "completion_length": 238.4955472946167, "epoch": 0.7953392849658409, "grad_norm": 0.18577863111781406, "kl": 0.05908203125, "learning_rate": 4.999653475302607e-07, "loss": 0.0001, "reward": 1.8232143223285675, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.8276786021888256, "rewards/format_reward_func": 0.9955357164144516, "step": 4744 }, { "completion_length": 243.20090293884277, "epoch": 0.7956745882057086, "grad_norm": 0.21887712276758306, "kl": 0.047943115234375, "learning_rate": 4.99965147521068e-07, "loss": 0.0, "reward": 1.8214285895228386, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8214286155998707, "rewards/format_reward_func": 1.0, "step": 4746 }, { "completion_length": 247.2455472946167, "epoch": 0.7960098914455761, "grad_norm": 0.3119421742863889, "kl": 0.0528106689453125, "learning_rate": 4.999649469363629e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 4748 }, { "completion_length": 239.9419755935669, "epoch": 0.7963451946854436, "grad_norm": 0.28155752485961605, "kl": 0.05231475830078125, "learning_rate": 4.999647457761459e-07, "loss": 0.0001, "reward": 1.8178571984171867, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8178571611642838, "rewards/format_reward_func": 1.0, "step": 4750 }, { "completion_length": 252.48661708831787, "epoch": 0.7966804979253111, "grad_norm": 0.3151087780616481, "kl": 0.04834747314453125, "learning_rate": 4.999645440404173e-07, "loss": 0.0, "reward": 1.767857201397419, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 4752 }, { "completion_length": 241.24554824829102, "epoch": 0.7970158011651788, "grad_norm": 0.2119678596279154, "kl": 0.057403564453125, "learning_rate": 4.999643417291776e-07, "loss": 0.0001, "reward": 1.8250000551342964, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8250000141561031, "rewards/format_reward_func": 1.0, "step": 4754 }, { "completion_length": 250.9196548461914, "epoch": 0.7973511044050463, "grad_norm": 0.19341988481809425, "kl": 0.0551300048828125, "learning_rate": 4.999641388424274e-07, "loss": 0.0001, "reward": 1.719642922282219, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 4756 }, { "completion_length": 244.50447368621826, "epoch": 0.7976864076449138, "grad_norm": 0.18595105296353032, "kl": 0.06219482421875, "learning_rate": 4.99963935380167e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526785936206579, "rewards/format_reward_func": 0.9955357164144516, "step": 4758 }, { "completion_length": 248.3348331451416, "epoch": 0.7980217108847815, "grad_norm": 0.2563888180930464, "kl": 0.053497314453125, "learning_rate": 4.99963731342397e-07, "loss": 0.0001, "reward": 1.821428619325161, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8214285895228386, "rewards/format_reward_func": 1.0, "step": 4760 }, { "completion_length": 254.7500114440918, "epoch": 0.798357014124649, "grad_norm": 0.22967847403831104, "kl": 0.05162811279296875, "learning_rate": 4.999635267291178e-07, "loss": 0.0001, "reward": 1.7428571954369545, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7428571786731482, "rewards/format_reward_func": 1.0, "step": 4762 }, { "completion_length": 251.24554538726807, "epoch": 0.7986923173645165, "grad_norm": 0.18666718271717592, "kl": 0.0740814208984375, "learning_rate": 4.999633215403298e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 4764 }, { "completion_length": 254.54911518096924, "epoch": 0.799027620604384, "grad_norm": 0.20649343618390373, "kl": 0.0608673095703125, "learning_rate": 4.999631157760337e-07, "loss": 0.0001, "reward": 1.717857226729393, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571857511997, "rewards/format_reward_func": 1.0, "step": 4766 }, { "completion_length": 258.2901916503906, "epoch": 0.7993629238442517, "grad_norm": 0.21921901486791798, "kl": 0.0643157958984375, "learning_rate": 4.999629094362298e-07, "loss": 0.0001, "reward": 1.735714390873909, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143089175224, "rewards/format_reward_func": 1.0, "step": 4768 }, { "completion_length": 249.4464406967163, "epoch": 0.7996982270841192, "grad_norm": 0.2943164089246413, "kl": 0.05615234375, "learning_rate": 4.999627025209186e-07, "loss": 0.0001, "reward": 1.7723214775323868, "reward_std": 0.06944798585027456, "rewards/equation_reward_func": 0.7741071917116642, "rewards/format_reward_func": 0.9982142895460129, "step": 4770 }, { "completion_length": 256.4866180419922, "epoch": 0.8000335303239867, "grad_norm": 0.17801495941107487, "kl": 0.057403564453125, "learning_rate": 4.999624950301005e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 4772 }, { "completion_length": 253.21876430511475, "epoch": 0.8003688335638544, "grad_norm": 0.22709532160911014, "kl": 0.0803985595703125, "learning_rate": 4.999622869637761e-07, "loss": 0.0001, "reward": 1.8285714909434319, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8285714574158192, "rewards/format_reward_func": 1.0, "step": 4774 }, { "completion_length": 255.7321548461914, "epoch": 0.8007041368037219, "grad_norm": 0.43890108257018107, "kl": 0.06640625, "learning_rate": 4.999620783219457e-07, "loss": 0.0001, "reward": 1.817857213318348, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 1.0, "step": 4776 }, { "completion_length": 249.24108505249023, "epoch": 0.8010394400435894, "grad_norm": 0.21523694406650426, "kl": 0.060211181640625, "learning_rate": 4.999618691046101e-07, "loss": 0.0001, "reward": 1.8160715103149414, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8205357380211353, "rewards/format_reward_func": 0.9955357164144516, "step": 4778 }, { "completion_length": 259.96875953674316, "epoch": 0.8013747432834569, "grad_norm": 0.30152421281682196, "kl": 0.062408447265625, "learning_rate": 4.999616593117696e-07, "loss": 0.0001, "reward": 1.7446429282426834, "reward_std": 0.10859139636158943, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 4780 }, { "completion_length": 248.758939743042, "epoch": 0.8017100465233246, "grad_norm": 0.18162921628223075, "kl": 0.05597686767578125, "learning_rate": 4.999614489434246e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 4782 }, { "completion_length": 249.38840198516846, "epoch": 0.8020453497631921, "grad_norm": 0.18843137481806166, "kl": 0.0561370849609375, "learning_rate": 4.999612379995757e-07, "loss": 0.0001, "reward": 1.7928571850061417, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 4784 }, { "completion_length": 253.75447750091553, "epoch": 0.8023806530030596, "grad_norm": 0.11907621852180425, "kl": 0.060516357421875, "learning_rate": 4.999610264802233e-07, "loss": 0.0001, "reward": 1.6660715192556381, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.6705357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 4786 }, { "completion_length": 261.1696586608887, "epoch": 0.8027159562429272, "grad_norm": 0.18805736926606073, "kl": 0.064910888671875, "learning_rate": 4.999608143853679e-07, "loss": 0.0001, "reward": 1.807142898440361, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428872644901, "rewards/format_reward_func": 1.0, "step": 4788 }, { "completion_length": 248.7276906967163, "epoch": 0.8030512594827948, "grad_norm": 0.21305119555940016, "kl": 0.05750274658203125, "learning_rate": 4.999606017150102e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 4790 }, { "completion_length": 259.43305015563965, "epoch": 0.8033865627226623, "grad_norm": 0.21317110179286103, "kl": 0.05792236328125, "learning_rate": 4.999603884691504e-07, "loss": 0.0001, "reward": 1.753571517765522, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 4792 }, { "completion_length": 252.7232265472412, "epoch": 0.8037218659625298, "grad_norm": 0.15468881031021706, "kl": 0.0545654296875, "learning_rate": 4.999601746477891e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464285995811224, "rewards/format_reward_func": 1.0, "step": 4794 }, { "completion_length": 270.4776916503906, "epoch": 0.8040571692023974, "grad_norm": 0.18228137072443312, "kl": 0.0901336669921875, "learning_rate": 4.999599602509269e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321429029107094, "rewards/format_reward_func": 1.0, "step": 4796 }, { "completion_length": 268.3571529388428, "epoch": 0.804392472442265, "grad_norm": 0.21559347957903086, "kl": 0.0575103759765625, "learning_rate": 4.999597452785641e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7553571723401546, "rewards/format_reward_func": 0.9910714328289032, "step": 4798 }, { "completion_length": 265.1384048461914, "epoch": 0.8047277756821325, "grad_norm": 0.2657425103639842, "kl": 0.158416748046875, "learning_rate": 4.999595297307014e-07, "loss": 0.0002, "reward": 1.757142923772335, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 4800 }, { "completion_length": 274.2276916503906, "epoch": 0.8050630789220001, "grad_norm": 0.1548226820890832, "kl": 0.0614166259765625, "learning_rate": 4.99959313607339e-07, "loss": 0.0001, "reward": 1.7482143566012383, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526786103844643, "rewards/format_reward_func": 0.9955357164144516, "step": 4802 }, { "completion_length": 279.4017963409424, "epoch": 0.8053983821618677, "grad_norm": 0.2341381706367825, "kl": 0.0644378662109375, "learning_rate": 4.999590969084777e-07, "loss": 0.0001, "reward": 1.7625000551342964, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 4804 }, { "completion_length": 270.4687604904175, "epoch": 0.8057336854017352, "grad_norm": 0.22770723007849342, "kl": 0.088226318359375, "learning_rate": 4.999588796341178e-07, "loss": 0.0001, "reward": 1.7625000551342964, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.766964316368103, "rewards/format_reward_func": 0.9955357164144516, "step": 4806 }, { "completion_length": 277.86162090301514, "epoch": 0.8060689886416027, "grad_norm": 0.3219488111395403, "kl": 0.069610595703125, "learning_rate": 4.999586617842599e-07, "loss": 0.0001, "reward": 1.7607143446803093, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7696428894996643, "rewards/format_reward_func": 0.9910714328289032, "step": 4808 }, { "completion_length": 267.16965770721436, "epoch": 0.8064042918814703, "grad_norm": 0.21817359942228975, "kl": 0.0725860595703125, "learning_rate": 4.999584433589046e-07, "loss": 0.0001, "reward": 1.7196428924798965, "reward_std": 0.0833375845104456, "rewards/equation_reward_func": 0.7330357562750578, "rewards/format_reward_func": 0.9866071492433548, "step": 4810 }, { "completion_length": 275.1696548461914, "epoch": 0.8067395951213379, "grad_norm": 0.17800857048787785, "kl": 0.075469970703125, "learning_rate": 4.999582243580522e-07, "loss": 0.0001, "reward": 1.7446429431438446, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7491071671247482, "rewards/format_reward_func": 0.9955357164144516, "step": 4812 }, { "completion_length": 274.85268688201904, "epoch": 0.8070748983612054, "grad_norm": 0.24170594435818452, "kl": 0.0898284912109375, "learning_rate": 4.999580047817033e-07, "loss": 0.0001, "reward": 1.7696429342031479, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7830357402563095, "rewards/format_reward_func": 0.9866071492433548, "step": 4814 }, { "completion_length": 270.9776916503906, "epoch": 0.807410201601073, "grad_norm": 0.24363523164547604, "kl": 0.1192169189453125, "learning_rate": 4.999577846298584e-07, "loss": 0.0001, "reward": 1.7589286267757416, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 4816 }, { "completion_length": 281.633939743042, "epoch": 0.8077455048409405, "grad_norm": 0.2132089822493617, "kl": 0.0901947021484375, "learning_rate": 4.999575639025179e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 4818 }, { "completion_length": 272.4642972946167, "epoch": 0.8080808080808081, "grad_norm": 0.1495708958499177, "kl": 0.2392578125, "learning_rate": 4.999573425996826e-07, "loss": 0.0002, "reward": 1.7589286342263222, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 4820 }, { "completion_length": 281.1071548461914, "epoch": 0.8084161113206756, "grad_norm": 0.18808619582281788, "kl": 0.0755767822265625, "learning_rate": 4.999571207213527e-07, "loss": 0.0001, "reward": 1.819642886519432, "reward_std": 0.0530330091714859, "rewards/equation_reward_func": 0.8330357410013676, "rewards/format_reward_func": 0.9866071492433548, "step": 4822 }, { "completion_length": 276.33930110931396, "epoch": 0.8087514145605432, "grad_norm": 0.25074459221996265, "kl": 0.1247100830078125, "learning_rate": 4.999568982675288e-07, "loss": 0.0001, "reward": 1.7839286178350449, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928939700127, "rewards/format_reward_func": 0.9955357164144516, "step": 4824 }, { "completion_length": 268.6116180419922, "epoch": 0.8090867178004107, "grad_norm": 0.2757940434856731, "kl": 0.079986572265625, "learning_rate": 4.999566752382115e-07, "loss": 0.0001, "reward": 1.7714286670088768, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 4826 }, { "completion_length": 271.4196538925171, "epoch": 0.8094220210402783, "grad_norm": 0.37402314358434524, "kl": 0.088958740234375, "learning_rate": 4.999564516334014e-07, "loss": 0.0001, "reward": 1.7375000640749931, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 4828 }, { "completion_length": 282.1160821914673, "epoch": 0.8097573242801459, "grad_norm": 0.2814170427276152, "kl": 0.0948486328125, "learning_rate": 4.999562274530986e-07, "loss": 0.0001, "reward": 1.7714286521077156, "reward_std": 0.09091372694820166, "rewards/equation_reward_func": 0.7803571671247482, "rewards/format_reward_func": 0.9910714328289032, "step": 4830 }, { "completion_length": 274.2634038925171, "epoch": 0.8100926275200134, "grad_norm": 0.45118148911042716, "kl": 0.0821533203125, "learning_rate": 4.999560026973041e-07, "loss": 0.0001, "reward": 1.7000000700354576, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7178571671247482, "rewards/format_reward_func": 0.9821428656578064, "step": 4832 }, { "completion_length": 275.20537281036377, "epoch": 0.810427930759881, "grad_norm": 0.18307441024953733, "kl": 0.08013916015625, "learning_rate": 4.999557773660181e-07, "loss": 0.0001, "reward": 1.7339286133646965, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 4834 }, { "completion_length": 276.81250953674316, "epoch": 0.8107632339997485, "grad_norm": 0.2023363492109368, "kl": 0.0673065185546875, "learning_rate": 4.999555514592412e-07, "loss": 0.0001, "reward": 1.716071479022503, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7205357365310192, "rewards/format_reward_func": 0.9955357164144516, "step": 4836 }, { "completion_length": 270.35715770721436, "epoch": 0.8110985372396161, "grad_norm": 0.21590173948082855, "kl": 0.079620361328125, "learning_rate": 4.999553249769741e-07, "loss": 0.0001, "reward": 1.807142898440361, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8071428816765547, "rewards/format_reward_func": 1.0, "step": 4838 }, { "completion_length": 276.65179920196533, "epoch": 0.8114338404794836, "grad_norm": 0.18878714466772603, "kl": 0.0695648193359375, "learning_rate": 4.999550979192169e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 4840 }, { "completion_length": 285.02680015563965, "epoch": 0.8117691437193512, "grad_norm": 0.1801698236755662, "kl": 0.0682830810546875, "learning_rate": 4.999548702859706e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7625000178813934, "rewards/format_reward_func": 0.9910714328289032, "step": 4842 }, { "completion_length": 266.7946529388428, "epoch": 0.8121044469592188, "grad_norm": 0.29530471967628835, "kl": 0.06537628173828125, "learning_rate": 4.999546420772355e-07, "loss": 0.0001, "reward": 1.7910714745521545, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7955357357859612, "rewards/format_reward_func": 0.9955357164144516, "step": 4844 }, { "completion_length": 277.7946548461914, "epoch": 0.8124397501990863, "grad_norm": 0.2510459183216338, "kl": 0.0666961669921875, "learning_rate": 4.99954413293012e-07, "loss": 0.0001, "reward": 1.7553572058677673, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7598214745521545, "rewards/format_reward_func": 0.9955357164144516, "step": 4846 }, { "completion_length": 283.46876335144043, "epoch": 0.8127750534389538, "grad_norm": 0.1536660170745747, "kl": 0.0618133544921875, "learning_rate": 4.999541839333009e-07, "loss": 0.0001, "reward": 1.7178572192788124, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7267857454717159, "rewards/format_reward_func": 0.9910714328289032, "step": 4848 }, { "completion_length": 272.60715675354004, "epoch": 0.8131103566788214, "grad_norm": 0.2226667586865844, "kl": 0.070068359375, "learning_rate": 4.999539539981026e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571604192257, "rewards/format_reward_func": 1.0, "step": 4850 }, { "completion_length": 261.90179443359375, "epoch": 0.813445659918689, "grad_norm": 0.3051079389978345, "kl": 0.0652008056640625, "learning_rate": 4.999537234874175e-07, "loss": 0.0001, "reward": 1.8178572058677673, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8178571574389935, "rewards/format_reward_func": 1.0, "step": 4852 }, { "completion_length": 270.1562662124634, "epoch": 0.8137809631585565, "grad_norm": 0.1828016501904028, "kl": 0.062744140625, "learning_rate": 4.999534924012463e-07, "loss": 0.0001, "reward": 1.7589286118745804, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928805589676, "rewards/format_reward_func": 0.9955357164144516, "step": 4854 }, { "completion_length": 269.808048248291, "epoch": 0.814116266398424, "grad_norm": 0.320133616246475, "kl": 0.075286865234375, "learning_rate": 4.999532607395895e-07, "loss": 0.0001, "reward": 1.742857240140438, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 4856 }, { "completion_length": 275.1205472946167, "epoch": 0.8144515696382917, "grad_norm": 0.16699386169014177, "kl": 0.0585174560546875, "learning_rate": 4.999530285024477e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 4858 }, { "completion_length": 272.2544765472412, "epoch": 0.8147868728781592, "grad_norm": 0.40172270093922924, "kl": 0.0598907470703125, "learning_rate": 4.999527956898213e-07, "loss": 0.0001, "reward": 1.7196429446339607, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7330357432365417, "rewards/format_reward_func": 0.9866071492433548, "step": 4860 }, { "completion_length": 266.4866189956665, "epoch": 0.8151221761180267, "grad_norm": 0.37233589266703304, "kl": 0.065093994140625, "learning_rate": 4.999525623017109e-07, "loss": 0.0001, "reward": 1.8000000640749931, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.8089285809546709, "rewards/format_reward_func": 0.9910714328289032, "step": 4862 }, { "completion_length": 267.90179347991943, "epoch": 0.8154574793578943, "grad_norm": 0.30506025414719584, "kl": 0.062347412109375, "learning_rate": 4.99952328338117e-07, "loss": 0.0001, "reward": 1.776785783469677, "reward_std": 0.09343910776078701, "rewards/equation_reward_func": 0.7812500298023224, "rewards/format_reward_func": 0.9955357164144516, "step": 4864 }, { "completion_length": 259.2142963409424, "epoch": 0.8157927825977619, "grad_norm": 0.12008669658374085, "kl": 0.062469482421875, "learning_rate": 4.999520937990401e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.792857151478529, "rewards/format_reward_func": 1.0, "step": 4866 }, { "completion_length": 255.54465198516846, "epoch": 0.8161280858376294, "grad_norm": 0.19604347972133174, "kl": 0.0647430419921875, "learning_rate": 4.999518586844809e-07, "loss": 0.0001, "reward": 1.7696429416537285, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 4868 }, { "completion_length": 258.7634048461914, "epoch": 0.8164633890774969, "grad_norm": 0.31025431871878384, "kl": 0.06878662109375, "learning_rate": 4.999516229944397e-07, "loss": 0.0001, "reward": 1.7142857983708382, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7142857387661934, "rewards/format_reward_func": 1.0, "step": 4870 }, { "completion_length": 258.1473340988159, "epoch": 0.8167986923173646, "grad_norm": 0.059445911244861434, "kl": 0.0688018798828125, "learning_rate": 4.999513867289173e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107143212109804, "rewards/format_reward_func": 1.0, "step": 4872 }, { "completion_length": 252.93304920196533, "epoch": 0.8171339955572321, "grad_norm": 0.0881423719531909, "kl": 0.0708465576171875, "learning_rate": 4.999511498879142e-07, "loss": 0.0001, "reward": 1.7736607939004898, "reward_std": 0.013258251827210188, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 0.9986607171595097, "step": 4874 }, { "completion_length": 256.0223321914673, "epoch": 0.8174692987970996, "grad_norm": 0.2124047604525655, "kl": 0.071258544921875, "learning_rate": 4.999509124714308e-07, "loss": 0.0001, "reward": 1.753571480512619, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.753571480512619, "rewards/format_reward_func": 1.0, "step": 4876 }, { "completion_length": 256.4375114440918, "epoch": 0.8178046020369671, "grad_norm": 0.18623706897044853, "kl": 0.0633697509765625, "learning_rate": 4.999506744794677e-07, "loss": 0.0001, "reward": 1.7678571864962578, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.776785746216774, "rewards/format_reward_func": 0.9910714328289032, "step": 4878 }, { "completion_length": 255.67411994934082, "epoch": 0.8181399052768348, "grad_norm": 0.15097665520310913, "kl": 0.0752410888671875, "learning_rate": 4.999504359120255e-07, "loss": 0.0001, "reward": 1.7415179386734962, "reward_std": 0.032198611879721284, "rewards/equation_reward_func": 0.7428571730852127, "rewards/format_reward_func": 0.9986607171595097, "step": 4880 }, { "completion_length": 273.29019260406494, "epoch": 0.8184752085167023, "grad_norm": 0.26742352819239673, "kl": 0.0709228515625, "learning_rate": 4.999501967691048e-07, "loss": 0.0001, "reward": 1.6964286714792252, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7053571790456772, "rewards/format_reward_func": 0.9910714328289032, "step": 4882 }, { "completion_length": 261.84376430511475, "epoch": 0.8188105117565698, "grad_norm": 0.25184865004850154, "kl": 0.0738372802734375, "learning_rate": 4.99949957050706e-07, "loss": 0.0001, "reward": 1.7290179282426834, "reward_std": 0.07007933035492897, "rewards/equation_reward_func": 0.7348214648663998, "rewards/format_reward_func": 0.994196429848671, "step": 4884 }, { "completion_length": 253.09375953674316, "epoch": 0.8191458149964373, "grad_norm": 0.21782260571475182, "kl": 0.079376220703125, "learning_rate": 4.999497167568297e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 1.0, "step": 4886 }, { "completion_length": 253.2812614440918, "epoch": 0.819481118236305, "grad_norm": 0.16804666737366383, "kl": 0.0702362060546875, "learning_rate": 4.999494758874765e-07, "loss": 0.0001, "reward": 1.7642857655882835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857413738966, "rewards/format_reward_func": 1.0, "step": 4888 }, { "completion_length": 265.95983600616455, "epoch": 0.8198164214761725, "grad_norm": 0.09307030417656371, "kl": 0.0602874755859375, "learning_rate": 4.999492344426469e-07, "loss": 0.0001, "reward": 1.733928620815277, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 4890 }, { "completion_length": 259.1428699493408, "epoch": 0.82015172471604, "grad_norm": 0.25695249055536407, "kl": 0.0602569580078125, "learning_rate": 4.999489924223416e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143055647612, "rewards/format_reward_func": 1.0, "step": 4892 }, { "completion_length": 269.8214387893677, "epoch": 0.8204870279559077, "grad_norm": 0.19842259243217908, "kl": 0.0663299560546875, "learning_rate": 4.999487498265609e-07, "loss": 0.0001, "reward": 1.7410714849829674, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.745535746216774, "rewards/format_reward_func": 0.9955357164144516, "step": 4894 }, { "completion_length": 257.30804443359375, "epoch": 0.8208223311957752, "grad_norm": 0.2123599581687482, "kl": 0.071533203125, "learning_rate": 4.999485066553056e-07, "loss": 0.0001, "reward": 1.764285795390606, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857283353806, "rewards/format_reward_func": 1.0, "step": 4896 }, { "completion_length": 270.36608028411865, "epoch": 0.8211576344356427, "grad_norm": 0.1736771649562477, "kl": 0.060089111328125, "learning_rate": 4.999482629085762e-07, "loss": 0.0001, "reward": 1.7892858013510704, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7982143089175224, "rewards/format_reward_func": 0.9910714328289032, "step": 4898 }, { "completion_length": 257.4955463409424, "epoch": 0.8214929376755102, "grad_norm": 0.20787241332400688, "kl": 0.0594024658203125, "learning_rate": 4.99948018586373e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7803571820259094, "rewards/format_reward_func": 0.9910714328289032, "step": 4900 }, { "completion_length": 259.64287090301514, "epoch": 0.8218282409153779, "grad_norm": 0.23193187383227143, "kl": 0.0570831298828125, "learning_rate": 4.99947773688697e-07, "loss": 0.0001, "reward": 1.7660714834928513, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 4902 }, { "completion_length": 260.96429920196533, "epoch": 0.8221635441552454, "grad_norm": 0.17487735272584332, "kl": 0.05246734619140625, "learning_rate": 4.999475282155485e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428921073675, "rewards/format_reward_func": 1.0, "step": 4904 }, { "completion_length": 261.5446557998657, "epoch": 0.8224988473951129, "grad_norm": 0.27319235143934706, "kl": 0.0552215576171875, "learning_rate": 4.999472821669281e-07, "loss": 0.0001, "reward": 1.821428619325161, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8214285969734192, "rewards/format_reward_func": 1.0, "step": 4906 }, { "completion_length": 271.3348379135132, "epoch": 0.8228341506349806, "grad_norm": 0.2630989065803111, "kl": 0.0597381591796875, "learning_rate": 4.999470355428364e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7678571939468384, "rewards/format_reward_func": 1.0, "step": 4908 }, { "completion_length": 273.5089406967163, "epoch": 0.8231694538748481, "grad_norm": 0.17175984680368336, "kl": 0.0602264404296875, "learning_rate": 4.999467883432739e-07, "loss": 0.0001, "reward": 1.6875000819563866, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.6919643208384514, "rewards/format_reward_func": 0.9955357164144516, "step": 4910 }, { "completion_length": 261.35715675354004, "epoch": 0.8235047571147156, "grad_norm": 0.2206713915711781, "kl": 0.0681610107421875, "learning_rate": 4.999465405682414e-07, "loss": 0.0001, "reward": 1.733928620815277, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7473214603960514, "rewards/format_reward_func": 0.9866071492433548, "step": 4912 }, { "completion_length": 277.81697368621826, "epoch": 0.8238400603545831, "grad_norm": 0.2808354461978824, "kl": 0.0636749267578125, "learning_rate": 4.999462922177391e-07, "loss": 0.0001, "reward": 1.7464286759495735, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 4914 }, { "completion_length": 270.495548248291, "epoch": 0.8241753635944508, "grad_norm": 0.37773820919936024, "kl": 0.0789031982421875, "learning_rate": 4.999460432917678e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9910714328289032, "step": 4916 }, { "completion_length": 269.8794746398926, "epoch": 0.8245106668343183, "grad_norm": 0.26859032997452337, "kl": 0.0596160888671875, "learning_rate": 4.999457937903281e-07, "loss": 0.0001, "reward": 1.6767858043313026, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.6901786029338837, "rewards/format_reward_func": 0.9866071492433548, "step": 4918 }, { "completion_length": 268.6384048461914, "epoch": 0.8248459700741858, "grad_norm": 0.22619369001240797, "kl": 0.0674896240234375, "learning_rate": 4.999455437134205e-07, "loss": 0.0001, "reward": 1.7750000432133675, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 0.9821428656578064, "step": 4920 }, { "completion_length": 273.4375162124634, "epoch": 0.8251812733140534, "grad_norm": 0.14572012351895092, "kl": 0.064910888671875, "learning_rate": 4.999452930610455e-07, "loss": 0.0001, "reward": 1.7107143625617027, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7107143178582191, "rewards/format_reward_func": 1.0, "step": 4922 }, { "completion_length": 273.55804347991943, "epoch": 0.825516576553921, "grad_norm": 0.19885069207259493, "kl": 0.0724029541015625, "learning_rate": 4.999450418332038e-07, "loss": 0.0001, "reward": 1.737500086426735, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419642992317677, "rewards/format_reward_func": 0.9955357164144516, "step": 4924 }, { "completion_length": 267.4598321914673, "epoch": 0.8258518797937885, "grad_norm": 0.20614664708216593, "kl": 0.1008758544921875, "learning_rate": 4.99944790029896e-07, "loss": 0.0001, "reward": 1.7696429193019867, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7741071730852127, "rewards/format_reward_func": 0.9955357164144516, "step": 4926 }, { "completion_length": 270.4241199493408, "epoch": 0.826187183033656, "grad_norm": 0.39868854843655405, "kl": 0.0659942626953125, "learning_rate": 4.999445376511225e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.8169643133878708, "rewards/format_reward_func": 0.9955357164144516, "step": 4928 }, { "completion_length": 273.05804920196533, "epoch": 0.8265224862735236, "grad_norm": 0.44631029880094397, "kl": 0.047637939453125, "learning_rate": 4.999442846968841e-07, "loss": 0.0, "reward": 1.7982143238186836, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8026786036789417, "rewards/format_reward_func": 0.9955357164144516, "step": 4930 }, { "completion_length": 264.0848321914673, "epoch": 0.8268577895133912, "grad_norm": 0.16727886038983347, "kl": 0.0599212646484375, "learning_rate": 4.999440311671812e-07, "loss": 0.0001, "reward": 1.8464286103844643, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8464285917580128, "rewards/format_reward_func": 1.0, "step": 4932 }, { "completion_length": 264.13840675354004, "epoch": 0.8271930927532587, "grad_norm": 0.29495113787224825, "kl": 0.0543212890625, "learning_rate": 4.999437770620146e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714469850063, "rewards/format_reward_func": 1.0, "step": 4934 }, { "completion_length": 258.9285831451416, "epoch": 0.8275283959931263, "grad_norm": 0.22670554901695683, "kl": 0.0487060546875, "learning_rate": 4.999435223813847e-07, "loss": 0.0, "reward": 1.8250000476837158, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.8339285813271999, "rewards/format_reward_func": 0.9910714328289032, "step": 4936 }, { "completion_length": 267.41965770721436, "epoch": 0.8278636992329939, "grad_norm": 0.26918688987822686, "kl": 0.05322265625, "learning_rate": 4.999432671252921e-07, "loss": 0.0001, "reward": 1.7375000715255737, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7419643104076385, "rewards/format_reward_func": 0.9955357164144516, "step": 4938 }, { "completion_length": 263.3214416503906, "epoch": 0.8281990024728614, "grad_norm": 0.21402795920358994, "kl": 0.059417724609375, "learning_rate": 4.999430112937374e-07, "loss": 0.0001, "reward": 1.8071429133415222, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 4940 }, { "completion_length": 271.71430110931396, "epoch": 0.8285343057127289, "grad_norm": 0.7274596587412323, "kl": 0.0613861083984375, "learning_rate": 4.999427548867214e-07, "loss": 0.0001, "reward": 1.689285784959793, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.689285758882761, "rewards/format_reward_func": 1.0, "step": 4942 }, { "completion_length": 242.38840103149414, "epoch": 0.8288696089525965, "grad_norm": 0.196112449776157, "kl": 0.0539093017578125, "learning_rate": 4.999424979042443e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428626775742, "rewards/format_reward_func": 1.0, "step": 4944 }, { "completion_length": 268.79465675354004, "epoch": 0.8292049121924641, "grad_norm": 0.14449989796392068, "kl": 0.06292724609375, "learning_rate": 4.99942240346307e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7482143193483353, "rewards/format_reward_func": 0.9910714328289032, "step": 4946 }, { "completion_length": 259.68751430511475, "epoch": 0.8295402154323316, "grad_norm": 0.19207518609316115, "kl": 0.05278778076171875, "learning_rate": 4.999419822129099e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 4948 }, { "completion_length": 269.71429538726807, "epoch": 0.8298755186721992, "grad_norm": 0.19097486547502238, "kl": 0.15569305419921875, "learning_rate": 4.999417235040538e-07, "loss": 0.0002, "reward": 1.7214286401867867, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7392857428640127, "rewards/format_reward_func": 0.9821428619325161, "step": 4950 }, { "completion_length": 260.8214406967163, "epoch": 0.8302108219120667, "grad_norm": 0.15030844373911773, "kl": 0.088470458984375, "learning_rate": 4.99941464219739e-07, "loss": 0.0001, "reward": 1.7446429207921028, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7491071838885546, "rewards/format_reward_func": 0.9955357164144516, "step": 4952 }, { "completion_length": 257.60715770721436, "epoch": 0.8305461251519343, "grad_norm": 0.2758773487892338, "kl": 0.069061279296875, "learning_rate": 4.999412043599665e-07, "loss": 0.0001, "reward": 1.68571437895298, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.6937500294297934, "rewards/format_reward_func": 0.9919642880558968, "step": 4954 }, { "completion_length": 262.9910821914673, "epoch": 0.8308814283918018, "grad_norm": 0.21574039383575827, "kl": 0.05950927734375, "learning_rate": 4.999409439247366e-07, "loss": 0.0001, "reward": 1.7915179207921028, "reward_std": 0.04230013699270785, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 0.9986607171595097, "step": 4956 }, { "completion_length": 257.2946548461914, "epoch": 0.8312167316316694, "grad_norm": 0.19333692041023248, "kl": 0.0700531005859375, "learning_rate": 4.999406829140499e-07, "loss": 0.0001, "reward": 1.7428571954369545, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7517857514321804, "rewards/format_reward_func": 0.9910714328289032, "step": 4958 }, { "completion_length": 258.95983505249023, "epoch": 0.831552034871537, "grad_norm": 0.28450642542702165, "kl": 0.1026763916015625, "learning_rate": 4.999404213279072e-07, "loss": 0.0001, "reward": 1.7107143551111221, "reward_std": 0.09091372601687908, "rewards/equation_reward_func": 0.7196428775787354, "rewards/format_reward_func": 0.9910714328289032, "step": 4960 }, { "completion_length": 258.48662185668945, "epoch": 0.8318873381114045, "grad_norm": 0.11666481478020153, "kl": 0.0623321533203125, "learning_rate": 4.999401591663088e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285910129547, "rewards/format_reward_func": 1.0, "step": 4962 }, { "completion_length": 261.14287090301514, "epoch": 0.8322226413512721, "grad_norm": 0.2595529695459257, "kl": 0.0776519775390625, "learning_rate": 4.999398964292556e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7464286033064127, "rewards/format_reward_func": 1.0, "step": 4964 }, { "completion_length": 255.86161422729492, "epoch": 0.8325579445911396, "grad_norm": 0.11477647603067251, "kl": 0.05771636962890625, "learning_rate": 4.999396331167481e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 4966 }, { "completion_length": 251.63840866088867, "epoch": 0.8328932478310072, "grad_norm": 0.3354314841872632, "kl": 0.0924072265625, "learning_rate": 4.999393692287868e-07, "loss": 0.0001, "reward": 1.7589286267757416, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 4968 }, { "completion_length": 259.92858505249023, "epoch": 0.8332285510708747, "grad_norm": 0.22319074901580088, "kl": 0.05889892578125, "learning_rate": 4.999391047653726e-07, "loss": 0.0001, "reward": 1.73214291036129, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428898721933, "rewards/format_reward_func": 1.0, "step": 4970 }, { "completion_length": 253.16965579986572, "epoch": 0.8335638543107423, "grad_norm": 0.20776365236196978, "kl": 0.0605621337890625, "learning_rate": 4.999388397265057e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 4972 }, { "completion_length": 258.81250953674316, "epoch": 0.8338991575506098, "grad_norm": 0.3116271721626666, "kl": 0.0717926025390625, "learning_rate": 4.999385741121871e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 4974 }, { "completion_length": 257.9732246398926, "epoch": 0.8342344607904774, "grad_norm": 0.16783852777353048, "kl": 0.0492401123046875, "learning_rate": 4.999383079224171e-07, "loss": 0.0, "reward": 1.760714367032051, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 4976 }, { "completion_length": 267.3125123977661, "epoch": 0.834569764030345, "grad_norm": 0.2939545154505324, "kl": 0.1884918212890625, "learning_rate": 4.999380411571965e-07, "loss": 0.0002, "reward": 1.7214286178350449, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7303571756929159, "rewards/format_reward_func": 0.9910714328289032, "step": 4978 }, { "completion_length": 271.2857255935669, "epoch": 0.8349050672702125, "grad_norm": 0.13913934639957815, "kl": 0.26397705078125, "learning_rate": 4.999377738165259e-07, "loss": 0.0003, "reward": 1.74642863124609, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7553571723401546, "rewards/format_reward_func": 0.9910714328289032, "step": 4980 }, { "completion_length": 258.9553680419922, "epoch": 0.83524037051008, "grad_norm": 0.2516402147403652, "kl": 0.0686798095703125, "learning_rate": 4.999375059004057e-07, "loss": 0.0001, "reward": 1.7446429207921028, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7491071820259094, "rewards/format_reward_func": 0.9955357164144516, "step": 4982 }, { "completion_length": 268.2321557998657, "epoch": 0.8355756737499476, "grad_norm": 0.2667209451787418, "kl": 0.1626129150390625, "learning_rate": 4.999372374088369e-07, "loss": 0.0002, "reward": 1.7803571969270706, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214488476515, "rewards/format_reward_func": 0.9955357164144516, "step": 4984 }, { "completion_length": 260.77233695983887, "epoch": 0.8359109769898152, "grad_norm": 0.26300885880997005, "kl": 0.1019744873046875, "learning_rate": 4.999369683418199e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 4986 }, { "completion_length": 259.6116199493408, "epoch": 0.8362462802296827, "grad_norm": 0.3083753439451682, "kl": 0.171356201171875, "learning_rate": 4.999366986993552e-07, "loss": 0.0002, "reward": 1.8321429267525673, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8321428745985031, "rewards/format_reward_func": 1.0, "step": 4988 }, { "completion_length": 278.30804920196533, "epoch": 0.8365815834695502, "grad_norm": 0.4166251158747486, "kl": 0.43697357177734375, "learning_rate": 4.999364284814435e-07, "loss": 0.0004, "reward": 1.753571480512619, "reward_std": 0.08586296625435352, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 0.9821428656578064, "step": 4990 }, { "completion_length": 273.8884048461914, "epoch": 0.8369168867094179, "grad_norm": 0.22651730612716978, "kl": 0.0768585205078125, "learning_rate": 4.999361576880856e-07, "loss": 0.0001, "reward": 1.7178572416305542, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7267857380211353, "rewards/format_reward_func": 0.9910714328289032, "step": 4992 }, { "completion_length": 265.2321548461914, "epoch": 0.8372521899492854, "grad_norm": 0.19169125415619304, "kl": 0.3283843994140625, "learning_rate": 4.99935886319282e-07, "loss": 0.0003, "reward": 1.8142857551574707, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8142857626080513, "rewards/format_reward_func": 1.0, "step": 4994 }, { "completion_length": 264.4375123977661, "epoch": 0.8375874931891529, "grad_norm": 0.14860434722498145, "kl": 0.04929351806640625, "learning_rate": 4.999356143750332e-07, "loss": 0.0, "reward": 1.7089286297559738, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.713392898440361, "rewards/format_reward_func": 0.9955357164144516, "step": 4996 }, { "completion_length": 262.3259057998657, "epoch": 0.8379227964290205, "grad_norm": 0.3040842393844373, "kl": 0.06220245361328125, "learning_rate": 4.999353418553402e-07, "loss": 0.0001, "reward": 1.7946429178118706, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7991071734577417, "rewards/format_reward_func": 0.9955357164144516, "step": 4998 }, { "completion_length": 256.0982255935669, "epoch": 0.8382580996688881, "grad_norm": 0.9815141329046756, "kl": 0.08681488037109375, "learning_rate": 4.999350687602031e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7928571850061417, "rewards/format_reward_func": 1.0, "step": 5000 }, { "completion_length": 257.6517963409424, "epoch": 0.8385934029087556, "grad_norm": 0.28596476806596116, "kl": 0.1295928955078125, "learning_rate": 4.99934795089623e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571566939354, "rewards/format_reward_func": 1.0, "step": 5002 }, { "completion_length": 254.4642972946167, "epoch": 0.8389287061486231, "grad_norm": 0.2751226439728887, "kl": 0.3716888427734375, "learning_rate": 4.999345208436002e-07, "loss": 0.0004, "reward": 1.7410714998841286, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7455357387661934, "rewards/format_reward_func": 0.9955357164144516, "step": 5004 }, { "completion_length": 264.1964406967163, "epoch": 0.8392640093884907, "grad_norm": 0.17527055408946596, "kl": 0.0655517578125, "learning_rate": 4.999342460221355e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 5006 }, { "completion_length": 268.5848340988159, "epoch": 0.8395993126283583, "grad_norm": 0.30189495346003126, "kl": 0.08837127685546875, "learning_rate": 4.999339706252295e-07, "loss": 0.0001, "reward": 1.741517961025238, "reward_std": 0.0688166399486363, "rewards/equation_reward_func": 0.7428571674972773, "rewards/format_reward_func": 0.9986607171595097, "step": 5008 }, { "completion_length": 271.37947940826416, "epoch": 0.8399346158682258, "grad_norm": 0.2656036792725558, "kl": 0.06268310546875, "learning_rate": 4.999336946528828e-07, "loss": 0.0001, "reward": 1.7696429342031479, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7741071805357933, "rewards/format_reward_func": 0.9955357164144516, "step": 5010 }, { "completion_length": 262.29019260406494, "epoch": 0.8402699191080933, "grad_norm": 0.2827278859733368, "kl": 0.06427001953125, "learning_rate": 4.999334181050961e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 5012 }, { "completion_length": 267.7767972946167, "epoch": 0.840605222347961, "grad_norm": 0.1459763848123801, "kl": 0.2068939208984375, "learning_rate": 4.9993314098187e-07, "loss": 0.0002, "reward": 1.7464286237955093, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 5014 }, { "completion_length": 260.0580472946167, "epoch": 0.8409405255878285, "grad_norm": 0.11177792675682865, "kl": 0.141387939453125, "learning_rate": 4.999328632832052e-07, "loss": 0.0001, "reward": 1.8321429044008255, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8321428783237934, "rewards/format_reward_func": 1.0, "step": 5016 }, { "completion_length": 269.8392963409424, "epoch": 0.841275828827696, "grad_norm": 0.2513538542042273, "kl": 0.0636138916015625, "learning_rate": 4.999325850091022e-07, "loss": 0.0001, "reward": 1.7767857685685158, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7901785969734192, "rewards/format_reward_func": 0.9866071492433548, "step": 5018 }, { "completion_length": 263.446439743042, "epoch": 0.8416111320675635, "grad_norm": 0.1607530545343153, "kl": 0.3533172607421875, "learning_rate": 4.999323061595617e-07, "loss": 0.0004, "reward": 1.7129465118050575, "reward_std": 0.0625031883828342, "rewards/equation_reward_func": 0.7232143208384514, "rewards/format_reward_func": 0.9897321499884129, "step": 5020 }, { "completion_length": 278.18304538726807, "epoch": 0.8419464353074312, "grad_norm": 0.7651100930666153, "kl": 2.2594146728515625, "learning_rate": 4.999320267345844e-07, "loss": 0.0023, "reward": 1.707589365541935, "reward_std": 0.09028238267637789, "rewards/equation_reward_func": 0.7223214656114578, "rewards/format_reward_func": 0.9852678664028645, "step": 5022 }, { "completion_length": 265.76786708831787, "epoch": 0.8422817385472987, "grad_norm": 0.5229851016677849, "kl": 0.22228240966796875, "learning_rate": 4.999317467341709e-07, "loss": 0.0002, "reward": 1.7272322103381157, "reward_std": 0.07765547605231404, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 0.980803582817316, "step": 5024 }, { "completion_length": 265.40626335144043, "epoch": 0.8426170417871662, "grad_norm": 0.4540552845227721, "kl": 0.666046142578125, "learning_rate": 4.99931466158322e-07, "loss": 0.0007, "reward": 1.7071429193019867, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 0.9821428656578064, "step": 5026 }, { "completion_length": 256.67411708831787, "epoch": 0.8429523450270339, "grad_norm": 0.2477658548989634, "kl": 0.38092041015625, "learning_rate": 4.99931185007038e-07, "loss": 0.0004, "reward": 1.7392857819795609, "reward_std": 0.07576144114136696, "rewards/equation_reward_func": 0.7482143230736256, "rewards/format_reward_func": 0.9910714328289032, "step": 5028 }, { "completion_length": 261.7321557998657, "epoch": 0.8432876482669014, "grad_norm": 294.4796959677068, "kl": 292.18275451660156, "learning_rate": 4.999309032803199e-07, "loss": 0.2905, "reward": 1.7553571984171867, "reward_std": 0.0833375845104456, "rewards/equation_reward_func": 0.7687500268220901, "rewards/format_reward_func": 0.9866071492433548, "step": 5030 }, { "completion_length": 254.8660831451416, "epoch": 0.8436229515067689, "grad_norm": 0.3487734623831873, "kl": 0.2953338623046875, "learning_rate": 4.99930620978168e-07, "loss": 0.0003, "reward": 1.7625000849366188, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 5032 }, { "completion_length": 250.8214406967163, "epoch": 0.8439582547466364, "grad_norm": 0.845100666891351, "kl": 1.3850021362304688, "learning_rate": 4.999303381005833e-07, "loss": 0.0014, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 5034 }, { "completion_length": 260.23662090301514, "epoch": 0.8442935579865041, "grad_norm": 0.29851671677387165, "kl": 0.46588134765625, "learning_rate": 4.999300546475663e-07, "loss": 0.0005, "reward": 1.7897321954369545, "reward_std": 0.09533314313739538, "rewards/equation_reward_func": 0.8044643066823483, "rewards/format_reward_func": 0.9852678626775742, "step": 5036 }, { "completion_length": 259.9821557998657, "epoch": 0.8446288612263716, "grad_norm": 0.19617775336306847, "kl": 0.188018798828125, "learning_rate": 4.999297706191175e-07, "loss": 0.0002, "reward": 1.739285796880722, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 5038 }, { "completion_length": 259.6562623977661, "epoch": 0.8449641644662391, "grad_norm": 0.36903900589044275, "kl": 0.31150054931640625, "learning_rate": 4.999294860152378e-07, "loss": 0.0003, "reward": 1.7035715207457542, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7125000394880772, "rewards/format_reward_func": 0.9910714328289032, "step": 5040 }, { "completion_length": 251.31697845458984, "epoch": 0.8452994677061068, "grad_norm": 0.34778111463721867, "kl": 0.58111572265625, "learning_rate": 4.999292008359277e-07, "loss": 0.0006, "reward": 1.7892857864499092, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.789285734295845, "rewards/format_reward_func": 1.0, "step": 5042 }, { "completion_length": 259.6517972946167, "epoch": 0.8456347709459743, "grad_norm": 0.22109059202099945, "kl": 0.05115509033203125, "learning_rate": 4.99928915081188e-07, "loss": 0.0001, "reward": 1.701785758137703, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.706250037997961, "rewards/format_reward_func": 0.9955357164144516, "step": 5044 }, { "completion_length": 255.47322463989258, "epoch": 0.8459700741858418, "grad_norm": 0.767684363928975, "kl": 1.7428512573242188, "learning_rate": 4.999286287510192e-07, "loss": 0.0017, "reward": 1.7303572073578835, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7348214611411095, "rewards/format_reward_func": 0.9955357164144516, "step": 5046 }, { "completion_length": 247.4866180419922, "epoch": 0.8463053774257093, "grad_norm": 0.2620184516888097, "kl": 0.0875244140625, "learning_rate": 4.999283418454221e-07, "loss": 0.0001, "reward": 1.7696429044008255, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7741071842610836, "rewards/format_reward_func": 0.9955357164144516, "step": 5048 }, { "completion_length": 257.3928699493408, "epoch": 0.846640680665577, "grad_norm": 0.2051462165393263, "kl": 0.0811309814453125, "learning_rate": 4.999280543643973e-07, "loss": 0.0001, "reward": 1.7428572252392769, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 5050 }, { "completion_length": 257.3348340988159, "epoch": 0.8469759839054445, "grad_norm": 0.23980940731317518, "kl": 0.0730438232421875, "learning_rate": 4.999277663079453e-07, "loss": 0.0001, "reward": 1.753571517765522, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 5052 }, { "completion_length": 256.02233505249023, "epoch": 0.847311287145312, "grad_norm": 0.2466820519649015, "kl": 0.0859832763671875, "learning_rate": 4.99927477676067e-07, "loss": 0.0001, "reward": 1.7446429282426834, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.9955357164144516, "step": 5054 }, { "completion_length": 242.81697368621826, "epoch": 0.8476465903851796, "grad_norm": 0.19081753990918476, "kl": 0.04564666748046875, "learning_rate": 4.99927188468763e-07, "loss": 0.0, "reward": 1.7910714894533157, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7955357450991869, "rewards/format_reward_func": 0.9955357164144516, "step": 5056 }, { "completion_length": 250.4375114440918, "epoch": 0.8479818936250472, "grad_norm": 0.27938780172578787, "kl": 0.1655731201171875, "learning_rate": 4.99926898686034e-07, "loss": 0.0002, "reward": 1.7464286163449287, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.746428593993187, "rewards/format_reward_func": 1.0, "step": 5058 }, { "completion_length": 253.05804443359375, "epoch": 0.8483171968649147, "grad_norm": 0.5356787768614212, "kl": 0.06327056884765625, "learning_rate": 4.999266083278806e-07, "loss": 0.0001, "reward": 1.7446429058909416, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7491071820259094, "rewards/format_reward_func": 0.9955357164144516, "step": 5060 }, { "completion_length": 237.91965198516846, "epoch": 0.8486525001047822, "grad_norm": 0.24795477280872208, "kl": 0.11028289794921875, "learning_rate": 4.999263173943034e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 5062 }, { "completion_length": 246.70983219146729, "epoch": 0.8489878033446498, "grad_norm": 0.2763103500220934, "kl": 0.081085205078125, "learning_rate": 4.999260258853032e-07, "loss": 0.0001, "reward": 1.8089286386966705, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.8133928813040257, "rewards/format_reward_func": 0.9955357164144516, "step": 5064 }, { "completion_length": 252.1830472946167, "epoch": 0.8493231065845174, "grad_norm": 0.10641035138334937, "kl": 0.04872894287109375, "learning_rate": 4.999257338008806e-07, "loss": 0.0, "reward": 1.7517857626080513, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 5066 }, { "completion_length": 255.8259038925171, "epoch": 0.8496584098243849, "grad_norm": 0.23307262313618435, "kl": 0.04785919189453125, "learning_rate": 4.999254411410363e-07, "loss": 0.0, "reward": 1.730357214808464, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7348214536905289, "rewards/format_reward_func": 0.9955357164144516, "step": 5068 }, { "completion_length": 244.45983219146729, "epoch": 0.8499937130642525, "grad_norm": 0.23193165760967088, "kl": 0.2115325927734375, "learning_rate": 4.99925147905771e-07, "loss": 0.0002, "reward": 1.785714328289032, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7946428805589676, "rewards/format_reward_func": 0.9910714328289032, "step": 5070 }, { "completion_length": 244.90626049041748, "epoch": 0.8503290163041201, "grad_norm": 0.24690095810292512, "kl": 0.053955078125, "learning_rate": 4.999248540950853e-07, "loss": 0.0001, "reward": 1.7785715162754059, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 5072 }, { "completion_length": 253.98661613464355, "epoch": 0.8506643195439876, "grad_norm": 0.23389187074428028, "kl": 0.1283416748046875, "learning_rate": 4.999245597089799e-07, "loss": 0.0001, "reward": 1.787500061094761, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 5074 }, { "completion_length": 247.71875762939453, "epoch": 0.8509996227838551, "grad_norm": 0.8323127056861116, "kl": 0.1199188232421875, "learning_rate": 4.999242647474555e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7910714596509933, "rewards/format_reward_func": 0.9910714328289032, "step": 5076 }, { "completion_length": 236.50000953674316, "epoch": 0.8513349260237227, "grad_norm": 0.18800976984029746, "kl": 0.1182861328125, "learning_rate": 4.99923969210513e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.753571467474103, "rewards/format_reward_func": 1.0, "step": 5078 }, { "completion_length": 239.78126049041748, "epoch": 0.8516702292635903, "grad_norm": 0.2201142403006137, "kl": 0.0549774169921875, "learning_rate": 4.999236730981526e-07, "loss": 0.0001, "reward": 1.714285783469677, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857536673546, "rewards/format_reward_func": 1.0, "step": 5080 }, { "completion_length": 247.9062623977661, "epoch": 0.8520055325034578, "grad_norm": 4.761260457058568, "kl": 0.383270263671875, "learning_rate": 4.999233764103753e-07, "loss": 0.0004, "reward": 1.753571480512619, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7625000402331352, "rewards/format_reward_func": 0.9910714328289032, "step": 5082 }, { "completion_length": 240.20983409881592, "epoch": 0.8523408357433254, "grad_norm": 0.126562800778106, "kl": 0.554901123046875, "learning_rate": 4.999230791471818e-07, "loss": 0.0006, "reward": 1.7714286521077156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 5084 }, { "completion_length": 230.77233409881592, "epoch": 0.8526761389831929, "grad_norm": 0.29796437906708106, "kl": 0.05060577392578125, "learning_rate": 4.999227813085725e-07, "loss": 0.0001, "reward": 1.8285714834928513, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8285714499652386, "rewards/format_reward_func": 1.0, "step": 5086 }, { "completion_length": 239.77679538726807, "epoch": 0.8530114422230605, "grad_norm": 0.30966121200378954, "kl": 0.2183380126953125, "learning_rate": 4.999224828945485e-07, "loss": 0.0002, "reward": 1.7946429029107094, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.799107164144516, "rewards/format_reward_func": 0.9955357164144516, "step": 5088 }, { "completion_length": 244.41519165039062, "epoch": 0.853346745462928, "grad_norm": 0.14491462362379912, "kl": 0.11983489990234375, "learning_rate": 4.999221839051102e-07, "loss": 0.0001, "reward": 1.7375000789761543, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7419643215835094, "rewards/format_reward_func": 0.9955357164144516, "step": 5090 }, { "completion_length": 234.41965293884277, "epoch": 0.8536820487027956, "grad_norm": 0.24775391265290617, "kl": 0.1589813232421875, "learning_rate": 4.999218843402584e-07, "loss": 0.0002, "reward": 1.718303643167019, "reward_std": 0.0650285689625889, "rewards/equation_reward_func": 0.724107164889574, "rewards/format_reward_func": 0.9941964335739613, "step": 5092 }, { "completion_length": 236.27233123779297, "epoch": 0.8540173519426631, "grad_norm": 0.4428321684880137, "kl": 0.21246337890625, "learning_rate": 4.999215841999937e-07, "loss": 0.0002, "reward": 1.6910714730620384, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7223214618861675, "rewards/format_reward_func": 0.9687500149011612, "step": 5094 }, { "completion_length": 236.79465293884277, "epoch": 0.8543526551825307, "grad_norm": 0.3418168828278188, "kl": 0.14962005615234375, "learning_rate": 4.999212834843169e-07, "loss": 0.0001, "reward": 1.798214353621006, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.8116071708500385, "rewards/format_reward_func": 0.9866071492433548, "step": 5096 }, { "completion_length": 230.79911708831787, "epoch": 0.8546879584223983, "grad_norm": 0.20071501266811895, "kl": 0.1268768310546875, "learning_rate": 4.999209821932287e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 5098 }, { "completion_length": 226.96875858306885, "epoch": 0.8550232616622658, "grad_norm": 0.17248835077943814, "kl": 0.098907470703125, "learning_rate": 4.999206803267296e-07, "loss": 0.0001, "reward": 1.689285822212696, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.6982143241912127, "rewards/format_reward_func": 0.9910714328289032, "step": 5100 }, { "completion_length": 228.93751049041748, "epoch": 0.8553585649021334, "grad_norm": 0.30204625045537503, "kl": 0.187408447265625, "learning_rate": 4.999203778848206e-07, "loss": 0.0002, "reward": 1.7410714775323868, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357480794191, "rewards/format_reward_func": 0.9955357164144516, "step": 5102 }, { "completion_length": 223.1071538925171, "epoch": 0.8556938681420009, "grad_norm": 0.783522810256943, "kl": 2.531463623046875, "learning_rate": 4.999200748675021e-07, "loss": 0.0025, "reward": 1.7250000908970833, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7339285984635353, "rewards/format_reward_func": 0.9910714328289032, "step": 5104 }, { "completion_length": 234.47768878936768, "epoch": 0.8560291713818685, "grad_norm": 0.24328087656232975, "kl": 0.229888916015625, "learning_rate": 4.99919771274775e-07, "loss": 0.0002, "reward": 1.7200893685221672, "reward_std": 0.06250318791717291, "rewards/equation_reward_func": 0.721428606659174, "rewards/format_reward_func": 0.9986607171595097, "step": 5106 }, { "completion_length": 244.42858219146729, "epoch": 0.856364474621736, "grad_norm": 0.29232745791069964, "kl": 1.5421829223632812, "learning_rate": 4.999194671066398e-07, "loss": 0.0015, "reward": 1.721428632736206, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7303571812808514, "rewards/format_reward_func": 0.9910714328289032, "step": 5108 }, { "completion_length": 252.26340675354004, "epoch": 0.8566997778616036, "grad_norm": 0.4569750985390259, "kl": 1.1767578125, "learning_rate": 4.999191623630974e-07, "loss": 0.0012, "reward": 1.6625000834465027, "reward_std": 0.11364215891808271, "rewards/equation_reward_func": 0.6848214641213417, "rewards/format_reward_func": 0.977678582072258, "step": 5110 }, { "completion_length": 247.93751430511475, "epoch": 0.8570350811014712, "grad_norm": 0.34339815751092734, "kl": 0.3735198974609375, "learning_rate": 4.999188570441485e-07, "loss": 0.0004, "reward": 1.7035714611411095, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7214286047965288, "rewards/format_reward_func": 0.9821428656578064, "step": 5112 }, { "completion_length": 228.1562614440918, "epoch": 0.8573703843413387, "grad_norm": 0.24380516996038454, "kl": 0.1014862060546875, "learning_rate": 4.999185511497937e-07, "loss": 0.0001, "reward": 1.8160714954137802, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.8205357380211353, "rewards/format_reward_func": 0.9955357164144516, "step": 5114 }, { "completion_length": 238.89286613464355, "epoch": 0.8577056875812062, "grad_norm": 0.2579426290533824, "kl": 0.203277587890625, "learning_rate": 4.999182446800336e-07, "loss": 0.0002, "reward": 1.732142947614193, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 0.9821428656578064, "step": 5116 }, { "completion_length": 229.06251049041748, "epoch": 0.8580409908210738, "grad_norm": 0.39017097066306555, "kl": 1.1303558349609375, "learning_rate": 4.999179376348691e-07, "loss": 0.0011, "reward": 1.7343750596046448, "reward_std": 0.08270624093711376, "rewards/equation_reward_func": 0.7535714525729418, "rewards/format_reward_func": 0.980803582817316, "step": 5118 }, { "completion_length": 226.82143878936768, "epoch": 0.8583762940609414, "grad_norm": 0.8587286627098762, "kl": 0.360626220703125, "learning_rate": 4.999176300143009e-07, "loss": 0.0004, "reward": 1.751785784959793, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 5120 }, { "completion_length": 229.28126049041748, "epoch": 0.8587115973008089, "grad_norm": 0.19873741773246498, "kl": 0.1075286865234375, "learning_rate": 4.999173218183296e-07, "loss": 0.0001, "reward": 1.776785783469677, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500186264515, "rewards/format_reward_func": 0.9955357164144516, "step": 5122 }, { "completion_length": 227.12054538726807, "epoch": 0.8590469005406765, "grad_norm": 0.2766724367610363, "kl": 0.2878875732421875, "learning_rate": 4.99917013046956e-07, "loss": 0.0003, "reward": 1.698214367032051, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7026786021888256, "rewards/format_reward_func": 0.9955357164144516, "step": 5124 }, { "completion_length": 225.43750858306885, "epoch": 0.8593822037805441, "grad_norm": 0.45199430854316097, "kl": 0.27734375, "learning_rate": 4.999167037001807e-07, "loss": 0.0003, "reward": 1.7625000849366188, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.766964316368103, "rewards/format_reward_func": 0.9955357164144516, "step": 5126 }, { "completion_length": 220.16965198516846, "epoch": 0.8597175070204116, "grad_norm": 0.1847114816856559, "kl": 0.318359375, "learning_rate": 4.999163937780046e-07, "loss": 0.0003, "reward": 1.8035714775323868, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 5128 }, { "completion_length": 227.0312623977661, "epoch": 0.8600528102602791, "grad_norm": 0.34900171200182317, "kl": 0.3245086669921875, "learning_rate": 4.999160832804282e-07, "loss": 0.0003, "reward": 1.7464286461472511, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 5130 }, { "completion_length": 218.93304634094238, "epoch": 0.8603881135001467, "grad_norm": 0.2367511358892103, "kl": 0.117279052734375, "learning_rate": 4.999157722074524e-07, "loss": 0.0001, "reward": 1.8022321909666061, "reward_std": 0.027147848159074783, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 0.9986607171595097, "step": 5132 }, { "completion_length": 219.32590198516846, "epoch": 0.8607234167400143, "grad_norm": 0.1688927659968083, "kl": 0.1425018310546875, "learning_rate": 4.999154605590778e-07, "loss": 0.0001, "reward": 1.8178571909666061, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 1.0, "step": 5134 }, { "completion_length": 227.3794755935669, "epoch": 0.8610587199798818, "grad_norm": 0.19784225720299922, "kl": 0.2835845947265625, "learning_rate": 4.999151483353052e-07, "loss": 0.0003, "reward": 1.7660714834928513, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 5136 }, { "completion_length": 242.0803680419922, "epoch": 0.8613940232197493, "grad_norm": 0.26270531928407537, "kl": 0.5782623291015625, "learning_rate": 4.999148355361351e-07, "loss": 0.0006, "reward": 1.775000050663948, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 5138 }, { "completion_length": 238.0357265472412, "epoch": 0.8617293264596169, "grad_norm": 0.22026295954848013, "kl": 0.259002685546875, "learning_rate": 4.999145221615685e-07, "loss": 0.0003, "reward": 1.7678572088479996, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7767857350409031, "rewards/format_reward_func": 0.9910714328289032, "step": 5140 }, { "completion_length": 228.8259048461914, "epoch": 0.8620646296994845, "grad_norm": 0.14104751882657987, "kl": 0.0952606201171875, "learning_rate": 4.99914208211606e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 5142 }, { "completion_length": 236.7812614440918, "epoch": 0.862399932939352, "grad_norm": 0.2925028960594958, "kl": 0.1213836669921875, "learning_rate": 4.999138936862484e-07, "loss": 0.0001, "reward": 1.7714286148548126, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7803571708500385, "rewards/format_reward_func": 0.9910714328289032, "step": 5144 }, { "completion_length": 242.68751049041748, "epoch": 0.8627352361792195, "grad_norm": 0.4819099304816873, "kl": 0.167633056640625, "learning_rate": 4.999135785854962e-07, "loss": 0.0002, "reward": 1.7214286401867867, "reward_std": 0.12121830228716135, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 0.9821428656578064, "step": 5146 }, { "completion_length": 249.82143878936768, "epoch": 0.8630705394190872, "grad_norm": 0.25499206507829786, "kl": 0.2075958251953125, "learning_rate": 4.999132629093503e-07, "loss": 0.0002, "reward": 1.7035714983940125, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7125000450760126, "rewards/format_reward_func": 0.9910714328289032, "step": 5148 }, { "completion_length": 238.89733409881592, "epoch": 0.8634058426589547, "grad_norm": 0.2327296549492035, "kl": 0.1017913818359375, "learning_rate": 4.999129466578116e-07, "loss": 0.0001, "reward": 1.6696429401636124, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.6741071790456772, "rewards/format_reward_func": 0.9955357164144516, "step": 5150 }, { "completion_length": 247.5759038925171, "epoch": 0.8637411458988222, "grad_norm": 0.20944422452788633, "kl": 0.100006103515625, "learning_rate": 4.999126298308805e-07, "loss": 0.0001, "reward": 1.7375000715255737, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7419643104076385, "rewards/format_reward_func": 0.9955357164144516, "step": 5152 }, { "completion_length": 244.74108409881592, "epoch": 0.8640764491386898, "grad_norm": 0.3227991323663998, "kl": 0.1121673583984375, "learning_rate": 4.999123124285578e-07, "loss": 0.0001, "reward": 1.7714286521077156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 5154 }, { "completion_length": 246.30358219146729, "epoch": 0.8644117523785574, "grad_norm": 0.6971793031048401, "kl": 0.1251983642578125, "learning_rate": 4.999119944508445e-07, "loss": 0.0001, "reward": 1.7071429416537285, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7160714715719223, "rewards/format_reward_func": 0.9910714328289032, "step": 5156 }, { "completion_length": 241.41518688201904, "epoch": 0.8647470556184249, "grad_norm": 0.30123485204649847, "kl": 0.0756683349609375, "learning_rate": 4.999116758977409e-07, "loss": 0.0001, "reward": 1.7232143878936768, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7276785969734192, "rewards/format_reward_func": 0.9955357164144516, "step": 5158 }, { "completion_length": 242.10715293884277, "epoch": 0.8650823588582924, "grad_norm": 0.19740171135735748, "kl": 0.0821380615234375, "learning_rate": 4.999113567692481e-07, "loss": 0.0001, "reward": 1.7571429461240768, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428753435612, "rewards/format_reward_func": 1.0, "step": 5160 }, { "completion_length": 241.02679634094238, "epoch": 0.8654176620981601, "grad_norm": 0.34486952501648477, "kl": 0.049346923828125, "learning_rate": 4.999110370653667e-07, "loss": 0.0, "reward": 1.7839286476373672, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7883928846567869, "rewards/format_reward_func": 0.9955357164144516, "step": 5162 }, { "completion_length": 235.915189743042, "epoch": 0.8657529653380276, "grad_norm": 0.23825360495441034, "kl": 0.05047607421875, "learning_rate": 4.999107167860973e-07, "loss": 0.0001, "reward": 1.7946429327130318, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7991071604192257, "rewards/format_reward_func": 0.9955357164144516, "step": 5164 }, { "completion_length": 237.2946538925171, "epoch": 0.8660882685778951, "grad_norm": 0.23002121289385286, "kl": 0.0514068603515625, "learning_rate": 4.999103959314409e-07, "loss": 0.0001, "reward": 1.7642858028411865, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 5166 }, { "completion_length": 238.7589406967163, "epoch": 0.8664235718177626, "grad_norm": 0.7932820511428698, "kl": 0.05147552490234375, "learning_rate": 4.999100745013981e-07, "loss": 0.0001, "reward": 1.7375000715255737, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7419643104076385, "rewards/format_reward_func": 0.9955357164144516, "step": 5168 }, { "completion_length": 236.88840675354004, "epoch": 0.8667588750576303, "grad_norm": 0.27955379277920045, "kl": 0.0646514892578125, "learning_rate": 4.999097524959695e-07, "loss": 0.0001, "reward": 1.691071517765522, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.6955357603728771, "rewards/format_reward_func": 0.9955357164144516, "step": 5170 }, { "completion_length": 256.50893783569336, "epoch": 0.8670941782974978, "grad_norm": 0.29478152619315, "kl": 0.074310302734375, "learning_rate": 4.999094299151562e-07, "loss": 0.0001, "reward": 1.7214286625385284, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214285898953676, "rewards/format_reward_func": 1.0, "step": 5172 }, { "completion_length": 235.45983123779297, "epoch": 0.8674294815373653, "grad_norm": 0.32795899856400945, "kl": 0.0535736083984375, "learning_rate": 4.999091067589585e-07, "loss": 0.0001, "reward": 1.725000061094761, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7250000387430191, "rewards/format_reward_func": 1.0, "step": 5174 }, { "completion_length": 237.6116180419922, "epoch": 0.867764784777233, "grad_norm": 0.19602738115025517, "kl": 0.0596466064453125, "learning_rate": 4.999087830273777e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428999304771, "rewards/format_reward_func": 1.0, "step": 5176 }, { "completion_length": 242.0625123977661, "epoch": 0.8681000880171005, "grad_norm": 0.5267813801735062, "kl": 0.0671844482421875, "learning_rate": 4.99908458720414e-07, "loss": 0.0001, "reward": 1.7285714894533157, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7464285995811224, "rewards/format_reward_func": 0.9821428656578064, "step": 5178 }, { "completion_length": 230.95536994934082, "epoch": 0.868435391256968, "grad_norm": 0.24433297853268698, "kl": 0.064056396484375, "learning_rate": 4.999081338380684e-07, "loss": 0.0001, "reward": 1.7500000968575478, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7580357417464256, "rewards/format_reward_func": 0.9919642880558968, "step": 5180 }, { "completion_length": 246.0982255935669, "epoch": 0.8687706944968355, "grad_norm": 0.8587767902492196, "kl": 0.1986541748046875, "learning_rate": 4.999078083803416e-07, "loss": 0.0002, "reward": 1.7321429252624512, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7410714663565159, "rewards/format_reward_func": 0.9910714328289032, "step": 5182 }, { "completion_length": 231.10715293884277, "epoch": 0.8691059977367032, "grad_norm": 0.22419756108609906, "kl": 0.0513763427734375, "learning_rate": 4.999074823472344e-07, "loss": 0.0001, "reward": 1.7285715192556381, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.728571455925703, "rewards/format_reward_func": 1.0, "step": 5184 }, { "completion_length": 234.24108219146729, "epoch": 0.8694413009765707, "grad_norm": 0.2572663801825886, "kl": 0.12969970703125, "learning_rate": 4.999071557387475e-07, "loss": 0.0001, "reward": 1.7267857939004898, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7312500402331352, "rewards/format_reward_func": 0.9955357164144516, "step": 5186 }, { "completion_length": 237.3482265472412, "epoch": 0.8697766042164382, "grad_norm": 0.2707232720959836, "kl": 0.117584228515625, "learning_rate": 4.999068285548816e-07, "loss": 0.0001, "reward": 1.7732143476605415, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.777678593993187, "rewards/format_reward_func": 0.9955357164144516, "step": 5188 }, { "completion_length": 229.05804538726807, "epoch": 0.8701119074563058, "grad_norm": 0.1857235486891341, "kl": 0.05426025390625, "learning_rate": 4.999065007956377e-07, "loss": 0.0001, "reward": 1.7714286148548126, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 1.0, "step": 5190 }, { "completion_length": 238.2723331451416, "epoch": 0.8704472106961734, "grad_norm": 0.4293622096350722, "kl": 0.111297607421875, "learning_rate": 4.999061724610163e-07, "loss": 0.0001, "reward": 1.7553572207689285, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7598214671015739, "rewards/format_reward_func": 0.9955357164144516, "step": 5192 }, { "completion_length": 236.36161994934082, "epoch": 0.8707825139360409, "grad_norm": 0.2534201839770564, "kl": 0.2327423095703125, "learning_rate": 4.999058435510182e-07, "loss": 0.0002, "reward": 1.7642857730388641, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857655882835, "rewards/format_reward_func": 1.0, "step": 5194 }, { "completion_length": 242.7053689956665, "epoch": 0.8711178171759084, "grad_norm": 0.4089098177812318, "kl": 0.1260986328125, "learning_rate": 4.999055140656442e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 5196 }, { "completion_length": 242.01786708831787, "epoch": 0.871453120415776, "grad_norm": 0.2138423740410924, "kl": 0.1017303466796875, "learning_rate": 4.99905184004895e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 5198 }, { "completion_length": 246.15179824829102, "epoch": 0.8717884236556436, "grad_norm": 0.25927983778414143, "kl": 0.0787200927734375, "learning_rate": 4.999048533687715e-07, "loss": 0.0001, "reward": 1.6803572252392769, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.6937500238418579, "rewards/format_reward_func": 0.9866071492433548, "step": 5200 }, { "completion_length": 241.31250953674316, "epoch": 0.8721237268955111, "grad_norm": 0.25433932775478646, "kl": 0.0538330078125, "learning_rate": 4.999045221572743e-07, "loss": 0.0001, "reward": 1.705357238650322, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7098214626312256, "rewards/format_reward_func": 0.9955357164144516, "step": 5202 }, { "completion_length": 240.00447368621826, "epoch": 0.8724590301353787, "grad_norm": 0.1208999441426067, "kl": 0.084716796875, "learning_rate": 4.999041903704043e-07, "loss": 0.0001, "reward": 1.7250000685453415, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7250000331550837, "rewards/format_reward_func": 1.0, "step": 5204 }, { "completion_length": 248.1384048461914, "epoch": 0.8727943333752463, "grad_norm": 0.347887490986824, "kl": 0.2652587890625, "learning_rate": 4.999038580081621e-07, "loss": 0.0003, "reward": 1.721428669989109, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7214286029338837, "rewards/format_reward_func": 1.0, "step": 5206 }, { "completion_length": 241.5580472946167, "epoch": 0.8731296366151138, "grad_norm": 0.11248959369685928, "kl": 0.09877777099609375, "learning_rate": 4.999035250705486e-07, "loss": 0.0001, "reward": 1.7428572326898575, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571730852127, "rewards/format_reward_func": 1.0, "step": 5208 }, { "completion_length": 246.4553680419922, "epoch": 0.8734649398549813, "grad_norm": 0.20355932834966817, "kl": 0.119964599609375, "learning_rate": 4.999031915575645e-07, "loss": 0.0001, "reward": 1.678571492433548, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.6785714589059353, "rewards/format_reward_func": 1.0, "step": 5210 }, { "completion_length": 240.62501335144043, "epoch": 0.8738002430948489, "grad_norm": 0.3614959079050482, "kl": 0.0868377685546875, "learning_rate": 4.999028574692107e-07, "loss": 0.0001, "reward": 1.7232143729925156, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7276785932481289, "rewards/format_reward_func": 0.9955357164144516, "step": 5212 }, { "completion_length": 232.2321548461914, "epoch": 0.8741355463347165, "grad_norm": 0.27345804390142686, "kl": 0.267913818359375, "learning_rate": 4.999025228054878e-07, "loss": 0.0003, "reward": 1.7857143431901932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 5214 }, { "completion_length": 243.81250953674316, "epoch": 0.874470849574584, "grad_norm": 0.25686369513436424, "kl": 0.05169677734375, "learning_rate": 4.999021875663967e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321429010480642, "rewards/format_reward_func": 1.0, "step": 5216 }, { "completion_length": 236.5892972946167, "epoch": 0.8748061528144516, "grad_norm": 0.2938128211300729, "kl": 0.0554046630859375, "learning_rate": 4.99901851751938e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 5218 }, { "completion_length": 238.8482255935669, "epoch": 0.8751414560543191, "grad_norm": 0.2940552545555363, "kl": 0.202545166015625, "learning_rate": 4.999015153621126e-07, "loss": 0.0002, "reward": 1.7428572252392769, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 5220 }, { "completion_length": 235.82143783569336, "epoch": 0.8754767592941867, "grad_norm": 0.18830219720647964, "kl": 0.124786376953125, "learning_rate": 4.999011783969213e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 5222 }, { "completion_length": 245.26786994934082, "epoch": 0.8758120625340542, "grad_norm": 0.38066338937574323, "kl": 0.1001739501953125, "learning_rate": 4.999008408563649e-07, "loss": 0.0001, "reward": 1.7321429178118706, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7410714514553547, "rewards/format_reward_func": 0.9910714328289032, "step": 5224 }, { "completion_length": 233.4732255935669, "epoch": 0.8761473657739218, "grad_norm": 0.4616713763436838, "kl": 0.1026153564453125, "learning_rate": 4.999005027404439e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 5226 }, { "completion_length": 239.80358409881592, "epoch": 0.8764826690137894, "grad_norm": 0.2662263067010209, "kl": 0.0876007080078125, "learning_rate": 4.999001640491595e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 5228 }, { "completion_length": 238.1294755935669, "epoch": 0.8768179722536569, "grad_norm": 0.2532932673862093, "kl": 0.064971923828125, "learning_rate": 4.998998247825121e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 5230 }, { "completion_length": 239.14733219146729, "epoch": 0.8771532754935245, "grad_norm": 0.22009531892447384, "kl": 0.06134033203125, "learning_rate": 4.998994849405027e-07, "loss": 0.0001, "reward": 1.780357226729393, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 5232 }, { "completion_length": 242.4062614440918, "epoch": 0.877488578733392, "grad_norm": 0.1460440667442655, "kl": 0.0941925048828125, "learning_rate": 4.99899144523132e-07, "loss": 0.0001, "reward": 1.705357201397419, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7187500353902578, "rewards/format_reward_func": 0.9866071455180645, "step": 5234 }, { "completion_length": 239.28126049041748, "epoch": 0.8778238819732596, "grad_norm": 0.25190830542060527, "kl": 0.0667724609375, "learning_rate": 4.998988035304009e-07, "loss": 0.0001, "reward": 1.7232143729925156, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276786044239998, "rewards/format_reward_func": 0.9955357164144516, "step": 5236 }, { "completion_length": 244.16518783569336, "epoch": 0.8781591852131271, "grad_norm": 0.17071947606065108, "kl": 0.089874267578125, "learning_rate": 4.998984619623101e-07, "loss": 0.0001, "reward": 1.7339286655187607, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 5238 }, { "completion_length": 230.227689743042, "epoch": 0.8784944884529947, "grad_norm": 0.3664008933351735, "kl": 0.0554656982421875, "learning_rate": 4.998981198188603e-07, "loss": 0.0001, "reward": 1.814285784959793, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 5240 }, { "completion_length": 229.21876049041748, "epoch": 0.8788297916928622, "grad_norm": 0.21221439980005558, "kl": 0.06195068359375, "learning_rate": 4.998977771000525e-07, "loss": 0.0001, "reward": 1.7642857655882835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.764285733923316, "rewards/format_reward_func": 1.0, "step": 5242 }, { "completion_length": 234.59376049041748, "epoch": 0.8791650949327298, "grad_norm": 0.3746951037284132, "kl": 0.07000732421875, "learning_rate": 4.998974338058872e-07, "loss": 0.0001, "reward": 1.6857143566012383, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.6857143249362707, "rewards/format_reward_func": 1.0, "step": 5244 }, { "completion_length": 242.1696538925171, "epoch": 0.8795003981725974, "grad_norm": 0.18881294484194702, "kl": 0.0675506591796875, "learning_rate": 4.998970899363655e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 5246 }, { "completion_length": 226.1384038925171, "epoch": 0.8798357014124649, "grad_norm": 0.22507211877639, "kl": 0.07757568359375, "learning_rate": 4.998967454914879e-07, "loss": 0.0001, "reward": 1.830357201397419, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.8348214440047741, "rewards/format_reward_func": 0.9955357164144516, "step": 5248 }, { "completion_length": 245.1026906967163, "epoch": 0.8801710046523324, "grad_norm": 0.22393823771667665, "kl": 0.06451416015625, "learning_rate": 4.998964004712555e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 5250 }, { "completion_length": 232.92411708831787, "epoch": 0.8805063078922, "grad_norm": 0.2144873023449417, "kl": 0.0620269775390625, "learning_rate": 4.998960548756689e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7732143141329288, "rewards/format_reward_func": 0.9910714328289032, "step": 5252 }, { "completion_length": 232.87054347991943, "epoch": 0.8808416111320676, "grad_norm": 0.1989102781559067, "kl": 0.05689239501953125, "learning_rate": 4.998957087047288e-07, "loss": 0.0001, "reward": 1.7464286237955093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 5254 }, { "completion_length": 237.9062614440918, "epoch": 0.8811769143719351, "grad_norm": 0.2512641577169618, "kl": 0.058868408203125, "learning_rate": 4.998953619584363e-07, "loss": 0.0001, "reward": 1.7803572118282318, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7848214693367481, "rewards/format_reward_func": 0.9955357164144516, "step": 5256 }, { "completion_length": 244.99108219146729, "epoch": 0.8815122176118027, "grad_norm": 0.25304485784456665, "kl": 0.0676422119140625, "learning_rate": 4.998950146367918e-07, "loss": 0.0001, "reward": 1.7053572162985802, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7098214570432901, "rewards/format_reward_func": 0.9955357164144516, "step": 5258 }, { "completion_length": 236.65179538726807, "epoch": 0.8818475208516702, "grad_norm": 0.2993844993610931, "kl": 0.0611572265625, "learning_rate": 4.998946667397966e-07, "loss": 0.0001, "reward": 1.7357143834233284, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 5260 }, { "completion_length": 235.5044755935669, "epoch": 0.8821828240915378, "grad_norm": 0.30016998238421877, "kl": 0.0649871826171875, "learning_rate": 4.99894318267451e-07, "loss": 0.0001, "reward": 1.785714365541935, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 5262 }, { "completion_length": 239.3973331451416, "epoch": 0.8825181273314053, "grad_norm": 0.3114488879046236, "kl": 0.08050537109375, "learning_rate": 4.99893969219756e-07, "loss": 0.0001, "reward": 1.7892857939004898, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7982143089175224, "rewards/format_reward_func": 0.9910714328289032, "step": 5264 }, { "completion_length": 230.040189743042, "epoch": 0.8828534305712729, "grad_norm": 0.3517487633510106, "kl": 0.0715179443359375, "learning_rate": 4.998936195967126e-07, "loss": 0.0001, "reward": 1.7236607894301414, "reward_std": 0.06755394977517426, "rewards/equation_reward_func": 0.7250000424683094, "rewards/format_reward_func": 0.9986607171595097, "step": 5266 }, { "completion_length": 235.5178680419922, "epoch": 0.8831887338111405, "grad_norm": 0.41806628054869266, "kl": 0.07794189453125, "learning_rate": 4.998932693983213e-07, "loss": 0.0001, "reward": 1.7053572461009026, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7098214700818062, "rewards/format_reward_func": 0.9955357164144516, "step": 5268 }, { "completion_length": 232.10268783569336, "epoch": 0.883524037051008, "grad_norm": 0.2858866580292781, "kl": 0.08392333984375, "learning_rate": 4.998929186245832e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 5270 }, { "completion_length": 233.68751049041748, "epoch": 0.8838593402908755, "grad_norm": 0.25485525873994247, "kl": 0.0716400146484375, "learning_rate": 4.998925672754987e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 5272 }, { "completion_length": 226.62947463989258, "epoch": 0.8841946435307431, "grad_norm": 0.17777709268158565, "kl": 0.0927581787109375, "learning_rate": 4.998922153510691e-07, "loss": 0.0001, "reward": 1.807142935693264, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428872644901, "rewards/format_reward_func": 1.0, "step": 5274 }, { "completion_length": 240.352689743042, "epoch": 0.8845299467706107, "grad_norm": 0.18581282838696533, "kl": 0.2295989990234375, "learning_rate": 4.998918628512949e-07, "loss": 0.0002, "reward": 1.7089286372065544, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7133928909897804, "rewards/format_reward_func": 0.9955357164144516, "step": 5276 }, { "completion_length": 233.8303680419922, "epoch": 0.8848652500104782, "grad_norm": 0.3049793148610497, "kl": 0.0879058837890625, "learning_rate": 4.998915097761769e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 5278 }, { "completion_length": 232.40179443359375, "epoch": 0.8852005532503457, "grad_norm": 0.09532660700253795, "kl": 0.06298828125, "learning_rate": 4.998911561257161e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.807142898440361, "rewards/format_reward_func": 1.0, "step": 5280 }, { "completion_length": 226.49554634094238, "epoch": 0.8855358564902134, "grad_norm": 0.2932981971207065, "kl": 0.06439208984375, "learning_rate": 4.998908018999131e-07, "loss": 0.0001, "reward": 1.732142947614193, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428786963224, "rewards/format_reward_func": 1.0, "step": 5282 }, { "completion_length": 238.65179634094238, "epoch": 0.8858711597300809, "grad_norm": 0.32438201855121496, "kl": 0.0819091796875, "learning_rate": 4.998904470987689e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.767857177183032, "rewards/format_reward_func": 1.0, "step": 5284 }, { "completion_length": 238.85268878936768, "epoch": 0.8862064629699484, "grad_norm": 0.3012234729171621, "kl": 0.08880615234375, "learning_rate": 4.998900917222842e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 5286 }, { "completion_length": 236.5580472946167, "epoch": 0.886541766209816, "grad_norm": 0.2963282908676942, "kl": 0.07366943359375, "learning_rate": 4.998897357704598e-07, "loss": 0.0001, "reward": 1.8017857819795609, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.8062500208616257, "rewards/format_reward_func": 0.9955357164144516, "step": 5288 }, { "completion_length": 233.37054538726807, "epoch": 0.8868770694496836, "grad_norm": 0.3279870054415193, "kl": 0.06890869140625, "learning_rate": 4.998893792432966e-07, "loss": 0.0001, "reward": 1.7642857506871223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857413738966, "rewards/format_reward_func": 1.0, "step": 5290 }, { "completion_length": 252.91072463989258, "epoch": 0.8872123726895511, "grad_norm": 0.27756577372857344, "kl": 0.121978759765625, "learning_rate": 4.998890221407956e-07, "loss": 0.0001, "reward": 1.748214341700077, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526785973459482, "rewards/format_reward_func": 0.9955357164144516, "step": 5292 }, { "completion_length": 236.54018688201904, "epoch": 0.8875476759294186, "grad_norm": 0.2331425793873391, "kl": 0.1655120849609375, "learning_rate": 4.998886644629572e-07, "loss": 0.0002, "reward": 1.7857143580913544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 5294 }, { "completion_length": 243.5491189956665, "epoch": 0.8878829791692863, "grad_norm": 0.16776429680449167, "kl": 0.079864501953125, "learning_rate": 4.998883062097824e-07, "loss": 0.0001, "reward": 1.7705357745289803, "reward_std": 0.0416687922552228, "rewards/equation_reward_func": 0.7767857387661934, "rewards/format_reward_func": 0.9937500059604645, "step": 5296 }, { "completion_length": 240.41965293884277, "epoch": 0.8882182824091538, "grad_norm": 0.8081502933749403, "kl": 0.257476806640625, "learning_rate": 4.998879473812722e-07, "loss": 0.0003, "reward": 1.8214286267757416, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8214286006987095, "rewards/format_reward_func": 1.0, "step": 5298 }, { "completion_length": 244.4687614440918, "epoch": 0.8885535856490213, "grad_norm": 0.14530388506531866, "kl": 0.0879058837890625, "learning_rate": 4.998875879774273e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 5300 }, { "completion_length": 229.28126335144043, "epoch": 0.8888888888888888, "grad_norm": 0.2260688976115879, "kl": 0.09368896484375, "learning_rate": 4.998872279982485e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 5302 }, { "completion_length": 234.7812614440918, "epoch": 0.8892241921287565, "grad_norm": 0.1600646981940207, "kl": 0.0696563720703125, "learning_rate": 4.998868674437365e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571622818708, "rewards/format_reward_func": 1.0, "step": 5304 }, { "completion_length": 245.08929824829102, "epoch": 0.889559495368624, "grad_norm": 0.20507006335606773, "kl": 0.068084716796875, "learning_rate": 4.998865063138926e-07, "loss": 0.0001, "reward": 1.7482143715023994, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526785954833031, "rewards/format_reward_func": 0.9955357164144516, "step": 5306 }, { "completion_length": 236.44197750091553, "epoch": 0.8898947986084915, "grad_norm": 0.24173203157702566, "kl": 0.0780029296875, "learning_rate": 4.99886144608717e-07, "loss": 0.0001, "reward": 1.7821429371833801, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7901786025613546, "rewards/format_reward_func": 0.9919642880558968, "step": 5308 }, { "completion_length": 243.21876335144043, "epoch": 0.8902301018483592, "grad_norm": 0.2508191564510342, "kl": 0.076171875, "learning_rate": 4.99885782328211e-07, "loss": 0.0001, "reward": 1.7392857745289803, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857596278191, "rewards/format_reward_func": 1.0, "step": 5310 }, { "completion_length": 246.66518688201904, "epoch": 0.8905654050882267, "grad_norm": 0.19502896555021526, "kl": 0.07415771484375, "learning_rate": 4.998854194723752e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 5312 }, { "completion_length": 239.36608028411865, "epoch": 0.8909007083280942, "grad_norm": 0.2431954955354549, "kl": 0.072662353515625, "learning_rate": 4.998850560412106e-07, "loss": 0.0001, "reward": 1.764285795390606, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 5314 }, { "completion_length": 261.59376335144043, "epoch": 0.8912360115679617, "grad_norm": 0.23926305068072765, "kl": 0.07647705078125, "learning_rate": 4.998846920347178e-07, "loss": 0.0001, "reward": 1.7732143849134445, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7776785865426064, "rewards/format_reward_func": 0.9955357164144516, "step": 5316 }, { "completion_length": 253.9419755935669, "epoch": 0.8915713148078294, "grad_norm": 0.2160253207774375, "kl": 0.0731964111328125, "learning_rate": 4.99884327452898e-07, "loss": 0.0001, "reward": 1.7428571954369545, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571730852127, "rewards/format_reward_func": 1.0, "step": 5318 }, { "completion_length": 250.19197750091553, "epoch": 0.8919066180476969, "grad_norm": 0.20380295428031342, "kl": 0.072662353515625, "learning_rate": 4.998839622957517e-07, "loss": 0.0001, "reward": 1.7285715267062187, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 5320 }, { "completion_length": 243.31697463989258, "epoch": 0.8922419212875644, "grad_norm": 0.22780821035437138, "kl": 0.084014892578125, "learning_rate": 4.998835965632798e-07, "loss": 0.0001, "reward": 1.7125000804662704, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7169643081724644, "rewards/format_reward_func": 0.9955357164144516, "step": 5322 }, { "completion_length": 252.35715293884277, "epoch": 0.892577224527432, "grad_norm": 0.22647910989616787, "kl": 0.07513427734375, "learning_rate": 4.998832302554834e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 5324 }, { "completion_length": 255.96876430511475, "epoch": 0.8929125277672996, "grad_norm": 0.27754938674440754, "kl": 0.08740234375, "learning_rate": 4.99882863372363e-07, "loss": 0.0001, "reward": 1.7375000789761543, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643141329288, "rewards/format_reward_func": 0.9955357164144516, "step": 5326 }, { "completion_length": 259.9509048461914, "epoch": 0.8932478310071671, "grad_norm": 0.1811040255508447, "kl": 0.084320068359375, "learning_rate": 4.998824959139196e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143152505159, "rewards/format_reward_func": 1.0, "step": 5328 }, { "completion_length": 248.4196548461914, "epoch": 0.8935831342470346, "grad_norm": 0.5155124794678484, "kl": 0.07611083984375, "learning_rate": 4.998821278801542e-07, "loss": 0.0001, "reward": 1.719642959535122, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.724107176065445, "rewards/format_reward_func": 0.9955357164144516, "step": 5330 }, { "completion_length": 250.12947940826416, "epoch": 0.8939184374869023, "grad_norm": 0.24714452364489936, "kl": 0.078399658203125, "learning_rate": 4.998817592710674e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 5332 }, { "completion_length": 246.7901906967163, "epoch": 0.8942537407267698, "grad_norm": 0.26676101095918225, "kl": 0.0773773193359375, "learning_rate": 4.998813900866601e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857544124126, "rewards/format_reward_func": 1.0, "step": 5334 }, { "completion_length": 250.9821538925171, "epoch": 0.8945890439666373, "grad_norm": 0.839157664979466, "kl": 0.082855224609375, "learning_rate": 4.998810203269333e-07, "loss": 0.0001, "reward": 1.7392858117818832, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.748214315623045, "rewards/format_reward_func": 0.9910714328289032, "step": 5336 }, { "completion_length": 252.33483123779297, "epoch": 0.8949243472065049, "grad_norm": 0.12935000666624785, "kl": 0.0791015625, "learning_rate": 4.998806499918876e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 5338 }, { "completion_length": 246.33036708831787, "epoch": 0.8952596504463725, "grad_norm": 0.10639964651343883, "kl": 0.09002685546875, "learning_rate": 4.998802790815241e-07, "loss": 0.0001, "reward": 1.8017857670783997, "reward_std": 0.04798224661499262, "rewards/equation_reward_func": 0.8151785917580128, "rewards/format_reward_func": 0.9866071492433548, "step": 5340 }, { "completion_length": 248.7634048461914, "epoch": 0.89559495368624, "grad_norm": 0.33624572209025144, "kl": 0.0796356201171875, "learning_rate": 4.998799075958435e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.7464285995811224, "rewards/format_reward_func": 1.0, "step": 5342 }, { "completion_length": 259.901798248291, "epoch": 0.8959302569261075, "grad_norm": 0.1254125349254734, "kl": 0.082733154296875, "learning_rate": 4.998795355348467e-07, "loss": 0.0001, "reward": 1.7375000566244125, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643215835094, "rewards/format_reward_func": 0.9955357164144516, "step": 5344 }, { "completion_length": 263.9285840988159, "epoch": 0.8962655601659751, "grad_norm": 0.31235861346966837, "kl": 0.1109619140625, "learning_rate": 4.998791628985346e-07, "loss": 0.0001, "reward": 1.733928643167019, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7383928820490837, "rewards/format_reward_func": 0.9955357164144516, "step": 5346 }, { "completion_length": 242.00447750091553, "epoch": 0.8966008634058427, "grad_norm": 0.1923375555144056, "kl": 0.08599853515625, "learning_rate": 4.99878789686908e-07, "loss": 0.0001, "reward": 1.7517857626080513, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7562500312924385, "rewards/format_reward_func": 0.9955357164144516, "step": 5348 }, { "completion_length": 253.73215293884277, "epoch": 0.8969361666457102, "grad_norm": 0.3144491273579619, "kl": 0.157440185546875, "learning_rate": 4.998784158999677e-07, "loss": 0.0002, "reward": 1.7482143566012383, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7526785880327225, "rewards/format_reward_func": 0.9955357164144516, "step": 5350 }, { "completion_length": 258.86608505249023, "epoch": 0.8972714698855778, "grad_norm": 0.3800286104807005, "kl": 0.0876312255859375, "learning_rate": 4.998780415377148e-07, "loss": 0.0001, "reward": 1.7946428880095482, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7991071678698063, "rewards/format_reward_func": 0.9955357164144516, "step": 5352 }, { "completion_length": 255.0178680419922, "epoch": 0.8976067731254453, "grad_norm": 0.5843896226137154, "kl": 0.1268310546875, "learning_rate": 4.998776666001499e-07, "loss": 0.0001, "reward": 1.6750000715255737, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.6839286163449287, "rewards/format_reward_func": 0.9910714328289032, "step": 5354 }, { "completion_length": 252.58929634094238, "epoch": 0.8979420763653129, "grad_norm": 0.3539484283441876, "kl": 0.2587127685546875, "learning_rate": 4.998772910872739e-07, "loss": 0.0003, "reward": 1.7035714909434319, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7125000301748514, "rewards/format_reward_func": 0.9910714328289032, "step": 5356 }, { "completion_length": 254.33929538726807, "epoch": 0.8982773796051804, "grad_norm": 0.46943661426880745, "kl": 0.193206787109375, "learning_rate": 4.998769149990878e-07, "loss": 0.0002, "reward": 1.7928571999073029, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 5358 }, { "completion_length": 244.76786994934082, "epoch": 0.898612682845048, "grad_norm": 0.12745735511328252, "kl": 0.22625732421875, "learning_rate": 4.998765383355924e-07, "loss": 0.0002, "reward": 1.753571517765522, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 5360 }, { "completion_length": 254.78572750091553, "epoch": 0.8989479860849156, "grad_norm": 0.20917256499861125, "kl": 0.3976593017578125, "learning_rate": 4.998761610967885e-07, "loss": 0.0004, "reward": 1.7982143610715866, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.8026786036789417, "rewards/format_reward_func": 0.9955357164144516, "step": 5362 }, { "completion_length": 242.13393688201904, "epoch": 0.8992832893247831, "grad_norm": 0.4313738380742602, "kl": 0.1184539794921875, "learning_rate": 4.998757832826772e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 5364 }, { "completion_length": 255.21875953674316, "epoch": 0.8996185925646507, "grad_norm": 0.5463979836566241, "kl": 0.6548309326171875, "learning_rate": 4.99875404893259e-07, "loss": 0.0007, "reward": 1.7017857879400253, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.706250037997961, "rewards/format_reward_func": 0.9955357164144516, "step": 5366 }, { "completion_length": 249.6517972946167, "epoch": 0.8999538958045182, "grad_norm": 0.2889413935758667, "kl": 0.3985443115234375, "learning_rate": 4.998750259285351e-07, "loss": 0.0004, "reward": 1.7428572177886963, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7428571656346321, "rewards/format_reward_func": 1.0, "step": 5368 }, { "completion_length": 246.79912090301514, "epoch": 0.9002891990443858, "grad_norm": 0.31076407243427856, "kl": 0.1228179931640625, "learning_rate": 4.998746463885062e-07, "loss": 0.0001, "reward": 1.7982143387198448, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8026785962283611, "rewards/format_reward_func": 0.9955357164144516, "step": 5370 }, { "completion_length": 247.64733123779297, "epoch": 0.9006245022842533, "grad_norm": 0.4925333749051105, "kl": 1.430389404296875, "learning_rate": 4.998742662731732e-07, "loss": 0.0014, "reward": 1.79464291036129, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7991071566939354, "rewards/format_reward_func": 0.9955357164144516, "step": 5372 }, { "completion_length": 244.7232265472412, "epoch": 0.9009598055241209, "grad_norm": 0.20763244478129034, "kl": 1.0489349365234375, "learning_rate": 4.998738855825371e-07, "loss": 0.001, "reward": 1.76071435213089, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 5374 }, { "completion_length": 245.29911518096924, "epoch": 0.9012951087639884, "grad_norm": 0.7021375598228516, "kl": 0.698516845703125, "learning_rate": 4.998735043165986e-07, "loss": 0.0007, "reward": 1.7821429073810577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 1.0, "step": 5376 }, { "completion_length": 257.50894260406494, "epoch": 0.901630412003856, "grad_norm": 0.7357447369391495, "kl": 0.44970703125, "learning_rate": 4.998731224753586e-07, "loss": 0.0004, "reward": 1.7321429327130318, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7410714663565159, "rewards/format_reward_func": 0.9910714328289032, "step": 5378 }, { "completion_length": 247.5491189956665, "epoch": 0.9019657152437235, "grad_norm": 0.47121200401910107, "kl": 1.12872314453125, "learning_rate": 4.99872740058818e-07, "loss": 0.0011, "reward": 1.7125000804662704, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7169643118977547, "rewards/format_reward_func": 0.9955357164144516, "step": 5380 }, { "completion_length": 247.50893783569336, "epoch": 0.9023010184835911, "grad_norm": 0.3110160210673541, "kl": 0.1286468505859375, "learning_rate": 4.998723570669778e-07, "loss": 0.0001, "reward": 1.7071429416537285, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7071428969502449, "rewards/format_reward_func": 1.0, "step": 5382 }, { "completion_length": 255.0625123977661, "epoch": 0.9026363217234586, "grad_norm": 0.33694169060014484, "kl": 0.8502655029296875, "learning_rate": 4.998719734998387e-07, "loss": 0.0008, "reward": 1.6785715445876122, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.6875000335276127, "rewards/format_reward_func": 0.9910714328289032, "step": 5384 }, { "completion_length": 247.7053680419922, "epoch": 0.9029716249633262, "grad_norm": 0.17383252788350392, "kl": 0.1181640625, "learning_rate": 4.998715893574018e-07, "loss": 0.0001, "reward": 1.721428669989109, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7214285954833031, "rewards/format_reward_func": 1.0, "step": 5386 }, { "completion_length": 249.31697463989258, "epoch": 0.9033069282031938, "grad_norm": 0.2062389071756386, "kl": 0.0739898681640625, "learning_rate": 4.998712046396677e-07, "loss": 0.0001, "reward": 1.7607143223285675, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 5388 }, { "completion_length": 253.4375114440918, "epoch": 0.9036422314430613, "grad_norm": 0.1814458101267558, "kl": 0.0583648681640625, "learning_rate": 4.998708193466375e-07, "loss": 0.0001, "reward": 1.7267857864499092, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7312500439584255, "rewards/format_reward_func": 0.9955357164144516, "step": 5390 }, { "completion_length": 239.55804824829102, "epoch": 0.9039775346829289, "grad_norm": 0.29584868151822824, "kl": 0.0521392822265625, "learning_rate": 4.998704334783121e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857361584902, "rewards/format_reward_func": 1.0, "step": 5392 }, { "completion_length": 250.7455472946167, "epoch": 0.9043128379227964, "grad_norm": 0.23987515671916398, "kl": 0.0930328369140625, "learning_rate": 4.998700470346923e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000264495611, "rewards/format_reward_func": 1.0, "step": 5394 }, { "completion_length": 236.6741189956665, "epoch": 0.904648141162664, "grad_norm": 0.27100398738808573, "kl": 0.0775299072265625, "learning_rate": 4.998696600157789e-07, "loss": 0.0001, "reward": 1.7142858132719994, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7142857387661934, "rewards/format_reward_func": 1.0, "step": 5396 }, { "completion_length": 249.26340007781982, "epoch": 0.9049834444025315, "grad_norm": 0.19531226401891985, "kl": 0.1512298583984375, "learning_rate": 4.998692724215731e-07, "loss": 0.0002, "reward": 1.7571429535746574, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428753435612, "rewards/format_reward_func": 1.0, "step": 5398 }, { "completion_length": 251.31250953674316, "epoch": 0.9053187476423991, "grad_norm": 0.18254214716236256, "kl": 0.05153656005859375, "learning_rate": 4.998688842520754e-07, "loss": 0.0001, "reward": 1.746428668498993, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 5400 }, { "completion_length": 244.5714406967163, "epoch": 0.9056540508822667, "grad_norm": 0.19710436019518554, "kl": 0.0549468994140625, "learning_rate": 4.998684955072869e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 5402 }, { "completion_length": 248.0982255935669, "epoch": 0.9059893541221342, "grad_norm": 0.2126268791462505, "kl": 0.0541839599609375, "learning_rate": 4.998681061872086e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 5404 }, { "completion_length": 245.34822463989258, "epoch": 0.9063246573620017, "grad_norm": 0.28491657265214865, "kl": 0.1408843994140625, "learning_rate": 4.998677162918411e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500461935997, "rewards/format_reward_func": 0.9955357164144516, "step": 5406 }, { "completion_length": 250.87947750091553, "epoch": 0.9066599606018693, "grad_norm": 0.28512756855223254, "kl": 0.187103271484375, "learning_rate": 4.998673258211857e-07, "loss": 0.0002, "reward": 1.7464286386966705, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.755357176065445, "rewards/format_reward_func": 0.9910714328289032, "step": 5408 }, { "completion_length": 250.35269165039062, "epoch": 0.9069952638417369, "grad_norm": 0.1916385409286891, "kl": 0.05348968505859375, "learning_rate": 4.998669347752429e-07, "loss": 0.0001, "reward": 1.8142857551574707, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 5410 }, { "completion_length": 258.12500953674316, "epoch": 0.9073305670816044, "grad_norm": 3.139980507250418, "kl": 0.21661376953125, "learning_rate": 4.998665431540138e-07, "loss": 0.0002, "reward": 1.7142857909202576, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7232143245637417, "rewards/format_reward_func": 0.9910714328289032, "step": 5412 }, { "completion_length": 258.0803689956665, "epoch": 0.907665870321472, "grad_norm": 0.358562731594068, "kl": 0.0805206298828125, "learning_rate": 4.998661509574993e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 5414 }, { "completion_length": 254.50893783569336, "epoch": 0.9080011735613396, "grad_norm": 0.27085475348892585, "kl": 0.0615692138671875, "learning_rate": 4.998657581857002e-07, "loss": 0.0001, "reward": 1.8107143267989159, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.810714315623045, "rewards/format_reward_func": 1.0, "step": 5416 }, { "completion_length": 239.04911994934082, "epoch": 0.9083364768012071, "grad_norm": 0.278262296727244, "kl": 0.0838623046875, "learning_rate": 4.998653648386175e-07, "loss": 0.0001, "reward": 1.703571505844593, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7035714704543352, "rewards/format_reward_func": 1.0, "step": 5418 }, { "completion_length": 240.16965293884277, "epoch": 0.9086717800410746, "grad_norm": 0.2400452957091018, "kl": 0.051971435546875, "learning_rate": 4.998649709162522e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7660714536905289, "rewards/format_reward_func": 0.9910714328289032, "step": 5420 }, { "completion_length": 236.52233123779297, "epoch": 0.9090070832809422, "grad_norm": 0.00552479084519166, "kl": 0.05666351318359375, "learning_rate": 4.998645764186051e-07, "loss": 0.0001, "reward": 1.7357143461704254, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7357143331319094, "rewards/format_reward_func": 1.0, "step": 5422 }, { "completion_length": 242.33036994934082, "epoch": 0.9093423865208098, "grad_norm": 0.29723655679749067, "kl": 0.0811309814453125, "learning_rate": 4.99864181345677e-07, "loss": 0.0001, "reward": 1.7285714745521545, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 5424 }, { "completion_length": 246.1607255935669, "epoch": 0.9096776897606773, "grad_norm": 0.13343396730306012, "kl": 0.0643768310546875, "learning_rate": 4.99863785697469e-07, "loss": 0.0001, "reward": 1.7785715237259865, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714454948902, "rewards/format_reward_func": 1.0, "step": 5426 }, { "completion_length": 238.2500114440918, "epoch": 0.9100129930005448, "grad_norm": 0.10802697735688958, "kl": 0.0711669921875, "learning_rate": 4.998633894739818e-07, "loss": 0.0001, "reward": 1.7767857536673546, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7812500447034836, "rewards/format_reward_func": 0.9955357164144516, "step": 5428 }, { "completion_length": 255.61161518096924, "epoch": 0.9103482962404125, "grad_norm": 0.15751124690918902, "kl": 0.06842041015625, "learning_rate": 4.998629926752165e-07, "loss": 0.0001, "reward": 1.725000075995922, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000294297934, "rewards/format_reward_func": 1.0, "step": 5430 }, { "completion_length": 253.02233409881592, "epoch": 0.91068359948028, "grad_norm": 0.07871958210206086, "kl": 0.106658935546875, "learning_rate": 4.998625953011739e-07, "loss": 0.0001, "reward": 1.7428571954369545, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7517857551574707, "rewards/format_reward_func": 0.9910714328289032, "step": 5432 }, { "completion_length": 239.16965579986572, "epoch": 0.9110189027201475, "grad_norm": 0.17591567198150765, "kl": 0.04869842529296875, "learning_rate": 4.99862197351855e-07, "loss": 0.0, "reward": 1.7821429371833801, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 5434 }, { "completion_length": 241.68304634094238, "epoch": 0.911354205960015, "grad_norm": 0.24591806078737033, "kl": 0.0575103759765625, "learning_rate": 4.998617988272608e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 5436 }, { "completion_length": 243.97768878936768, "epoch": 0.9116895091998827, "grad_norm": 0.2791332987456658, "kl": 0.064788818359375, "learning_rate": 4.998613997273919e-07, "loss": 0.0001, "reward": 1.742857202887535, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571842610836, "rewards/format_reward_func": 1.0, "step": 5438 }, { "completion_length": 248.4241189956665, "epoch": 0.9120248124397502, "grad_norm": 0.157219240713236, "kl": 0.0563507080078125, "learning_rate": 4.998610000522495e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 5440 }, { "completion_length": 249.42858695983887, "epoch": 0.9123601156796177, "grad_norm": 0.22393359111992003, "kl": 0.17645263671875, "learning_rate": 4.998605998018344e-07, "loss": 0.0002, "reward": 1.7160715237259865, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7205357439815998, "rewards/format_reward_func": 0.9955357164144516, "step": 5442 }, { "completion_length": 246.8526906967163, "epoch": 0.9126954189194854, "grad_norm": 0.22293059464193454, "kl": 0.0731201171875, "learning_rate": 4.998601989761477e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7455357387661934, "rewards/format_reward_func": 0.9955357164144516, "step": 5444 }, { "completion_length": 232.83036708831787, "epoch": 0.9130307221593529, "grad_norm": 0.004513281336610052, "kl": 0.05118560791015625, "learning_rate": 4.9985979757519e-07, "loss": 0.0001, "reward": 1.8250000327825546, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8250000365078449, "rewards/format_reward_func": 1.0, "step": 5446 }, { "completion_length": 246.4642972946167, "epoch": 0.9133660253992204, "grad_norm": 0.28111220060756315, "kl": 0.11065673828125, "learning_rate": 4.998593955989625e-07, "loss": 0.0001, "reward": 1.7803571671247482, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.7937500290572643, "rewards/format_reward_func": 0.9866071492433548, "step": 5448 }, { "completion_length": 248.61161708831787, "epoch": 0.9137013286390879, "grad_norm": 0.26719790440911256, "kl": 0.0706329345703125, "learning_rate": 4.99858993047466e-07, "loss": 0.0001, "reward": 1.8214286118745804, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8214286006987095, "rewards/format_reward_func": 1.0, "step": 5450 }, { "completion_length": 246.4598331451416, "epoch": 0.9140366318789556, "grad_norm": 0.27475256082811733, "kl": 0.2954254150390625, "learning_rate": 4.998585899207015e-07, "loss": 0.0003, "reward": 1.7500000968575478, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 5452 }, { "completion_length": 258.51787185668945, "epoch": 0.9143719351188231, "grad_norm": 0.2363268462501324, "kl": 0.075775146484375, "learning_rate": 4.998581862186698e-07, "loss": 0.0001, "reward": 1.753571480512619, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.753571467474103, "rewards/format_reward_func": 1.0, "step": 5454 }, { "completion_length": 248.1875123977661, "epoch": 0.9147072383586906, "grad_norm": 0.35017889785356593, "kl": 0.1490478515625, "learning_rate": 4.99857781941372e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7357143145054579, "rewards/format_reward_func": 1.0, "step": 5456 }, { "completion_length": 247.55804538726807, "epoch": 0.9150425415985582, "grad_norm": 0.2851086680739538, "kl": 0.08409881591796875, "learning_rate": 4.998573770888089e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 5458 }, { "completion_length": 252.24108123779297, "epoch": 0.9153778448384258, "grad_norm": 0.12642756304879468, "kl": 0.055938720703125, "learning_rate": 4.998569716609815e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 5460 }, { "completion_length": 248.8571548461914, "epoch": 0.9157131480782933, "grad_norm": 0.11845124265324933, "kl": 0.1025543212890625, "learning_rate": 4.998565656578907e-07, "loss": 0.0001, "reward": 1.7517857626080513, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500350177288, "rewards/format_reward_func": 0.9955357164144516, "step": 5462 }, { "completion_length": 245.77679824829102, "epoch": 0.9160484513181608, "grad_norm": 0.2892291151263095, "kl": 0.2335357666015625, "learning_rate": 4.998561590795375e-07, "loss": 0.0002, "reward": 1.8107143640518188, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 5464 }, { "completion_length": 247.86608028411865, "epoch": 0.9163837545580285, "grad_norm": 0.14281412571223723, "kl": 0.1459503173828125, "learning_rate": 4.998557519259227e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 5466 }, { "completion_length": 254.5312623977661, "epoch": 0.916719057797896, "grad_norm": 0.2005936378687353, "kl": 0.145904541015625, "learning_rate": 4.998553441970474e-07, "loss": 0.0001, "reward": 1.7589286342263222, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9955357164144516, "step": 5468 }, { "completion_length": 239.5625114440918, "epoch": 0.9170543610377635, "grad_norm": 0.0922149462674823, "kl": 0.0469512939453125, "learning_rate": 4.998549358929124e-07, "loss": 0.0, "reward": 1.7178572341799736, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571783006191, "rewards/format_reward_func": 1.0, "step": 5470 }, { "completion_length": 242.3080472946167, "epoch": 0.9173896642776311, "grad_norm": 0.28370406486244265, "kl": 0.07550048828125, "learning_rate": 4.998545270135187e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 5472 }, { "completion_length": 255.2678680419922, "epoch": 0.9177249675174987, "grad_norm": 0.3061601835724448, "kl": 0.0880889892578125, "learning_rate": 4.998541175588672e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.09091372601687908, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 5474 }, { "completion_length": 258.0134038925171, "epoch": 0.9180602707573662, "grad_norm": 0.2438152833657782, "kl": 0.05542755126953125, "learning_rate": 4.998537075289589e-07, "loss": 0.0001, "reward": 1.8196429088711739, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8241071589291096, "rewards/format_reward_func": 0.9955357164144516, "step": 5476 }, { "completion_length": 251.0580472946167, "epoch": 0.9183955739972337, "grad_norm": 0.3451355955311913, "kl": 0.1912841796875, "learning_rate": 4.998532969237948e-07, "loss": 0.0002, "reward": 1.7803572118282318, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 5478 }, { "completion_length": 246.39287090301514, "epoch": 0.9187308772371013, "grad_norm": 0.12898136469831076, "kl": 0.3510284423828125, "learning_rate": 4.998528857433758e-07, "loss": 0.0004, "reward": 1.7732143476605415, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7776786033064127, "rewards/format_reward_func": 0.9955357164144516, "step": 5480 }, { "completion_length": 252.08036994934082, "epoch": 0.9190661804769689, "grad_norm": 0.198850014831597, "kl": 0.5845413208007812, "learning_rate": 4.998524739877027e-07, "loss": 0.0006, "reward": 1.7000000551342964, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7089286018162966, "rewards/format_reward_func": 0.9910714328289032, "step": 5482 }, { "completion_length": 246.92411994934082, "epoch": 0.9194014837168364, "grad_norm": 0.20629485243538223, "kl": 0.1067047119140625, "learning_rate": 4.998520616567767e-07, "loss": 0.0001, "reward": 1.8142857775092125, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857365310192, "rewards/format_reward_func": 1.0, "step": 5484 }, { "completion_length": 257.71875858306885, "epoch": 0.919736786956704, "grad_norm": 0.27743791130919937, "kl": 0.06072998046875, "learning_rate": 4.998516487505985e-07, "loss": 0.0001, "reward": 1.7053572162985802, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.709821455180645, "rewards/format_reward_func": 0.9955357164144516, "step": 5486 }, { "completion_length": 251.83929634094238, "epoch": 0.9200720901965715, "grad_norm": 0.26182652616513147, "kl": 0.3108367919921875, "learning_rate": 4.998512352691692e-07, "loss": 0.0003, "reward": 1.762500062584877, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 5488 }, { "completion_length": 256.5044755935669, "epoch": 0.9204073934364391, "grad_norm": 0.228511123466136, "kl": 0.05645751953125, "learning_rate": 4.998508212124896e-07, "loss": 0.0001, "reward": 1.7660715132951736, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7705357223749161, "rewards/format_reward_func": 0.9955357164144516, "step": 5490 }, { "completion_length": 249.4866189956665, "epoch": 0.9207426966763066, "grad_norm": 0.328227301695069, "kl": 0.3878936767578125, "learning_rate": 4.99850406580561e-07, "loss": 0.0004, "reward": 1.7857143580913544, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 5492 }, { "completion_length": 251.33483600616455, "epoch": 0.9210779999161742, "grad_norm": 0.22654887015954236, "kl": 0.05487060546875, "learning_rate": 4.99849991373384e-07, "loss": 0.0001, "reward": 1.8392857685685158, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8392857350409031, "rewards/format_reward_func": 1.0, "step": 5494 }, { "completion_length": 259.1205463409424, "epoch": 0.9214133031560418, "grad_norm": 0.22116648315395465, "kl": 0.05206298828125, "learning_rate": 4.998495755909597e-07, "loss": 0.0001, "reward": 1.7732143253087997, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.777678593993187, "rewards/format_reward_func": 0.9955357164144516, "step": 5496 }, { "completion_length": 249.49108123779297, "epoch": 0.9217486063959093, "grad_norm": 0.2728720425990515, "kl": 0.3609466552734375, "learning_rate": 4.998491592332891e-07, "loss": 0.0004, "reward": 1.7607143372297287, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 5498 }, { "completion_length": 253.9375114440918, "epoch": 0.9220839096357769, "grad_norm": 0.22240895206575162, "kl": 0.1055145263671875, "learning_rate": 4.99848742300373e-07, "loss": 0.0001, "reward": 1.7803572043776512, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7848214507102966, "rewards/format_reward_func": 0.9955357164144516, "step": 5500 }, { "completion_length": 255.65626049041748, "epoch": 0.9224192128756444, "grad_norm": 0.24399409381101786, "kl": 0.30101776123046875, "learning_rate": 4.998483247922125e-07, "loss": 0.0003, "reward": 1.7625000700354576, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643051922321, "rewards/format_reward_func": 0.9955357164144516, "step": 5502 }, { "completion_length": 251.9598331451416, "epoch": 0.922754516115512, "grad_norm": 0.22450025929828568, "kl": 0.062225341796875, "learning_rate": 4.998479067088085e-07, "loss": 0.0001, "reward": 1.8321429193019867, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8321428820490837, "rewards/format_reward_func": 1.0, "step": 5504 }, { "completion_length": 248.4509038925171, "epoch": 0.9230898193553795, "grad_norm": 0.3051567211790164, "kl": 0.3009185791015625, "learning_rate": 4.99847488050162e-07, "loss": 0.0003, "reward": 1.771428644657135, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 5506 }, { "completion_length": 258.60269260406494, "epoch": 0.9234251225952471, "grad_norm": 0.14467945045617023, "kl": 0.0584869384765625, "learning_rate": 4.998470688162739e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 5508 }, { "completion_length": 244.5848331451416, "epoch": 0.9237604258351146, "grad_norm": 0.2562804961215929, "kl": 0.14105224609375, "learning_rate": 4.998466490071452e-07, "loss": 0.0001, "reward": 1.7875000312924385, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7919643111526966, "rewards/format_reward_func": 0.9955357164144516, "step": 5510 }, { "completion_length": 263.23662281036377, "epoch": 0.9240957290749822, "grad_norm": 0.14269246424130785, "kl": 0.2320556640625, "learning_rate": 4.99846228622777e-07, "loss": 0.0002, "reward": 1.7821429148316383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 5512 }, { "completion_length": 253.27233219146729, "epoch": 0.9244310323148497, "grad_norm": 0.19061742235708712, "kl": 0.2888031005859375, "learning_rate": 4.9984580766317e-07, "loss": 0.0003, "reward": 1.764285795390606, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857581377029, "rewards/format_reward_func": 1.0, "step": 5514 }, { "completion_length": 258.8705463409424, "epoch": 0.9247663355547173, "grad_norm": 0.19888817017665691, "kl": 0.2613983154296875, "learning_rate": 4.998453861283254e-07, "loss": 0.0003, "reward": 1.723214365541935, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.736607164144516, "rewards/format_reward_func": 0.9866071492433548, "step": 5516 }, { "completion_length": 261.30804538726807, "epoch": 0.9251016387945848, "grad_norm": 0.31133274406414835, "kl": 0.060394287109375, "learning_rate": 4.998449640182442e-07, "loss": 0.0001, "reward": 1.7696429416537285, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7741071730852127, "rewards/format_reward_func": 0.9955357164144516, "step": 5518 }, { "completion_length": 263.91519355773926, "epoch": 0.9254369420344524, "grad_norm": 0.21421569618201042, "kl": 0.256683349609375, "learning_rate": 4.998445413329271e-07, "loss": 0.0003, "reward": 1.6821429505944252, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.6821428909897804, "rewards/format_reward_func": 1.0, "step": 5520 }, { "completion_length": 255.39733219146729, "epoch": 0.92577224527432, "grad_norm": 0.2180324591587215, "kl": 0.2082366943359375, "learning_rate": 4.998441180723753e-07, "loss": 0.0002, "reward": 1.782142922282219, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7821428664028645, "rewards/format_reward_func": 1.0, "step": 5522 }, { "completion_length": 248.8437623977661, "epoch": 0.9261075485141875, "grad_norm": 0.12138142361987297, "kl": 0.113922119140625, "learning_rate": 4.998436942365896e-07, "loss": 0.0001, "reward": 1.8142857626080513, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8142857328057289, "rewards/format_reward_func": 1.0, "step": 5524 }, { "completion_length": 258.85715675354004, "epoch": 0.926442851754055, "grad_norm": 0.28283183000256845, "kl": 0.074371337890625, "learning_rate": 4.998432698255712e-07, "loss": 0.0001, "reward": 1.7446429133415222, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071783006191, "rewards/format_reward_func": 0.9955357164144516, "step": 5526 }, { "completion_length": 255.95090293884277, "epoch": 0.9267781549939226, "grad_norm": 0.24545344039802608, "kl": 0.1540679931640625, "learning_rate": 4.998428448393209e-07, "loss": 0.0002, "reward": 1.7553572207689285, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 5528 }, { "completion_length": 255.63840293884277, "epoch": 0.9271134582337902, "grad_norm": 0.2534769583940854, "kl": 0.1455841064453125, "learning_rate": 4.998424192778396e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 5530 }, { "completion_length": 250.9553680419922, "epoch": 0.9274487614736577, "grad_norm": 0.24871677747576773, "kl": 0.070220947265625, "learning_rate": 4.998419931411286e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 1.0, "step": 5532 }, { "completion_length": 259.0491180419922, "epoch": 0.9277840647135253, "grad_norm": 0.1870510929710293, "kl": 0.07379150390625, "learning_rate": 4.998415664291887e-07, "loss": 0.0001, "reward": 1.8000000417232513, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 5534 }, { "completion_length": 258.1071529388428, "epoch": 0.9281193679533929, "grad_norm": 0.25610770723960113, "kl": 0.12774658203125, "learning_rate": 4.998411391420209e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 5536 }, { "completion_length": 245.25893688201904, "epoch": 0.9284546711932604, "grad_norm": 0.30114561591144784, "kl": 0.083526611328125, "learning_rate": 4.998407112796261e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 5538 }, { "completion_length": 241.30804920196533, "epoch": 0.9287899744331279, "grad_norm": 0.2860699239934635, "kl": 0.094268798828125, "learning_rate": 4.998402828420052e-07, "loss": 0.0001, "reward": 1.7553572058677673, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7598214671015739, "rewards/format_reward_func": 0.9955357164144516, "step": 5540 }, { "completion_length": 253.16072750091553, "epoch": 0.9291252776729955, "grad_norm": 0.2555147566584149, "kl": 0.0799713134765625, "learning_rate": 4.998398538291596e-07, "loss": 0.0001, "reward": 1.7071429416537285, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7071428876370192, "rewards/format_reward_func": 1.0, "step": 5542 }, { "completion_length": 244.32590293884277, "epoch": 0.9294605809128631, "grad_norm": 0.1987351795108615, "kl": 0.0678253173828125, "learning_rate": 4.998394242410899e-07, "loss": 0.0001, "reward": 1.8142857626080513, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8142857477068901, "rewards/format_reward_func": 1.0, "step": 5544 }, { "completion_length": 255.3080472946167, "epoch": 0.9297958841527306, "grad_norm": 0.31618605204352274, "kl": 0.121063232421875, "learning_rate": 4.998389940777972e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7410714626312256, "rewards/format_reward_func": 0.9910714328289032, "step": 5546 }, { "completion_length": 268.62501335144043, "epoch": 0.9301311873925981, "grad_norm": 0.3458136837670348, "kl": 0.0794830322265625, "learning_rate": 4.998385633392825e-07, "loss": 0.0001, "reward": 1.7285714894533157, "reward_std": 0.09091372694820166, "rewards/equation_reward_func": 0.7375000230967999, "rewards/format_reward_func": 0.9910714328289032, "step": 5548 }, { "completion_length": 265.83930015563965, "epoch": 0.9304664906324658, "grad_norm": 0.2956670786981416, "kl": 0.2076568603515625, "learning_rate": 4.998381320255468e-07, "loss": 0.0002, "reward": 1.7000000551342964, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7089286111295223, "rewards/format_reward_func": 0.9910714328289032, "step": 5550 }, { "completion_length": 263.7857265472412, "epoch": 0.9308017938723333, "grad_norm": 0.22854230090747885, "kl": 0.12713623046875, "learning_rate": 4.998377001365911e-07, "loss": 0.0001, "reward": 1.6910715252161026, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.6955357491970062, "rewards/format_reward_func": 0.9955357164144516, "step": 5552 }, { "completion_length": 265.04465675354004, "epoch": 0.9311370971122008, "grad_norm": 0.23869813596626355, "kl": 0.3279571533203125, "learning_rate": 4.998372676724164e-07, "loss": 0.0003, "reward": 1.7178572043776512, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7267857491970062, "rewards/format_reward_func": 0.9910714328289032, "step": 5554 }, { "completion_length": 264.2053699493408, "epoch": 0.9314724003520684, "grad_norm": 0.21032106039875323, "kl": 0.152618408203125, "learning_rate": 4.998368346330237e-07, "loss": 0.0002, "reward": 1.7339286729693413, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7473214641213417, "rewards/format_reward_func": 0.9866071492433548, "step": 5556 }, { "completion_length": 266.86608600616455, "epoch": 0.931807703591936, "grad_norm": 0.2956750550588605, "kl": 0.080657958984375, "learning_rate": 4.998364010184139e-07, "loss": 0.0001, "reward": 1.7375000789761543, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7419643122702837, "rewards/format_reward_func": 0.9955357164144516, "step": 5558 }, { "completion_length": 258.6651887893677, "epoch": 0.9321430068318035, "grad_norm": 0.24094408593698557, "kl": 0.07884979248046875, "learning_rate": 4.99835966828588e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214285992085934, "rewards/format_reward_func": 1.0, "step": 5560 }, { "completion_length": 256.8660821914673, "epoch": 0.932478310071671, "grad_norm": 0.19758210940866042, "kl": 0.1866455078125, "learning_rate": 4.998355320635473e-07, "loss": 0.0002, "reward": 1.7857143431901932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 5562 }, { "completion_length": 251.83929920196533, "epoch": 0.9328136133115387, "grad_norm": 0.20512783737454493, "kl": 0.07763671875, "learning_rate": 4.998350967232925e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714730620384, "rewards/format_reward_func": 1.0, "step": 5564 }, { "completion_length": 269.28126430511475, "epoch": 0.9331489165514062, "grad_norm": 0.1919354926759305, "kl": 0.0897216796875, "learning_rate": 4.998346608078245e-07, "loss": 0.0001, "reward": 1.7000000774860382, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7089286148548126, "rewards/format_reward_func": 0.9910714328289032, "step": 5566 }, { "completion_length": 250.1607265472412, "epoch": 0.9334842197912737, "grad_norm": 0.10457544698349448, "kl": 0.0626678466796875, "learning_rate": 4.998342243171447e-07, "loss": 0.0001, "reward": 1.7464286237955093, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.754464328289032, "rewards/format_reward_func": 0.9919642917811871, "step": 5568 }, { "completion_length": 264.3973331451416, "epoch": 0.9338195230311412, "grad_norm": 0.22229885242314276, "kl": 0.0751495361328125, "learning_rate": 4.998337872512538e-07, "loss": 0.0001, "reward": 1.839285783469677, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.8482143133878708, "rewards/format_reward_func": 0.9910714328289032, "step": 5570 }, { "completion_length": 270.1384057998657, "epoch": 0.9341548262710089, "grad_norm": 0.18494009709638623, "kl": 0.115081787109375, "learning_rate": 4.998333496101529e-07, "loss": 0.0001, "reward": 1.6750000789761543, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6750000324100256, "rewards/format_reward_func": 1.0, "step": 5572 }, { "completion_length": 259.0535840988159, "epoch": 0.9344901295108764, "grad_norm": 0.09959741588528473, "kl": 0.0759735107421875, "learning_rate": 4.998329113938429e-07, "loss": 0.0001, "reward": 1.700000062584877, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7000000271946192, "rewards/format_reward_func": 1.0, "step": 5574 }, { "completion_length": 252.05358505249023, "epoch": 0.9348254327507439, "grad_norm": 0.2517501606709083, "kl": 0.0615692138671875, "learning_rate": 4.998324726023249e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 5576 }, { "completion_length": 252.37500953674316, "epoch": 0.9351607359906116, "grad_norm": 0.22949889960028877, "kl": 0.080596923828125, "learning_rate": 4.998320332356001e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428753435612, "rewards/format_reward_func": 1.0, "step": 5578 }, { "completion_length": 259.05805110931396, "epoch": 0.9354960392304791, "grad_norm": 0.3646697098932177, "kl": 0.4967041015625, "learning_rate": 4.998315932936693e-07, "loss": 0.0005, "reward": 1.735714353621006, "reward_std": 0.11111677903681993, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 0.9821428656578064, "step": 5580 }, { "completion_length": 245.58036708831787, "epoch": 0.9358313424703466, "grad_norm": 0.25969544709589587, "kl": 0.085906982421875, "learning_rate": 4.998311527765334e-07, "loss": 0.0001, "reward": 1.7875000536441803, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7919643111526966, "rewards/format_reward_func": 0.9955357164144516, "step": 5582 }, { "completion_length": 250.86161994934082, "epoch": 0.9361666457102141, "grad_norm": 0.17861812568478064, "kl": 0.08895111083984375, "learning_rate": 4.998307116841937e-07, "loss": 0.0001, "reward": 1.7714285999536514, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 5584 }, { "completion_length": 262.96876335144043, "epoch": 0.9365019489500818, "grad_norm": 0.25962272007727116, "kl": 0.0623779296875, "learning_rate": 4.998302700166509e-07, "loss": 0.0001, "reward": 1.7232143580913544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7267857529222965, "rewards/format_reward_func": 0.9964285716414452, "step": 5586 }, { "completion_length": 267.6384029388428, "epoch": 0.9368372521899493, "grad_norm": 0.2566924669697665, "kl": 0.137725830078125, "learning_rate": 4.998298277739063e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7910714447498322, "rewards/format_reward_func": 0.9910714328289032, "step": 5588 }, { "completion_length": 270.07590198516846, "epoch": 0.9371725554298168, "grad_norm": 0.2571152286005929, "kl": 0.11667633056640625, "learning_rate": 4.998293849559608e-07, "loss": 0.0001, "reward": 1.7696429044008255, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7741071842610836, "rewards/format_reward_func": 0.9955357164144516, "step": 5590 }, { "completion_length": 265.2142972946167, "epoch": 0.9375078586696844, "grad_norm": 0.2686328008496726, "kl": 0.0756072998046875, "learning_rate": 4.998289415628154e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7589286062866449, "rewards/format_reward_func": 0.9910714328289032, "step": 5592 }, { "completion_length": 262.8526916503906, "epoch": 0.937843161909552, "grad_norm": 0.2147661255591824, "kl": 0.1193695068359375, "learning_rate": 4.998284975944712e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7589286044239998, "rewards/format_reward_func": 0.9910714328289032, "step": 5594 }, { "completion_length": 261.5044755935669, "epoch": 0.9381784651494195, "grad_norm": 0.31342327171802786, "kl": 0.064178466796875, "learning_rate": 4.998280530509291e-07, "loss": 0.0001, "reward": 1.7464286759495735, "reward_std": 0.09596448857337236, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 5596 }, { "completion_length": 258.48215198516846, "epoch": 0.938513768389287, "grad_norm": 0.24752753734293131, "kl": 0.1938629150390625, "learning_rate": 4.998276079321903e-07, "loss": 0.0002, "reward": 1.7553572207689285, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 5598 }, { "completion_length": 251.5401906967163, "epoch": 0.9388490716291547, "grad_norm": 0.14067875762090168, "kl": 0.108612060546875, "learning_rate": 4.998271622382556e-07, "loss": 0.0001, "reward": 1.814285784959793, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8142857328057289, "rewards/format_reward_func": 1.0, "step": 5600 }, { "completion_length": 243.477689743042, "epoch": 0.9391843748690222, "grad_norm": 0.2324244712555495, "kl": 0.0554046630859375, "learning_rate": 4.998267159691262e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571429014205933, "rewards/format_reward_func": 1.0, "step": 5602 }, { "completion_length": 263.3616189956665, "epoch": 0.9395196781088897, "grad_norm": 0.19419565752990522, "kl": 0.233978271484375, "learning_rate": 4.998262691248031e-07, "loss": 0.0002, "reward": 1.7892857789993286, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857436090708, "rewards/format_reward_func": 1.0, "step": 5604 }, { "completion_length": 247.4642972946167, "epoch": 0.9398549813487573, "grad_norm": 0.1919762132952126, "kl": 0.1018218994140625, "learning_rate": 4.998258217052872e-07, "loss": 0.0001, "reward": 1.7196429148316383, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7241071797907352, "rewards/format_reward_func": 0.9955357164144516, "step": 5606 }, { "completion_length": 252.6919765472412, "epoch": 0.9401902845886249, "grad_norm": 0.1919943550574376, "kl": 0.2514801025390625, "learning_rate": 4.998253737105797e-07, "loss": 0.0003, "reward": 1.7714286223053932, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 5608 }, { "completion_length": 250.2366189956665, "epoch": 0.9405255878284924, "grad_norm": 0.29611682678625656, "kl": 0.1829986572265625, "learning_rate": 4.998249251406815e-07, "loss": 0.0002, "reward": 1.789285771548748, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 5610 }, { "completion_length": 263.4553680419922, "epoch": 0.9408608910683599, "grad_norm": 0.23130672291701776, "kl": 0.094757080078125, "learning_rate": 4.998244759955939e-07, "loss": 0.0001, "reward": 1.7357143461704254, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9910714328289032, "step": 5612 }, { "completion_length": 268.04019355773926, "epoch": 0.9411961943082275, "grad_norm": 0.30392357057779124, "kl": 0.2783203125, "learning_rate": 4.998240262753174e-07, "loss": 0.0003, "reward": 1.7178572490811348, "reward_std": 0.09596449043601751, "rewards/equation_reward_func": 0.7267857454717159, "rewards/format_reward_func": 0.9910714328289032, "step": 5614 }, { "completion_length": 257.6339387893677, "epoch": 0.9415314975480951, "grad_norm": 0.4255414977969514, "kl": 0.2381591796875, "learning_rate": 4.998235759798537e-07, "loss": 0.0002, "reward": 1.7250000536441803, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7339286021888256, "rewards/format_reward_func": 0.9910714328289032, "step": 5616 }, { "completion_length": 257.5401906967163, "epoch": 0.9418668007879626, "grad_norm": 0.292460391135546, "kl": 0.360076904296875, "learning_rate": 4.998231251092033e-07, "loss": 0.0004, "reward": 1.68392863124609, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.6883928906172514, "rewards/format_reward_func": 0.9955357164144516, "step": 5618 }, { "completion_length": 268.0848321914673, "epoch": 0.9422021040278302, "grad_norm": 0.2446423277783523, "kl": 0.21514892578125, "learning_rate": 4.998226736633675e-07, "loss": 0.0002, "reward": 1.726785808801651, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7312500327825546, "rewards/format_reward_func": 0.9955357164144516, "step": 5620 }, { "completion_length": 256.3392972946167, "epoch": 0.9425374072676977, "grad_norm": 0.18823448189637623, "kl": 0.12542724609375, "learning_rate": 4.998222216423472e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7616071663796902, "rewards/format_reward_func": 0.9919642880558968, "step": 5622 }, { "completion_length": 256.7053699493408, "epoch": 0.9428727105075653, "grad_norm": 0.2138723503939617, "kl": 0.1275787353515625, "learning_rate": 4.998217690461435e-07, "loss": 0.0001, "reward": 1.7410715147852898, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7544643245637417, "rewards/format_reward_func": 0.9866071492433548, "step": 5624 }, { "completion_length": 250.16072750091553, "epoch": 0.9432080137474328, "grad_norm": 0.16148957638978453, "kl": 0.0900421142578125, "learning_rate": 4.998213158747576e-07, "loss": 0.0001, "reward": 1.758928619325161, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7723214477300644, "rewards/format_reward_func": 0.9866071492433548, "step": 5626 }, { "completion_length": 264.17411518096924, "epoch": 0.9435433169873004, "grad_norm": 0.30775280324654475, "kl": 0.090179443359375, "learning_rate": 4.998208621281903e-07, "loss": 0.0001, "reward": 1.7607143744826317, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.7696428783237934, "rewards/format_reward_func": 0.9910714328289032, "step": 5628 }, { "completion_length": 256.93304538726807, "epoch": 0.943878620227168, "grad_norm": 0.4164457824510602, "kl": 0.1088409423828125, "learning_rate": 4.998204078064429e-07, "loss": 0.0001, "reward": 1.737500086426735, "reward_std": 0.0883883461356163, "rewards/equation_reward_func": 0.7508928924798965, "rewards/format_reward_func": 0.9866071492433548, "step": 5630 }, { "completion_length": 257.83483505249023, "epoch": 0.9442139234670355, "grad_norm": 0.18818507517460953, "kl": 0.1334228515625, "learning_rate": 4.998199529095162e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7910714596509933, "rewards/format_reward_func": 0.9910714328289032, "step": 5632 }, { "completion_length": 270.9375114440918, "epoch": 0.944549226706903, "grad_norm": 0.2851531511191286, "kl": 0.18560791015625, "learning_rate": 4.998194974374113e-07, "loss": 0.0002, "reward": 1.7392857745289803, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.7482143193483353, "rewards/format_reward_func": 0.9910714328289032, "step": 5634 }, { "completion_length": 264.75001525878906, "epoch": 0.9448845299467706, "grad_norm": 0.253065033016023, "kl": 0.06976318359375, "learning_rate": 4.998190413901292e-07, "loss": 0.0001, "reward": 1.7696429267525673, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7910714596509933, "rewards/format_reward_func": 0.9785714372992516, "step": 5636 }, { "completion_length": 265.07144260406494, "epoch": 0.9452198331866382, "grad_norm": 0.23839217310733235, "kl": 0.061492919921875, "learning_rate": 4.998185847676712e-07, "loss": 0.0001, "reward": 1.7928571924567223, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571794182062, "rewards/format_reward_func": 1.0, "step": 5638 }, { "completion_length": 261.7678699493408, "epoch": 0.9455551364265057, "grad_norm": 0.19998156568588835, "kl": 0.1107177734375, "learning_rate": 4.998181275700382e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 5640 }, { "completion_length": 270.2634048461914, "epoch": 0.9458904396663733, "grad_norm": 0.25558352437163384, "kl": 0.1804656982421875, "learning_rate": 4.998176697972311e-07, "loss": 0.0002, "reward": 1.7785715013742447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 5642 }, { "completion_length": 254.98661994934082, "epoch": 0.9462257429062408, "grad_norm": 0.14052163514370772, "kl": 0.221527099609375, "learning_rate": 4.998172114492513e-07, "loss": 0.0002, "reward": 1.7535714954137802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 5644 }, { "completion_length": 257.1875123977661, "epoch": 0.9465610461461084, "grad_norm": 0.27107773602436486, "kl": 0.11501312255859375, "learning_rate": 4.998167525260994e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428954601288, "rewards/format_reward_func": 1.0, "step": 5646 }, { "completion_length": 266.7857275009155, "epoch": 0.9468963493859759, "grad_norm": 0.34060283731390234, "kl": 0.3077545166015625, "learning_rate": 4.99816293027777e-07, "loss": 0.0003, "reward": 1.7089286595582962, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7133928872644901, "rewards/format_reward_func": 0.9955357164144516, "step": 5648 }, { "completion_length": 259.46429920196533, "epoch": 0.9472316526258435, "grad_norm": 0.32314258962676723, "kl": 0.0685882568359375, "learning_rate": 4.998158329542847e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714715719223, "rewards/format_reward_func": 1.0, "step": 5650 }, { "completion_length": 256.6651945114136, "epoch": 0.947566955865711, "grad_norm": 0.23348653009831258, "kl": 0.140716552734375, "learning_rate": 4.998153723056237e-07, "loss": 0.0001, "reward": 1.7339286506175995, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 5652 }, { "completion_length": 261.8705463409424, "epoch": 0.9479022591055786, "grad_norm": 0.5032103429997516, "kl": 0.4618377685546875, "learning_rate": 4.998149110817952e-07, "loss": 0.0005, "reward": 1.7785714641213417, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7875000201165676, "rewards/format_reward_func": 0.9910714328289032, "step": 5654 }, { "completion_length": 248.64287090301514, "epoch": 0.9482375623454462, "grad_norm": 0.26493899916194824, "kl": 0.0601654052734375, "learning_rate": 4.998144492828e-07, "loss": 0.0001, "reward": 1.7357143834233284, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7357143051922321, "rewards/format_reward_func": 1.0, "step": 5656 }, { "completion_length": 277.3750114440918, "epoch": 0.9485728655853137, "grad_norm": 0.1620975210345513, "kl": 0.1508026123046875, "learning_rate": 4.998139869086394e-07, "loss": 0.0002, "reward": 1.7053571864962578, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7276785969734192, "rewards/format_reward_func": 0.977678582072258, "step": 5658 }, { "completion_length": 265.5803699493408, "epoch": 0.9489081688251813, "grad_norm": 0.14163608078600906, "kl": 0.1465911865234375, "learning_rate": 4.998135239593145e-07, "loss": 0.0001, "reward": 1.7517857775092125, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500461935997, "rewards/format_reward_func": 0.9955357164144516, "step": 5660 }, { "completion_length": 263.0134086608887, "epoch": 0.9492434720650488, "grad_norm": 0.19857088284391342, "kl": 0.0649871826171875, "learning_rate": 4.998130604348261e-07, "loss": 0.0001, "reward": 1.8357143327593803, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8357143104076385, "rewards/format_reward_func": 1.0, "step": 5662 }, { "completion_length": 264.9285840988159, "epoch": 0.9495787753049164, "grad_norm": 0.29489920317894763, "kl": 0.0663299560546875, "learning_rate": 4.998125963351754e-07, "loss": 0.0001, "reward": 1.744642935693264, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7580357417464256, "rewards/format_reward_func": 0.9866071492433548, "step": 5664 }, { "completion_length": 271.7410840988159, "epoch": 0.9499140785447839, "grad_norm": 0.2585833121153054, "kl": 0.0933837890625, "learning_rate": 4.998121316603635e-07, "loss": 0.0001, "reward": 1.7107143700122833, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.7196428887546062, "rewards/format_reward_func": 0.9910714328289032, "step": 5666 }, { "completion_length": 256.977689743042, "epoch": 0.9502493817846515, "grad_norm": 0.004380553101243798, "kl": 0.069793701171875, "learning_rate": 4.998116664103914e-07, "loss": 0.0001, "reward": 1.8250000327825546, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8250000290572643, "rewards/format_reward_func": 1.0, "step": 5668 }, { "completion_length": 268.883939743042, "epoch": 0.9505846850245191, "grad_norm": 0.405321103068457, "kl": 0.142242431640625, "learning_rate": 4.998112005852603e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 0.9821428656578064, "step": 5670 }, { "completion_length": 263.4464416503906, "epoch": 0.9509199882643866, "grad_norm": 0.7514470251323826, "kl": 0.2141571044921875, "learning_rate": 4.998107341849712e-07, "loss": 0.0002, "reward": 1.789285771548748, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.789285734295845, "rewards/format_reward_func": 1.0, "step": 5672 }, { "completion_length": 262.5357255935669, "epoch": 0.9512552915042541, "grad_norm": 0.17197998736240475, "kl": 0.0726470947265625, "learning_rate": 4.998102672095251e-07, "loss": 0.0001, "reward": 1.7428571954369545, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571842610836, "rewards/format_reward_func": 1.0, "step": 5674 }, { "completion_length": 264.42411708831787, "epoch": 0.9515905947441217, "grad_norm": 0.24326647158367354, "kl": 0.080718994140625, "learning_rate": 4.998097996589233e-07, "loss": 0.0001, "reward": 1.7732143625617027, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.9955357164144516, "step": 5676 }, { "completion_length": 266.07143783569336, "epoch": 0.9519258979839893, "grad_norm": 0.4137675130766855, "kl": 0.101409912109375, "learning_rate": 4.998093315331665e-07, "loss": 0.0001, "reward": 1.7950893491506577, "reward_std": 0.057452425360679626, "rewards/equation_reward_func": 0.7964286096394062, "rewards/format_reward_func": 0.9986607171595097, "step": 5678 }, { "completion_length": 256.8973331451416, "epoch": 0.9522612012238568, "grad_norm": 0.23364485897135776, "kl": 0.0838165283203125, "learning_rate": 4.998088628322562e-07, "loss": 0.0001, "reward": 1.8321429342031479, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8321428820490837, "rewards/format_reward_func": 1.0, "step": 5680 }, { "completion_length": 269.2500114440918, "epoch": 0.9525965044637243, "grad_norm": 0.44124717033408517, "kl": 0.0906219482421875, "learning_rate": 4.998083935561932e-07, "loss": 0.0001, "reward": 1.7196429148316383, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7241071835160255, "rewards/format_reward_func": 0.9955357164144516, "step": 5682 }, { "completion_length": 265.31251430511475, "epoch": 0.952931807703592, "grad_norm": 0.1912833560793976, "kl": 0.07427978515625, "learning_rate": 4.998079237049785e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714525729418, "rewards/format_reward_func": 1.0, "step": 5684 }, { "completion_length": 268.4509086608887, "epoch": 0.9532671109434595, "grad_norm": 0.12658711226962915, "kl": 0.06768798828125, "learning_rate": 4.998074532786135e-07, "loss": 0.0001, "reward": 1.7071429416537285, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7071428932249546, "rewards/format_reward_func": 1.0, "step": 5686 }, { "completion_length": 283.089298248291, "epoch": 0.953602414183327, "grad_norm": 0.23460052009656376, "kl": 0.073516845703125, "learning_rate": 4.998069822770992e-07, "loss": 0.0001, "reward": 1.7303571924567223, "reward_std": 0.06313453428447247, "rewards/equation_reward_func": 0.7526785936206579, "rewards/format_reward_func": 0.977678582072258, "step": 5688 }, { "completion_length": 281.93751430511475, "epoch": 0.9539377174231946, "grad_norm": 0.18191125932432262, "kl": 0.0916290283203125, "learning_rate": 4.998065107004365e-07, "loss": 0.0001, "reward": 1.6625000536441803, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.6848214641213417, "rewards/format_reward_func": 0.977678582072258, "step": 5690 }, { "completion_length": 281.11162281036377, "epoch": 0.9542730206630622, "grad_norm": 0.2272203925271211, "kl": 0.1111297607421875, "learning_rate": 4.998060385486265e-07, "loss": 0.0001, "reward": 1.7339286729693413, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 5692 }, { "completion_length": 280.6964406967163, "epoch": 0.9546083239029297, "grad_norm": 0.12683642382177307, "kl": 0.086151123046875, "learning_rate": 4.998055658216705e-07, "loss": 0.0001, "reward": 1.7482143267989159, "reward_std": 0.0328299580141902, "rewards/equation_reward_func": 0.7526785954833031, "rewards/format_reward_func": 0.9955357164144516, "step": 5694 }, { "completion_length": 282.22769260406494, "epoch": 0.9549436271427972, "grad_norm": 0.33633389268441777, "kl": 0.1428070068359375, "learning_rate": 4.998050925195694e-07, "loss": 0.0001, "reward": 1.7125000655651093, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7348214611411095, "rewards/format_reward_func": 0.977678582072258, "step": 5696 }, { "completion_length": 277.4687614440918, "epoch": 0.9552789303826649, "grad_norm": 0.2182738670209157, "kl": 0.1761627197265625, "learning_rate": 4.998046186423243e-07, "loss": 0.0002, "reward": 1.7625000774860382, "reward_std": 0.09343911055475473, "rewards/equation_reward_func": 0.7758928760886192, "rewards/format_reward_func": 0.9866071492433548, "step": 5698 }, { "completion_length": 272.46876430511475, "epoch": 0.9556142336225324, "grad_norm": 0.2382541500873255, "kl": 0.0994110107421875, "learning_rate": 4.998041441899365e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 5700 }, { "completion_length": 277.77680110931396, "epoch": 0.9559495368623999, "grad_norm": 0.17786839241166985, "kl": 0.1782073974609375, "learning_rate": 4.998036691624069e-07, "loss": 0.0002, "reward": 1.7571429312229156, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 5702 }, { "completion_length": 280.6607265472412, "epoch": 0.9562848401022674, "grad_norm": 0.44365582048278357, "kl": 0.2714080810546875, "learning_rate": 4.998031935597366e-07, "loss": 0.0003, "reward": 1.698214329779148, "reward_std": 0.09343911055475473, "rewards/equation_reward_func": 0.7116071823984385, "rewards/format_reward_func": 0.9866071492433548, "step": 5704 }, { "completion_length": 276.37501335144043, "epoch": 0.9566201433421351, "grad_norm": 0.15799020884293222, "kl": 0.0831298828125, "learning_rate": 4.998027173819268e-07, "loss": 0.0001, "reward": 1.7767857685685158, "reward_std": 0.05303300637751818, "rewards/equation_reward_func": 0.7901786006987095, "rewards/format_reward_func": 0.9866071492433548, "step": 5706 }, { "completion_length": 274.84822845458984, "epoch": 0.9569554465820026, "grad_norm": 0.22140325800096408, "kl": 0.145477294921875, "learning_rate": 4.998022406289784e-07, "loss": 0.0001, "reward": 1.7053572311997414, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7098214514553547, "rewards/format_reward_func": 0.9955357164144516, "step": 5708 }, { "completion_length": 275.5937623977661, "epoch": 0.9572907498218701, "grad_norm": 0.43758406981647235, "kl": 0.108306884765625, "learning_rate": 4.998017633008928e-07, "loss": 0.0001, "reward": 1.6910715103149414, "reward_std": 0.0883883461356163, "rewards/equation_reward_func": 0.7133928909897804, "rewards/format_reward_func": 0.9776785783469677, "step": 5710 }, { "completion_length": 269.6205520629883, "epoch": 0.9576260530617378, "grad_norm": 0.09696601262183618, "kl": 0.07403564453125, "learning_rate": 4.998012853976707e-07, "loss": 0.0001, "reward": 1.737500086426735, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 5712 }, { "completion_length": 271.7009057998657, "epoch": 0.9579613563016053, "grad_norm": 0.3847634166372032, "kl": 0.0723419189453125, "learning_rate": 4.998008069193136e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.11616754438728094, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 0.9821428656578064, "step": 5714 }, { "completion_length": 253.88840293884277, "epoch": 0.9582966595414728, "grad_norm": 0.2766356278864402, "kl": 0.07025146484375, "learning_rate": 4.998003278658222e-07, "loss": 0.0001, "reward": 1.7446429207921028, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7660714499652386, "rewards/format_reward_func": 0.9785714447498322, "step": 5716 }, { "completion_length": 269.3750114440918, "epoch": 0.9586319627813403, "grad_norm": 0.31884468339624056, "kl": 0.077911376953125, "learning_rate": 4.997998482371981e-07, "loss": 0.0001, "reward": 1.6964286863803864, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.6964285932481289, "rewards/format_reward_func": 1.0, "step": 5718 }, { "completion_length": 267.9509057998657, "epoch": 0.958967266021208, "grad_norm": 0.15281834104888092, "kl": 0.081817626953125, "learning_rate": 4.99799368033442e-07, "loss": 0.0001, "reward": 1.700000062584877, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7089286092668772, "rewards/format_reward_func": 0.9910714328289032, "step": 5720 }, { "completion_length": 265.84376430511475, "epoch": 0.9593025692610755, "grad_norm": 0.13977553928490638, "kl": 0.0774688720703125, "learning_rate": 4.997988872545551e-07, "loss": 0.0001, "reward": 1.7446429207921028, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071801632643, "rewards/format_reward_func": 0.9955357164144516, "step": 5722 }, { "completion_length": 258.8437614440918, "epoch": 0.959637872500943, "grad_norm": 0.0732864545892103, "kl": 0.0887908935546875, "learning_rate": 4.997984059005386e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 5724 }, { "completion_length": 256.7321548461914, "epoch": 0.9599731757408106, "grad_norm": 0.1907927548631042, "kl": 0.078094482421875, "learning_rate": 4.997979239713935e-07, "loss": 0.0001, "reward": 1.8214286267757416, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8214286044239998, "rewards/format_reward_func": 1.0, "step": 5726 }, { "completion_length": 252.30804443359375, "epoch": 0.9603084789806782, "grad_norm": 0.5063146778249406, "kl": 0.0906982421875, "learning_rate": 4.997974414671211e-07, "loss": 0.0001, "reward": 1.698214367032051, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.702678607776761, "rewards/format_reward_func": 0.9955357164144516, "step": 5728 }, { "completion_length": 259.276798248291, "epoch": 0.9606437822205457, "grad_norm": 0.3635133661797194, "kl": 0.1004638671875, "learning_rate": 4.997969583877223e-07, "loss": 0.0001, "reward": 1.7357143461704254, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9910714328289032, "step": 5730 }, { "completion_length": 252.36162090301514, "epoch": 0.9609790854604132, "grad_norm": 0.28713514263695467, "kl": 0.0994110107421875, "learning_rate": 4.997964747331982e-07, "loss": 0.0001, "reward": 1.742857202887535, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7428571749478579, "rewards/format_reward_func": 1.0, "step": 5732 }, { "completion_length": 250.84375953674316, "epoch": 0.9613143887002809, "grad_norm": 0.2510492779487852, "kl": 0.250762939453125, "learning_rate": 4.9979599050355e-07, "loss": 0.0003, "reward": 1.723214365541935, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276786044239998, "rewards/format_reward_func": 0.9955357164144516, "step": 5734 }, { "completion_length": 249.1830472946167, "epoch": 0.9616496919401484, "grad_norm": 0.22425040779164032, "kl": 0.0988311767578125, "learning_rate": 4.99795505698779e-07, "loss": 0.0001, "reward": 1.7486607730388641, "reward_std": 0.05240166233852506, "rewards/equation_reward_func": 0.7500000223517418, "rewards/format_reward_func": 0.9986607171595097, "step": 5736 }, { "completion_length": 242.62947845458984, "epoch": 0.9619849951800159, "grad_norm": 0.23321105934828876, "kl": 0.093170166015625, "learning_rate": 4.997950203188859e-07, "loss": 0.0001, "reward": 1.825000062584877, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8250000290572643, "rewards/format_reward_func": 1.0, "step": 5738 }, { "completion_length": 252.52233600616455, "epoch": 0.9623202984198835, "grad_norm": 1.1336488889710032, "kl": 0.0902557373046875, "learning_rate": 4.997945343638721e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.771428594365716, "rewards/format_reward_func": 0.9821428656578064, "step": 5740 }, { "completion_length": 238.45983123779297, "epoch": 0.9626556016597511, "grad_norm": 0.08074561960104147, "kl": 0.0883331298828125, "learning_rate": 4.997940478337387e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.03030457627028227, "rewards/equation_reward_func": 0.7660714685916901, "rewards/format_reward_func": 0.9910714328289032, "step": 5742 }, { "completion_length": 241.31251049041748, "epoch": 0.9629909048996186, "grad_norm": 0.16187012849684024, "kl": 0.0847930908203125, "learning_rate": 4.997935607284869e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.8196428641676903, "rewards/format_reward_func": 0.9910714328289032, "step": 5744 }, { "completion_length": 243.7232255935669, "epoch": 0.9633262081394861, "grad_norm": 0.7833918591829895, "kl": 0.11737060546875, "learning_rate": 4.997930730481175e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7660714499652386, "rewards/format_reward_func": 0.9910714328289032, "step": 5746 }, { "completion_length": 254.01786994934082, "epoch": 0.9636615113793537, "grad_norm": 0.2893503495802617, "kl": 0.0744171142578125, "learning_rate": 4.99792584792632e-07, "loss": 0.0001, "reward": 1.7214286178350449, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7303571961820126, "rewards/format_reward_func": 0.9910714328289032, "step": 5748 }, { "completion_length": 247.45090579986572, "epoch": 0.9639968146192213, "grad_norm": 0.21552202774201076, "kl": 0.076690673828125, "learning_rate": 4.997920959620312e-07, "loss": 0.0001, "reward": 1.7517857626080513, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500312924385, "rewards/format_reward_func": 0.9955357164144516, "step": 5750 }, { "completion_length": 228.21876049041748, "epoch": 0.9643321178590888, "grad_norm": 0.17294579698170476, "kl": 0.0935211181640625, "learning_rate": 4.997916065563164e-07, "loss": 0.0001, "reward": 1.7767857611179352, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500298023224, "rewards/format_reward_func": 0.9955357164144516, "step": 5752 }, { "completion_length": 237.68304824829102, "epoch": 0.9646674210989564, "grad_norm": 0.2319434648635915, "kl": 0.148101806640625, "learning_rate": 4.997911165754888e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7357142902910709, "rewards/format_reward_func": 1.0, "step": 5754 }, { "completion_length": 235.92858123779297, "epoch": 0.965002724338824, "grad_norm": 0.25800015070654836, "kl": 0.1013336181640625, "learning_rate": 4.997906260195495e-07, "loss": 0.0001, "reward": 1.7839286401867867, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 5756 }, { "completion_length": 227.48661613464355, "epoch": 0.9653380275786915, "grad_norm": 0.5387041435019756, "kl": 0.2335357666015625, "learning_rate": 4.997901348884994e-07, "loss": 0.0002, "reward": 1.8125000670552254, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8169643171131611, "rewards/format_reward_func": 0.9955357164144516, "step": 5758 }, { "completion_length": 255.6071538925171, "epoch": 0.965673330818559, "grad_norm": 0.9829855590547008, "kl": 0.515045166015625, "learning_rate": 4.997896431823398e-07, "loss": 0.0005, "reward": 1.6928571909666061, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7107143253087997, "rewards/format_reward_func": 0.9821428656578064, "step": 5760 }, { "completion_length": 249.9062623977661, "epoch": 0.9660086340584266, "grad_norm": 0.4302051737915668, "kl": 0.381072998046875, "learning_rate": 4.997891509010719e-07, "loss": 0.0004, "reward": 1.7964286357164383, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.8053571581840515, "rewards/format_reward_func": 0.9910714328289032, "step": 5762 }, { "completion_length": 251.86161708831787, "epoch": 0.9663439372982942, "grad_norm": 0.2003740166787335, "kl": 0.1812744140625, "learning_rate": 4.997886580446968e-07, "loss": 0.0002, "reward": 1.7642857730388641, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7821428701281548, "rewards/format_reward_func": 0.9821428656578064, "step": 5764 }, { "completion_length": 236.9910831451416, "epoch": 0.9666792405381617, "grad_norm": 0.30823609277606523, "kl": 0.79034423828125, "learning_rate": 4.997881646132154e-07, "loss": 0.0008, "reward": 1.7732143551111221, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.777678593993187, "rewards/format_reward_func": 0.9955357164144516, "step": 5766 }, { "completion_length": 258.9464406967163, "epoch": 0.9670145437780292, "grad_norm": 0.4599728212819268, "kl": 4.020721435546875, "learning_rate": 4.997876706066293e-07, "loss": 0.004, "reward": 1.7410714775323868, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7455357424914837, "rewards/format_reward_func": 0.9955357164144516, "step": 5768 }, { "completion_length": 255.64733028411865, "epoch": 0.9673498470178968, "grad_norm": 1.0446450602846626, "kl": 4.951690673828125, "learning_rate": 4.997871760249392e-07, "loss": 0.005, "reward": 1.6772322282195091, "reward_std": 0.1433153918478638, "rewards/equation_reward_func": 0.7142857499420643, "rewards/format_reward_func": 0.9629464447498322, "step": 5770 }, { "completion_length": 256.3928699493408, "epoch": 0.9676851502577644, "grad_norm": 0.652762162620934, "kl": 1.116546630859375, "learning_rate": 4.997866808681464e-07, "loss": 0.0011, "reward": 1.6901786252856255, "reward_std": 0.14773480826988816, "rewards/equation_reward_func": 0.7366071753203869, "rewards/format_reward_func": 0.9535714499652386, "step": 5772 }, { "completion_length": 260.3928680419922, "epoch": 0.9680204534976319, "grad_norm": 0.4629856359890017, "kl": 1.8438873291015625, "learning_rate": 4.997861851362522e-07, "loss": 0.0018, "reward": 1.7361607551574707, "reward_std": 0.12058695964515209, "rewards/equation_reward_func": 0.7598214652389288, "rewards/format_reward_func": 0.9763392992317677, "step": 5774 }, { "completion_length": 258.7410840988159, "epoch": 0.9683557567374995, "grad_norm": 0.2744137856877816, "kl": 0.482940673828125, "learning_rate": 4.997856888292575e-07, "loss": 0.0005, "reward": 1.7428571954369545, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 0.9821428656578064, "step": 5776 }, { "completion_length": 255.73661613464355, "epoch": 0.968691059977367, "grad_norm": 0.1992131837536013, "kl": 0.2537994384765625, "learning_rate": 4.997851919471634e-07, "loss": 0.0003, "reward": 1.6647322177886963, "reward_std": 0.07007933338172734, "rewards/equation_reward_func": 0.6794643215835094, "rewards/format_reward_func": 0.9852678664028645, "step": 5778 }, { "completion_length": 243.08929824829102, "epoch": 0.9690263632172346, "grad_norm": 0.2374438613904606, "kl": 0.1378021240234375, "learning_rate": 4.997846944899713e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.0858629671856761, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 0.9821428656578064, "step": 5780 }, { "completion_length": 230.6116189956665, "epoch": 0.9693616664571021, "grad_norm": 0.19207024544444537, "kl": 0.1244354248046875, "learning_rate": 4.997841964576822e-07, "loss": 0.0001, "reward": 1.7482143267989159, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7526786141097546, "rewards/format_reward_func": 0.9955357164144516, "step": 5782 }, { "completion_length": 234.8303680419922, "epoch": 0.9696969696969697, "grad_norm": 0.29969401860216555, "kl": 0.1541748046875, "learning_rate": 4.997836978502973e-07, "loss": 0.0002, "reward": 1.7196429297327995, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7330357395112514, "rewards/format_reward_func": 0.9866071492433548, "step": 5784 }, { "completion_length": 235.4687614440918, "epoch": 0.9700322729368372, "grad_norm": 0.0050095061490047785, "kl": 0.1041717529296875, "learning_rate": 4.997831986678177e-07, "loss": 0.0001, "reward": 1.7397321984171867, "reward_std": 0.034723992459475994, "rewards/equation_reward_func": 0.745535746216774, "rewards/format_reward_func": 0.994196429848671, "step": 5786 }, { "completion_length": 233.54465007781982, "epoch": 0.9703675761767048, "grad_norm": 0.4226043906249522, "kl": 0.1141204833984375, "learning_rate": 4.997826989102445e-07, "loss": 0.0001, "reward": 1.7517857775092125, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500331550837, "rewards/format_reward_func": 0.9955357164144516, "step": 5788 }, { "completion_length": 235.62501049041748, "epoch": 0.9707028794165724, "grad_norm": 0.14606378420437877, "kl": 0.11199951171875, "learning_rate": 4.997821985775789e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 5790 }, { "completion_length": 224.12054443359375, "epoch": 0.9710381826564399, "grad_norm": 0.4096506818257736, "kl": 0.1255035400390625, "learning_rate": 4.997816976698222e-07, "loss": 0.0001, "reward": 1.8000000417232513, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 5792 }, { "completion_length": 231.3482255935669, "epoch": 0.9713734858963075, "grad_norm": 0.3583328866620858, "kl": 0.137237548828125, "learning_rate": 4.997811961869754e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7589285895228386, "rewards/format_reward_func": 0.9910714328289032, "step": 5794 }, { "completion_length": 228.71429538726807, "epoch": 0.971708789136175, "grad_norm": 0.20583662808869987, "kl": 0.106781005859375, "learning_rate": 4.997806941290396e-07, "loss": 0.0001, "reward": 1.7142857909202576, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857406288385, "rewards/format_reward_func": 1.0, "step": 5796 }, { "completion_length": 222.55804634094238, "epoch": 0.9720440923760426, "grad_norm": 0.17532440930536886, "kl": 0.09112548828125, "learning_rate": 4.99780191496016e-07, "loss": 0.0001, "reward": 1.725000061094761, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.725000036880374, "rewards/format_reward_func": 1.0, "step": 5798 }, { "completion_length": 235.87501049041748, "epoch": 0.9723793956159101, "grad_norm": 0.31365165812364126, "kl": 0.09649658203125, "learning_rate": 4.997796882879058e-07, "loss": 0.0001, "reward": 1.764285758137703, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7732143122702837, "rewards/format_reward_func": 0.9910714328289032, "step": 5800 }, { "completion_length": 223.37500953674316, "epoch": 0.9727146988557777, "grad_norm": 0.3106940012934583, "kl": 0.104827880859375, "learning_rate": 4.997791845047102e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 5802 }, { "completion_length": 218.8616180419922, "epoch": 0.9730500020956453, "grad_norm": 0.22578401002909218, "kl": 0.09271240234375, "learning_rate": 4.997786801464303e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714626312256, "rewards/format_reward_func": 1.0, "step": 5804 }, { "completion_length": 220.16518592834473, "epoch": 0.9733853053355128, "grad_norm": 0.331889824862444, "kl": 0.08624267578125, "learning_rate": 4.997781752130673e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 1.0, "step": 5806 }, { "completion_length": 224.34822463989258, "epoch": 0.9737206085753803, "grad_norm": 0.21284460310280223, "kl": 0.1049041748046875, "learning_rate": 4.997776697046223e-07, "loss": 0.0001, "reward": 1.8125000521540642, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8169643059372902, "rewards/format_reward_func": 0.9955357164144516, "step": 5808 }, { "completion_length": 221.4553680419922, "epoch": 0.9740559118152479, "grad_norm": 0.20490578484408217, "kl": 0.07861328125, "learning_rate": 4.997771636210965e-07, "loss": 0.0001, "reward": 1.783928632736206, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 5810 }, { "completion_length": 225.59822463989258, "epoch": 0.9743912150551155, "grad_norm": 0.33365690223263317, "kl": 0.0732269287109375, "learning_rate": 4.99776656962491e-07, "loss": 0.0001, "reward": 1.7892857566475868, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857436090708, "rewards/format_reward_func": 1.0, "step": 5812 }, { "completion_length": 225.36161422729492, "epoch": 0.974726518294983, "grad_norm": 0.2679487209690672, "kl": 0.0830841064453125, "learning_rate": 4.997761497288071e-07, "loss": 0.0001, "reward": 1.7607143372297287, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7696428969502449, "rewards/format_reward_func": 0.9910714328289032, "step": 5814 }, { "completion_length": 218.46429634094238, "epoch": 0.9750618215348505, "grad_norm": 0.3324314294312522, "kl": 0.0901641845703125, "learning_rate": 4.997756419200458e-07, "loss": 0.0001, "reward": 1.796428643167019, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 5816 }, { "completion_length": 229.84375858306885, "epoch": 0.9753971247747182, "grad_norm": 0.18266863033969682, "kl": 0.075103759765625, "learning_rate": 4.997751335362085e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 5818 }, { "completion_length": 225.60715293884277, "epoch": 0.9757324280145857, "grad_norm": 0.3062540378594742, "kl": 0.079925537109375, "learning_rate": 4.997746245772962e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 1.0, "step": 5820 }, { "completion_length": 232.3035831451416, "epoch": 0.9760677312544532, "grad_norm": 0.26651813698518534, "kl": 0.07159423828125, "learning_rate": 4.9977411504331e-07, "loss": 0.0001, "reward": 1.6892857998609543, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.6892857421189547, "rewards/format_reward_func": 1.0, "step": 5822 }, { "completion_length": 228.92411708831787, "epoch": 0.9764030344943208, "grad_norm": 0.2276887505675821, "kl": 0.0889892578125, "learning_rate": 4.997736049342512e-07, "loss": 0.0001, "reward": 1.7642857655882835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 5824 }, { "completion_length": 233.46875953674316, "epoch": 0.9767383377341884, "grad_norm": 0.07011180696246816, "kl": 0.0858306884765625, "learning_rate": 4.997730942501211e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 5826 }, { "completion_length": 234.75893878936768, "epoch": 0.9770736409740559, "grad_norm": 0.5499560677708243, "kl": 0.0884246826171875, "learning_rate": 4.997725829909205e-07, "loss": 0.0001, "reward": 1.755357213318348, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.759821455925703, "rewards/format_reward_func": 0.9955357164144516, "step": 5828 }, { "completion_length": 228.28125953674316, "epoch": 0.9774089442139234, "grad_norm": 0.2858952293699924, "kl": 0.0711822509765625, "learning_rate": 4.99772071156651e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143204659224, "rewards/format_reward_func": 1.0, "step": 5830 }, { "completion_length": 233.47768878936768, "epoch": 0.9777442474537911, "grad_norm": 0.252328997483877, "kl": 0.0720367431640625, "learning_rate": 4.997715587473135e-07, "loss": 0.0001, "reward": 1.7267857864499092, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.731250025331974, "rewards/format_reward_func": 0.9955357164144516, "step": 5832 }, { "completion_length": 234.97322273254395, "epoch": 0.9780795506936586, "grad_norm": 0.2657323747803113, "kl": 0.0772247314453125, "learning_rate": 4.997710457629092e-07, "loss": 0.0001, "reward": 1.7446429133415222, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071857511997, "rewards/format_reward_func": 0.9955357164144516, "step": 5834 }, { "completion_length": 228.12054443359375, "epoch": 0.9784148539335261, "grad_norm": 0.00524528276891892, "kl": 0.0808563232421875, "learning_rate": 4.997705322034394e-07, "loss": 0.0001, "reward": 1.7732143625617027, "reward_std": 0.027779195457696915, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 5836 }, { "completion_length": 238.71876049041748, "epoch": 0.9787501571733936, "grad_norm": 0.28480684344520696, "kl": 0.076904296875, "learning_rate": 4.997700180689053e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7964285872876644, "rewards/format_reward_func": 1.0, "step": 5838 }, { "completion_length": 227.2366180419922, "epoch": 0.9790854604132613, "grad_norm": 0.3515029048482449, "kl": 0.078582763671875, "learning_rate": 4.99769503359308e-07, "loss": 0.0001, "reward": 1.7357143759727478, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 5840 }, { "completion_length": 234.6830472946167, "epoch": 0.9794207636531288, "grad_norm": 0.1499024663881233, "kl": 0.073211669921875, "learning_rate": 4.997689880746486e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714730620384, "rewards/format_reward_func": 1.0, "step": 5842 }, { "completion_length": 229.09376049041748, "epoch": 0.9797560668929963, "grad_norm": 0.22930478631284873, "kl": 0.05889892578125, "learning_rate": 4.997684722149284e-07, "loss": 0.0001, "reward": 1.753571480512619, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714600235224, "rewards/format_reward_func": 1.0, "step": 5844 }, { "completion_length": 228.95983219146729, "epoch": 0.980091370132864, "grad_norm": 0.20276460790670736, "kl": 0.09027099609375, "learning_rate": 4.997679557801487e-07, "loss": 0.0001, "reward": 1.814285784959793, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857365310192, "rewards/format_reward_func": 1.0, "step": 5846 }, { "completion_length": 223.62947463989258, "epoch": 0.9804266733727315, "grad_norm": 0.23756297896683548, "kl": 0.0641937255859375, "learning_rate": 4.997674387703104e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 5848 }, { "completion_length": 214.40626049041748, "epoch": 0.980761976612599, "grad_norm": 0.24790898194269112, "kl": 0.0617218017578125, "learning_rate": 4.997669211854148e-07, "loss": 0.0001, "reward": 1.8214286491274834, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8214285895228386, "rewards/format_reward_func": 1.0, "step": 5850 }, { "completion_length": 229.6919755935669, "epoch": 0.9810972798524665, "grad_norm": 0.45248769291328733, "kl": 0.089996337890625, "learning_rate": 4.997664030254634e-07, "loss": 0.0001, "reward": 1.7446429505944252, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7482143118977547, "rewards/format_reward_func": 0.9964285790920258, "step": 5852 }, { "completion_length": 233.15625858306885, "epoch": 0.9814325830923342, "grad_norm": 0.17702506587319505, "kl": 0.0703277587890625, "learning_rate": 4.99765884290457e-07, "loss": 0.0001, "reward": 1.7517857775092125, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7651786021888256, "rewards/format_reward_func": 0.9866071455180645, "step": 5854 }, { "completion_length": 231.97322463989258, "epoch": 0.9817678863322017, "grad_norm": 0.25418976598809995, "kl": 0.06585693359375, "learning_rate": 4.997653649803968e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.746428593993187, "rewards/format_reward_func": 1.0, "step": 5856 }, { "completion_length": 234.37054634094238, "epoch": 0.9821031895720692, "grad_norm": 0.18552869148200038, "kl": 0.09344482421875, "learning_rate": 4.997648450952842e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571589291096, "rewards/format_reward_func": 1.0, "step": 5858 }, { "completion_length": 245.8839406967163, "epoch": 0.9824384928119368, "grad_norm": 0.2988328236976148, "kl": 0.0713958740234375, "learning_rate": 4.997643246351204e-07, "loss": 0.0001, "reward": 1.7303571999073029, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7348214760422707, "rewards/format_reward_func": 0.9955357164144516, "step": 5860 }, { "completion_length": 236.5625123977661, "epoch": 0.9827737960518044, "grad_norm": 0.09087717881506024, "kl": 0.06475830078125, "learning_rate": 4.997638035999065e-07, "loss": 0.0001, "reward": 1.7285715341567993, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 5862 }, { "completion_length": 240.37501335144043, "epoch": 0.9831090992916719, "grad_norm": 0.2330972837811373, "kl": 0.073760986328125, "learning_rate": 4.997632819896437e-07, "loss": 0.0001, "reward": 1.7928572446107864, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7928571626543999, "rewards/format_reward_func": 1.0, "step": 5864 }, { "completion_length": 242.43750953674316, "epoch": 0.9834444025315394, "grad_norm": 0.3031159260506758, "kl": 0.0743408203125, "learning_rate": 4.997627598043331e-07, "loss": 0.0001, "reward": 1.7392858043313026, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 5866 }, { "completion_length": 228.8616180419922, "epoch": 0.983779705771407, "grad_norm": 0.21641965125787252, "kl": 0.082244873046875, "learning_rate": 4.997622370439762e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 5868 }, { "completion_length": 243.7634048461914, "epoch": 0.9841150090112746, "grad_norm": 0.17336499937615538, "kl": 0.068206787109375, "learning_rate": 4.99761713708574e-07, "loss": 0.0001, "reward": 1.742857202887535, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571656346321, "rewards/format_reward_func": 1.0, "step": 5870 }, { "completion_length": 241.12947463989258, "epoch": 0.9844503122511421, "grad_norm": 0.2895336684874953, "kl": 0.0819244384765625, "learning_rate": 4.997611897981277e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7910714596509933, "rewards/format_reward_func": 0.9910714328289032, "step": 5872 }, { "completion_length": 241.39733123779297, "epoch": 0.9847856154910097, "grad_norm": 0.2139574550323431, "kl": 0.0740966796875, "learning_rate": 4.997606653126385e-07, "loss": 0.0001, "reward": 1.7928572222590446, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 5874 }, { "completion_length": 245.07143783569336, "epoch": 0.9851209187308773, "grad_norm": 0.24078819208661117, "kl": 0.0866851806640625, "learning_rate": 4.997601402521077e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 5876 }, { "completion_length": 227.65179634094238, "epoch": 0.9854562219707448, "grad_norm": 0.3018082502223864, "kl": 0.0716400146484375, "learning_rate": 4.997596146165363e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571827709675, "rewards/format_reward_func": 1.0, "step": 5878 }, { "completion_length": 232.21429824829102, "epoch": 0.9857915252106123, "grad_norm": 0.20631451236813453, "kl": 0.0742034912109375, "learning_rate": 4.997590884059259e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 5880 }, { "completion_length": 245.1116180419922, "epoch": 0.9861268284504799, "grad_norm": 0.14566184117270678, "kl": 0.078857421875, "learning_rate": 4.997585616202773e-07, "loss": 0.0001, "reward": 1.714285783469677, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857499420643, "rewards/format_reward_func": 1.0, "step": 5882 }, { "completion_length": 245.8571538925171, "epoch": 0.9864621316903475, "grad_norm": 0.2400514185909936, "kl": 0.0743560791015625, "learning_rate": 4.99758034259592e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7633928805589676, "rewards/format_reward_func": 0.9955357164144516, "step": 5884 }, { "completion_length": 241.3348331451416, "epoch": 0.986797434930215, "grad_norm": 0.10029168277200791, "kl": 0.0720062255859375, "learning_rate": 4.997575063238711e-07, "loss": 0.0001, "reward": 1.7214286252856255, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7214286047965288, "rewards/format_reward_func": 1.0, "step": 5886 }, { "completion_length": 239.9732255935669, "epoch": 0.9871327381700825, "grad_norm": 0.12171211237035984, "kl": 0.077362060546875, "learning_rate": 4.997569778131157e-07, "loss": 0.0001, "reward": 1.7839286178350449, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 5888 }, { "completion_length": 251.11608600616455, "epoch": 0.9874680414099501, "grad_norm": 0.24615144405117734, "kl": 0.1072845458984375, "learning_rate": 4.997564487273272e-07, "loss": 0.0001, "reward": 1.7125000730156898, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7258929051458836, "rewards/format_reward_func": 0.9866071455180645, "step": 5890 }, { "completion_length": 248.27233409881592, "epoch": 0.9878033446498177, "grad_norm": 0.10878122748316436, "kl": 0.0915069580078125, "learning_rate": 4.997559190665067e-07, "loss": 0.0001, "reward": 1.7714286595582962, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285887777805, "rewards/format_reward_func": 1.0, "step": 5892 }, { "completion_length": 260.90626335144043, "epoch": 0.9881386478896852, "grad_norm": 0.24469718671854498, "kl": 0.10359954833984375, "learning_rate": 4.997553888306556e-07, "loss": 0.0001, "reward": 1.733928643167019, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7383928894996643, "rewards/format_reward_func": 0.9955357164144516, "step": 5894 }, { "completion_length": 263.89287090301514, "epoch": 0.9884739511295528, "grad_norm": 0.19164770454787758, "kl": 0.0981903076171875, "learning_rate": 4.997548580197749e-07, "loss": 0.0001, "reward": 1.7553572058677673, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 5896 }, { "completion_length": 262.14733028411865, "epoch": 0.9888092543694204, "grad_norm": 0.11700179725728932, "kl": 0.09405517578125, "learning_rate": 4.99754326633866e-07, "loss": 0.0001, "reward": 1.8035714700818062, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.8125000223517418, "rewards/format_reward_func": 0.9910714328289032, "step": 5898 }, { "completion_length": 249.94197463989258, "epoch": 0.9891445576092879, "grad_norm": 0.1817638086218183, "kl": 0.14984130859375, "learning_rate": 4.997537946729298e-07, "loss": 0.0001, "reward": 1.7732143625617027, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7776785977184772, "rewards/format_reward_func": 0.9955357164144516, "step": 5900 }, { "completion_length": 266.526798248291, "epoch": 0.9894798608491554, "grad_norm": 0.3553538368445335, "kl": 0.1319122314453125, "learning_rate": 4.99753262136968e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 5902 }, { "completion_length": 255.9375123977661, "epoch": 0.989815164089023, "grad_norm": 0.26831031710499303, "kl": 0.145782470703125, "learning_rate": 4.997527290259816e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8160714507102966, "rewards/format_reward_func": 0.9910714328289032, "step": 5904 }, { "completion_length": 268.3571548461914, "epoch": 0.9901504673288906, "grad_norm": 0.2864201561733845, "kl": 0.1101837158203125, "learning_rate": 4.997521953399717e-07, "loss": 0.0001, "reward": 1.766071505844593, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7794643118977547, "rewards/format_reward_func": 0.9866071492433548, "step": 5906 }, { "completion_length": 266.1250114440918, "epoch": 0.9904857705687581, "grad_norm": 0.3488360998148485, "kl": 0.1395721435546875, "learning_rate": 4.997516610789397e-07, "loss": 0.0001, "reward": 1.773214340209961, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.7866071723401546, "rewards/format_reward_func": 0.9866071492433548, "step": 5908 }, { "completion_length": 260.60715103149414, "epoch": 0.9908210738086257, "grad_norm": 0.13988328840005468, "kl": 0.1138763427734375, "learning_rate": 4.997511262428867e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 5910 }, { "completion_length": 273.7991189956665, "epoch": 0.9911563770484932, "grad_norm": 0.18774606542998765, "kl": 0.15557861328125, "learning_rate": 4.997505908318142e-07, "loss": 0.0002, "reward": 1.7803571969270706, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7848214656114578, "rewards/format_reward_func": 0.9955357164144516, "step": 5912 }, { "completion_length": 261.88840770721436, "epoch": 0.9914916802883608, "grad_norm": 0.20507424623893872, "kl": 0.2316436767578125, "learning_rate": 4.997500548457231e-07, "loss": 0.0002, "reward": 1.7325893491506577, "reward_std": 0.044825518038123846, "rewards/equation_reward_func": 0.7383928745985031, "rewards/format_reward_func": 0.9941964335739613, "step": 5914 }, { "completion_length": 256.0669755935669, "epoch": 0.9918269835282283, "grad_norm": 0.3678386165226889, "kl": 0.1391754150390625, "learning_rate": 4.997495182846147e-07, "loss": 0.0001, "reward": 1.7348214909434319, "reward_std": 0.08207489270716906, "rewards/equation_reward_func": 0.741071455180645, "rewards/format_reward_func": 0.9937500059604645, "step": 5916 }, { "completion_length": 256.1160840988159, "epoch": 0.9921622867680959, "grad_norm": 0.1756867333928644, "kl": 0.1789093017578125, "learning_rate": 4.997489811484903e-07, "loss": 0.0002, "reward": 1.7803572043776512, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7848214581608772, "rewards/format_reward_func": 0.9955357164144516, "step": 5918 }, { "completion_length": 269.5178689956665, "epoch": 0.9924975900079634, "grad_norm": 0.11046519379219497, "kl": 0.1328277587890625, "learning_rate": 4.997484434373513e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.789285734295845, "rewards/format_reward_func": 0.9821428656578064, "step": 5920 }, { "completion_length": 269.83929538726807, "epoch": 0.992832893247831, "grad_norm": 0.2843856797083342, "kl": 0.4182891845703125, "learning_rate": 4.997479051511988e-07, "loss": 0.0004, "reward": 1.696428656578064, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7053571809083223, "rewards/format_reward_func": 0.9910714328289032, "step": 5922 }, { "completion_length": 266.794659614563, "epoch": 0.9931681964876986, "grad_norm": 0.25353859287113506, "kl": 0.2885894775390625, "learning_rate": 4.997473662900339e-07, "loss": 0.0003, "reward": 1.6517857983708382, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.6562500335276127, "rewards/format_reward_func": 0.9955357164144516, "step": 5924 }, { "completion_length": 260.80358505249023, "epoch": 0.9935034997275661, "grad_norm": 0.17313230378824523, "kl": 0.297637939453125, "learning_rate": 4.997468268538579e-07, "loss": 0.0003, "reward": 1.7571429312229156, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 5926 }, { "completion_length": 249.4107265472412, "epoch": 0.9938388029674337, "grad_norm": 0.1547458708021722, "kl": 0.3666229248046875, "learning_rate": 4.997462868426722e-07, "loss": 0.0004, "reward": 1.826785758137703, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8312500193715096, "rewards/format_reward_func": 0.9955357164144516, "step": 5928 }, { "completion_length": 255.52233219146729, "epoch": 0.9941741062073012, "grad_norm": 0.39843122540179543, "kl": 0.4351806640625, "learning_rate": 4.997457462564781e-07, "loss": 0.0004, "reward": 1.7040179446339607, "reward_std": 0.06124049751088023, "rewards/equation_reward_func": 0.7098214589059353, "rewards/format_reward_func": 0.9941964335739613, "step": 5930 }, { "completion_length": 259.84376430511475, "epoch": 0.9945094094471688, "grad_norm": 0.3779794869916766, "kl": 0.6574249267578125, "learning_rate": 4.997452050952765e-07, "loss": 0.0007, "reward": 1.76071435213089, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7696428894996643, "rewards/format_reward_func": 0.9910714328289032, "step": 5932 }, { "completion_length": 251.8794765472412, "epoch": 0.9948447126870363, "grad_norm": 0.8563269925502356, "kl": 1.45794677734375, "learning_rate": 4.997446633590689e-07, "loss": 0.0015, "reward": 1.7553571909666061, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214708268642, "rewards/format_reward_func": 0.9955357164144516, "step": 5934 }, { "completion_length": 251.55805110931396, "epoch": 0.9951800159269039, "grad_norm": 0.2936187191356089, "kl": 1.031280517578125, "learning_rate": 4.997441210478564e-07, "loss": 0.001, "reward": 1.7142857611179352, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7232143320143223, "rewards/format_reward_func": 0.9910714328289032, "step": 5936 }, { "completion_length": 260.50447368621826, "epoch": 0.9955153191667715, "grad_norm": 0.28716438779501635, "kl": 0.818145751953125, "learning_rate": 4.997435781616405e-07, "loss": 0.0008, "reward": 1.7267857939004898, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7312500290572643, "rewards/format_reward_func": 0.9955357164144516, "step": 5938 }, { "completion_length": 256.3616180419922, "epoch": 0.995850622406639, "grad_norm": 0.23298759856279558, "kl": 1.7165985107421875, "learning_rate": 4.997430347004221e-07, "loss": 0.0017, "reward": 1.7642857655882835, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7732143178582191, "rewards/format_reward_func": 0.9910714328289032, "step": 5940 }, { "completion_length": 256.57143783569336, "epoch": 0.9961859256465065, "grad_norm": 0.2095096831056827, "kl": 0.1193695068359375, "learning_rate": 4.997424906642028e-07, "loss": 0.0001, "reward": 1.8200893253087997, "reward_std": 0.06250318605452776, "rewards/equation_reward_func": 0.8214285895228386, "rewards/format_reward_func": 0.9986607171595097, "step": 5942 }, { "completion_length": 253.92858219146729, "epoch": 0.9965212288863741, "grad_norm": 0.22191173301560937, "kl": 0.5313568115234375, "learning_rate": 4.997419460529836e-07, "loss": 0.0005, "reward": 1.7160715237259865, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7205357477068901, "rewards/format_reward_func": 0.9955357164144516, "step": 5944 }, { "completion_length": 252.2946548461914, "epoch": 0.9968565321262417, "grad_norm": 0.2167018970654993, "kl": 1.0382080078125, "learning_rate": 4.997414008667658e-07, "loss": 0.001, "reward": 1.7982143461704254, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8026785887777805, "rewards/format_reward_func": 0.9955357164144516, "step": 5946 }, { "completion_length": 262.90626525878906, "epoch": 0.9971918353661092, "grad_norm": 0.25514814703256206, "kl": 0.0827789306640625, "learning_rate": 4.997408551055508e-07, "loss": 0.0001, "reward": 1.7839286476373672, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7875000238418579, "rewards/format_reward_func": 0.9964285716414452, "step": 5948 }, { "completion_length": 261.5714406967163, "epoch": 0.9975271386059767, "grad_norm": 0.30817589784765403, "kl": 0.498565673828125, "learning_rate": 4.997403087693398e-07, "loss": 0.0005, "reward": 1.7821429297327995, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 5950 }, { "completion_length": 264.19197845458984, "epoch": 0.9978624418458444, "grad_norm": 0.21263202583302027, "kl": 0.1584625244140625, "learning_rate": 4.997397618581339e-07, "loss": 0.0002, "reward": 1.7767857611179352, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500204890966, "rewards/format_reward_func": 0.9955357164144516, "step": 5952 }, { "completion_length": 259.008939743042, "epoch": 0.9981977450857119, "grad_norm": 0.2297520903477404, "kl": 0.1841583251953125, "learning_rate": 4.997392143719344e-07, "loss": 0.0002, "reward": 1.7107143625617027, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7107143327593803, "rewards/format_reward_func": 1.0, "step": 5954 }, { "completion_length": 255.6785831451416, "epoch": 0.9985330483255794, "grad_norm": 0.23026073378599074, "kl": 0.18896484375, "learning_rate": 4.997386663107428e-07, "loss": 0.0002, "reward": 1.7428572177886963, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571823984385, "rewards/format_reward_func": 1.0, "step": 5956 }, { "completion_length": 266.88394260406494, "epoch": 0.998868351565447, "grad_norm": 0.2934772523750861, "kl": 0.1302490234375, "learning_rate": 4.997381176745602e-07, "loss": 0.0001, "reward": 1.7839286178350449, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 5958 }, { "completion_length": 262.446439743042, "epoch": 0.9992036548053146, "grad_norm": 0.26250342499542473, "kl": 0.257049560546875, "learning_rate": 4.997375684633878e-07, "loss": 0.0003, "reward": 1.782142922282219, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7821428682655096, "rewards/format_reward_func": 1.0, "step": 5960 }, { "completion_length": 253.3616180419922, "epoch": 0.9995389580451821, "grad_norm": 0.35762220838323594, "kl": 0.0915374755859375, "learning_rate": 4.997370186772269e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 0.9821428656578064, "step": 5962 }, { "completion_length": 263.8259029388428, "epoch": 0.9998742612850496, "grad_norm": 1.4570787798699594, "kl": 0.1656646728515625, "learning_rate": 4.997364683160787e-07, "loss": 0.0002, "reward": 1.7321429327130318, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428880095482, "rewards/format_reward_func": 1.0, "step": 5964 }, { "completion_length": 265.9805304787376, "epoch": 1.0003353032398676, "grad_norm": 0.9384003688196095, "kl": 0.10651189630681818, "learning_rate": 4.997359173799447e-07, "loss": 0.0001, "reward": 1.6987013925205579, "reward_std": 0.08815876665440472, "rewards/equation_reward_func": 0.7116883397102356, "rewards/format_reward_func": 0.9870129932056774, "step": 5966 }, { "completion_length": 259.83037090301514, "epoch": 1.000670606479735, "grad_norm": 1.2250441001337247, "kl": 0.1012420654296875, "learning_rate": 4.99735365868826e-07, "loss": 0.0001, "reward": 1.769642949104309, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7741071656346321, "rewards/format_reward_func": 0.9955357164144516, "step": 5968 }, { "completion_length": 265.8125104904175, "epoch": 1.0010059097196027, "grad_norm": 0.21174169445063962, "kl": 0.10089111328125, "learning_rate": 4.997348137827238e-07, "loss": 0.0001, "reward": 1.714285783469677, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7142857573926449, "rewards/format_reward_func": 1.0, "step": 5970 }, { "completion_length": 264.96876430511475, "epoch": 1.0013412129594703, "grad_norm": 0.309308577606721, "kl": 0.06874847412109375, "learning_rate": 4.997342611216395e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500335276127, "rewards/format_reward_func": 0.9955357164144516, "step": 5972 }, { "completion_length": 258.415189743042, "epoch": 1.0016765161993377, "grad_norm": 0.9905876695648793, "kl": 0.0795135498046875, "learning_rate": 4.997337078855744e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 5974 }, { "completion_length": 266.07144355773926, "epoch": 1.0020118194392054, "grad_norm": 0.2839674817964923, "kl": 0.1131744384765625, "learning_rate": 4.997331540745296e-07, "loss": 0.0001, "reward": 1.7303571924567223, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7437500394880772, "rewards/format_reward_func": 0.9866071492433548, "step": 5976 }, { "completion_length": 276.91519355773926, "epoch": 1.0023471226790728, "grad_norm": 0.24495313830148246, "kl": 0.123016357421875, "learning_rate": 4.997325996885066e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 5978 }, { "completion_length": 273.0178699493408, "epoch": 1.0026824259189404, "grad_norm": 1.3292883047967456, "kl": 0.136383056640625, "learning_rate": 4.997320447275065e-07, "loss": 0.0001, "reward": 1.71830365806818, "reward_std": 0.05492704384960234, "rewards/equation_reward_func": 0.7241071686148643, "rewards/format_reward_func": 0.9941964335739613, "step": 5980 }, { "completion_length": 271.33929538726807, "epoch": 1.003017729158808, "grad_norm": 1.1772382051916397, "kl": 0.2193603515625, "learning_rate": 4.997314891915307e-07, "loss": 0.0002, "reward": 1.7607143595814705, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7785714454948902, "rewards/format_reward_func": 0.9821428656578064, "step": 5982 }, { "completion_length": 260.65179347991943, "epoch": 1.0033530323986755, "grad_norm": 4.171729150636543, "kl": 0.216400146484375, "learning_rate": 4.997309330805803e-07, "loss": 0.0002, "reward": 1.7625000551342964, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9955357164144516, "step": 5984 }, { "completion_length": 264.4196529388428, "epoch": 1.003688335638543, "grad_norm": 0.38616155970757654, "kl": 0.297698974609375, "learning_rate": 4.997303763946568e-07, "loss": 0.0003, "reward": 1.7160715013742447, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.7294643148779869, "rewards/format_reward_func": 0.9866071492433548, "step": 5986 }, { "completion_length": 261.3169755935669, "epoch": 1.0040236388784107, "grad_norm": 0.21821546768331762, "kl": 0.239715576171875, "learning_rate": 4.997298191337613e-07, "loss": 0.0002, "reward": 1.7611608058214188, "reward_std": 0.05492704384960234, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9941964335739613, "step": 5988 }, { "completion_length": 272.94197845458984, "epoch": 1.0043589421182781, "grad_norm": 0.8574313762374961, "kl": 0.2301483154296875, "learning_rate": 4.997292612978954e-07, "loss": 0.0002, "reward": 1.7214286103844643, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7303571626543999, "rewards/format_reward_func": 0.9910714328289032, "step": 5990 }, { "completion_length": 282.87947273254395, "epoch": 1.0046942453581458, "grad_norm": 0.41062082585236326, "kl": 0.18109130859375, "learning_rate": 4.997287028870599e-07, "loss": 0.0002, "reward": 1.739285796880722, "reward_std": 0.09596449136734009, "rewards/equation_reward_func": 0.7571428753435612, "rewards/format_reward_func": 0.9821428656578064, "step": 5992 }, { "completion_length": 264.7321548461914, "epoch": 1.0050295485980134, "grad_norm": 0.22685254042735117, "kl": 0.5179901123046875, "learning_rate": 4.997281439012564e-07, "loss": 0.0005, "reward": 1.8035714849829674, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.8125000335276127, "rewards/format_reward_func": 0.9910714328289032, "step": 5994 }, { "completion_length": 269.5223331451416, "epoch": 1.0053648518378808, "grad_norm": 0.22963070499995214, "kl": 0.550018310546875, "learning_rate": 4.997275843404861e-07, "loss": 0.0006, "reward": 1.7142857760190964, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7321428991854191, "rewards/format_reward_func": 0.9821428656578064, "step": 5996 }, { "completion_length": 265.7946557998657, "epoch": 1.0057001550777485, "grad_norm": 0.4467582667821596, "kl": 0.439666748046875, "learning_rate": 4.997270242047504e-07, "loss": 0.0004, "reward": 1.7343750521540642, "reward_std": 0.0675539500080049, "rewards/equation_reward_func": 0.7535714637488127, "rewards/format_reward_func": 0.980803582817316, "step": 5998 }, { "completion_length": 262.388409614563, "epoch": 1.006035458317616, "grad_norm": 0.21615161312001333, "kl": 0.2450103759765625, "learning_rate": 4.997264634940503e-07, "loss": 0.0002, "reward": 1.773214340209961, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.777678620070219, "rewards/format_reward_func": 0.9955357164144516, "step": 6000 }, { "completion_length": 250.29911994934082, "epoch": 1.0063707615574835, "grad_norm": 0.34505426050760507, "kl": 1.3549957275390625, "learning_rate": 4.997259022083875e-07, "loss": 0.0014, "reward": 1.76071435213089, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 6002 }, { "completion_length": 256.5000104904175, "epoch": 1.0067060647973511, "grad_norm": 0.23308189879398827, "kl": 0.4465484619140625, "learning_rate": 4.99725340347763e-07, "loss": 0.0004, "reward": 1.7714286372065544, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 6004 }, { "completion_length": 250.5669765472412, "epoch": 1.0070413680372186, "grad_norm": 0.23353505975057562, "kl": 0.1022491455078125, "learning_rate": 4.997247779121782e-07, "loss": 0.0001, "reward": 1.8357143178582191, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.8446428813040257, "rewards/format_reward_func": 0.9910714328289032, "step": 6006 }, { "completion_length": 260.1562614440918, "epoch": 1.0073766712770862, "grad_norm": 0.34613575609921376, "kl": 0.1776885986328125, "learning_rate": 4.997242149016343e-07, "loss": 0.0002, "reward": 1.7477679252624512, "reward_std": 0.08396892924793065, "rewards/equation_reward_func": 0.7553571686148643, "rewards/format_reward_func": 0.9924107231199741, "step": 6008 }, { "completion_length": 260.95983505249023, "epoch": 1.0077119745169538, "grad_norm": 0.209932605339094, "kl": 0.2499847412109375, "learning_rate": 4.997236513161327e-07, "loss": 0.0003, "reward": 1.741071492433548, "reward_std": 0.08333758544176817, "rewards/equation_reward_func": 0.7633928693830967, "rewards/format_reward_func": 0.977678582072258, "step": 6010 }, { "completion_length": 268.901798248291, "epoch": 1.0080472777568212, "grad_norm": 0.23702142547618296, "kl": 0.272735595703125, "learning_rate": 4.997230871556747e-07, "loss": 0.0003, "reward": 1.751785770058632, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 6012 }, { "completion_length": 258.6875123977661, "epoch": 1.0083825809966889, "grad_norm": 0.2647906060959816, "kl": 0.30157470703125, "learning_rate": 4.997225224202616e-07, "loss": 0.0003, "reward": 1.7517857924103737, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500294297934, "rewards/format_reward_func": 0.9955357164144516, "step": 6014 }, { "completion_length": 260.34822368621826, "epoch": 1.0087178842365565, "grad_norm": 0.3469783106868168, "kl": 0.2371063232421875, "learning_rate": 4.997219571098945e-07, "loss": 0.0002, "reward": 1.7589286267757416, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 6016 }, { "completion_length": 251.0714406967163, "epoch": 1.009053187476424, "grad_norm": 0.2878495464486915, "kl": 0.1557464599609375, "learning_rate": 4.997213912245751e-07, "loss": 0.0002, "reward": 1.7678572461009026, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 6018 }, { "completion_length": 253.09376049041748, "epoch": 1.0093884907162916, "grad_norm": 0.31678984513529884, "kl": 0.5877227783203125, "learning_rate": 4.997208247643042e-07, "loss": 0.0006, "reward": 1.7678571864962578, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 6020 }, { "completion_length": 251.00001335144043, "epoch": 1.0097237939561592, "grad_norm": 0.2262559946347966, "kl": 0.2086334228515625, "learning_rate": 4.997202577290836e-07, "loss": 0.0002, "reward": 1.7607143595814705, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7696428932249546, "rewards/format_reward_func": 0.9910714328289032, "step": 6022 }, { "completion_length": 259.9598340988159, "epoch": 1.0100590971960266, "grad_norm": 0.3477392927745284, "kl": 0.3122711181640625, "learning_rate": 4.997196901189142e-07, "loss": 0.0003, "reward": 1.7696429342031479, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7741071730852127, "rewards/format_reward_func": 0.9955357164144516, "step": 6024 }, { "completion_length": 251.7098331451416, "epoch": 1.0103944004358942, "grad_norm": 0.20619517269581278, "kl": 0.207794189453125, "learning_rate": 4.997191219337976e-07, "loss": 0.0002, "reward": 1.7339286506175995, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7383928876370192, "rewards/format_reward_func": 0.9955357164144516, "step": 6026 }, { "completion_length": 267.39733600616455, "epoch": 1.0107297036757619, "grad_norm": 0.28115504056281226, "kl": 0.31494140625, "learning_rate": 4.99718553173735e-07, "loss": 0.0003, "reward": 1.7214286401867867, "reward_std": 0.0909137288108468, "rewards/equation_reward_func": 0.739285746589303, "rewards/format_reward_func": 0.9821428656578064, "step": 6028 }, { "completion_length": 250.46876049041748, "epoch": 1.0110650069156293, "grad_norm": 0.08975104121727806, "kl": 0.3542633056640625, "learning_rate": 4.997179838387276e-07, "loss": 0.0004, "reward": 1.7642857730388641, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 6030 }, { "completion_length": 249.0714406967163, "epoch": 1.011400310155497, "grad_norm": 0.2578463046422513, "kl": 0.090484619140625, "learning_rate": 4.997174139287768e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 6032 }, { "completion_length": 248.33036994934082, "epoch": 1.0117356133953643, "grad_norm": 0.30858155230260276, "kl": 0.078338623046875, "learning_rate": 4.997168434438841e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.0656599160283804, "rewards/equation_reward_func": 0.7410714663565159, "rewards/format_reward_func": 0.9910714328289032, "step": 6034 }, { "completion_length": 252.12054538726807, "epoch": 1.012070916635232, "grad_norm": 0.26137515161726865, "kl": 0.0858306884765625, "learning_rate": 4.997162723840505e-07, "loss": 0.0001, "reward": 1.7910714596509933, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 6036 }, { "completion_length": 246.0669765472412, "epoch": 1.0124062198750996, "grad_norm": 0.30419280294145973, "kl": 0.07515716552734375, "learning_rate": 4.997157007492775e-07, "loss": 0.0001, "reward": 1.7660714909434319, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.9955357164144516, "step": 6038 }, { "completion_length": 250.9241189956665, "epoch": 1.012741523114967, "grad_norm": 0.2789904085399853, "kl": 0.1070098876953125, "learning_rate": 4.997151285395663e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 6040 }, { "completion_length": 257.3526906967163, "epoch": 1.0130768263548346, "grad_norm": 0.29540488870553583, "kl": 0.211273193359375, "learning_rate": 4.997145557549184e-07, "loss": 0.0002, "reward": 1.6821429580450058, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.6821429021656513, "rewards/format_reward_func": 1.0, "step": 6042 }, { "completion_length": 263.98661613464355, "epoch": 1.0134121295947023, "grad_norm": 0.3503033270367516, "kl": 0.429779052734375, "learning_rate": 4.99713982395335e-07, "loss": 0.0004, "reward": 1.7642857506871223, "reward_std": 0.04040610231459141, "rewards/equation_reward_func": 0.77321432903409, "rewards/format_reward_func": 0.9910714328289032, "step": 6044 }, { "completion_length": 251.9509048461914, "epoch": 1.0137474328345697, "grad_norm": 0.2594043398085691, "kl": 0.1715240478515625, "learning_rate": 4.997134084608174e-07, "loss": 0.0002, "reward": 1.85714291036129, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8571428693830967, "rewards/format_reward_func": 1.0, "step": 6046 }, { "completion_length": 255.3482265472412, "epoch": 1.0140827360744373, "grad_norm": 0.3322364759657405, "kl": 0.13507080078125, "learning_rate": 4.997128339513669e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 6048 }, { "completion_length": 253.30805110931396, "epoch": 1.014418039314305, "grad_norm": 0.5803981729663658, "kl": 0.480499267578125, "learning_rate": 4.997122588669849e-07, "loss": 0.0005, "reward": 1.7535715028643608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714693367481, "rewards/format_reward_func": 1.0, "step": 6050 }, { "completion_length": 252.18751049041748, "epoch": 1.0147533425541724, "grad_norm": 0.40183967799838594, "kl": 0.190338134765625, "learning_rate": 4.997116832076727e-07, "loss": 0.0002, "reward": 1.7571429535746574, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 6052 }, { "completion_length": 246.7321548461914, "epoch": 1.01508864579404, "grad_norm": 0.26403663378360054, "kl": 0.138153076171875, "learning_rate": 4.997111069734316e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.8053571693599224, "rewards/format_reward_func": 0.9910714328289032, "step": 6054 }, { "completion_length": 250.02679920196533, "epoch": 1.0154239490339076, "grad_norm": 0.18947696729731334, "kl": 0.109039306640625, "learning_rate": 4.997105301642629e-07, "loss": 0.0001, "reward": 1.7660714983940125, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.9955357164144516, "step": 6056 }, { "completion_length": 257.2767972946167, "epoch": 1.015759252273775, "grad_norm": 0.43595958723248723, "kl": 0.13599395751953125, "learning_rate": 4.99709952780168e-07, "loss": 0.0001, "reward": 1.7392857745289803, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 6058 }, { "completion_length": 258.1741189956665, "epoch": 1.0160945555136427, "grad_norm": 0.261448358162978, "kl": 0.150146484375, "learning_rate": 4.997093748211482e-07, "loss": 0.0002, "reward": 1.73214291036129, "reward_std": 0.04545686487108469, "rewards/equation_reward_func": 0.7410714700818062, "rewards/format_reward_func": 0.9910714328289032, "step": 6060 }, { "completion_length": 256.78126430511475, "epoch": 1.01642985875351, "grad_norm": 0.18200663054584304, "kl": 0.103240966796875, "learning_rate": 4.997087962872049e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 6062 }, { "completion_length": 258.3571557998657, "epoch": 1.0167651619933777, "grad_norm": 0.1819383632703182, "kl": 0.17974853515625, "learning_rate": 4.997082171783393e-07, "loss": 0.0002, "reward": 1.7732143625617027, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.777678593993187, "rewards/format_reward_func": 0.9955357164144516, "step": 6064 }, { "completion_length": 259.5134057998657, "epoch": 1.0171004652332454, "grad_norm": 0.3211241713421729, "kl": 0.13037109375, "learning_rate": 4.997076374945529e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 0.9821428656578064, "step": 6066 }, { "completion_length": 255.58036994934082, "epoch": 1.0174357684731128, "grad_norm": 0.2425993917665553, "kl": 0.2032470703125, "learning_rate": 4.997070572358468e-07, "loss": 0.0002, "reward": 1.814285770058632, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 6068 }, { "completion_length": 264.77680110931396, "epoch": 1.0177710717129804, "grad_norm": 0.15083247763046945, "kl": 0.17120361328125, "learning_rate": 4.997064764022225e-07, "loss": 0.0002, "reward": 1.780357226729393, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 6070 }, { "completion_length": 259.620548248291, "epoch": 1.018106374952848, "grad_norm": 0.5226944898181175, "kl": 0.14019775390625, "learning_rate": 4.997058949936814e-07, "loss": 0.0001, "reward": 1.7482143342494965, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7616071663796902, "rewards/format_reward_func": 0.9866071492433548, "step": 6072 }, { "completion_length": 267.4955520629883, "epoch": 1.0184416781927155, "grad_norm": 0.29505621677078775, "kl": 0.1959228515625, "learning_rate": 4.997053130102247e-07, "loss": 0.0002, "reward": 1.7071429267525673, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7160714529454708, "rewards/format_reward_func": 0.9910714328289032, "step": 6074 }, { "completion_length": 270.7053699493408, "epoch": 1.018776981432583, "grad_norm": 0.5437437458100716, "kl": 0.2197265625, "learning_rate": 4.997047304518538e-07, "loss": 0.0002, "reward": 1.7482143566012383, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526785954833031, "rewards/format_reward_func": 0.9955357164144516, "step": 6076 }, { "completion_length": 268.59822368621826, "epoch": 1.0191122846724507, "grad_norm": 0.24741324747983043, "kl": 0.14093017578125, "learning_rate": 4.997041473185699e-07, "loss": 0.0001, "reward": 1.6857143640518188, "reward_std": 0.09091372694820166, "rewards/equation_reward_func": 0.6946428865194321, "rewards/format_reward_func": 0.9910714328289032, "step": 6078 }, { "completion_length": 279.3928680419922, "epoch": 1.0194475879123182, "grad_norm": 0.16270285483118307, "kl": 0.12310791015625, "learning_rate": 4.997035636103746e-07, "loss": 0.0001, "reward": 1.7808035910129547, "reward_std": 0.0776554774492979, "rewards/equation_reward_func": 0.800000037997961, "rewards/format_reward_func": 0.980803582817316, "step": 6080 }, { "completion_length": 270.41518783569336, "epoch": 1.0197828911521858, "grad_norm": 0.35956905028353797, "kl": 0.286376953125, "learning_rate": 4.997029793272691e-07, "loss": 0.0003, "reward": 1.703571505844593, "reward_std": 0.08586296625435352, "rewards/equation_reward_func": 0.7125000394880772, "rewards/format_reward_func": 0.9910714328289032, "step": 6082 }, { "completion_length": 270.30358505249023, "epoch": 1.0201181943920532, "grad_norm": 0.47292226180050445, "kl": 0.266021728515625, "learning_rate": 4.997023944692547e-07, "loss": 0.0003, "reward": 1.7482143491506577, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7616071775555611, "rewards/format_reward_func": 0.9866071492433548, "step": 6084 }, { "completion_length": 251.9107265472412, "epoch": 1.0204534976319208, "grad_norm": 0.2832647131892392, "kl": 0.1192626953125, "learning_rate": 4.997018090363329e-07, "loss": 0.0001, "reward": 1.7732143327593803, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7866071555763483, "rewards/format_reward_func": 0.9866071492433548, "step": 6086 }, { "completion_length": 264.56697845458984, "epoch": 1.0207888008717885, "grad_norm": 0.31822589986037786, "kl": 0.26104736328125, "learning_rate": 4.997012230285048e-07, "loss": 0.0003, "reward": 1.6875000894069672, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.7098214626312256, "rewards/format_reward_func": 0.977678582072258, "step": 6088 }, { "completion_length": 265.49108695983887, "epoch": 1.0211241041116559, "grad_norm": 0.45547720897693517, "kl": 0.40484619140625, "learning_rate": 4.997006364457721e-07, "loss": 0.0004, "reward": 1.7750000655651093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 6090 }, { "completion_length": 268.34376430511475, "epoch": 1.0214594073515235, "grad_norm": 0.2786369185101486, "kl": 0.428558349609375, "learning_rate": 4.997000492881358e-07, "loss": 0.0004, "reward": 1.7053572237491608, "reward_std": 0.09848987311124802, "rewards/equation_reward_func": 0.7366071715950966, "rewards/format_reward_func": 0.9687500149011612, "step": 6092 }, { "completion_length": 277.3214406967163, "epoch": 1.0217947105913912, "grad_norm": 0.18411353534994465, "kl": 0.46099853515625, "learning_rate": 4.996994615555975e-07, "loss": 0.0005, "reward": 1.7071429193019867, "reward_std": 0.10101525392383337, "rewards/equation_reward_func": 0.7250000387430191, "rewards/format_reward_func": 0.9821428656578064, "step": 6094 }, { "completion_length": 267.19197845458984, "epoch": 1.0221300138312586, "grad_norm": 0.3466971216842296, "kl": 0.677581787109375, "learning_rate": 4.996988732481584e-07, "loss": 0.0007, "reward": 1.694642923772335, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.716964315623045, "rewards/format_reward_func": 0.977678582072258, "step": 6096 }, { "completion_length": 267.0223379135132, "epoch": 1.0224653170711262, "grad_norm": 0.3621796109181974, "kl": 0.21337890625, "learning_rate": 4.9969828436582e-07, "loss": 0.0002, "reward": 1.7410714849829674, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7544643245637417, "rewards/format_reward_func": 0.9866071492433548, "step": 6098 }, { "completion_length": 273.089298248291, "epoch": 1.0228006203109938, "grad_norm": 0.7042681314100642, "kl": 0.7377166748046875, "learning_rate": 4.996976949085834e-07, "loss": 0.0007, "reward": 1.6803572326898575, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.6937500275671482, "rewards/format_reward_func": 0.9866071492433548, "step": 6100 }, { "completion_length": 267.9687614440918, "epoch": 1.0231359235508612, "grad_norm": 0.2904338249092064, "kl": 0.147735595703125, "learning_rate": 4.996971048764502e-07, "loss": 0.0001, "reward": 1.7142857909202576, "reward_std": 0.1060660183429718, "rewards/equation_reward_func": 0.7410714644938707, "rewards/format_reward_func": 0.9732142947614193, "step": 6102 }, { "completion_length": 254.40179443359375, "epoch": 1.0234712267907289, "grad_norm": 0.1715708017841068, "kl": 0.1373291015625, "learning_rate": 4.996965142694217e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.025253813713788986, "rewards/equation_reward_func": 0.8125000223517418, "rewards/format_reward_func": 0.9910714328289032, "step": 6104 }, { "completion_length": 260.66518783569336, "epoch": 1.0238065300305965, "grad_norm": 0.5298721752119621, "kl": 0.3925323486328125, "learning_rate": 4.996959230874993e-07, "loss": 0.0004, "reward": 1.7607143446803093, "reward_std": 0.09596449043601751, "rewards/equation_reward_func": 0.769642885774374, "rewards/format_reward_func": 0.9910714328289032, "step": 6106 }, { "completion_length": 261.8437614440918, "epoch": 1.024141833270464, "grad_norm": 0.2597657463578196, "kl": 0.9878387451171875, "learning_rate": 4.996953313306842e-07, "loss": 0.001, "reward": 1.7339286357164383, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7473214529454708, "rewards/format_reward_func": 0.9866071492433548, "step": 6108 }, { "completion_length": 258.2544765472412, "epoch": 1.0244771365103316, "grad_norm": 0.34126144567118943, "kl": 0.373443603515625, "learning_rate": 4.99694738998978e-07, "loss": 0.0004, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 6110 }, { "completion_length": 248.68304443359375, "epoch": 1.024812439750199, "grad_norm": 0.191507371496448, "kl": 0.4516754150390625, "learning_rate": 4.996941460923818e-07, "loss": 0.0005, "reward": 1.7660714760422707, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7794643137603998, "rewards/format_reward_func": 0.9866071492433548, "step": 6112 }, { "completion_length": 252.77233409881592, "epoch": 1.0251477429900666, "grad_norm": 0.38964610451237514, "kl": 0.4473114013671875, "learning_rate": 4.996935526108972e-07, "loss": 0.0004, "reward": 1.7482143267989159, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526786085218191, "rewards/format_reward_func": 0.9955357164144516, "step": 6114 }, { "completion_length": 251.73215293884277, "epoch": 1.0254830462299342, "grad_norm": 0.2627876383548723, "kl": 0.3182525634765625, "learning_rate": 4.996929585545254e-07, "loss": 0.0003, "reward": 1.7696429193019867, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 6116 }, { "completion_length": 239.15625953674316, "epoch": 1.0258183494698017, "grad_norm": 0.11734356018251145, "kl": 0.720947265625, "learning_rate": 4.996923639232678e-07, "loss": 0.0007, "reward": 1.7642857730388641, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 6118 }, { "completion_length": 232.41518878936768, "epoch": 1.0261536527096693, "grad_norm": 0.3020438778673639, "kl": 0.3007354736328125, "learning_rate": 4.996917687171258e-07, "loss": 0.0003, "reward": 1.7839286476373672, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 6120 }, { "completion_length": 234.4821548461914, "epoch": 1.026488955949537, "grad_norm": 0.19764513890719873, "kl": 0.581573486328125, "learning_rate": 4.996911729361009e-07, "loss": 0.0006, "reward": 1.8357143551111221, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8357143141329288, "rewards/format_reward_func": 1.0, "step": 6122 }, { "completion_length": 230.95983219146729, "epoch": 1.0268242591894043, "grad_norm": 0.1907246795303604, "kl": 0.1136474609375, "learning_rate": 4.996905765801942e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 6124 }, { "completion_length": 233.60715198516846, "epoch": 1.027159562429272, "grad_norm": 0.22052651145584928, "kl": 0.1941375732421875, "learning_rate": 4.996899796494073e-07, "loss": 0.0002, "reward": 1.7928571999073029, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 6126 }, { "completion_length": 233.07590198516846, "epoch": 1.0274948656691396, "grad_norm": 0.28840565132762075, "kl": 0.187591552734375, "learning_rate": 4.996893821437414e-07, "loss": 0.0002, "reward": 1.6821429654955864, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6821428909897804, "rewards/format_reward_func": 1.0, "step": 6128 }, { "completion_length": 222.7991189956665, "epoch": 1.027830168909007, "grad_norm": 0.22193940299452425, "kl": 0.144378662109375, "learning_rate": 4.99688784063198e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428999304771, "rewards/format_reward_func": 1.0, "step": 6130 }, { "completion_length": 230.36608219146729, "epoch": 1.0281654721488747, "grad_norm": 0.5075373073783743, "kl": 0.3329925537109375, "learning_rate": 4.996881854077785e-07, "loss": 0.0003, "reward": 1.7571429088711739, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 6132 }, { "completion_length": 231.11608219146729, "epoch": 1.0285007753887423, "grad_norm": 0.20879696471969295, "kl": 0.253082275390625, "learning_rate": 4.996875861774842e-07, "loss": 0.0003, "reward": 1.7517857924103737, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500312924385, "rewards/format_reward_func": 0.9955357164144516, "step": 6134 }, { "completion_length": 230.20090579986572, "epoch": 1.0288360786286097, "grad_norm": 0.38774526912973895, "kl": 0.451629638671875, "learning_rate": 4.996869863723165e-07, "loss": 0.0005, "reward": 1.7267857789993286, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7401785980910063, "rewards/format_reward_func": 0.9866071492433548, "step": 6136 }, { "completion_length": 220.5357265472412, "epoch": 1.0291713818684773, "grad_norm": 0.27093543124287117, "kl": 0.1378173828125, "learning_rate": 4.996863859922767e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.746428593993187, "rewards/format_reward_func": 1.0, "step": 6138 }, { "completion_length": 231.71876049041748, "epoch": 1.0295066851083448, "grad_norm": 0.6042941302054783, "kl": 0.576324462890625, "learning_rate": 4.996857850373663e-07, "loss": 0.0006, "reward": 1.7147322073578835, "reward_std": 0.04987628059461713, "rewards/equation_reward_func": 0.7205357514321804, "rewards/format_reward_func": 0.9941964335739613, "step": 6140 }, { "completion_length": 229.93751049041748, "epoch": 1.0298419883482124, "grad_norm": 0.40842733005443116, "kl": 0.823272705078125, "learning_rate": 4.996851835075867e-07, "loss": 0.0008, "reward": 1.6964286342263222, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7142857573926449, "rewards/format_reward_func": 0.9821428656578064, "step": 6142 }, { "completion_length": 221.98215198516846, "epoch": 1.03017729158808, "grad_norm": 0.24154587050830953, "kl": 0.40399169921875, "learning_rate": 4.996845814029392e-07, "loss": 0.0004, "reward": 1.7535714879631996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 6144 }, { "completion_length": 218.56697750091553, "epoch": 1.0305125948279474, "grad_norm": 0.2808529909155721, "kl": 1.314208984375, "learning_rate": 4.996839787234252e-07, "loss": 0.0013, "reward": 1.7839286476373672, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 6146 }, { "completion_length": 231.55804824829102, "epoch": 1.030847898067815, "grad_norm": 0.20719697823983504, "kl": 0.54595947265625, "learning_rate": 4.996833754690461e-07, "loss": 0.0005, "reward": 1.7214286625385284, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7303571794182062, "rewards/format_reward_func": 0.9910714328289032, "step": 6148 }, { "completion_length": 228.35715293884277, "epoch": 1.0311832013076827, "grad_norm": 0.32793316359403357, "kl": 0.9630889892578125, "learning_rate": 4.996827716398033e-07, "loss": 0.001, "reward": 1.7482143566012383, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 6150 }, { "completion_length": 223.99554538726807, "epoch": 1.0315185045475501, "grad_norm": 0.13363658578744522, "kl": 0.293060302734375, "learning_rate": 4.996821672356982e-07, "loss": 0.0003, "reward": 1.7303571999073029, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7348214760422707, "rewards/format_reward_func": 0.9955357164144516, "step": 6152 }, { "completion_length": 219.50000858306885, "epoch": 1.0318538077874178, "grad_norm": 0.233703696734875, "kl": 0.246307373046875, "learning_rate": 4.996815622567322e-07, "loss": 0.0002, "reward": 1.7785715013742447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 6154 }, { "completion_length": 222.09822368621826, "epoch": 1.0321891110272854, "grad_norm": 0.16851135813365517, "kl": 1.2260894775390625, "learning_rate": 4.996809567029067e-07, "loss": 0.0012, "reward": 1.766071505844593, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 6156 }, { "completion_length": 222.87947463989258, "epoch": 1.0325244142671528, "grad_norm": 0.3325081112548761, "kl": 0.226348876953125, "learning_rate": 4.996803505742229e-07, "loss": 0.0002, "reward": 1.7589286491274834, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7633929029107094, "rewards/format_reward_func": 0.9955357164144516, "step": 6158 }, { "completion_length": 215.20090293884277, "epoch": 1.0328597175070204, "grad_norm": 0.3898566757276845, "kl": 0.326934814453125, "learning_rate": 4.996797438706825e-07, "loss": 0.0003, "reward": 1.742857202887535, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7517857439815998, "rewards/format_reward_func": 0.9910714328289032, "step": 6160 }, { "completion_length": 216.13840293884277, "epoch": 1.033195020746888, "grad_norm": 0.29160838237888526, "kl": 0.46624755859375, "learning_rate": 4.996791365922867e-07, "loss": 0.0005, "reward": 1.6821429431438446, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.6821428947150707, "rewards/format_reward_func": 1.0, "step": 6162 }, { "completion_length": 207.17411708831787, "epoch": 1.0335303239867555, "grad_norm": 0.29139948351310624, "kl": 0.110260009765625, "learning_rate": 4.99678528739037e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714715719223, "rewards/format_reward_func": 1.0, "step": 6164 }, { "completion_length": 207.4062614440918, "epoch": 1.0338656272266231, "grad_norm": 0.4263314352373076, "kl": 0.1416778564453125, "learning_rate": 4.996779203109347e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 6166 }, { "completion_length": 208.258939743042, "epoch": 1.0342009304664905, "grad_norm": 0.20562322052764345, "kl": 0.4632568359375, "learning_rate": 4.996773113079813e-07, "loss": 0.0005, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000156462193, "rewards/format_reward_func": 1.0, "step": 6168 }, { "completion_length": 214.71875953674316, "epoch": 1.0345362337063582, "grad_norm": 0.4857726664048818, "kl": 1.15155029296875, "learning_rate": 4.996767017301781e-07, "loss": 0.0012, "reward": 1.7892857566475868, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7982143051922321, "rewards/format_reward_func": 0.9910714328289032, "step": 6170 }, { "completion_length": 208.66965198516846, "epoch": 1.0348715369462258, "grad_norm": 0.8186370164924367, "kl": 0.657196044921875, "learning_rate": 4.996760915775266e-07, "loss": 0.0007, "reward": 1.7321429327130318, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.732142873108387, "rewards/format_reward_func": 1.0, "step": 6172 }, { "completion_length": 201.98661613464355, "epoch": 1.0352068401860932, "grad_norm": 0.09891019549043235, "kl": 0.2893218994140625, "learning_rate": 4.996754808500282e-07, "loss": 0.0003, "reward": 1.7357143387198448, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7357143331319094, "rewards/format_reward_func": 1.0, "step": 6174 }, { "completion_length": 206.6696538925171, "epoch": 1.0355421434259608, "grad_norm": 0.3504160565048967, "kl": 0.156005859375, "learning_rate": 4.996748695476843e-07, "loss": 0.0002, "reward": 1.7892857864499092, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857529222965, "rewards/format_reward_func": 1.0, "step": 6176 }, { "completion_length": 211.74554443359375, "epoch": 1.0358774466658285, "grad_norm": 0.280817911357908, "kl": 0.1046600341796875, "learning_rate": 4.996742576704961e-07, "loss": 0.0001, "reward": 1.7964286506175995, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 6178 }, { "completion_length": 216.66072273254395, "epoch": 1.036212749905696, "grad_norm": 0.15394694928327, "kl": 0.117523193359375, "learning_rate": 4.996736452184653e-07, "loss": 0.0001, "reward": 1.7178572118282318, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571820259094, "rewards/format_reward_func": 1.0, "step": 6180 }, { "completion_length": 215.63393783569336, "epoch": 1.0365480531455635, "grad_norm": 0.1798703112423341, "kl": 0.0845489501953125, "learning_rate": 4.996730321915932e-07, "loss": 0.0001, "reward": 1.735714353621006, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143145054579, "rewards/format_reward_func": 1.0, "step": 6182 }, { "completion_length": 211.61161518096924, "epoch": 1.0368833563854312, "grad_norm": 0.26618095980539785, "kl": 0.10028076171875, "learning_rate": 4.996724185898812e-07, "loss": 0.0001, "reward": 1.8160714879631996, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.820535734295845, "rewards/format_reward_func": 0.9955357164144516, "step": 6184 }, { "completion_length": 218.41072368621826, "epoch": 1.0372186596252986, "grad_norm": 0.23902131457280135, "kl": 0.090423583984375, "learning_rate": 4.996718044133306e-07, "loss": 0.0001, "reward": 1.8017857447266579, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.8062500357627869, "rewards/format_reward_func": 0.9955357164144516, "step": 6186 }, { "completion_length": 216.80357933044434, "epoch": 1.0375539628651662, "grad_norm": 0.6333183597495273, "kl": 0.0997314453125, "learning_rate": 4.996711896619432e-07, "loss": 0.0001, "reward": 1.7928571924567223, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 6188 }, { "completion_length": 218.7901906967163, "epoch": 1.0378892661050338, "grad_norm": 0.24219539600599282, "kl": 0.0910186767578125, "learning_rate": 4.9967057433572e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 6190 }, { "completion_length": 225.9419765472412, "epoch": 1.0382245693449013, "grad_norm": 0.38545151734950256, "kl": 0.108154296875, "learning_rate": 4.996699584346625e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 6192 }, { "completion_length": 221.77679634094238, "epoch": 1.038559872584769, "grad_norm": 0.004252728264381877, "kl": 0.094970703125, "learning_rate": 4.996693419587723e-07, "loss": 0.0001, "reward": 1.7178572118282318, "reward_std": 0.005050762556493282, "rewards/equation_reward_func": 0.7178571671247482, "rewards/format_reward_func": 1.0, "step": 6194 }, { "completion_length": 222.7991180419922, "epoch": 1.0388951758246363, "grad_norm": 0.20910697584743013, "kl": 0.100341796875, "learning_rate": 4.996687249080507e-07, "loss": 0.0001, "reward": 1.8464286178350449, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8464286029338837, "rewards/format_reward_func": 1.0, "step": 6196 }, { "completion_length": 236.7991180419922, "epoch": 1.039230479064504, "grad_norm": 0.4548612314796721, "kl": 0.1113128662109375, "learning_rate": 4.99668107282499e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428880095482, "rewards/format_reward_func": 1.0, "step": 6198 }, { "completion_length": 221.1875114440918, "epoch": 1.0395657823043716, "grad_norm": 0.21093026878166982, "kl": 0.09210205078125, "learning_rate": 4.996674890821189e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000279396772, "rewards/format_reward_func": 1.0, "step": 6200 }, { "completion_length": 222.7991180419922, "epoch": 1.039901085544239, "grad_norm": 0.04538039287639752, "kl": 0.0979766845703125, "learning_rate": 4.996668703069115e-07, "loss": 0.0001, "reward": 1.7464286088943481, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7553571797907352, "rewards/format_reward_func": 0.9910714328289032, "step": 6202 }, { "completion_length": 226.7991180419922, "epoch": 1.0402363887841066, "grad_norm": 0.2985622998817586, "kl": 0.099334716796875, "learning_rate": 4.996662509568785e-07, "loss": 0.0001, "reward": 1.764285795390606, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 6204 }, { "completion_length": 224.3705472946167, "epoch": 1.0405716920239743, "grad_norm": 0.3199587341752373, "kl": 0.118927001953125, "learning_rate": 4.996656310320213e-07, "loss": 0.0001, "reward": 1.7321429550647736, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428768336773, "rewards/format_reward_func": 1.0, "step": 6206 }, { "completion_length": 240.6160831451416, "epoch": 1.0409069952638417, "grad_norm": 0.1554368722368833, "kl": 0.094329833984375, "learning_rate": 4.996650105323411e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 6208 }, { "completion_length": 233.73661518096924, "epoch": 1.0412422985037093, "grad_norm": 0.2616894712735538, "kl": 0.0997772216796875, "learning_rate": 4.996643894578396e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 6210 }, { "completion_length": 233.00893783569336, "epoch": 1.041577601743577, "grad_norm": 0.32797188169474895, "kl": 0.0866241455078125, "learning_rate": 4.996637678085181e-07, "loss": 0.0001, "reward": 1.725000075995922, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000461935997, "rewards/format_reward_func": 1.0, "step": 6212 }, { "completion_length": 213.83929538726807, "epoch": 1.0419129049834444, "grad_norm": 0.27628963395743616, "kl": 0.085784912109375, "learning_rate": 4.99663145584378e-07, "loss": 0.0001, "reward": 1.742857202887535, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 6214 }, { "completion_length": 221.89733123779297, "epoch": 1.042248208223312, "grad_norm": 0.08516641647525623, "kl": 0.090362548828125, "learning_rate": 4.996625227854207e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 6216 }, { "completion_length": 232.78126049041748, "epoch": 1.0425835114631794, "grad_norm": 0.2787046385004818, "kl": 0.0954742431640625, "learning_rate": 4.996618994116478e-07, "loss": 0.0001, "reward": 1.7267857864499092, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7312500346451998, "rewards/format_reward_func": 0.9955357164144516, "step": 6218 }, { "completion_length": 230.34376049041748, "epoch": 1.042918814703047, "grad_norm": 0.1534465803849126, "kl": 0.095611572265625, "learning_rate": 4.996612754630605e-07, "loss": 0.0001, "reward": 1.7125000730156898, "reward_std": 0.0328299580141902, "rewards/equation_reward_func": 0.7169643342494965, "rewards/format_reward_func": 0.9955357164144516, "step": 6220 }, { "completion_length": 241.18751049041748, "epoch": 1.0432541179429147, "grad_norm": 0.2836709467862103, "kl": 0.087158203125, "learning_rate": 4.996606509396605e-07, "loss": 0.0001, "reward": 1.7107143923640251, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.7107143141329288, "rewards/format_reward_func": 1.0, "step": 6222 }, { "completion_length": 237.9196548461914, "epoch": 1.043589421182782, "grad_norm": 0.5012585855206936, "kl": 0.09515380859375, "learning_rate": 4.996600258414491e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 6224 }, { "completion_length": 234.18751049041748, "epoch": 1.0439247244226497, "grad_norm": 0.2459476407984449, "kl": 0.0809173583984375, "learning_rate": 4.996594001684278e-07, "loss": 0.0001, "reward": 1.7714286595582962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 6226 }, { "completion_length": 238.39733028411865, "epoch": 1.0442600276625174, "grad_norm": 0.14081129195328745, "kl": 0.07952880859375, "learning_rate": 4.996587739205979e-07, "loss": 0.0001, "reward": 1.751785770058632, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500182539225, "rewards/format_reward_func": 0.9955357164144516, "step": 6228 }, { "completion_length": 237.11608219146729, "epoch": 1.0445953309023848, "grad_norm": 0.1969990280030385, "kl": 0.078887939453125, "learning_rate": 4.996581470979609e-07, "loss": 0.0001, "reward": 1.6928572207689285, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.6928571742027998, "rewards/format_reward_func": 1.0, "step": 6230 }, { "completion_length": 235.27679824829102, "epoch": 1.0449306341422524, "grad_norm": 0.10241176001437838, "kl": 0.077056884765625, "learning_rate": 4.996575197005184e-07, "loss": 0.0001, "reward": 1.8053571805357933, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8098214603960514, "rewards/format_reward_func": 0.9955357164144516, "step": 6232 }, { "completion_length": 233.43304634094238, "epoch": 1.04526593738212, "grad_norm": 0.43947870911254105, "kl": 0.0736846923828125, "learning_rate": 4.996568917282718e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 6234 }, { "completion_length": 244.62947750091553, "epoch": 1.0456012406219874, "grad_norm": 0.22402884796565656, "kl": 0.0749969482421875, "learning_rate": 4.996562631812222e-07, "loss": 0.0001, "reward": 1.7178572192788124, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571857511997, "rewards/format_reward_func": 1.0, "step": 6236 }, { "completion_length": 244.3259038925171, "epoch": 1.045936543861855, "grad_norm": 0.275003424803172, "kl": 0.0874786376953125, "learning_rate": 4.996556340593715e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 6238 }, { "completion_length": 235.7321538925171, "epoch": 1.0462718471017227, "grad_norm": 0.22154834415764207, "kl": 0.0706024169921875, "learning_rate": 4.99655004362721e-07, "loss": 0.0001, "reward": 1.703571505844593, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.703571455553174, "rewards/format_reward_func": 1.0, "step": 6240 }, { "completion_length": 236.3437623977661, "epoch": 1.0466071503415901, "grad_norm": 0.39231322722768996, "kl": 0.07466888427734375, "learning_rate": 4.99654374091272e-07, "loss": 0.0001, "reward": 1.7522322162985802, "reward_std": 0.0473508988507092, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 0.9986607171595097, "step": 6242 }, { "completion_length": 235.2857265472412, "epoch": 1.0469424535814578, "grad_norm": 0.23871235936534596, "kl": 0.067901611328125, "learning_rate": 4.996537432450261e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 6244 }, { "completion_length": 245.44197463989258, "epoch": 1.0472777568213252, "grad_norm": 0.13381440765275113, "kl": 0.06072998046875, "learning_rate": 4.996531118239848e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 6246 }, { "completion_length": 249.54911518096924, "epoch": 1.0476130600611928, "grad_norm": 0.17442154972673185, "kl": 0.071014404296875, "learning_rate": 4.996524798281494e-07, "loss": 0.0001, "reward": 1.6750000640749931, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6750000417232513, "rewards/format_reward_func": 1.0, "step": 6248 }, { "completion_length": 236.6116180419922, "epoch": 1.0479483633010604, "grad_norm": 0.1054946432492003, "kl": 0.06951904296875, "learning_rate": 4.996518472575214e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 6250 }, { "completion_length": 242.00001049041748, "epoch": 1.0482836665409279, "grad_norm": 0.21478339945524438, "kl": 0.0652008056640625, "learning_rate": 4.996512141121023e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 6252 }, { "completion_length": 236.27679538726807, "epoch": 1.0486189697807955, "grad_norm": 0.2525516533339607, "kl": 0.079071044921875, "learning_rate": 4.996505803918936e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 6254 }, { "completion_length": 242.00894260406494, "epoch": 1.0489542730206631, "grad_norm": 0.19916546296729642, "kl": 0.071990966796875, "learning_rate": 4.996499460968966e-07, "loss": 0.0001, "reward": 1.816071480512619, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8205357417464256, "rewards/format_reward_func": 0.9955357164144516, "step": 6256 }, { "completion_length": 247.7187614440918, "epoch": 1.0492895762605305, "grad_norm": 0.5188461088710983, "kl": 0.518463134765625, "learning_rate": 4.996493112271129e-07, "loss": 0.0005, "reward": 1.751785784959793, "reward_std": 0.10859139822423458, "rewards/equation_reward_func": 0.765178594738245, "rewards/format_reward_func": 0.9866071492433548, "step": 6258 }, { "completion_length": 250.15626049041748, "epoch": 1.0496248795003982, "grad_norm": 0.2216511665362915, "kl": 0.06902313232421875, "learning_rate": 4.996486757825439e-07, "loss": 0.0001, "reward": 1.7160715013742447, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7205357663333416, "rewards/format_reward_func": 0.9955357164144516, "step": 6260 }, { "completion_length": 255.28572463989258, "epoch": 1.0499601827402658, "grad_norm": 0.3203314600754042, "kl": 0.109161376953125, "learning_rate": 4.996480397631911e-07, "loss": 0.0001, "reward": 1.7214286476373672, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7303571775555611, "rewards/format_reward_func": 0.9910714328289032, "step": 6262 }, { "completion_length": 242.7857265472412, "epoch": 1.0502954859801332, "grad_norm": 0.302882520820471, "kl": 0.0699310302734375, "learning_rate": 4.996474031690561e-07, "loss": 0.0001, "reward": 1.8053572103381157, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8098214529454708, "rewards/format_reward_func": 0.9955357164144516, "step": 6264 }, { "completion_length": 254.93304538726807, "epoch": 1.0506307892200009, "grad_norm": 0.24384354682916257, "kl": 0.0830535888671875, "learning_rate": 4.996467660001399e-07, "loss": 0.0001, "reward": 1.744642935693264, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 6266 }, { "completion_length": 249.7187623977661, "epoch": 1.0509660924598685, "grad_norm": 0.3250066916248128, "kl": 0.0789337158203125, "learning_rate": 4.996461282564446e-07, "loss": 0.0001, "reward": 1.708928644657135, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7223214507102966, "rewards/format_reward_func": 0.9866071492433548, "step": 6268 }, { "completion_length": 246.92411994934082, "epoch": 1.051301395699736, "grad_norm": 0.2190682285191929, "kl": 0.084503173828125, "learning_rate": 4.996454899379711e-07, "loss": 0.0001, "reward": 1.7339286506175995, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7383928820490837, "rewards/format_reward_func": 0.9955357164144516, "step": 6270 }, { "completion_length": 259.10715198516846, "epoch": 1.0516366989396035, "grad_norm": 0.27884125227950174, "kl": 0.0814361572265625, "learning_rate": 4.996448510447213e-07, "loss": 0.0001, "reward": 1.705357238650322, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.709821455180645, "rewards/format_reward_func": 0.9955357164144516, "step": 6272 }, { "completion_length": 242.6116180419922, "epoch": 1.051972002179471, "grad_norm": 0.26117811635112054, "kl": 0.0708770751953125, "learning_rate": 4.996442115766965e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7946428880095482, "rewards/format_reward_func": 0.9910714328289032, "step": 6274 }, { "completion_length": 253.24555110931396, "epoch": 1.0523073054193386, "grad_norm": 0.20232160553432557, "kl": 0.0785369873046875, "learning_rate": 4.996435715338981e-07, "loss": 0.0001, "reward": 1.7803571969270706, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7937500290572643, "rewards/format_reward_func": 0.9866071492433548, "step": 6276 }, { "completion_length": 250.5491180419922, "epoch": 1.0526426086592062, "grad_norm": 0.28832289892932167, "kl": 0.06787109375, "learning_rate": 4.996429309163276e-07, "loss": 0.0001, "reward": 1.7392858043313026, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7660714611411095, "rewards/format_reward_func": 0.9732142984867096, "step": 6278 }, { "completion_length": 247.56697940826416, "epoch": 1.0529779118990736, "grad_norm": 0.23480186692840185, "kl": 0.0709991455078125, "learning_rate": 4.996422897239866e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.10606601741164923, "rewards/equation_reward_func": 0.7589286006987095, "rewards/format_reward_func": 0.9732142984867096, "step": 6280 }, { "completion_length": 257.071439743042, "epoch": 1.0533132151389413, "grad_norm": 0.3109790026339082, "kl": 0.0759124755859375, "learning_rate": 4.996416479568763e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7517857290804386, "rewards/format_reward_func": 0.9910714328289032, "step": 6282 }, { "completion_length": 243.58483219146729, "epoch": 1.053648518378809, "grad_norm": 0.21916995248392268, "kl": 0.06939697265625, "learning_rate": 4.996410056149986e-07, "loss": 0.0001, "reward": 1.7589286640286446, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 6284 }, { "completion_length": 251.3303680419922, "epoch": 1.0539838216186763, "grad_norm": 0.1085527793568352, "kl": 0.07000732421875, "learning_rate": 4.996403626983547e-07, "loss": 0.0001, "reward": 1.7107143476605415, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7196428924798965, "rewards/format_reward_func": 0.9910714328289032, "step": 6286 }, { "completion_length": 257.58483695983887, "epoch": 1.054319124858544, "grad_norm": 0.27881947506588706, "kl": 0.0749053955078125, "learning_rate": 4.996397192069461e-07, "loss": 0.0001, "reward": 1.6857143566012383, "reward_std": 0.11111677996814251, "rewards/equation_reward_func": 0.7125000320374966, "rewards/format_reward_func": 0.9732142984867096, "step": 6288 }, { "completion_length": 251.14733219146729, "epoch": 1.0546544280984116, "grad_norm": 0.27651003259180323, "kl": 0.0708160400390625, "learning_rate": 4.996390751407741e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 6290 }, { "completion_length": 243.77679538726807, "epoch": 1.054989731338279, "grad_norm": 0.21381289177800553, "kl": 0.0681915283203125, "learning_rate": 4.996384304998406e-07, "loss": 0.0001, "reward": 1.7803571969270706, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7848214525729418, "rewards/format_reward_func": 0.9955357164144516, "step": 6292 }, { "completion_length": 246.5089406967163, "epoch": 1.0553250345781466, "grad_norm": 0.22910722937156328, "kl": 0.0751953125, "learning_rate": 4.99637785284147e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.04545686487108469, "rewards/equation_reward_func": 0.7839286141097546, "rewards/format_reward_func": 0.9910714328289032, "step": 6294 }, { "completion_length": 259.4241180419922, "epoch": 1.0556603378180143, "grad_norm": 0.7797186367045612, "kl": 0.1149749755859375, "learning_rate": 4.996371394936944e-07, "loss": 0.0001, "reward": 1.7732143551111221, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 6296 }, { "completion_length": 250.4285831451416, "epoch": 1.0559956410578817, "grad_norm": 1.2601252607687894, "kl": 0.0680389404296875, "learning_rate": 4.996364931284847e-07, "loss": 0.0001, "reward": 1.773214340209961, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7776785995811224, "rewards/format_reward_func": 0.9955357164144516, "step": 6298 }, { "completion_length": 238.5803680419922, "epoch": 1.0563309442977493, "grad_norm": 0.2717918614938203, "kl": 0.078369140625, "learning_rate": 4.996358461885192e-07, "loss": 0.0001, "reward": 1.7910714969038963, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7955357395112514, "rewards/format_reward_func": 0.9955357164144516, "step": 6300 }, { "completion_length": 242.43304634094238, "epoch": 1.0566662475376167, "grad_norm": 0.09746477369521411, "kl": 0.079925537109375, "learning_rate": 4.996351986737993e-07, "loss": 0.0001, "reward": 1.7303572073578835, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7348214611411095, "rewards/format_reward_func": 0.9955357164144516, "step": 6302 }, { "completion_length": 245.0178680419922, "epoch": 1.0570015507774844, "grad_norm": 0.13468086713952232, "kl": 0.076995849609375, "learning_rate": 4.996345505843268e-07, "loss": 0.0001, "reward": 1.7589286342263222, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 6304 }, { "completion_length": 250.1875114440918, "epoch": 1.057336854017352, "grad_norm": 0.18329931190892818, "kl": 0.0720367431640625, "learning_rate": 4.996339019201029e-07, "loss": 0.0001, "reward": 1.7446429207921028, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 6306 }, { "completion_length": 238.9419765472412, "epoch": 1.0576721572572194, "grad_norm": 0.20137418059888398, "kl": 0.0643157958984375, "learning_rate": 4.996332526811292e-07, "loss": 0.0001, "reward": 1.773214340209961, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7776786163449287, "rewards/format_reward_func": 0.9955357164144516, "step": 6308 }, { "completion_length": 246.35715103149414, "epoch": 1.058007460497087, "grad_norm": 0.16433502622545273, "kl": 0.0754547119140625, "learning_rate": 4.996326028674073e-07, "loss": 0.0001, "reward": 1.716071493923664, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7205357421189547, "rewards/format_reward_func": 0.9955357164144516, "step": 6310 }, { "completion_length": 247.68751335144043, "epoch": 1.0583427637369547, "grad_norm": 0.19212147064966212, "kl": 0.0781707763671875, "learning_rate": 4.996319524789385e-07, "loss": 0.0001, "reward": 1.7589286267757416, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 6312 }, { "completion_length": 242.81697273254395, "epoch": 1.058678066976822, "grad_norm": 0.09429431247365397, "kl": 0.0767822265625, "learning_rate": 4.996313015157244e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 6314 }, { "completion_length": 245.39733123779297, "epoch": 1.0590133702166897, "grad_norm": 0.18421668642270214, "kl": 0.09881591796875, "learning_rate": 4.996306499777665e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7758928686380386, "rewards/format_reward_func": 0.9866071492433548, "step": 6316 }, { "completion_length": 245.1116189956665, "epoch": 1.0593486734565574, "grad_norm": 0.3807004269832346, "kl": 0.0757904052734375, "learning_rate": 4.996299978650662e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 6318 }, { "completion_length": 251.22322463989258, "epoch": 1.0596839766964248, "grad_norm": 0.2114666808769925, "kl": 0.078765869140625, "learning_rate": 4.996293451776252e-07, "loss": 0.0001, "reward": 1.7500000298023224, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7678571939468384, "rewards/format_reward_func": 0.9821428656578064, "step": 6320 }, { "completion_length": 246.852689743042, "epoch": 1.0600192799362924, "grad_norm": 0.16649081805675256, "kl": 0.0686492919921875, "learning_rate": 4.996286919154449e-07, "loss": 0.0001, "reward": 1.7625000700354576, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 6322 }, { "completion_length": 247.61608123779297, "epoch": 1.06035458317616, "grad_norm": 0.30420478683403074, "kl": 0.0810699462890625, "learning_rate": 4.996280380785267e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.745535746216774, "rewards/format_reward_func": 0.9955357164144516, "step": 6324 }, { "completion_length": 251.58483219146729, "epoch": 1.0606898864160275, "grad_norm": 0.23160704711198593, "kl": 0.0971832275390625, "learning_rate": 4.996273836668723e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7660714648663998, "rewards/format_reward_func": 0.9910714328289032, "step": 6326 }, { "completion_length": 244.9642972946167, "epoch": 1.061025189655895, "grad_norm": 0.3936892033816459, "kl": 0.0736236572265625, "learning_rate": 4.996267286804831e-07, "loss": 0.0001, "reward": 1.7446429282426834, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7491071652621031, "rewards/format_reward_func": 0.9955357164144516, "step": 6328 }, { "completion_length": 241.1116180419922, "epoch": 1.0613604928957625, "grad_norm": 0.1383559369834235, "kl": 0.0700225830078125, "learning_rate": 4.996260731193606e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 6330 }, { "completion_length": 245.1473331451416, "epoch": 1.0616957961356301, "grad_norm": 0.3744539358250854, "kl": 0.0670928955078125, "learning_rate": 4.996254169835063e-07, "loss": 0.0001, "reward": 1.8160715028643608, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.8205357491970062, "rewards/format_reward_func": 0.9955357164144516, "step": 6332 }, { "completion_length": 247.540189743042, "epoch": 1.0620310993754978, "grad_norm": 0.4886645560494357, "kl": 0.0726165771484375, "learning_rate": 4.996247602729216e-07, "loss": 0.0001, "reward": 1.7267857640981674, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.731250025331974, "rewards/format_reward_func": 0.9955357164144516, "step": 6334 }, { "completion_length": 251.43304538726807, "epoch": 1.0623664026153652, "grad_norm": 0.1689562542398881, "kl": 0.0782623291015625, "learning_rate": 4.996241029876084e-07, "loss": 0.0001, "reward": 1.7910714820027351, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 6336 }, { "completion_length": 263.352689743042, "epoch": 1.0627017058552328, "grad_norm": 0.26788844139713214, "kl": 0.13543701171875, "learning_rate": 4.996234451275678e-07, "loss": 0.0001, "reward": 1.728571504354477, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 0.9821428656578064, "step": 6338 }, { "completion_length": 247.23661994934082, "epoch": 1.0630370090951005, "grad_norm": 0.19132039665083433, "kl": 0.1452789306640625, "learning_rate": 4.996227866928016e-07, "loss": 0.0001, "reward": 1.7285715267062187, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7375000230967999, "rewards/format_reward_func": 0.9910714328289032, "step": 6340 }, { "completion_length": 258.4419746398926, "epoch": 1.0633723123349679, "grad_norm": 0.19874075640970054, "kl": 0.1968841552734375, "learning_rate": 4.996221276833111e-07, "loss": 0.0002, "reward": 1.7607143446803093, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7696428820490837, "rewards/format_reward_func": 0.9910714328289032, "step": 6342 }, { "completion_length": 251.9464406967163, "epoch": 1.0637076155748355, "grad_norm": 0.23396514743791882, "kl": 0.1824951171875, "learning_rate": 4.996214680990979e-07, "loss": 0.0002, "reward": 1.7767857983708382, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7812500111758709, "rewards/format_reward_func": 0.9955357164144516, "step": 6344 }, { "completion_length": 238.31251049041748, "epoch": 1.0640429188147031, "grad_norm": 0.43415076467434777, "kl": 0.073333740234375, "learning_rate": 4.996208079401635e-07, "loss": 0.0001, "reward": 1.7839286550879478, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 6346 }, { "completion_length": 245.0089406967163, "epoch": 1.0643782220545706, "grad_norm": 0.4964206120947802, "kl": 0.758087158203125, "learning_rate": 4.996201472065096e-07, "loss": 0.0008, "reward": 1.7321429401636124, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7410714626312256, "rewards/format_reward_func": 0.9910714328289032, "step": 6348 }, { "completion_length": 258.4107275009155, "epoch": 1.0647135252944382, "grad_norm": 0.2686395974838121, "kl": 0.7075042724609375, "learning_rate": 4.996194858981374e-07, "loss": 0.0007, "reward": 1.7535714954137802, "reward_std": 0.0656599160283804, "rewards/equation_reward_func": 0.7625000365078449, "rewards/format_reward_func": 0.9910714328289032, "step": 6350 }, { "completion_length": 241.78572463989258, "epoch": 1.0650488285343056, "grad_norm": 0.20670819187440403, "kl": 0.2355194091796875, "learning_rate": 4.996188240150486e-07, "loss": 0.0002, "reward": 1.8000000566244125, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 6352 }, { "completion_length": 258.058048248291, "epoch": 1.0653841317741732, "grad_norm": 0.24306248609979544, "kl": 0.6288604736328125, "learning_rate": 4.996181615572449e-07, "loss": 0.0006, "reward": 1.7375000938773155, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7419643141329288, "rewards/format_reward_func": 0.9955357164144516, "step": 6354 }, { "completion_length": 239.61608028411865, "epoch": 1.0657194350140409, "grad_norm": 0.28623101171516113, "kl": 0.140655517578125, "learning_rate": 4.996174985247275e-07, "loss": 0.0001, "reward": 1.7178572192788124, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7267857454717159, "rewards/format_reward_func": 0.9910714328289032, "step": 6356 }, { "completion_length": 244.5759048461914, "epoch": 1.0660547382539083, "grad_norm": 0.18153326342052306, "kl": 0.0954132080078125, "learning_rate": 4.996168349174981e-07, "loss": 0.0001, "reward": 1.7196429297327995, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7241071779280901, "rewards/format_reward_func": 0.9955357164144516, "step": 6358 }, { "completion_length": 247.27679634094238, "epoch": 1.066390041493776, "grad_norm": 0.35221345372591045, "kl": 0.691741943359375, "learning_rate": 4.996161707355582e-07, "loss": 0.0007, "reward": 1.7500000670552254, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7589286081492901, "rewards/format_reward_func": 0.9910714328289032, "step": 6360 }, { "completion_length": 255.2366189956665, "epoch": 1.0667253447336436, "grad_norm": 0.5279583818578518, "kl": 0.78302001953125, "learning_rate": 4.996155059789093e-07, "loss": 0.0008, "reward": 1.6785714775323868, "reward_std": 0.11111677903681993, "rewards/equation_reward_func": 0.7053571660071611, "rewards/format_reward_func": 0.9732142984867096, "step": 6362 }, { "completion_length": 256.92858123779297, "epoch": 1.067060647973511, "grad_norm": 0.2476382497789468, "kl": 0.3397369384765625, "learning_rate": 4.996148406475529e-07, "loss": 0.0003, "reward": 1.7464286461472511, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 6364 }, { "completion_length": 251.4017972946167, "epoch": 1.0673959512133786, "grad_norm": 0.22177943064580702, "kl": 1.80938720703125, "learning_rate": 4.996141747414906e-07, "loss": 0.0018, "reward": 1.7928571924567223, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.8017857410013676, "rewards/format_reward_func": 0.9910714328289032, "step": 6366 }, { "completion_length": 249.25447750091553, "epoch": 1.0677312544532462, "grad_norm": 0.2940764790633873, "kl": 0.3301849365234375, "learning_rate": 4.99613508260724e-07, "loss": 0.0003, "reward": 1.7321429029107094, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7410714533179998, "rewards/format_reward_func": 0.9910714328289032, "step": 6368 }, { "completion_length": 262.54465770721436, "epoch": 1.0680665576931136, "grad_norm": 0.36158647929313403, "kl": 0.313018798828125, "learning_rate": 4.996128412052546e-07, "loss": 0.0003, "reward": 1.7517857626080513, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7651786096394062, "rewards/format_reward_func": 0.9866071492433548, "step": 6370 }, { "completion_length": 265.00894260406494, "epoch": 1.0684018609329813, "grad_norm": 0.2477564882494541, "kl": 0.3001556396484375, "learning_rate": 4.996121735750838e-07, "loss": 0.0003, "reward": 1.7071429267525673, "reward_std": 0.1010152529925108, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 0.9821428656578064, "step": 6372 }, { "completion_length": 253.47322463989258, "epoch": 1.068737164172849, "grad_norm": 0.21786370922386733, "kl": 0.0959014892578125, "learning_rate": 4.996115053702132e-07, "loss": 0.0001, "reward": 1.7375000640749931, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643197208643, "rewards/format_reward_func": 0.9955357164144516, "step": 6374 }, { "completion_length": 252.44644165039062, "epoch": 1.0690724674127163, "grad_norm": 0.16300399710153712, "kl": 0.1212158203125, "learning_rate": 4.996108365906444e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7910714522004128, "rewards/format_reward_func": 0.9910714328289032, "step": 6376 }, { "completion_length": 255.8482255935669, "epoch": 1.069407770652584, "grad_norm": 0.4202332630677491, "kl": 0.266632080078125, "learning_rate": 4.996101672363789e-07, "loss": 0.0003, "reward": 1.7464286461472511, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7464286163449287, "rewards/format_reward_func": 1.0, "step": 6378 }, { "completion_length": 250.80358219146729, "epoch": 1.0697430738924516, "grad_norm": 0.21634007380193282, "kl": 0.1393280029296875, "learning_rate": 4.996094973074182e-07, "loss": 0.0001, "reward": 1.7910715118050575, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 6380 }, { "completion_length": 240.21429634094238, "epoch": 1.070078377132319, "grad_norm": 0.2353217723544025, "kl": 0.1108551025390625, "learning_rate": 4.996088268037641e-07, "loss": 0.0001, "reward": 1.7517857924103737, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500312924385, "rewards/format_reward_func": 0.9955357164144516, "step": 6382 }, { "completion_length": 254.6250114440918, "epoch": 1.0704136803721866, "grad_norm": 0.22977953063269516, "kl": 0.0945281982421875, "learning_rate": 4.996081557254177e-07, "loss": 0.0001, "reward": 1.7696429193019867, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7741071600466967, "rewards/format_reward_func": 0.9955357164144516, "step": 6384 }, { "completion_length": 244.19643592834473, "epoch": 1.070748983612054, "grad_norm": 0.22856891401563256, "kl": 0.212188720703125, "learning_rate": 4.996074840723809e-07, "loss": 0.0002, "reward": 1.7642858251929283, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 6386 }, { "completion_length": 236.9062623977661, "epoch": 1.0710842868519217, "grad_norm": 0.20213076895736504, "kl": 0.105621337890625, "learning_rate": 4.99606811844655e-07, "loss": 0.0001, "reward": 1.7785715162754059, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714380443096, "rewards/format_reward_func": 1.0, "step": 6388 }, { "completion_length": 248.64733409881592, "epoch": 1.0714195900917893, "grad_norm": 0.16060784507836098, "kl": 0.07293701171875, "learning_rate": 4.996061390422418e-07, "loss": 0.0001, "reward": 1.8035714998841286, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8125000335276127, "rewards/format_reward_func": 0.9910714328289032, "step": 6390 }, { "completion_length": 237.47322273254395, "epoch": 1.0717548933316567, "grad_norm": 0.3744916426993643, "kl": 0.1375274658203125, "learning_rate": 4.996054656651427e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7732143066823483, "rewards/format_reward_func": 0.9910714328289032, "step": 6392 }, { "completion_length": 240.9419765472412, "epoch": 1.0720901965715244, "grad_norm": 0.5524514166335937, "kl": 0.1202850341796875, "learning_rate": 4.996047917133591e-07, "loss": 0.0001, "reward": 1.7482143566012383, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7616071775555611, "rewards/format_reward_func": 0.9866071492433548, "step": 6394 }, { "completion_length": 247.79465293884277, "epoch": 1.072425499811392, "grad_norm": 0.1931412468701336, "kl": 0.1047821044921875, "learning_rate": 4.996041171868929e-07, "loss": 0.0001, "reward": 1.7214286476373672, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7214286103844643, "rewards/format_reward_func": 1.0, "step": 6396 }, { "completion_length": 231.8973331451416, "epoch": 1.0727608030512594, "grad_norm": 0.16039896848106713, "kl": 0.07208251953125, "learning_rate": 4.996034420857454e-07, "loss": 0.0001, "reward": 1.7839286401867867, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 6398 }, { "completion_length": 247.7634038925171, "epoch": 1.073096106291127, "grad_norm": 0.40971345021003636, "kl": 0.078338623046875, "learning_rate": 4.996027664099183e-07, "loss": 0.0001, "reward": 1.7785715162754059, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 6400 }, { "completion_length": 248.1741189956665, "epoch": 1.0734314095309947, "grad_norm": 0.1190091501059919, "kl": 0.104644775390625, "learning_rate": 4.996020901594131e-07, "loss": 0.0001, "reward": 1.748214341700077, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7526785992085934, "rewards/format_reward_func": 0.9955357164144516, "step": 6402 }, { "completion_length": 244.5491180419922, "epoch": 1.073766712770862, "grad_norm": 0.20764049742946922, "kl": 0.074798583984375, "learning_rate": 4.996014133342314e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285798370838, "rewards/format_reward_func": 1.0, "step": 6404 }, { "completion_length": 246.4017972946167, "epoch": 1.0741020160107297, "grad_norm": 0.21780243516392667, "kl": 0.067718505859375, "learning_rate": 4.996007359343745e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 6406 }, { "completion_length": 246.27679920196533, "epoch": 1.0744373192505972, "grad_norm": 0.48129432871493416, "kl": 0.08587646484375, "learning_rate": 4.996000579598442e-07, "loss": 0.0001, "reward": 1.7071429565548897, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7160714659839869, "rewards/format_reward_func": 0.9910714328289032, "step": 6408 }, { "completion_length": 245.6562614440918, "epoch": 1.0747726224904648, "grad_norm": 0.24065879335741383, "kl": 0.0702972412109375, "learning_rate": 4.995993794106421e-07, "loss": 0.0001, "reward": 1.8000000715255737, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 6410 }, { "completion_length": 247.36608123779297, "epoch": 1.0751079257303324, "grad_norm": 0.26497871394053646, "kl": 0.0687713623046875, "learning_rate": 4.995987002867698e-07, "loss": 0.0001, "reward": 1.7553572207689285, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 6412 }, { "completion_length": 248.27679824829102, "epoch": 1.0754432289701998, "grad_norm": 0.41136003473797084, "kl": 0.0745391845703125, "learning_rate": 4.995980205882286e-07, "loss": 0.0001, "reward": 1.7392858043313026, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 6414 }, { "completion_length": 247.2678689956665, "epoch": 1.0757785322100675, "grad_norm": 0.21398005997336908, "kl": 0.064544677734375, "learning_rate": 4.995973403150202e-07, "loss": 0.0001, "reward": 1.719642959535122, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.724107176065445, "rewards/format_reward_func": 0.9955357164144516, "step": 6416 }, { "completion_length": 234.8259038925171, "epoch": 1.076113835449935, "grad_norm": 0.25680062865685876, "kl": 0.0615081787109375, "learning_rate": 4.995966594671462e-07, "loss": 0.0001, "reward": 1.8071429207921028, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 6418 }, { "completion_length": 239.17858409881592, "epoch": 1.0764491386898025, "grad_norm": 0.3022971347656876, "kl": 0.05865478515625, "learning_rate": 4.995959780446081e-07, "loss": 0.0001, "reward": 1.7687500417232513, "reward_std": 0.04419417306780815, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9982142895460129, "step": 6420 }, { "completion_length": 240.0357255935669, "epoch": 1.0767844419296702, "grad_norm": 0.3100097102368012, "kl": 0.065399169921875, "learning_rate": 4.995952960474076e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.800000037997961, "rewards/format_reward_func": 1.0, "step": 6422 }, { "completion_length": 250.1116189956665, "epoch": 1.0771197451695378, "grad_norm": 0.12420128973486144, "kl": 0.0625457763671875, "learning_rate": 4.995946134755462e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 6424 }, { "completion_length": 252.18751335144043, "epoch": 1.0774550484094052, "grad_norm": 0.16978968685993892, "kl": 0.067596435546875, "learning_rate": 4.995939303290253e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 6426 }, { "completion_length": 254.5044755935669, "epoch": 1.0777903516492728, "grad_norm": 0.33574965100851467, "kl": 0.064117431640625, "learning_rate": 4.995932466078468e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7669643275439739, "rewards/format_reward_func": 0.9955357164144516, "step": 6428 }, { "completion_length": 249.9241180419922, "epoch": 1.0781256548891403, "grad_norm": 0.22794782342785525, "kl": 0.06793212890625, "learning_rate": 4.995925623120121e-07, "loss": 0.0001, "reward": 1.6964286491274834, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7053571809083223, "rewards/format_reward_func": 0.9910714328289032, "step": 6430 }, { "completion_length": 251.34376335144043, "epoch": 1.0784609581290079, "grad_norm": 0.21343348359286007, "kl": 0.064117431640625, "learning_rate": 4.995918774415226e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 6432 }, { "completion_length": 250.5178680419922, "epoch": 1.0787962613688755, "grad_norm": 0.2154335777824777, "kl": 0.073577880859375, "learning_rate": 4.995911919963802e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 6434 }, { "completion_length": 256.2455520629883, "epoch": 1.079131564608743, "grad_norm": 0.22964136065354926, "kl": 0.0595703125, "learning_rate": 4.995905059765863e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.8080357350409031, "rewards/format_reward_func": 0.9866071492433548, "step": 6436 }, { "completion_length": 255.1294765472412, "epoch": 1.0794668678486106, "grad_norm": 0.30185869514240066, "kl": 0.069580078125, "learning_rate": 4.995898193821426e-07, "loss": 0.0001, "reward": 1.753571517765522, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 6438 }, { "completion_length": 254.9241180419922, "epoch": 1.0798021710884782, "grad_norm": 0.22874675324827642, "kl": 0.060516357421875, "learning_rate": 4.995891322130505e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643014669418, "rewards/format_reward_func": 0.9955357164144516, "step": 6440 }, { "completion_length": 245.75447845458984, "epoch": 1.0801374743283456, "grad_norm": 0.15348513336418879, "kl": 0.058837890625, "learning_rate": 4.995884444693117e-07, "loss": 0.0001, "reward": 1.835714340209961, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8357143104076385, "rewards/format_reward_func": 1.0, "step": 6442 }, { "completion_length": 247.2366189956665, "epoch": 1.0804727775682132, "grad_norm": 0.17123096708578153, "kl": 0.0656585693359375, "learning_rate": 4.995877561509278e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964286170899868, "rewards/format_reward_func": 1.0, "step": 6444 }, { "completion_length": 257.4598340988159, "epoch": 1.0808080808080809, "grad_norm": 0.28229072762750085, "kl": 0.069366455078125, "learning_rate": 4.995870672579003e-07, "loss": 0.0001, "reward": 1.7361607998609543, "reward_std": 0.10038390709087253, "rewards/equation_reward_func": 0.7419643122702837, "rewards/format_reward_func": 0.9941964335739613, "step": 6446 }, { "completion_length": 249.0089406967163, "epoch": 1.0811433840479483, "grad_norm": 0.2814686950997999, "kl": 0.08123779296875, "learning_rate": 4.995863777902308e-07, "loss": 0.0001, "reward": 1.8196429088711739, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8241071663796902, "rewards/format_reward_func": 0.9955357164144516, "step": 6448 }, { "completion_length": 251.23215293884277, "epoch": 1.081478687287816, "grad_norm": 0.18263971276236252, "kl": 0.0647430419921875, "learning_rate": 4.99585687747921e-07, "loss": 0.0001, "reward": 1.7839286401867867, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 6450 }, { "completion_length": 246.1071548461914, "epoch": 1.0818139905276836, "grad_norm": 0.3340344630893179, "kl": 0.07000732421875, "learning_rate": 4.995849971309723e-07, "loss": 0.0001, "reward": 1.7785715088248253, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 6452 }, { "completion_length": 245.1875114440918, "epoch": 1.082149293767551, "grad_norm": 0.23800132416609462, "kl": 0.0767669677734375, "learning_rate": 4.995843059393865e-07, "loss": 0.0001, "reward": 1.7410714775323868, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357499420643, "rewards/format_reward_func": 0.9955357164144516, "step": 6454 }, { "completion_length": 247.5000114440918, "epoch": 1.0824845970074186, "grad_norm": 0.19794952988984094, "kl": 0.07769775390625, "learning_rate": 4.995836141731651e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285887777805, "rewards/format_reward_func": 1.0, "step": 6456 }, { "completion_length": 244.90179634094238, "epoch": 1.0828199002472862, "grad_norm": 0.2173504470847555, "kl": 0.0712127685546875, "learning_rate": 4.995829218323096e-07, "loss": 0.0001, "reward": 1.7053572162985802, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7098214700818062, "rewards/format_reward_func": 0.9955357164144516, "step": 6458 }, { "completion_length": 258.9375123977661, "epoch": 1.0831552034871537, "grad_norm": 0.24258856009385474, "kl": 0.0692596435546875, "learning_rate": 4.995822289168216e-07, "loss": 0.0001, "reward": 1.7035715207457542, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.703571442514658, "rewards/format_reward_func": 1.0, "step": 6460 }, { "completion_length": 254.15626525878906, "epoch": 1.0834905067270213, "grad_norm": 0.21434048158714838, "kl": 0.0685882568359375, "learning_rate": 4.99581535426703e-07, "loss": 0.0001, "reward": 1.7196429297327995, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7241071909666061, "rewards/format_reward_func": 0.9955357164144516, "step": 6462 }, { "completion_length": 255.51786708831787, "epoch": 1.0838258099668887, "grad_norm": 0.322625348089809, "kl": 0.066741943359375, "learning_rate": 4.99580841361955e-07, "loss": 0.0001, "reward": 1.7303572371602058, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7348214536905289, "rewards/format_reward_func": 0.9955357164144516, "step": 6464 }, { "completion_length": 262.401798248291, "epoch": 1.0841611132067563, "grad_norm": 0.18824960966371346, "kl": 0.0705108642578125, "learning_rate": 4.995801467225794e-07, "loss": 0.0001, "reward": 1.7375000640749931, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.7508928924798965, "rewards/format_reward_func": 0.9866071492433548, "step": 6466 }, { "completion_length": 253.3259048461914, "epoch": 1.084496416446624, "grad_norm": 0.32009667898891864, "kl": 0.074462890625, "learning_rate": 4.995794515085778e-07, "loss": 0.0001, "reward": 1.7517857775092125, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 6468 }, { "completion_length": 252.25001525878906, "epoch": 1.0848317196864914, "grad_norm": 0.23876316643511034, "kl": 0.069305419921875, "learning_rate": 4.995787557199518e-07, "loss": 0.0001, "reward": 1.764285795390606, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7732143066823483, "rewards/format_reward_func": 0.9910714328289032, "step": 6470 }, { "completion_length": 255.30804920196533, "epoch": 1.085167022926359, "grad_norm": 0.26642096787601005, "kl": 0.069610595703125, "learning_rate": 4.995780593567028e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 6472 }, { "completion_length": 252.88394260406494, "epoch": 1.0855023261662267, "grad_norm": 0.24414306302956112, "kl": 0.0724029541015625, "learning_rate": 4.995773624188328e-07, "loss": 0.0001, "reward": 1.7267858013510704, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7312500216066837, "rewards/format_reward_func": 0.9955357164144516, "step": 6474 }, { "completion_length": 255.62054538726807, "epoch": 1.085837629406094, "grad_norm": 0.250032728392339, "kl": 0.0707550048828125, "learning_rate": 4.995766649063429e-07, "loss": 0.0001, "reward": 1.7964286133646965, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 6476 }, { "completion_length": 256.508939743042, "epoch": 1.0861729326459617, "grad_norm": 0.1228238932830406, "kl": 0.0816497802734375, "learning_rate": 4.995759668192352e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143107801676, "rewards/format_reward_func": 1.0, "step": 6478 }, { "completion_length": 253.83483409881592, "epoch": 1.0865082358858293, "grad_norm": 0.2071801861333888, "kl": 0.0679779052734375, "learning_rate": 4.995752681575111e-07, "loss": 0.0001, "reward": 1.8160714730620384, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8205357268452644, "rewards/format_reward_func": 0.9955357164144516, "step": 6480 }, { "completion_length": 241.84822368621826, "epoch": 1.0868435391256968, "grad_norm": 0.17132538782288873, "kl": 0.0664825439453125, "learning_rate": 4.995745689211721e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000134110451, "rewards/format_reward_func": 1.0, "step": 6482 }, { "completion_length": 251.61608123779297, "epoch": 1.0871788423655644, "grad_norm": 0.33335513174173054, "kl": 0.077301025390625, "learning_rate": 4.9957386911022e-07, "loss": 0.0001, "reward": 1.7107143849134445, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7196428813040257, "rewards/format_reward_func": 0.9910714328289032, "step": 6484 }, { "completion_length": 249.34375953674316, "epoch": 1.0875141456054318, "grad_norm": 0.1509868126855817, "kl": 0.0727386474609375, "learning_rate": 4.995731687246562e-07, "loss": 0.0001, "reward": 1.814285770058632, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 6486 }, { "completion_length": 263.23662090301514, "epoch": 1.0878494488452994, "grad_norm": 0.14210317501246147, "kl": 0.07989501953125, "learning_rate": 4.995724677644824e-07, "loss": 0.0001, "reward": 1.7107143625617027, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7196428924798965, "rewards/format_reward_func": 0.9910714328289032, "step": 6488 }, { "completion_length": 258.1696548461914, "epoch": 1.088184752085167, "grad_norm": 0.22951210067149835, "kl": 0.075286865234375, "learning_rate": 4.995717662297003e-07, "loss": 0.0001, "reward": 1.7178572565317154, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7178571857511997, "rewards/format_reward_func": 1.0, "step": 6490 }, { "completion_length": 256.8303689956665, "epoch": 1.0885200553250345, "grad_norm": 0.21521536442832762, "kl": 0.0674591064453125, "learning_rate": 4.995710641203115e-07, "loss": 0.0001, "reward": 1.7410714849829674, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7455357424914837, "rewards/format_reward_func": 0.9955357164144516, "step": 6492 }, { "completion_length": 252.008939743042, "epoch": 1.0888553585649021, "grad_norm": 0.17696140360889953, "kl": 0.073944091796875, "learning_rate": 4.995703614363176e-07, "loss": 0.0001, "reward": 1.7410714998841286, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357406288385, "rewards/format_reward_func": 0.9955357164144516, "step": 6494 }, { "completion_length": 276.0759048461914, "epoch": 1.0891906618047698, "grad_norm": 0.29492164732292375, "kl": 0.0858001708984375, "learning_rate": 4.995696581777201e-07, "loss": 0.0001, "reward": 1.7196429371833801, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 6496 }, { "completion_length": 265.07144260406494, "epoch": 1.0895259650446372, "grad_norm": 0.20700246572065972, "kl": 0.098724365234375, "learning_rate": 4.995689543445209e-07, "loss": 0.0001, "reward": 1.7785714641213417, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7875000238418579, "rewards/format_reward_func": 0.9910714328289032, "step": 6498 }, { "completion_length": 267.6875104904175, "epoch": 1.0898612682845048, "grad_norm": 0.1563277942876201, "kl": 0.0779876708984375, "learning_rate": 4.995682499367212e-07, "loss": 0.0001, "reward": 1.7892857491970062, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.807142885401845, "rewards/format_reward_func": 0.9821428656578064, "step": 6500 }, { "completion_length": 260.4017972946167, "epoch": 1.0901965715243724, "grad_norm": 0.16573974603677033, "kl": 0.0820465087890625, "learning_rate": 4.99567544954323e-07, "loss": 0.0001, "reward": 1.812500037252903, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8169643133878708, "rewards/format_reward_func": 0.9955357164144516, "step": 6502 }, { "completion_length": 270.37054920196533, "epoch": 1.0905318747642399, "grad_norm": 0.20563901893599482, "kl": 0.08184814453125, "learning_rate": 4.995668393973277e-07, "loss": 0.0001, "reward": 1.7089286521077156, "reward_std": 0.10859140008687973, "rewards/equation_reward_func": 0.7401786111295223, "rewards/format_reward_func": 0.9687500111758709, "step": 6504 }, { "completion_length": 276.2589387893677, "epoch": 1.0908671780041075, "grad_norm": 0.7781279067739908, "kl": 0.1058349609375, "learning_rate": 4.99566133265737e-07, "loss": 0.0001, "reward": 1.712500050663948, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7348214518278837, "rewards/format_reward_func": 0.977678582072258, "step": 6506 }, { "completion_length": 284.8482255935669, "epoch": 1.0912024812439751, "grad_norm": 0.30134766414761066, "kl": 0.1042633056640625, "learning_rate": 4.995654265595526e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7767857424914837, "rewards/format_reward_func": 0.9910714328289032, "step": 6508 }, { "completion_length": 289.0535879135132, "epoch": 1.0915377844838425, "grad_norm": 0.25376529695446304, "kl": 0.114471435546875, "learning_rate": 4.99564719278776e-07, "loss": 0.0001, "reward": 1.7160714715719223, "reward_std": 0.07828682288527489, "rewards/equation_reward_func": 0.738392885774374, "rewards/format_reward_func": 0.977678582072258, "step": 6510 }, { "completion_length": 284.9821557998657, "epoch": 1.0918730877237102, "grad_norm": 0.42895058427759786, "kl": 0.13214111328125, "learning_rate": 4.99564011423409e-07, "loss": 0.0001, "reward": 1.6714286506175995, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.6892857495695353, "rewards/format_reward_func": 0.9821428656578064, "step": 6512 }, { "completion_length": 283.6562614440918, "epoch": 1.0922083909635778, "grad_norm": 0.6980135663023137, "kl": 0.156036376953125, "learning_rate": 4.99563302993453e-07, "loss": 0.0002, "reward": 1.6982143893837929, "reward_std": 0.12374368775635958, "rewards/equation_reward_func": 0.7294643074274063, "rewards/format_reward_func": 0.9687500149011612, "step": 6514 }, { "completion_length": 284.52233600616455, "epoch": 1.0925436942034452, "grad_norm": 0.44354741258299424, "kl": 0.1544189453125, "learning_rate": 4.995625939889097e-07, "loss": 0.0002, "reward": 1.7008929252624512, "reward_std": 0.059346460737288, "rewards/equation_reward_func": 0.7116071954369545, "rewards/format_reward_func": 0.9892857223749161, "step": 6516 }, { "completion_length": 278.3214406967163, "epoch": 1.0928789974433128, "grad_norm": 0.3003540847071273, "kl": 0.144378662109375, "learning_rate": 4.995618844097808e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.10354063473641872, "rewards/equation_reward_func": 0.7901785857975483, "rewards/format_reward_func": 0.9866071492433548, "step": 6518 }, { "completion_length": 300.0803680419922, "epoch": 1.0932143006831803, "grad_norm": 0.6405706414392106, "kl": 0.28515625, "learning_rate": 4.99561174256068e-07, "loss": 0.0003, "reward": 1.687500074505806, "reward_std": 0.08838834799826145, "rewards/equation_reward_func": 0.7098214626312256, "rewards/format_reward_func": 0.977678582072258, "step": 6520 }, { "completion_length": 290.151798248291, "epoch": 1.093549603923048, "grad_norm": 0.21560716139636138, "kl": 0.2230987548828125, "learning_rate": 4.995604635277728e-07, "loss": 0.0002, "reward": 1.707142949104309, "reward_std": 0.11616754159331322, "rewards/equation_reward_func": 0.7428571730852127, "rewards/format_reward_func": 0.9642857313156128, "step": 6522 }, { "completion_length": 288.7901945114136, "epoch": 1.0938849071629155, "grad_norm": 0.6662672959830765, "kl": 0.8878631591796875, "learning_rate": 4.995597522248968e-07, "loss": 0.0009, "reward": 1.7214286401867867, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 0.9821428656578064, "step": 6524 }, { "completion_length": 289.4553689956665, "epoch": 1.094220210402783, "grad_norm": 0.1884437199547336, "kl": 0.4522247314453125, "learning_rate": 4.995590403474419e-07, "loss": 0.0005, "reward": 1.707142896950245, "reward_std": 0.08081220462918282, "rewards/equation_reward_func": 0.7339286208152771, "rewards/format_reward_func": 0.9732142984867096, "step": 6526 }, { "completion_length": 268.06251335144043, "epoch": 1.0945555136426506, "grad_norm": 0.29136324362216554, "kl": 0.8277587890625, "learning_rate": 4.995583278954095e-07, "loss": 0.0008, "reward": 1.782142922282219, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7910714484751225, "rewards/format_reward_func": 0.9910714328289032, "step": 6528 }, { "completion_length": 282.5134105682373, "epoch": 1.0948908168825182, "grad_norm": 0.13064029811812, "kl": 1.827606201171875, "learning_rate": 4.995576148688012e-07, "loss": 0.0018, "reward": 1.771428607404232, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7982143238186836, "rewards/format_reward_func": 0.9732142984867096, "step": 6530 }, { "completion_length": 274.4553699493408, "epoch": 1.0952261201223856, "grad_norm": 0.2776119596095246, "kl": 0.695587158203125, "learning_rate": 4.995569012676189e-07, "loss": 0.0007, "reward": 1.7375000640749931, "reward_std": 0.08838834799826145, "rewards/equation_reward_func": 0.7508928775787354, "rewards/format_reward_func": 0.9866071492433548, "step": 6532 }, { "completion_length": 283.30358695983887, "epoch": 1.0955614233622533, "grad_norm": 0.2657775533041802, "kl": 0.5340576171875, "learning_rate": 4.995561870918639e-07, "loss": 0.0005, "reward": 1.7482143715023994, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7526785992085934, "rewards/format_reward_func": 0.9955357164144516, "step": 6534 }, { "completion_length": 290.11162281036377, "epoch": 1.095896726602121, "grad_norm": 0.3284617239203021, "kl": 0.455780029296875, "learning_rate": 4.995554723415382e-07, "loss": 0.0005, "reward": 1.7071429342031479, "reward_std": 0.10606601741164923, "rewards/equation_reward_func": 0.733928594738245, "rewards/format_reward_func": 0.9732142984867096, "step": 6536 }, { "completion_length": 285.56251525878906, "epoch": 1.0962320298419883, "grad_norm": 0.4737519246446728, "kl": 0.477142333984375, "learning_rate": 4.995547570166432e-07, "loss": 0.0005, "reward": 1.7482143715023994, "reward_std": 0.10859139822423458, "rewards/equation_reward_func": 0.779464315623045, "rewards/format_reward_func": 0.9687500149011612, "step": 6538 }, { "completion_length": 281.23662281036377, "epoch": 1.096567333081856, "grad_norm": 0.28880568251512756, "kl": 0.977142333984375, "learning_rate": 4.995540411171805e-07, "loss": 0.001, "reward": 1.767857201397419, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.776785746216774, "rewards/format_reward_func": 0.9910714328289032, "step": 6540 }, { "completion_length": 289.55358505249023, "epoch": 1.0969026363217234, "grad_norm": 0.022334262039546134, "kl": 0.261566162109375, "learning_rate": 4.99553324643152e-07, "loss": 0.0003, "reward": 1.757142923772335, "reward_std": 0.04040610231459141, "rewards/equation_reward_func": 0.7660714536905289, "rewards/format_reward_func": 0.9910714328289032, "step": 6542 }, { "completion_length": 285.9241199493408, "epoch": 1.097237939561591, "grad_norm": 0.30625968284332616, "kl": 0.1597900390625, "learning_rate": 4.995526075945593e-07, "loss": 0.0002, "reward": 1.7553571835160255, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.977678582072258, "step": 6544 }, { "completion_length": 279.4241199493408, "epoch": 1.0975732428014586, "grad_norm": 0.4419334505325685, "kl": 0.6954498291015625, "learning_rate": 4.995518899714039e-07, "loss": 0.0007, "reward": 1.685714341700077, "reward_std": 0.07071067858487368, "rewards/equation_reward_func": 0.7035714499652386, "rewards/format_reward_func": 0.9821428656578064, "step": 6546 }, { "completion_length": 286.0134086608887, "epoch": 1.097908546041326, "grad_norm": 0.33027778382925066, "kl": 0.388275146484375, "learning_rate": 4.995511717736875e-07, "loss": 0.0004, "reward": 1.73214291036129, "reward_std": 0.11616754066199064, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 0.9821428656578064, "step": 6548 }, { "completion_length": 283.43750953674316, "epoch": 1.0982438492811937, "grad_norm": 0.21767882315718848, "kl": 0.23004150390625, "learning_rate": 4.995504530014117e-07, "loss": 0.0002, "reward": 1.7500000670552254, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7589286006987095, "rewards/format_reward_func": 0.9910714328289032, "step": 6550 }, { "completion_length": 281.70983123779297, "epoch": 1.0985791525210613, "grad_norm": 0.3029158393739477, "kl": 0.1859130859375, "learning_rate": 4.995497336545783e-07, "loss": 0.0002, "reward": 1.7410715073347092, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357611179352, "rewards/format_reward_func": 0.9955357164144516, "step": 6552 }, { "completion_length": 270.4107246398926, "epoch": 1.0989144557609287, "grad_norm": 0.37787958558581103, "kl": 0.186492919921875, "learning_rate": 4.995490137331889e-07, "loss": 0.0002, "reward": 1.7946429178118706, "reward_std": 0.08838834706693888, "rewards/equation_reward_func": 0.8080357275903225, "rewards/format_reward_func": 0.9866071492433548, "step": 6554 }, { "completion_length": 288.4419765472412, "epoch": 1.0992497590007964, "grad_norm": 0.26048348704828717, "kl": 0.204254150390625, "learning_rate": 4.995482932372451e-07, "loss": 0.0002, "reward": 1.698214367032051, "reward_std": 0.12879444938153028, "rewards/equation_reward_func": 0.7294643111526966, "rewards/format_reward_func": 0.9687500111758709, "step": 6556 }, { "completion_length": 261.23662090301514, "epoch": 1.099585062240664, "grad_norm": 0.551212163479299, "kl": 0.3712005615234375, "learning_rate": 4.995475721667486e-07, "loss": 0.0004, "reward": 1.7089286148548126, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7223214562982321, "rewards/format_reward_func": 0.9866071492433548, "step": 6558 }, { "completion_length": 249.2544755935669, "epoch": 1.0999203654805314, "grad_norm": 0.20253441295934366, "kl": 0.2565765380859375, "learning_rate": 4.995468505217011e-07, "loss": 0.0003, "reward": 1.769642911851406, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7741071656346321, "rewards/format_reward_func": 0.9955357164144516, "step": 6560 }, { "completion_length": 258.9598340988159, "epoch": 1.100255668720399, "grad_norm": 0.21418411741040597, "kl": 0.1841278076171875, "learning_rate": 4.995461283021043e-07, "loss": 0.0002, "reward": 1.6607143729925156, "reward_std": 0.0656599160283804, "rewards/equation_reward_func": 0.6785714607685804, "rewards/format_reward_func": 0.9821428656578064, "step": 6562 }, { "completion_length": 253.91519165039062, "epoch": 1.1005909719602665, "grad_norm": 0.2070990208135219, "kl": 0.144287109375, "learning_rate": 4.995454055079597e-07, "loss": 0.0001, "reward": 1.721428632736206, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214286178350449, "rewards/format_reward_func": 1.0, "step": 6564 }, { "completion_length": 246.5803689956665, "epoch": 1.100926275200134, "grad_norm": 0.13879373930903302, "kl": 0.237457275390625, "learning_rate": 4.995446821392691e-07, "loss": 0.0002, "reward": 1.7857143431901932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.785714328289032, "rewards/format_reward_func": 1.0, "step": 6566 }, { "completion_length": 242.7142972946167, "epoch": 1.1012615784400017, "grad_norm": 0.26167857128278654, "kl": 0.146392822265625, "learning_rate": 4.99543958196034e-07, "loss": 0.0001, "reward": 1.7285715118050575, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.728571455925703, "rewards/format_reward_func": 1.0, "step": 6568 }, { "completion_length": 248.7500123977661, "epoch": 1.1015968816798691, "grad_norm": 0.15531658303917262, "kl": 0.11407470703125, "learning_rate": 4.995432336782563e-07, "loss": 0.0001, "reward": 1.7660714983940125, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 6570 }, { "completion_length": 235.06251049041748, "epoch": 1.1019321849197368, "grad_norm": 0.8186170595241872, "kl": 0.1658935546875, "learning_rate": 4.995425085859375e-07, "loss": 0.0002, "reward": 1.8142857775092125, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 6572 }, { "completion_length": 239.82590293884277, "epoch": 1.1022674881596044, "grad_norm": 0.06946096262257632, "kl": 0.1508331298828125, "learning_rate": 4.995417829190793e-07, "loss": 0.0002, "reward": 1.8464286103844643, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8464285880327225, "rewards/format_reward_func": 1.0, "step": 6574 }, { "completion_length": 241.59375953674316, "epoch": 1.1026027913994718, "grad_norm": 0.3313378435357141, "kl": 0.213287353515625, "learning_rate": 4.995410566776835e-07, "loss": 0.0002, "reward": 1.703571505844593, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7035714667290449, "rewards/format_reward_func": 1.0, "step": 6576 }, { "completion_length": 233.88394165039062, "epoch": 1.1029380946393395, "grad_norm": 0.16843392243853206, "kl": 0.14697265625, "learning_rate": 4.995403298617516e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 6578 }, { "completion_length": 239.76786994934082, "epoch": 1.103273397879207, "grad_norm": 0.3046427178111775, "kl": 0.140228271484375, "learning_rate": 4.995396024712853e-07, "loss": 0.0001, "reward": 1.7250000834465027, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7250000331550837, "rewards/format_reward_func": 1.0, "step": 6580 }, { "completion_length": 243.0803680419922, "epoch": 1.1036087011190745, "grad_norm": 0.21699272384070514, "kl": 0.19512939453125, "learning_rate": 4.995388745062864e-07, "loss": 0.0002, "reward": 1.7678572162985802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571604192257, "rewards/format_reward_func": 1.0, "step": 6582 }, { "completion_length": 245.22769165039062, "epoch": 1.1039440043589421, "grad_norm": 0.1718854405192041, "kl": 0.1443328857421875, "learning_rate": 4.995381459667564e-07, "loss": 0.0001, "reward": 1.7428572326898575, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571730852127, "rewards/format_reward_func": 1.0, "step": 6584 }, { "completion_length": 234.7098331451416, "epoch": 1.1042793075988098, "grad_norm": 0.22912566502299336, "kl": 0.168731689453125, "learning_rate": 4.995374168526971e-07, "loss": 0.0002, "reward": 1.7571429163217545, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 6586 }, { "completion_length": 235.71429634094238, "epoch": 1.1046146108386772, "grad_norm": 0.1733917636861512, "kl": 0.141448974609375, "learning_rate": 4.995366871641101e-07, "loss": 0.0001, "reward": 1.7982143312692642, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8026785962283611, "rewards/format_reward_func": 0.9955357164144516, "step": 6588 }, { "completion_length": 242.3392972946167, "epoch": 1.1049499140785448, "grad_norm": 0.3394077171058364, "kl": 0.211273193359375, "learning_rate": 4.995359569009972e-07, "loss": 0.0002, "reward": 1.7125000953674316, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7169643230736256, "rewards/format_reward_func": 0.9955357164144516, "step": 6590 }, { "completion_length": 252.6875123977661, "epoch": 1.1052852173184124, "grad_norm": 0.43081293111981867, "kl": 0.11590576171875, "learning_rate": 4.995352260633601e-07, "loss": 0.0001, "reward": 1.7285715192556381, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7285714484751225, "rewards/format_reward_func": 1.0, "step": 6592 }, { "completion_length": 258.4330472946167, "epoch": 1.1056205205582799, "grad_norm": 0.21628723638974706, "kl": 0.13763427734375, "learning_rate": 4.995344946512002e-07, "loss": 0.0001, "reward": 1.7392858043313026, "reward_std": 0.05555839091539383, "rewards/equation_reward_func": 0.7482143007218838, "rewards/format_reward_func": 0.9910714328289032, "step": 6594 }, { "completion_length": 248.11608123779297, "epoch": 1.1059558237981475, "grad_norm": 0.5966538770183953, "kl": 0.261688232421875, "learning_rate": 4.995337626645195e-07, "loss": 0.0003, "reward": 1.7875000685453415, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.791964303702116, "rewards/format_reward_func": 0.9955357164144516, "step": 6596 }, { "completion_length": 245.98215103149414, "epoch": 1.106291127038015, "grad_norm": 0.157717130031815, "kl": 0.097625732421875, "learning_rate": 4.995330301033195e-07, "loss": 0.0001, "reward": 1.7625000700354576, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7669643107801676, "rewards/format_reward_func": 0.9955357164144516, "step": 6598 }, { "completion_length": 248.59376335144043, "epoch": 1.1066264302778825, "grad_norm": 0.13235538653295403, "kl": 0.130096435546875, "learning_rate": 4.995322969676019e-07, "loss": 0.0001, "reward": 1.719642922282219, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 6600 }, { "completion_length": 255.5089406967163, "epoch": 1.1069617335177502, "grad_norm": 0.3280297518641213, "kl": 0.145782470703125, "learning_rate": 4.995315632573684e-07, "loss": 0.0001, "reward": 1.6982143893837929, "reward_std": 0.11364216171205044, "rewards/equation_reward_func": 0.7205357421189547, "rewards/format_reward_func": 0.977678582072258, "step": 6602 }, { "completion_length": 270.5000104904175, "epoch": 1.1072970367576176, "grad_norm": 0.9859893448344605, "kl": 0.204925537109375, "learning_rate": 4.995308289726209e-07, "loss": 0.0002, "reward": 1.7214286476373672, "reward_std": 0.09091373067349195, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 0.9821428656578064, "step": 6604 }, { "completion_length": 254.0848331451416, "epoch": 1.1076323399974852, "grad_norm": 0.6059061751603121, "kl": 0.307647705078125, "learning_rate": 4.995300941133608e-07, "loss": 0.0003, "reward": 1.7660714983940125, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.9955357164144516, "step": 6606 }, { "completion_length": 274.86608600616455, "epoch": 1.1079676432373529, "grad_norm": 0.2292386723821532, "kl": 0.272247314453125, "learning_rate": 4.995293586795899e-07, "loss": 0.0003, "reward": 1.7553572282195091, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.759821455925703, "rewards/format_reward_func": 0.9955357164144516, "step": 6608 }, { "completion_length": 261.2946557998657, "epoch": 1.1083029464772203, "grad_norm": 0.2272666943279464, "kl": 0.20709228515625, "learning_rate": 4.995286226713099e-07, "loss": 0.0002, "reward": 1.7071429565548897, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7160714566707611, "rewards/format_reward_func": 0.9910714328289032, "step": 6610 }, { "completion_length": 253.86608123779297, "epoch": 1.108638249717088, "grad_norm": 0.4846628000520213, "kl": 0.3332977294921875, "learning_rate": 4.995278860885225e-07, "loss": 0.0003, "reward": 1.6946429386734962, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.6991071794182062, "rewards/format_reward_func": 0.9955357164144516, "step": 6612 }, { "completion_length": 254.2321538925171, "epoch": 1.1089735529569555, "grad_norm": 0.16874754553226004, "kl": 0.1689453125, "learning_rate": 4.995271489312294e-07, "loss": 0.0002, "reward": 1.7933036237955093, "reward_std": 0.05997780850157142, "rewards/equation_reward_func": 0.808035746216774, "rewards/format_reward_func": 0.9852678664028645, "step": 6614 }, { "completion_length": 246.46429538726807, "epoch": 1.109308856196823, "grad_norm": 0.2300480690557061, "kl": 0.7288818359375, "learning_rate": 4.995264111994322e-07, "loss": 0.0007, "reward": 1.7285714969038963, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7375000342726707, "rewards/format_reward_func": 0.9910714328289032, "step": 6616 }, { "completion_length": 255.94197463989258, "epoch": 1.1096441594366906, "grad_norm": 0.2096849116943943, "kl": 1.5742645263671875, "learning_rate": 4.995256728931328e-07, "loss": 0.0016, "reward": 1.7254464998841286, "reward_std": 0.054927044780924916, "rewards/equation_reward_func": 0.7401785962283611, "rewards/format_reward_func": 0.9852678664028645, "step": 6618 }, { "completion_length": 253.44197273254395, "epoch": 1.109979462676558, "grad_norm": 0.24599453467498117, "kl": 0.540283203125, "learning_rate": 4.995249340123328e-07, "loss": 0.0005, "reward": 1.801785759627819, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8062500283122063, "rewards/format_reward_func": 0.9955357164144516, "step": 6620 }, { "completion_length": 254.8750114440918, "epoch": 1.1103147659164256, "grad_norm": 0.3624082535916845, "kl": 0.9485015869140625, "learning_rate": 4.995241945570339e-07, "loss": 0.001, "reward": 1.7517857775092125, "reward_std": 0.1287944484502077, "rewards/equation_reward_func": 0.7830357439815998, "rewards/format_reward_func": 0.9687500149011612, "step": 6622 }, { "completion_length": 249.16072463989258, "epoch": 1.1106500691562933, "grad_norm": 0.22190804563509325, "kl": 0.233489990234375, "learning_rate": 4.995234545272377e-07, "loss": 0.0002, "reward": 1.7892858013510704, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.7982143126428127, "rewards/format_reward_func": 0.9910714328289032, "step": 6624 }, { "completion_length": 266.75894355773926, "epoch": 1.1109853723961607, "grad_norm": 0.2184835683267264, "kl": 0.2601318359375, "learning_rate": 4.99522713922946e-07, "loss": 0.0003, "reward": 1.6803572326898575, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.684821480885148, "rewards/format_reward_func": 0.9955357164144516, "step": 6626 }, { "completion_length": 258.70536518096924, "epoch": 1.1113206756360283, "grad_norm": 0.23099437396258055, "kl": 0.1270751953125, "learning_rate": 4.995219727441605e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.07576144114136696, "rewards/equation_reward_func": 0.7910714522004128, "rewards/format_reward_func": 0.9910714328289032, "step": 6628 }, { "completion_length": 249.26340579986572, "epoch": 1.111655978875896, "grad_norm": 0.15913812225492693, "kl": 0.1038665771484375, "learning_rate": 4.995212309908829e-07, "loss": 0.0001, "reward": 1.7321429029107094, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7500000242143869, "rewards/format_reward_func": 0.9821428619325161, "step": 6630 }, { "completion_length": 255.02233505249023, "epoch": 1.1119912821157634, "grad_norm": 0.1314757051272552, "kl": 0.1993408203125, "learning_rate": 4.995204886631149e-07, "loss": 0.0002, "reward": 1.753571517765522, "reward_std": 0.05555839091539383, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 0.9821428656578064, "step": 6632 }, { "completion_length": 251.0848331451416, "epoch": 1.112326585355631, "grad_norm": 0.32806559142892566, "kl": 0.133941650390625, "learning_rate": 4.995197457608582e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.10101525485515594, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 0.9821428656578064, "step": 6634 }, { "completion_length": 255.01787281036377, "epoch": 1.1126618885954986, "grad_norm": 0.16793208241146898, "kl": 0.108489990234375, "learning_rate": 4.995190022841146e-07, "loss": 0.0001, "reward": 1.7500000447034836, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7589286044239998, "rewards/format_reward_func": 0.9910714328289032, "step": 6636 }, { "completion_length": 262.2142972946167, "epoch": 1.112997191835366, "grad_norm": 0.08044673225781292, "kl": 0.131439208984375, "learning_rate": 4.995182582328858e-07, "loss": 0.0001, "reward": 1.7125000804662704, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7169643100351095, "rewards/format_reward_func": 0.9955357164144516, "step": 6638 }, { "completion_length": 256.00001335144043, "epoch": 1.1133324950752337, "grad_norm": 0.20721738332140918, "kl": 0.11676025390625, "learning_rate": 4.995175136071733e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.752678606659174, "rewards/format_reward_func": 0.9955357164144516, "step": 6640 }, { "completion_length": 246.9419765472412, "epoch": 1.1136677983151013, "grad_norm": 0.24787722237396217, "kl": 0.095733642578125, "learning_rate": 4.99516768406979e-07, "loss": 0.0001, "reward": 1.7982143387198448, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8026786036789417, "rewards/format_reward_func": 0.9955357164144516, "step": 6642 }, { "completion_length": 241.3437614440918, "epoch": 1.1140031015549687, "grad_norm": 0.4045079891732805, "kl": 0.08056640625, "learning_rate": 4.995160226323046e-07, "loss": 0.0001, "reward": 1.7803572192788124, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7848214581608772, "rewards/format_reward_func": 0.9955357164144516, "step": 6644 }, { "completion_length": 245.88840198516846, "epoch": 1.1143384047948364, "grad_norm": 0.28886896563636494, "kl": 0.080047607421875, "learning_rate": 4.99515276283152e-07, "loss": 0.0001, "reward": 1.7580358013510704, "reward_std": 0.08965103607624769, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9982142895460129, "step": 6646 }, { "completion_length": 260.1384038925171, "epoch": 1.114673708034704, "grad_norm": 0.18604455411456783, "kl": 0.103424072265625, "learning_rate": 4.995145293595224e-07, "loss": 0.0001, "reward": 1.7375000938773155, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 6648 }, { "completion_length": 244.8169765472412, "epoch": 1.1150090112745714, "grad_norm": 0.13081924970963807, "kl": 0.073028564453125, "learning_rate": 4.995137818614181e-07, "loss": 0.0001, "reward": 1.8642857670783997, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8642857372760773, "rewards/format_reward_func": 1.0, "step": 6650 }, { "completion_length": 256.6294765472412, "epoch": 1.115344314514439, "grad_norm": 0.2613255092581983, "kl": 0.088592529296875, "learning_rate": 4.995130337888404e-07, "loss": 0.0001, "reward": 1.7678571864962578, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.776785746216774, "rewards/format_reward_func": 0.9910714328289032, "step": 6652 }, { "completion_length": 253.9241189956665, "epoch": 1.1156796177543065, "grad_norm": 0.29768289237812334, "kl": 0.07568359375, "learning_rate": 4.995122851417912e-07, "loss": 0.0001, "reward": 1.708928681910038, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7223214525729418, "rewards/format_reward_func": 0.9866071492433548, "step": 6654 }, { "completion_length": 242.65625858306885, "epoch": 1.116014920994174, "grad_norm": 0.2506214712263386, "kl": 0.1123504638671875, "learning_rate": 4.995115359202722e-07, "loss": 0.0001, "reward": 1.7214286476373672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.72142861969769, "rewards/format_reward_func": 1.0, "step": 6656 }, { "completion_length": 253.31251049041748, "epoch": 1.1163502242340417, "grad_norm": 0.2593847018803885, "kl": 0.1129302978515625, "learning_rate": 4.995107861242852e-07, "loss": 0.0001, "reward": 1.7660714983940125, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7705357447266579, "rewards/format_reward_func": 0.9955357164144516, "step": 6658 }, { "completion_length": 255.15179824829102, "epoch": 1.1166855274739091, "grad_norm": 0.27640892602323935, "kl": 0.07427978515625, "learning_rate": 4.995100357538319e-07, "loss": 0.0001, "reward": 1.7214286625385284, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214285992085934, "rewards/format_reward_func": 1.0, "step": 6660 }, { "completion_length": 258.34376335144043, "epoch": 1.1170208307137768, "grad_norm": 0.2795999240184237, "kl": 0.152252197265625, "learning_rate": 4.995092848089139e-07, "loss": 0.0002, "reward": 1.7357143759727478, "reward_std": 0.1010152529925108, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9910714328289032, "step": 6662 }, { "completion_length": 248.71876430511475, "epoch": 1.1173561339536444, "grad_norm": 0.2088474635895226, "kl": 0.067962646484375, "learning_rate": 4.99508533289533e-07, "loss": 0.0001, "reward": 1.6928572282195091, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7017857655882835, "rewards/format_reward_func": 0.9910714328289032, "step": 6664 }, { "completion_length": 260.60715675354004, "epoch": 1.1176914371935118, "grad_norm": 0.23495409011200794, "kl": 0.107940673828125, "learning_rate": 4.99507781195691e-07, "loss": 0.0001, "reward": 1.7410714775323868, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7544643208384514, "rewards/format_reward_func": 0.9866071492433548, "step": 6666 }, { "completion_length": 258.20537185668945, "epoch": 1.1180267404333795, "grad_norm": 0.3012901568012722, "kl": 0.18408203125, "learning_rate": 4.995070285273895e-07, "loss": 0.0002, "reward": 1.7660714834928513, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.779464315623045, "rewards/format_reward_func": 0.9866071492433548, "step": 6668 }, { "completion_length": 267.14287090301514, "epoch": 1.118362043673247, "grad_norm": 0.12807288181741017, "kl": 0.08056640625, "learning_rate": 4.995062752846304e-07, "loss": 0.0001, "reward": 1.7303572297096252, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.743750024586916, "rewards/format_reward_func": 0.9866071492433548, "step": 6670 }, { "completion_length": 254.95983219146729, "epoch": 1.1186973469131145, "grad_norm": 0.27471962269330896, "kl": 0.365966796875, "learning_rate": 4.995055214674153e-07, "loss": 0.0004, "reward": 1.7339286282658577, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7473214566707611, "rewards/format_reward_func": 0.9866071492433548, "step": 6672 }, { "completion_length": 245.0714406967163, "epoch": 1.1190326501529821, "grad_norm": 0.445086464766825, "kl": 0.2374267578125, "learning_rate": 4.99504767075746e-07, "loss": 0.0002, "reward": 1.7821429297327995, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428701281548, "rewards/format_reward_func": 1.0, "step": 6674 }, { "completion_length": 258.39286613464355, "epoch": 1.1193679533928496, "grad_norm": 0.41685696367688935, "kl": 0.120513916015625, "learning_rate": 4.995040121096243e-07, "loss": 0.0001, "reward": 1.7589286342263222, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9955357164144516, "step": 6676 }, { "completion_length": 242.99554538726807, "epoch": 1.1197032566327172, "grad_norm": 0.40514511717107204, "kl": 0.22406005859375, "learning_rate": 4.995032565690517e-07, "loss": 0.0002, "reward": 1.7160715088248253, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7205357495695353, "rewards/format_reward_func": 0.9955357164144516, "step": 6678 }, { "completion_length": 240.9821538925171, "epoch": 1.1200385598725848, "grad_norm": 0.46798781534682443, "kl": 0.207855224609375, "learning_rate": 4.995025004540301e-07, "loss": 0.0002, "reward": 1.7892857640981674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 6680 }, { "completion_length": 256.11161708831787, "epoch": 1.1203738631124522, "grad_norm": 0.6230530844540287, "kl": 0.28582763671875, "learning_rate": 4.995017437645613e-07, "loss": 0.0003, "reward": 1.770089365541935, "reward_std": 0.09280776605010033, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 0.9808035790920258, "step": 6682 }, { "completion_length": 236.52679634094238, "epoch": 1.1207091663523199, "grad_norm": 2.168458827730386, "kl": 0.2948150634765625, "learning_rate": 4.99500986500647e-07, "loss": 0.0003, "reward": 1.7080358043313026, "reward_std": 0.04924493608996272, "rewards/equation_reward_func": 0.7107143364846706, "rewards/format_reward_func": 0.9973214343190193, "step": 6684 }, { "completion_length": 235.4553680419922, "epoch": 1.1210444695921875, "grad_norm": 0.5230753506181978, "kl": 0.3092041015625, "learning_rate": 4.995002286622888e-07, "loss": 0.0003, "reward": 1.7428572252392769, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7517857402563095, "rewards/format_reward_func": 0.9910714328289032, "step": 6686 }, { "completion_length": 263.477689743042, "epoch": 1.121379772832055, "grad_norm": 2.2929950926167426, "kl": 1.7777099609375, "learning_rate": 4.994994702494887e-07, "loss": 0.0018, "reward": 1.7089285999536514, "reward_std": 0.1439467379823327, "rewards/equation_reward_func": 0.7580357454717159, "rewards/format_reward_func": 0.9508928805589676, "step": 6688 }, { "completion_length": 332.70983505249023, "epoch": 1.1217150760719226, "grad_norm": 2.489929363328956, "kl": 2.840087890625, "learning_rate": 4.994987112622483e-07, "loss": 0.0028, "reward": 1.1937500424683094, "reward_std": 0.4482551934197545, "rewards/equation_reward_func": 0.535714304074645, "rewards/format_reward_func": 0.6580357477068901, "step": 6690 }, { "completion_length": 297.50001430511475, "epoch": 1.1220503793117902, "grad_norm": 2.5227119840513015, "kl": 1.386444091796875, "learning_rate": 4.994979517005693e-07, "loss": 0.0014, "reward": 1.365625061094761, "reward_std": 0.3276682370342314, "rewards/equation_reward_func": 0.6062500178813934, "rewards/format_reward_func": 0.7593750320374966, "step": 6692 }, { "completion_length": 292.82590770721436, "epoch": 1.1223856825516576, "grad_norm": 1.663920193770367, "kl": 1.7261810302734375, "learning_rate": 4.994971915644536e-07, "loss": 0.0017, "reward": 1.4058036357164383, "reward_std": 0.3415578296408057, "rewards/equation_reward_func": 0.6482142992317677, "rewards/format_reward_func": 0.7575893234461546, "step": 6694 }, { "completion_length": 314.8214454650879, "epoch": 1.1227209857915252, "grad_norm": 1.2900474738091785, "kl": 1.621734619140625, "learning_rate": 4.994964308539027e-07, "loss": 0.0016, "reward": 1.278571479022503, "reward_std": 0.3838579710572958, "rewards/equation_reward_func": 0.5732143018394709, "rewards/format_reward_func": 0.7053571790456772, "step": 6696 }, { "completion_length": 272.94197368621826, "epoch": 1.1230562890313927, "grad_norm": 1.9977503024608492, "kl": 10.36822509765625, "learning_rate": 4.994956695689187e-07, "loss": 0.0104, "reward": 1.4410714656114578, "reward_std": 0.3055711481720209, "rewards/equation_reward_func": 0.6151786055415869, "rewards/format_reward_func": 0.8258929029107094, "step": 6698 }, { "completion_length": 259.0178699493408, "epoch": 1.1233915922712603, "grad_norm": 1.0655966422208405, "kl": 8.722640991210938, "learning_rate": 4.99494907709503e-07, "loss": 0.0087, "reward": 1.5750000774860382, "reward_std": 0.2676904257386923, "rewards/equation_reward_func": 0.7178571633994579, "rewards/format_reward_func": 0.8571428880095482, "step": 6700 }, { "completion_length": 249.94197750091553, "epoch": 1.123726895511128, "grad_norm": 1.1186796278129825, "kl": 7.51885986328125, "learning_rate": 4.994941452756575e-07, "loss": 0.0075, "reward": 1.6553571745753288, "reward_std": 0.18940360005944967, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.906250037252903, "step": 6702 }, { "completion_length": 241.90626335144043, "epoch": 1.1240621987509953, "grad_norm": 0.26440510902701225, "kl": 0.41455078125, "learning_rate": 4.994933822673841e-07, "loss": 0.0004, "reward": 1.6441965103149414, "reward_std": 0.0839689327403903, "rewards/equation_reward_func": 0.7053571715950966, "rewards/format_reward_func": 0.9388393126428127, "step": 6704 }, { "completion_length": 233.44197463989258, "epoch": 1.124397501990863, "grad_norm": 0.40772070413726497, "kl": 0.08148193359375, "learning_rate": 4.994926186846844e-07, "loss": 0.0001, "reward": 1.6924107670783997, "reward_std": 0.08270624093711376, "rewards/equation_reward_func": 0.7321428768336773, "rewards/format_reward_func": 0.9602678790688515, "step": 6706 }, { "completion_length": 224.9196548461914, "epoch": 1.1247328052307306, "grad_norm": 0.2948453419333735, "kl": 0.0688323974609375, "learning_rate": 4.994918545275601e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.812500037252903, "rewards/format_reward_func": 0.9732142984867096, "step": 6708 }, { "completion_length": 232.6696538925171, "epoch": 1.125068108470598, "grad_norm": 0.23710215383380753, "kl": 0.06982421875, "learning_rate": 4.994910897960132e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 0.9642857238650322, "step": 6710 }, { "completion_length": 229.37947463989258, "epoch": 1.1254034117104657, "grad_norm": 0.28880507537327116, "kl": 0.083526611328125, "learning_rate": 4.994903244900453e-07, "loss": 0.0001, "reward": 1.7000000774860382, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7357143275439739, "rewards/format_reward_func": 0.9642857313156128, "step": 6712 }, { "completion_length": 234.78572368621826, "epoch": 1.1257387149503333, "grad_norm": 0.49477930676249415, "kl": 0.129913330078125, "learning_rate": 4.994895586096581e-07, "loss": 0.0001, "reward": 1.6857143267989159, "reward_std": 0.12626906856894493, "rewards/equation_reward_func": 0.7392857354134321, "rewards/format_reward_func": 0.9464285969734192, "step": 6714 }, { "completion_length": 234.7366180419922, "epoch": 1.1260740181902007, "grad_norm": 0.2093629175985944, "kl": 0.0768280029296875, "learning_rate": 4.994887921548533e-07, "loss": 0.0001, "reward": 1.7267857566475868, "reward_std": 0.10354063659906387, "rewards/equation_reward_func": 0.7758928947150707, "rewards/format_reward_func": 0.950892873108387, "step": 6716 }, { "completion_length": 226.94643878936768, "epoch": 1.1264093214300683, "grad_norm": 0.27764295574443226, "kl": 0.0676422119140625, "learning_rate": 4.994880251256329e-07, "loss": 0.0001, "reward": 1.6910715103149414, "reward_std": 0.07828682195395231, "rewards/equation_reward_func": 0.713392885401845, "rewards/format_reward_func": 0.9776785783469677, "step": 6718 }, { "completion_length": 252.6562614440918, "epoch": 1.126744624669936, "grad_norm": 0.7163660640986659, "kl": 0.150604248046875, "learning_rate": 4.994872575219986e-07, "loss": 0.0002, "reward": 1.6642857491970062, "reward_std": 0.10606601648032665, "rewards/equation_reward_func": 0.7267857491970062, "rewards/format_reward_func": 0.9375000186264515, "step": 6720 }, { "completion_length": 240.53572368621826, "epoch": 1.1270799279098034, "grad_norm": 0.23950711514360806, "kl": 0.1202392578125, "learning_rate": 4.994864893439521e-07, "loss": 0.0001, "reward": 1.7178572043776512, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9732142984867096, "step": 6722 }, { "completion_length": 229.9866189956665, "epoch": 1.127415231149671, "grad_norm": 0.7546838926343592, "kl": 0.2407379150390625, "learning_rate": 4.994857205914952e-07, "loss": 0.0002, "reward": 1.6946429088711739, "reward_std": 0.13384521193802357, "rewards/equation_reward_func": 0.7437500320374966, "rewards/format_reward_func": 0.9508928768336773, "step": 6724 }, { "completion_length": 234.6026906967163, "epoch": 1.1277505343895387, "grad_norm": 0.2525886081067382, "kl": 0.0670013427734375, "learning_rate": 4.994849512646297e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.1060660183429718, "rewards/equation_reward_func": 0.7875000350177288, "rewards/format_reward_func": 0.9732142984867096, "step": 6726 }, { "completion_length": 227.79911708831787, "epoch": 1.128085837629406, "grad_norm": 0.3302151167549251, "kl": 0.068359375, "learning_rate": 4.994841813633574e-07, "loss": 0.0001, "reward": 1.73214291036129, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7410714589059353, "rewards/format_reward_func": 0.9910714328289032, "step": 6728 }, { "completion_length": 232.83036708831787, "epoch": 1.1284211408692737, "grad_norm": 0.27840527237917856, "kl": 0.071990966796875, "learning_rate": 4.994834108876798e-07, "loss": 0.0001, "reward": 1.734375037252903, "reward_std": 0.0738674052990973, "rewards/equation_reward_func": 0.7625000439584255, "rewards/format_reward_func": 0.9718750081956387, "step": 6730 }, { "completion_length": 229.79911708831787, "epoch": 1.128756444109141, "grad_norm": 0.37449317940935545, "kl": 0.0745849609375, "learning_rate": 4.994826398375991e-07, "loss": 0.0001, "reward": 1.786160796880722, "reward_std": 0.09028238197788596, "rewards/equation_reward_func": 0.7919643148779869, "rewards/format_reward_func": 0.9941964335739613, "step": 6732 }, { "completion_length": 231.52233123779297, "epoch": 1.1290917473490087, "grad_norm": 0.4020724354337358, "kl": 0.074310302734375, "learning_rate": 4.994818682131168e-07, "loss": 0.0001, "reward": 1.8138393461704254, "reward_std": 0.061240497045218945, "rewards/equation_reward_func": 0.8178571611642838, "rewards/format_reward_func": 0.995982151478529, "step": 6734 }, { "completion_length": 226.665189743042, "epoch": 1.1294270505888764, "grad_norm": 0.21480784890133833, "kl": 0.0719451904296875, "learning_rate": 4.994810960142347e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7625000327825546, "rewards/format_reward_func": 0.9910714328289032, "step": 6736 }, { "completion_length": 237.665189743042, "epoch": 1.1297623538287438, "grad_norm": 0.42019404209191996, "kl": 0.0841827392578125, "learning_rate": 4.994803232409546e-07, "loss": 0.0001, "reward": 1.6464286521077156, "reward_std": 0.1414213553071022, "rewards/equation_reward_func": 0.6821428798139095, "rewards/format_reward_func": 0.9642857313156128, "step": 6738 }, { "completion_length": 235.20536708831787, "epoch": 1.1300976570686114, "grad_norm": 0.22061191284016354, "kl": 0.0779876708984375, "learning_rate": 4.994795498932784e-07, "loss": 0.0001, "reward": 1.7214286476373672, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7392857298254967, "rewards/format_reward_func": 0.9821428656578064, "step": 6740 }, { "completion_length": 236.31251049041748, "epoch": 1.130432960308479, "grad_norm": 0.26718414308599536, "kl": 1.054351806640625, "learning_rate": 4.994787759712077e-07, "loss": 0.0011, "reward": 1.7089286297559738, "reward_std": 0.108591397292912, "rewards/equation_reward_func": 0.7312500327825546, "rewards/format_reward_func": 0.977678582072258, "step": 6742 }, { "completion_length": 230.5357255935669, "epoch": 1.1307682635483465, "grad_norm": 0.30451959283162633, "kl": 0.08184814453125, "learning_rate": 4.994780014747444e-07, "loss": 0.0001, "reward": 1.764285758137703, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.7821429036557674, "rewards/format_reward_func": 0.9821428656578064, "step": 6744 }, { "completion_length": 240.34822845458984, "epoch": 1.131103566788214, "grad_norm": 0.2806823903013915, "kl": 0.736236572265625, "learning_rate": 4.994772264038901e-07, "loss": 0.0007, "reward": 1.7303572371602058, "reward_std": 0.0833375845104456, "rewards/equation_reward_func": 0.7616071701049805, "rewards/format_reward_func": 0.9687500149011612, "step": 6746 }, { "completion_length": 242.71429538726807, "epoch": 1.1314388700280817, "grad_norm": 0.30587482030970475, "kl": 0.078155517578125, "learning_rate": 4.994764507586468e-07, "loss": 0.0001, "reward": 1.7696429267525673, "reward_std": 0.09343911148607731, "rewards/equation_reward_func": 0.7919643111526966, "rewards/format_reward_func": 0.9776785783469677, "step": 6748 }, { "completion_length": 220.5759038925171, "epoch": 1.1317741732679492, "grad_norm": 0.12420241190982315, "kl": 0.0816192626953125, "learning_rate": 4.994756745390163e-07, "loss": 0.0001, "reward": 1.7285714894533157, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7464285995811224, "rewards/format_reward_func": 0.9821428656578064, "step": 6750 }, { "completion_length": 229.2544755935669, "epoch": 1.1321094765078168, "grad_norm": 0.24521204153039233, "kl": 0.078582763671875, "learning_rate": 4.994748977450002e-07, "loss": 0.0001, "reward": 1.7053572088479996, "reward_std": 0.10354063659906387, "rewards/equation_reward_func": 0.7455357387661934, "rewards/format_reward_func": 0.9598214440047741, "step": 6752 }, { "completion_length": 236.79465293884277, "epoch": 1.1324447797476842, "grad_norm": 0.2621989103061339, "kl": 0.086334228515625, "learning_rate": 4.994741203766006e-07, "loss": 0.0001, "reward": 1.7267857417464256, "reward_std": 0.11364216171205044, "rewards/equation_reward_func": 0.7580357305705547, "rewards/format_reward_func": 0.9687500149011612, "step": 6754 }, { "completion_length": 230.37054538726807, "epoch": 1.1327800829875518, "grad_norm": 0.35612925150187297, "kl": 0.126983642578125, "learning_rate": 4.994733424338187e-07, "loss": 0.0001, "reward": 1.7160715237259865, "reward_std": 0.11869292240589857, "rewards/equation_reward_func": 0.7294643297791481, "rewards/format_reward_func": 0.9866071492433548, "step": 6756 }, { "completion_length": 232.8303680419922, "epoch": 1.1331153862274195, "grad_norm": 0.24637191100536926, "kl": 0.08123779296875, "learning_rate": 4.99472563916657e-07, "loss": 0.0001, "reward": 1.7357143461704254, "reward_std": 0.0909137288108468, "rewards/equation_reward_func": 0.753571467474103, "rewards/format_reward_func": 0.9821428656578064, "step": 6758 }, { "completion_length": 218.29911708831787, "epoch": 1.1334506894672869, "grad_norm": 0.1348445612235299, "kl": 0.06597900390625, "learning_rate": 4.994717848251168e-07, "loss": 0.0001, "reward": 1.8089286237955093, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.813392885029316, "rewards/format_reward_func": 0.9955357164144516, "step": 6760 }, { "completion_length": 235.64286708831787, "epoch": 1.1337859927071545, "grad_norm": 0.25379818975244123, "kl": 0.0818634033203125, "learning_rate": 4.994710051592001e-07, "loss": 0.0001, "reward": 1.7446429133415222, "reward_std": 0.10354063473641872, "rewards/equation_reward_func": 0.7758928909897804, "rewards/format_reward_func": 0.9687500149011612, "step": 6762 }, { "completion_length": 225.30804824829102, "epoch": 1.1341212959470222, "grad_norm": 0.29357611013427315, "kl": 0.6669158935546875, "learning_rate": 4.994702249189087e-07, "loss": 0.0007, "reward": 1.7607143595814705, "reward_std": 0.10606601554900408, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 0.9821428656578064, "step": 6764 }, { "completion_length": 226.20982933044434, "epoch": 1.1344565991868896, "grad_norm": 0.22086866513243852, "kl": 0.0677032470703125, "learning_rate": 4.994694441042442e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7482143230736256, "rewards/format_reward_func": 0.9910714328289032, "step": 6766 }, { "completion_length": 231.54018783569336, "epoch": 1.1347919024267572, "grad_norm": 0.2603920909323302, "kl": 0.0783538818359375, "learning_rate": 4.994686627152085e-07, "loss": 0.0001, "reward": 1.7500000447034836, "reward_std": 0.09596449043601751, "rewards/equation_reward_func": 0.7767857424914837, "rewards/format_reward_func": 0.9732142984867096, "step": 6768 }, { "completion_length": 233.05804634094238, "epoch": 1.1351272056666248, "grad_norm": 0.15957707276356545, "kl": 0.07037353515625, "learning_rate": 4.994678807518037e-07, "loss": 0.0001, "reward": 1.787500061094761, "reward_std": 0.08838834706693888, "rewards/equation_reward_func": 0.8008928820490837, "rewards/format_reward_func": 0.9866071492433548, "step": 6770 }, { "completion_length": 226.86161708831787, "epoch": 1.1354625089064923, "grad_norm": 0.2632334048762584, "kl": 0.074432373046875, "learning_rate": 4.994670982140312e-07, "loss": 0.0001, "reward": 1.7714286521077156, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7803571633994579, "rewards/format_reward_func": 0.9910714328289032, "step": 6772 }, { "completion_length": 232.81250953674316, "epoch": 1.1357978121463599, "grad_norm": 0.16245051836075658, "kl": 0.0787353515625, "learning_rate": 4.994663151018929e-07, "loss": 0.0001, "reward": 1.7589286118745804, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7812500260770321, "rewards/format_reward_func": 0.977678582072258, "step": 6774 }, { "completion_length": 237.65179920196533, "epoch": 1.1361331153862273, "grad_norm": 0.3432961730562157, "kl": 1.7099456787109375, "learning_rate": 4.994655314153907e-07, "loss": 0.0017, "reward": 1.6964286267757416, "reward_std": 0.12121830601245165, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 0.9642857313156128, "step": 6776 }, { "completion_length": 229.12947273254395, "epoch": 1.136468418626095, "grad_norm": 0.19895687920441474, "kl": 0.0804595947265625, "learning_rate": 4.994647471545265e-07, "loss": 0.0001, "reward": 1.7357143461704254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143238186836, "rewards/format_reward_func": 1.0, "step": 6778 }, { "completion_length": 226.87947463989258, "epoch": 1.1368037218659626, "grad_norm": 0.20275366249162494, "kl": 0.0865631103515625, "learning_rate": 4.994639623193017e-07, "loss": 0.0001, "reward": 1.78035718947649, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7848214618861675, "rewards/format_reward_func": 0.9955357164144516, "step": 6780 }, { "completion_length": 229.3080472946167, "epoch": 1.1371390251058302, "grad_norm": 0.25465394761532567, "kl": 0.08404541015625, "learning_rate": 4.994631769097186e-07, "loss": 0.0001, "reward": 1.6696429252624512, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.6830357611179352, "rewards/format_reward_func": 0.9866071492433548, "step": 6782 }, { "completion_length": 235.73215198516846, "epoch": 1.1374743283456976, "grad_norm": 0.2737771638741082, "kl": 1.16455078125, "learning_rate": 4.994623909257786e-07, "loss": 0.0012, "reward": 1.7500000447034836, "reward_std": 0.08586296811699867, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 0.9642857313156128, "step": 6784 }, { "completion_length": 220.9509048461914, "epoch": 1.1378096315855653, "grad_norm": 0.19918002938971385, "kl": 0.0744476318359375, "learning_rate": 4.994616043674837e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7803571745753288, "rewards/format_reward_func": 0.9910714328289032, "step": 6786 }, { "completion_length": 217.20536518096924, "epoch": 1.1381449348254327, "grad_norm": 0.17679503465918528, "kl": 0.079254150390625, "learning_rate": 4.994608172348356e-07, "loss": 0.0001, "reward": 1.798214353621006, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.8026786036789417, "rewards/format_reward_func": 0.9955357164144516, "step": 6788 }, { "completion_length": 221.59822368621826, "epoch": 1.1384802380653003, "grad_norm": 0.33131053910015773, "kl": 0.076934814453125, "learning_rate": 4.994600295278364e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000227242708, "rewards/format_reward_func": 1.0, "step": 6790 }, { "completion_length": 213.90626049041748, "epoch": 1.138815541305168, "grad_norm": 0.3653836980839601, "kl": 1.16046142578125, "learning_rate": 4.994592412464876e-07, "loss": 0.0012, "reward": 1.7500000670552254, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7589286155998707, "rewards/format_reward_func": 0.9910714328289032, "step": 6792 }, { "completion_length": 233.4017972946167, "epoch": 1.1391508445450353, "grad_norm": 0.27747762072907645, "kl": 0.084259033203125, "learning_rate": 4.994584523907914e-07, "loss": 0.0001, "reward": 1.655357226729393, "reward_std": 0.09848987404257059, "rewards/equation_reward_func": 0.6866071820259094, "rewards/format_reward_func": 0.9687500111758709, "step": 6794 }, { "completion_length": 226.19643783569336, "epoch": 1.139486147784903, "grad_norm": 1.1883478179099713, "kl": 3.068939208984375, "learning_rate": 4.99457662960749e-07, "loss": 0.0031, "reward": 1.7125000581145287, "reward_std": 0.1035406356677413, "rewards/equation_reward_func": 0.7526785898953676, "rewards/format_reward_func": 0.9598214440047741, "step": 6796 }, { "completion_length": 221.78572750091553, "epoch": 1.1398214510247706, "grad_norm": 0.2720848237059936, "kl": 0.0783538818359375, "learning_rate": 4.994568729563628e-07, "loss": 0.0001, "reward": 1.7803572043776512, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7937500402331352, "rewards/format_reward_func": 0.9866071492433548, "step": 6798 }, { "completion_length": 215.40625953674316, "epoch": 1.140156754264638, "grad_norm": 0.17525951997006495, "kl": 0.084381103515625, "learning_rate": 4.994560823776342e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.08586296625435352, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 0.9821428656578064, "step": 6800 }, { "completion_length": 231.19643783569336, "epoch": 1.1404920575045057, "grad_norm": 0.5392629404213098, "kl": 0.669952392578125, "learning_rate": 4.994552912245653e-07, "loss": 0.0007, "reward": 1.7107143625617027, "reward_std": 0.11616754252463579, "rewards/equation_reward_func": 0.737500037997961, "rewards/format_reward_func": 0.9732142984867096, "step": 6802 }, { "completion_length": 220.7991180419922, "epoch": 1.1408273607443733, "grad_norm": 0.38399115209840423, "kl": 0.09869384765625, "learning_rate": 4.994544994971579e-07, "loss": 0.0001, "reward": 1.7017858177423477, "reward_std": 0.11869292240589857, "rewards/equation_reward_func": 0.7151786051690578, "rewards/format_reward_func": 0.9866071492433548, "step": 6804 }, { "completion_length": 210.77679538726807, "epoch": 1.1411626639842407, "grad_norm": 0.3414302520044539, "kl": 0.08026123046875, "learning_rate": 4.994537071954136e-07, "loss": 0.0001, "reward": 1.7946428954601288, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7991071697324514, "rewards/format_reward_func": 0.9955357164144516, "step": 6806 }, { "completion_length": 216.79018688201904, "epoch": 1.1414979672241083, "grad_norm": 0.35112093270640127, "kl": 0.0818939208984375, "learning_rate": 4.994529143193344e-07, "loss": 0.0001, "reward": 1.7053571790456772, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.7187500260770321, "rewards/format_reward_func": 0.9866071492433548, "step": 6808 }, { "completion_length": 216.69197368621826, "epoch": 1.1418332704639758, "grad_norm": 0.27308493165841274, "kl": 0.084320068359375, "learning_rate": 4.994521208689222e-07, "loss": 0.0001, "reward": 1.7107143551111221, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7196428962051868, "rewards/format_reward_func": 0.9910714328289032, "step": 6810 }, { "completion_length": 229.5982255935669, "epoch": 1.1421685737038434, "grad_norm": 0.33048177298301895, "kl": 0.0850372314453125, "learning_rate": 4.994513268441786e-07, "loss": 0.0001, "reward": 1.7267857864499092, "reward_std": 0.09343911055475473, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.977678582072258, "step": 6812 }, { "completion_length": 216.61608219146729, "epoch": 1.142503876943711, "grad_norm": 0.16062072058112123, "kl": 0.0807952880859375, "learning_rate": 4.994505322451057e-07, "loss": 0.0001, "reward": 1.7446429282426834, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.74910718947649, "rewards/format_reward_func": 0.9955357164144516, "step": 6814 }, { "completion_length": 212.56697273254395, "epoch": 1.1428391801835784, "grad_norm": 0.2508972626651724, "kl": 0.085968017578125, "learning_rate": 4.994497370717051e-07, "loss": 0.0001, "reward": 1.7517857924103737, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 6816 }, { "completion_length": 221.79465293884277, "epoch": 1.143174483423446, "grad_norm": 0.28485351061728537, "kl": 0.0821075439453125, "learning_rate": 4.994489413239788e-07, "loss": 0.0001, "reward": 1.7732143476605415, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.9955357164144516, "step": 6818 }, { "completion_length": 217.49554538726807, "epoch": 1.1435097866633137, "grad_norm": 0.16398489059121946, "kl": 0.0821380615234375, "learning_rate": 4.994481450019285e-07, "loss": 0.0001, "reward": 1.817857176065445, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8178571611642838, "rewards/format_reward_func": 1.0, "step": 6820 }, { "completion_length": 217.56250858306885, "epoch": 1.1438450899031811, "grad_norm": 0.3405546953923567, "kl": 0.085357666015625, "learning_rate": 4.99447348105556e-07, "loss": 0.0001, "reward": 1.7325893640518188, "reward_std": 0.08018085593357682, "rewards/equation_reward_func": 0.7473214641213417, "rewards/format_reward_func": 0.9852678664028645, "step": 6822 }, { "completion_length": 224.90179634094238, "epoch": 1.1441803931430488, "grad_norm": 0.2212434884477253, "kl": 0.072967529296875, "learning_rate": 4.994465506348633e-07, "loss": 0.0001, "reward": 1.717857226729393, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7267857324331999, "rewards/format_reward_func": 0.9910714328289032, "step": 6824 }, { "completion_length": 216.48661613464355, "epoch": 1.1445156963829164, "grad_norm": 0.26409679570024003, "kl": 0.088470458984375, "learning_rate": 4.994457525898521e-07, "loss": 0.0001, "reward": 1.7482143640518188, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526786010712385, "rewards/format_reward_func": 0.9955357164144516, "step": 6826 }, { "completion_length": 223.00000858306885, "epoch": 1.1448509996227838, "grad_norm": 0.1694539692321961, "kl": 0.0811614990234375, "learning_rate": 4.994449539705244e-07, "loss": 0.0001, "reward": 1.8147321939468384, "reward_std": 0.03977475711144507, "rewards/equation_reward_func": 0.8205357473343611, "rewards/format_reward_func": 0.9941964335739613, "step": 6828 }, { "completion_length": 217.86608123779297, "epoch": 1.1451863028626514, "grad_norm": 0.2320165700751102, "kl": 0.0828704833984375, "learning_rate": 4.994441547768818e-07, "loss": 0.0001, "reward": 1.801785759627819, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.8062500283122063, "rewards/format_reward_func": 0.9955357164144516, "step": 6830 }, { "completion_length": 219.41965103149414, "epoch": 1.1455216061025189, "grad_norm": 0.3201717426420188, "kl": 0.08636474609375, "learning_rate": 4.994433550089263e-07, "loss": 0.0001, "reward": 1.7446429431438446, "reward_std": 0.08838834799826145, "rewards/equation_reward_func": 0.7580357529222965, "rewards/format_reward_func": 0.9866071492433548, "step": 6832 }, { "completion_length": 215.90179538726807, "epoch": 1.1458569093423865, "grad_norm": 0.1761618860877667, "kl": 0.084991455078125, "learning_rate": 4.994425546666597e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8125000260770321, "rewards/format_reward_func": 0.9910714328289032, "step": 6834 }, { "completion_length": 222.77679538726807, "epoch": 1.1461922125822541, "grad_norm": 0.23635481786912677, "kl": 0.087860107421875, "learning_rate": 4.99441753750084e-07, "loss": 0.0001, "reward": 1.7879464998841286, "reward_std": 0.06755395047366619, "rewards/equation_reward_func": 0.7982143200933933, "rewards/format_reward_func": 0.9897321462631226, "step": 6836 }, { "completion_length": 225.16518878936768, "epoch": 1.1465275158221215, "grad_norm": 0.21894307591906984, "kl": 0.09490966796875, "learning_rate": 4.994409522592008e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7303571663796902, "rewards/format_reward_func": 0.9910714328289032, "step": 6838 }, { "completion_length": 222.02679538726807, "epoch": 1.1468628190619892, "grad_norm": 0.24222570084093759, "kl": 0.08447265625, "learning_rate": 4.994401501940119e-07, "loss": 0.0001, "reward": 1.7000000849366188, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7089286111295223, "rewards/format_reward_func": 0.9910714328289032, "step": 6840 }, { "completion_length": 233.45090579986572, "epoch": 1.1471981223018568, "grad_norm": 0.3205876539023307, "kl": 0.08941650390625, "learning_rate": 4.994393475545195e-07, "loss": 0.0001, "reward": 1.6843750774860382, "reward_std": 0.08775700209662318, "rewards/equation_reward_func": 0.7035714648663998, "rewards/format_reward_func": 0.980803582817316, "step": 6842 }, { "completion_length": 225.56250858306885, "epoch": 1.1475334255417242, "grad_norm": 0.23515632938836778, "kl": 0.0845794677734375, "learning_rate": 4.994385443407252e-07, "loss": 0.0001, "reward": 1.7696428894996643, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7830357439815998, "rewards/format_reward_func": 0.9866071492433548, "step": 6844 }, { "completion_length": 227.61161708831787, "epoch": 1.1478687287815919, "grad_norm": 0.2374903179827161, "kl": 0.1004638671875, "learning_rate": 4.994377405526308e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7544643059372902, "rewards/format_reward_func": 0.9866071492433548, "step": 6846 }, { "completion_length": 222.64733123779297, "epoch": 1.1482040320214595, "grad_norm": 0.40396699261744706, "kl": 0.49755859375, "learning_rate": 4.994369361902383e-07, "loss": 0.0005, "reward": 1.7196429371833801, "reward_std": 0.10354063380509615, "rewards/equation_reward_func": 0.733035746961832, "rewards/format_reward_func": 0.9866071492433548, "step": 6848 }, { "completion_length": 222.08036613464355, "epoch": 1.148539335261327, "grad_norm": 0.31380228580436215, "kl": 0.088623046875, "learning_rate": 4.994361312535495e-07, "loss": 0.0001, "reward": 1.6897322162985802, "reward_std": 0.09154507238417864, "rewards/equation_reward_func": 0.7223214562982321, "rewards/format_reward_func": 0.9674107320606709, "step": 6850 }, { "completion_length": 211.91965293884277, "epoch": 1.1488746385011945, "grad_norm": 0.2694868807371034, "kl": 0.081146240234375, "learning_rate": 4.994353257425661e-07, "loss": 0.0001, "reward": 1.7839286103844643, "reward_std": 0.0833375845104456, "rewards/equation_reward_func": 0.7973214536905289, "rewards/format_reward_func": 0.9866071492433548, "step": 6852 }, { "completion_length": 223.18304538726807, "epoch": 1.1492099417410622, "grad_norm": 0.24600340041907806, "kl": 0.798309326171875, "learning_rate": 4.994345196572903e-07, "loss": 0.0008, "reward": 1.7232143580913544, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7366071790456772, "rewards/format_reward_func": 0.9866071492433548, "step": 6854 }, { "completion_length": 230.9419755935669, "epoch": 1.1495452449809296, "grad_norm": 0.15237309305910776, "kl": 0.0922698974609375, "learning_rate": 4.994337129977235e-07, "loss": 0.0001, "reward": 1.7254465147852898, "reward_std": 0.09154507424682379, "rewards/equation_reward_func": 0.7401786036789417, "rewards/format_reward_func": 0.9852678626775742, "step": 6856 }, { "completion_length": 228.14733219146729, "epoch": 1.1498805482207972, "grad_norm": 0.22325111909912654, "kl": 0.09490966796875, "learning_rate": 4.99432905763868e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7767857350409031, "rewards/format_reward_func": 0.9910714328289032, "step": 6858 }, { "completion_length": 227.51340293884277, "epoch": 1.1502158514606649, "grad_norm": 0.2710958581186891, "kl": 0.7169189453125, "learning_rate": 4.994320979557255e-07, "loss": 0.0007, "reward": 1.7540179193019867, "reward_std": 0.11553619476035237, "rewards/equation_reward_func": 0.7687500305473804, "rewards/format_reward_func": 0.9852678664028645, "step": 6860 }, { "completion_length": 222.80804634094238, "epoch": 1.1505511547005323, "grad_norm": 0.26345604424103974, "kl": 0.079986572265625, "learning_rate": 4.994312895732978e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7839285917580128, "rewards/format_reward_func": 0.9910714328289032, "step": 6862 }, { "completion_length": 227.67858123779297, "epoch": 1.1508864579404, "grad_norm": 0.26812911857919164, "kl": 0.0786590576171875, "learning_rate": 4.994304806165867e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.8017857484519482, "rewards/format_reward_func": 0.9910714328289032, "step": 6864 }, { "completion_length": 235.8616189956665, "epoch": 1.1512217611802673, "grad_norm": 0.2522975419476787, "kl": 0.08685302734375, "learning_rate": 4.994296710855942e-07, "loss": 0.0001, "reward": 1.6357143446803093, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.6803571656346321, "rewards/format_reward_func": 0.955357164144516, "step": 6866 }, { "completion_length": 234.99554538726807, "epoch": 1.151557064420135, "grad_norm": 0.25997920012377895, "kl": 0.07647705078125, "learning_rate": 4.994288609803221e-07, "loss": 0.0001, "reward": 1.7142858058214188, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 0.9821428656578064, "step": 6868 }, { "completion_length": 234.01340579986572, "epoch": 1.1518923676600026, "grad_norm": 0.547764225858209, "kl": 0.92034912109375, "learning_rate": 4.994280503007723e-07, "loss": 0.0009, "reward": 1.7598215192556381, "reward_std": 0.11743023293092847, "rewards/equation_reward_func": 0.7848214469850063, "rewards/format_reward_func": 0.975000012665987, "step": 6870 }, { "completion_length": 223.48215293884277, "epoch": 1.15222767089987, "grad_norm": 0.004337909642485559, "kl": 0.0759124755859375, "learning_rate": 4.994272390469467e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 6872 }, { "completion_length": 227.67411994934082, "epoch": 1.1525629741397376, "grad_norm": 0.26786762702478173, "kl": 0.093414306640625, "learning_rate": 4.99426427218847e-07, "loss": 0.0001, "reward": 1.745089367032051, "reward_std": 0.0675539500080049, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 0.9986607171595097, "step": 6874 }, { "completion_length": 223.37947368621826, "epoch": 1.1528982773796053, "grad_norm": 0.14117511452087392, "kl": 0.0843505859375, "learning_rate": 4.994256148164752e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7803571801632643, "rewards/format_reward_func": 0.9910714328289032, "step": 6876 }, { "completion_length": 227.93304634094238, "epoch": 1.1532335806194727, "grad_norm": 0.1684037263529093, "kl": 0.08544921875, "learning_rate": 4.994248018398332e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.810714315623045, "rewards/format_reward_func": 1.0, "step": 6878 }, { "completion_length": 223.60268783569336, "epoch": 1.1535688838593403, "grad_norm": 0.23348290820565865, "kl": 0.0734100341796875, "learning_rate": 4.994239882889229e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 6880 }, { "completion_length": 229.4553680419922, "epoch": 1.153904187099208, "grad_norm": 0.21225142595897906, "kl": 0.092926025390625, "learning_rate": 4.99423174163746e-07, "loss": 0.0001, "reward": 1.8196429163217545, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8241071589291096, "rewards/format_reward_func": 0.9955357164144516, "step": 6882 }, { "completion_length": 226.08036994934082, "epoch": 1.1542394903390754, "grad_norm": 0.3118696688418131, "kl": 0.1126251220703125, "learning_rate": 4.994223594643044e-07, "loss": 0.0001, "reward": 1.756250061094761, "reward_std": 0.06818529823794961, "rewards/equation_reward_func": 0.7723214607685804, "rewards/format_reward_func": 0.9839285798370838, "step": 6884 }, { "completion_length": 220.00000762939453, "epoch": 1.154574793578943, "grad_norm": 0.20714372614678384, "kl": 0.1049652099609375, "learning_rate": 4.994215441906003e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 6886 }, { "completion_length": 227.08036994934082, "epoch": 1.1549100968188104, "grad_norm": 0.26264781214443184, "kl": 0.074127197265625, "learning_rate": 4.99420728342635e-07, "loss": 0.0001, "reward": 1.6964286342263222, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6964285932481289, "rewards/format_reward_func": 1.0, "step": 6888 }, { "completion_length": 221.78572273254395, "epoch": 1.155245400058678, "grad_norm": 0.18060283117654174, "kl": 0.096527099609375, "learning_rate": 4.994199119204109e-07, "loss": 0.0001, "reward": 1.6803572326898575, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.6848214734345675, "rewards/format_reward_func": 0.9955357164144516, "step": 6890 }, { "completion_length": 224.6384048461914, "epoch": 1.1555807032985457, "grad_norm": 0.31072415977608814, "kl": 0.2027130126953125, "learning_rate": 4.994190949239297e-07, "loss": 0.0002, "reward": 1.7375000640749931, "reward_std": 0.06818529404699802, "rewards/equation_reward_func": 0.74196432903409, "rewards/format_reward_func": 0.9955357164144516, "step": 6892 }, { "completion_length": 223.92411708831787, "epoch": 1.155916006538413, "grad_norm": 0.42115750864099694, "kl": 0.1059722900390625, "learning_rate": 4.994182773531932e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7732143141329288, "rewards/format_reward_func": 0.9910714328289032, "step": 6894 }, { "completion_length": 223.92411613464355, "epoch": 1.1562513097782807, "grad_norm": 0.1894086306269926, "kl": 0.0964202880859375, "learning_rate": 4.994174592082034e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.0909137288108468, "rewards/equation_reward_func": 0.7946428842842579, "rewards/format_reward_func": 0.9910714328289032, "step": 6896 }, { "completion_length": 226.8035831451416, "epoch": 1.1565866130181484, "grad_norm": 0.19003955983787937, "kl": 0.0852813720703125, "learning_rate": 4.994166404889621e-07, "loss": 0.0001, "reward": 1.7178572043776512, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7267857417464256, "rewards/format_reward_func": 0.9910714328289032, "step": 6898 }, { "completion_length": 226.31697368621826, "epoch": 1.1569219162580158, "grad_norm": 0.23749390714239918, "kl": 0.1407928466796875, "learning_rate": 4.994158211954712e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7910714484751225, "rewards/format_reward_func": 0.9910714328289032, "step": 6900 }, { "completion_length": 213.43750762939453, "epoch": 1.1572572194978834, "grad_norm": 0.16679608761011747, "kl": 0.100677490234375, "learning_rate": 4.994150013277326e-07, "loss": 0.0001, "reward": 1.8392857536673546, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8392857387661934, "rewards/format_reward_func": 1.0, "step": 6902 }, { "completion_length": 216.508939743042, "epoch": 1.157592522737751, "grad_norm": 0.22082425907055644, "kl": 0.10552978515625, "learning_rate": 4.994141808857483e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 6904 }, { "completion_length": 220.37054538726807, "epoch": 1.1579278259776185, "grad_norm": 0.5021244800347835, "kl": 0.0793609619140625, "learning_rate": 4.9941335986952e-07, "loss": 0.0001, "reward": 1.7160715013742447, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7205357514321804, "rewards/format_reward_func": 0.9955357164144516, "step": 6906 }, { "completion_length": 217.09822368621826, "epoch": 1.158263129217486, "grad_norm": 0.20157959080101762, "kl": 0.08599853515625, "learning_rate": 4.994125382790496e-07, "loss": 0.0001, "reward": 1.7482143864035606, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 6908 }, { "completion_length": 214.39733028411865, "epoch": 1.1585984324573535, "grad_norm": 0.17372353002627805, "kl": 0.075927734375, "learning_rate": 4.994117161143392e-07, "loss": 0.0001, "reward": 1.7250000685453415, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000424683094, "rewards/format_reward_func": 1.0, "step": 6910 }, { "completion_length": 217.69197177886963, "epoch": 1.1589337356972211, "grad_norm": 0.18357542006548058, "kl": 0.0815887451171875, "learning_rate": 4.994108933753905e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 6912 }, { "completion_length": 216.56250953674316, "epoch": 1.1592690389370888, "grad_norm": 0.14700847457999847, "kl": 0.0750274658203125, "learning_rate": 4.994100700622054e-07, "loss": 0.0001, "reward": 1.753571480512619, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714656114578, "rewards/format_reward_func": 1.0, "step": 6914 }, { "completion_length": 227.56251049041748, "epoch": 1.1596043421769564, "grad_norm": 0.24436108533912068, "kl": 0.0693511962890625, "learning_rate": 4.994092461747859e-07, "loss": 0.0001, "reward": 1.7303572371602058, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7348214611411095, "rewards/format_reward_func": 0.9955357164144516, "step": 6916 }, { "completion_length": 212.84375762939453, "epoch": 1.1599396454168238, "grad_norm": 0.24133830306304005, "kl": 0.0664825439453125, "learning_rate": 4.994084217131338e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 6918 }, { "completion_length": 212.49554443359375, "epoch": 1.1602749486566915, "grad_norm": 0.2615600831239175, "kl": 0.07427978515625, "learning_rate": 4.994075966772511e-07, "loss": 0.0001, "reward": 1.707142911851406, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7071428894996643, "rewards/format_reward_func": 1.0, "step": 6920 }, { "completion_length": 224.66965293884277, "epoch": 1.1606102518965589, "grad_norm": 0.15686356558490766, "kl": 0.4253997802734375, "learning_rate": 4.994067710671396e-07, "loss": 0.0004, "reward": 1.7267857640981674, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7312500439584255, "rewards/format_reward_func": 0.9955357164144516, "step": 6922 }, { "completion_length": 215.59822273254395, "epoch": 1.1609455551364265, "grad_norm": 0.22075119949291844, "kl": 0.071746826171875, "learning_rate": 4.994059448828011e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 6924 }, { "completion_length": 224.16518878936768, "epoch": 1.1612808583762941, "grad_norm": 0.2866122016574234, "kl": 0.0772247314453125, "learning_rate": 4.994051181242379e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 6926 }, { "completion_length": 199.56250858306885, "epoch": 1.1616161616161615, "grad_norm": 0.2835153067499442, "kl": 0.07659912109375, "learning_rate": 4.994042907914514e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 6928 }, { "completion_length": 220.72768878936768, "epoch": 1.1619514648560292, "grad_norm": 0.22777164421377802, "kl": 0.6842498779296875, "learning_rate": 4.99403462884444e-07, "loss": 0.0007, "reward": 1.7928571999073029, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571589291096, "rewards/format_reward_func": 1.0, "step": 6930 }, { "completion_length": 219.58482933044434, "epoch": 1.1622867680958968, "grad_norm": 0.17939756568933712, "kl": 0.073150634765625, "learning_rate": 4.994026344032172e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7696428932249546, "rewards/format_reward_func": 0.9910714328289032, "step": 6932 }, { "completion_length": 210.61608028411865, "epoch": 1.1626220713357642, "grad_norm": 0.3478303221101954, "kl": 0.0870361328125, "learning_rate": 4.994018053477731e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.760714303702116, "rewards/format_reward_func": 1.0, "step": 6934 }, { "completion_length": 210.92858219146729, "epoch": 1.1629573745756319, "grad_norm": 0.1651791347237627, "kl": 0.0748748779296875, "learning_rate": 4.994009757181136e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7633928805589676, "rewards/format_reward_func": 0.9955357164144516, "step": 6936 }, { "completion_length": 215.88393783569336, "epoch": 1.1632926778154995, "grad_norm": 0.3029528530372732, "kl": 0.0755157470703125, "learning_rate": 4.994001455142405e-07, "loss": 0.0001, "reward": 1.7017857730388641, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7062500566244125, "rewards/format_reward_func": 0.9955357164144516, "step": 6938 }, { "completion_length": 211.4419755935669, "epoch": 1.163627981055367, "grad_norm": 0.26122991877460666, "kl": 0.087005615234375, "learning_rate": 4.993993147361558e-07, "loss": 0.0001, "reward": 1.7000000700354576, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7089286148548126, "rewards/format_reward_func": 0.9910714328289032, "step": 6940 }, { "completion_length": 208.23661613464355, "epoch": 1.1639632842952345, "grad_norm": 0.2747568610792723, "kl": 0.0793609619140625, "learning_rate": 4.993984833838615e-07, "loss": 0.0001, "reward": 1.785714328289032, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 6942 }, { "completion_length": 213.88393688201904, "epoch": 1.164298587535102, "grad_norm": 0.2901435145294046, "kl": 0.07421875, "learning_rate": 4.993976514573593e-07, "loss": 0.0001, "reward": 1.692857213318348, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.692857176065445, "rewards/format_reward_func": 1.0, "step": 6944 }, { "completion_length": 214.64286708831787, "epoch": 1.1646338907749696, "grad_norm": 0.2537096914951688, "kl": 0.07794189453125, "learning_rate": 4.993968189566513e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 6946 }, { "completion_length": 223.53572273254395, "epoch": 1.1649691940148372, "grad_norm": 0.2668001365446884, "kl": 0.0701446533203125, "learning_rate": 4.993959858817395e-07, "loss": 0.0001, "reward": 1.7732143253087997, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7776786088943481, "rewards/format_reward_func": 0.9955357164144516, "step": 6948 }, { "completion_length": 218.05358123779297, "epoch": 1.1653044972547046, "grad_norm": 0.21744701461798605, "kl": 0.0718994140625, "learning_rate": 4.993951522326255e-07, "loss": 0.0001, "reward": 1.769642911851406, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.9955357164144516, "step": 6950 }, { "completion_length": 229.17858123779297, "epoch": 1.1656398004945723, "grad_norm": 0.21091420914175668, "kl": 0.073822021484375, "learning_rate": 4.993943180093114e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428880095482, "rewards/format_reward_func": 1.0, "step": 6952 }, { "completion_length": 229.37500953674316, "epoch": 1.16597510373444, "grad_norm": 0.28567924027709884, "kl": 0.0760650634765625, "learning_rate": 4.993934832117991e-07, "loss": 0.0001, "reward": 1.7125000730156898, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.725892897695303, "rewards/format_reward_func": 0.9866071492433548, "step": 6954 }, { "completion_length": 226.1384038925171, "epoch": 1.1663104069743073, "grad_norm": 0.30841935931803693, "kl": 0.0717315673828125, "learning_rate": 4.993926478400906e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.8107143193483353, "rewards/format_reward_func": 1.0, "step": 6956 }, { "completion_length": 235.94643783569336, "epoch": 1.166645710214175, "grad_norm": 0.23208670864060132, "kl": 0.3082733154296875, "learning_rate": 4.993918118941877e-07, "loss": 0.0003, "reward": 1.7410715222358704, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7455357424914837, "rewards/format_reward_func": 0.9955357164144516, "step": 6958 }, { "completion_length": 224.87054443359375, "epoch": 1.1669810134540426, "grad_norm": 0.14985084192465364, "kl": 0.0709381103515625, "learning_rate": 4.993909753740924e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 6960 }, { "completion_length": 235.30358123779297, "epoch": 1.16731631669391, "grad_norm": 0.24284178348934188, "kl": 0.543304443359375, "learning_rate": 4.993901382798067e-07, "loss": 0.0005, "reward": 1.766071505844593, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.9955357164144516, "step": 6962 }, { "completion_length": 234.59822463989258, "epoch": 1.1676516199337776, "grad_norm": 0.15701413152998836, "kl": 0.3557891845703125, "learning_rate": 4.993893006113323e-07, "loss": 0.0004, "reward": 1.7339285984635353, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7562500312924385, "rewards/format_reward_func": 0.977678582072258, "step": 6964 }, { "completion_length": 234.5134048461914, "epoch": 1.167986923173645, "grad_norm": 0.22042202431841593, "kl": 0.0718994140625, "learning_rate": 4.993884623686713e-07, "loss": 0.0001, "reward": 1.7642858028411865, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857320606709, "rewards/format_reward_func": 1.0, "step": 6966 }, { "completion_length": 234.33929443359375, "epoch": 1.1683222264135127, "grad_norm": 0.2321463814591959, "kl": 0.083343505859375, "learning_rate": 4.993876235518256e-07, "loss": 0.0001, "reward": 1.7160714715719223, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7205357551574707, "rewards/format_reward_func": 0.9955357164144516, "step": 6968 }, { "completion_length": 237.26340198516846, "epoch": 1.1686575296533803, "grad_norm": 0.23375655605151902, "kl": 0.0672149658203125, "learning_rate": 4.993867841607972e-07, "loss": 0.0001, "reward": 1.7535714656114578, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714842379093, "rewards/format_reward_func": 1.0, "step": 6970 }, { "completion_length": 236.7321538925171, "epoch": 1.1689928328932477, "grad_norm": 0.16916676139529008, "kl": 0.067230224609375, "learning_rate": 4.993859441955879e-07, "loss": 0.0001, "reward": 1.7303572073578835, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7348214723169804, "rewards/format_reward_func": 0.9955357164144516, "step": 6972 }, { "completion_length": 227.88393878936768, "epoch": 1.1693281361331154, "grad_norm": 0.21767153944629555, "kl": 0.068359375, "learning_rate": 4.993851036561996e-07, "loss": 0.0001, "reward": 1.7982143461704254, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8026785887777805, "rewards/format_reward_func": 0.9955357164144516, "step": 6974 }, { "completion_length": 238.67858219146729, "epoch": 1.169663439372983, "grad_norm": 0.2871662999160085, "kl": 0.0770721435546875, "learning_rate": 4.993842625426344e-07, "loss": 0.0001, "reward": 1.755357213318348, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7598214577883482, "rewards/format_reward_func": 0.9955357164144516, "step": 6976 }, { "completion_length": 243.97322750091553, "epoch": 1.1699987426128504, "grad_norm": 0.2003109959117877, "kl": 0.0980682373046875, "learning_rate": 4.993834208548942e-07, "loss": 0.0001, "reward": 1.7178572341799736, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7267857436090708, "rewards/format_reward_func": 0.9910714328289032, "step": 6978 }, { "completion_length": 243.05358409881592, "epoch": 1.170334045852718, "grad_norm": 0.4412118535614136, "kl": 0.1660919189453125, "learning_rate": 4.993825785929808e-07, "loss": 0.0002, "reward": 1.7375000640749931, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7419643215835094, "rewards/format_reward_func": 0.9955357164144516, "step": 6980 }, { "completion_length": 238.2009038925171, "epoch": 1.1706693490925857, "grad_norm": 0.2531765682553049, "kl": 0.070648193359375, "learning_rate": 4.993817357568963e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 6982 }, { "completion_length": 237.69197463989258, "epoch": 1.171004652332453, "grad_norm": 0.1307315755554436, "kl": 0.07025146484375, "learning_rate": 4.993808923466427e-07, "loss": 0.0001, "reward": 1.8071429207921028, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071429021656513, "rewards/format_reward_func": 1.0, "step": 6984 }, { "completion_length": 229.9241180419922, "epoch": 1.1713399555723207, "grad_norm": 0.16282934602909968, "kl": 0.0719757080078125, "learning_rate": 4.993800483622217e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285872876644, "rewards/format_reward_func": 1.0, "step": 6986 }, { "completion_length": 235.06250858306885, "epoch": 1.1716752588121884, "grad_norm": 0.15396331685512085, "kl": 0.061370849609375, "learning_rate": 4.993792038036353e-07, "loss": 0.0001, "reward": 1.8785714581608772, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8785714544355869, "rewards/format_reward_func": 1.0, "step": 6988 }, { "completion_length": 230.7321538925171, "epoch": 1.1720105620520558, "grad_norm": 0.20821104135962984, "kl": 0.1810455322265625, "learning_rate": 4.993783586708857e-07, "loss": 0.0002, "reward": 1.7785714864730835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 6990 }, { "completion_length": 239.67411994934082, "epoch": 1.1723458652919234, "grad_norm": 0.13161959537442824, "kl": 0.0686798095703125, "learning_rate": 4.993775129639744e-07, "loss": 0.0001, "reward": 1.8071429133415222, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 6992 }, { "completion_length": 240.89286994934082, "epoch": 1.172681168531791, "grad_norm": 0.20917233948066855, "kl": 0.2086334228515625, "learning_rate": 4.993766666829038e-07, "loss": 0.0002, "reward": 1.7125000655651093, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7169643230736256, "rewards/format_reward_func": 0.9955357164144516, "step": 6994 }, { "completion_length": 242.56697273254395, "epoch": 1.1730164717716585, "grad_norm": 0.14468665687786536, "kl": 0.064697265625, "learning_rate": 4.993758198276756e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571529686451, "rewards/format_reward_func": 1.0, "step": 6996 }, { "completion_length": 246.53126049041748, "epoch": 1.173351775011526, "grad_norm": 0.24789853835463235, "kl": 0.076202392578125, "learning_rate": 4.993749723982918e-07, "loss": 0.0001, "reward": 1.8178571835160255, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.817857176065445, "rewards/format_reward_func": 1.0, "step": 6998 }, { "completion_length": 237.70983219146729, "epoch": 1.1736870782513935, "grad_norm": 0.22220896735230375, "kl": 0.0637969970703125, "learning_rate": 4.993741243947544e-07, "loss": 0.0001, "reward": 1.8357143327593803, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8357143066823483, "rewards/format_reward_func": 1.0, "step": 7000 }, { "completion_length": 243.8616180419922, "epoch": 1.1740223814912611, "grad_norm": 0.24457270120564578, "kl": 0.060516357421875, "learning_rate": 4.993732758170652e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 7002 }, { "completion_length": 234.6116189956665, "epoch": 1.1743576847311288, "grad_norm": 0.25798999402321, "kl": 0.06378173828125, "learning_rate": 4.993724266652263e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 7004 }, { "completion_length": 243.3660831451416, "epoch": 1.1746929879709962, "grad_norm": 0.12745495764158657, "kl": 0.0615386962890625, "learning_rate": 4.993715769392397e-07, "loss": 0.0001, "reward": 1.7053572088479996, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.709821468219161, "rewards/format_reward_func": 0.9955357164144516, "step": 7006 }, { "completion_length": 241.02679443359375, "epoch": 1.1750282912108638, "grad_norm": 0.2753649872008471, "kl": 0.11676025390625, "learning_rate": 4.993707266391072e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7767857611179352, "rewards/format_reward_func": 0.9910714328289032, "step": 7008 }, { "completion_length": 236.1473331451416, "epoch": 1.1753635944507315, "grad_norm": 0.21138871590711736, "kl": 0.42535400390625, "learning_rate": 4.993698757648308e-07, "loss": 0.0004, "reward": 1.8035714700818062, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 7010 }, { "completion_length": 236.2946538925171, "epoch": 1.1756988976905989, "grad_norm": 0.20702311310531854, "kl": 0.150177001953125, "learning_rate": 4.993690243164125e-07, "loss": 0.0002, "reward": 1.7178572192788124, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571633994579, "rewards/format_reward_func": 1.0, "step": 7012 }, { "completion_length": 246.5000123977661, "epoch": 1.1760342009304665, "grad_norm": 0.24995814190921872, "kl": 0.3637237548828125, "learning_rate": 4.993681722938542e-07, "loss": 0.0004, "reward": 1.7696429342031479, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.9955357164144516, "step": 7014 }, { "completion_length": 241.29912090301514, "epoch": 1.1763695041703341, "grad_norm": 0.1647114277394096, "kl": 0.104705810546875, "learning_rate": 4.993673196971581e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 7016 }, { "completion_length": 244.20983409881592, "epoch": 1.1767048074102016, "grad_norm": 0.2431811099491955, "kl": 0.0694427490234375, "learning_rate": 4.993664665263258e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285928755999, "rewards/format_reward_func": 1.0, "step": 7018 }, { "completion_length": 250.6562623977661, "epoch": 1.1770401106500692, "grad_norm": 0.2823400285980501, "kl": 0.07000732421875, "learning_rate": 4.993656127813594e-07, "loss": 0.0001, "reward": 1.691071517765522, "reward_std": 0.09343910776078701, "rewards/equation_reward_func": 0.6955357547849417, "rewards/format_reward_func": 0.9955357164144516, "step": 7020 }, { "completion_length": 243.43304634094238, "epoch": 1.1773754138899366, "grad_norm": 0.2006690204806412, "kl": 0.0662994384765625, "learning_rate": 4.99364758462261e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 7022 }, { "completion_length": 239.68304538726807, "epoch": 1.1777107171298042, "grad_norm": 0.25763110318507637, "kl": 0.735565185546875, "learning_rate": 4.993639035690325e-07, "loss": 0.0007, "reward": 1.7750000804662704, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 7024 }, { "completion_length": 243.7634038925171, "epoch": 1.1780460203696719, "grad_norm": 0.2525095936361976, "kl": 0.0626220703125, "learning_rate": 4.993630481016758e-07, "loss": 0.0001, "reward": 1.821428619325161, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8214286044239998, "rewards/format_reward_func": 1.0, "step": 7026 }, { "completion_length": 238.9330472946167, "epoch": 1.1783813236095393, "grad_norm": 0.32584782130502027, "kl": 0.08795166015625, "learning_rate": 4.993621920601928e-07, "loss": 0.0001, "reward": 1.7750001102685928, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 7028 }, { "completion_length": 238.77233219146729, "epoch": 1.178716626849407, "grad_norm": 0.18152868420415122, "kl": 0.1728973388671875, "learning_rate": 4.993613354445857e-07, "loss": 0.0002, "reward": 1.7464286237955093, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.755357176065445, "rewards/format_reward_func": 0.9910714328289032, "step": 7030 }, { "completion_length": 234.59822463989258, "epoch": 1.1790519300892746, "grad_norm": 0.21606357065813922, "kl": 0.0672607421875, "learning_rate": 4.993604782548563e-07, "loss": 0.0001, "reward": 1.7410714998841286, "reward_std": 0.06313453521579504, "rewards/equation_reward_func": 0.7544643208384514, "rewards/format_reward_func": 0.9866071492433548, "step": 7032 }, { "completion_length": 245.071439743042, "epoch": 1.179387233329142, "grad_norm": 0.11731433901397256, "kl": 0.0707244873046875, "learning_rate": 4.993596204910067e-07, "loss": 0.0001, "reward": 1.7125000804662704, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.716964315623045, "rewards/format_reward_func": 0.9955357164144516, "step": 7034 }, { "completion_length": 234.6741189956665, "epoch": 1.1797225365690096, "grad_norm": 0.15904667048164609, "kl": 0.066925048828125, "learning_rate": 4.993587621530386e-07, "loss": 0.0001, "reward": 1.755357213318348, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 7036 }, { "completion_length": 244.46875953674316, "epoch": 1.1800578398088772, "grad_norm": 0.21918559336727544, "kl": 0.068328857421875, "learning_rate": 4.993579032409544e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 7038 }, { "completion_length": 241.32144165039062, "epoch": 1.1803931430487447, "grad_norm": 0.28183054748017666, "kl": 0.1956024169921875, "learning_rate": 4.993570437547558e-07, "loss": 0.0002, "reward": 1.8089286535978317, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.813392885029316, "rewards/format_reward_func": 0.9955357164144516, "step": 7040 }, { "completion_length": 240.8571538925171, "epoch": 1.1807284462886123, "grad_norm": 0.17991330132898947, "kl": 0.0831451416015625, "learning_rate": 4.993561836944447e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 7042 }, { "completion_length": 244.83929920196533, "epoch": 1.1810637495284797, "grad_norm": 0.15519989350192553, "kl": 0.0687103271484375, "learning_rate": 4.993553230600233e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 7044 }, { "completion_length": 244.7812614440918, "epoch": 1.1813990527683473, "grad_norm": 0.2493715816928291, "kl": 0.0808258056640625, "learning_rate": 4.993544618514935e-07, "loss": 0.0001, "reward": 1.7040179297327995, "reward_std": 0.06502856919541955, "rewards/equation_reward_func": 0.7098214626312256, "rewards/format_reward_func": 0.9941964335739613, "step": 7046 }, { "completion_length": 250.14287090301514, "epoch": 1.181734356008215, "grad_norm": 0.22729720223898298, "kl": 0.0684661865234375, "learning_rate": 4.993536000688573e-07, "loss": 0.0001, "reward": 1.6785715147852898, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.6785714477300644, "rewards/format_reward_func": 1.0, "step": 7048 }, { "completion_length": 248.52679538726807, "epoch": 1.1820696592480826, "grad_norm": 0.2324995386469855, "kl": 0.06976318359375, "learning_rate": 4.993527377121166e-07, "loss": 0.0001, "reward": 1.7517857924103737, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 7050 }, { "completion_length": 248.4419765472412, "epoch": 1.18240496248795, "grad_norm": 0.18319454840783975, "kl": 0.0713653564453125, "learning_rate": 4.993518747812735e-07, "loss": 0.0001, "reward": 1.7107143551111221, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7276786006987095, "rewards/format_reward_func": 0.9830357283353806, "step": 7052 }, { "completion_length": 238.18751049041748, "epoch": 1.1827402657278177, "grad_norm": 0.23620609642510929, "kl": 0.100982666015625, "learning_rate": 4.993510112763299e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500201165676, "rewards/format_reward_func": 0.9955357164144516, "step": 7054 }, { "completion_length": 253.98661994934082, "epoch": 1.183075568967685, "grad_norm": 0.2541844366706011, "kl": 0.0661773681640625, "learning_rate": 4.993501471972879e-07, "loss": 0.0001, "reward": 1.7250000983476639, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7339286096394062, "rewards/format_reward_func": 0.9910714328289032, "step": 7056 }, { "completion_length": 243.8214406967163, "epoch": 1.1834108722075527, "grad_norm": 0.19942797787478084, "kl": 0.07354736328125, "learning_rate": 4.993492825441493e-07, "loss": 0.0001, "reward": 1.7642858028411865, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857320606709, "rewards/format_reward_func": 1.0, "step": 7058 }, { "completion_length": 236.7232265472412, "epoch": 1.1837461754474203, "grad_norm": 0.21271859218155764, "kl": 0.065704345703125, "learning_rate": 4.993484173169162e-07, "loss": 0.0001, "reward": 1.7285715118050575, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7285714596509933, "rewards/format_reward_func": 1.0, "step": 7060 }, { "completion_length": 239.19197368621826, "epoch": 1.1840814786872877, "grad_norm": 0.28199565232331336, "kl": 0.12799072265625, "learning_rate": 4.993475515155906e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 7062 }, { "completion_length": 246.8928680419922, "epoch": 1.1844167819271554, "grad_norm": 0.25315393129614683, "kl": 0.5302581787109375, "learning_rate": 4.993466851401745e-07, "loss": 0.0005, "reward": 1.7571429163217545, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 7064 }, { "completion_length": 235.8125123977661, "epoch": 1.184752085167023, "grad_norm": 0.2698316473301265, "kl": 0.0690765380859375, "learning_rate": 4.993458181906699e-07, "loss": 0.0001, "reward": 1.8089286237955093, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.813392885029316, "rewards/format_reward_func": 0.9955357164144516, "step": 7066 }, { "completion_length": 244.77679538726807, "epoch": 1.1850873884068904, "grad_norm": 0.2320733825817402, "kl": 0.0703582763671875, "learning_rate": 4.993449506670788e-07, "loss": 0.0001, "reward": 1.7482143566012383, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526786103844643, "rewards/format_reward_func": 0.9955357164144516, "step": 7068 }, { "completion_length": 259.99554920196533, "epoch": 1.185422691646758, "grad_norm": 0.24226402232737604, "kl": 0.0719451904296875, "learning_rate": 4.99344082569403e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.08081220369786024, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 0.9821428656578064, "step": 7070 }, { "completion_length": 247.50447940826416, "epoch": 1.1857579948866257, "grad_norm": 0.18774264494588802, "kl": 0.0692596435546875, "learning_rate": 4.993432138976448e-07, "loss": 0.0001, "reward": 1.8000000417232513, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000324100256, "rewards/format_reward_func": 1.0, "step": 7072 }, { "completion_length": 254.71876049041748, "epoch": 1.1860932981264931, "grad_norm": 0.2447953619963093, "kl": 0.2768707275390625, "learning_rate": 4.993423446518063e-07, "loss": 0.0003, "reward": 1.7392857894301414, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.748214315623045, "rewards/format_reward_func": 0.9910714328289032, "step": 7074 }, { "completion_length": 252.3259038925171, "epoch": 1.1864286013663607, "grad_norm": 0.2295580947248307, "kl": 0.15301513671875, "learning_rate": 4.99341474831889e-07, "loss": 0.0002, "reward": 1.721428632736206, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7303571682423353, "rewards/format_reward_func": 0.9910714328289032, "step": 7076 }, { "completion_length": 255.95536422729492, "epoch": 1.1867639046062282, "grad_norm": 0.2032927882840729, "kl": 0.0631103515625, "learning_rate": 4.993406044378951e-07, "loss": 0.0001, "reward": 1.7196429371833801, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7241071779280901, "rewards/format_reward_func": 0.9955357164144516, "step": 7078 }, { "completion_length": 253.1696548461914, "epoch": 1.1870992078460958, "grad_norm": 0.2855426504789579, "kl": 0.1221771240234375, "learning_rate": 4.993397334698269e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.746428593993187, "rewards/format_reward_func": 1.0, "step": 7080 }, { "completion_length": 259.46429538726807, "epoch": 1.1874345110859634, "grad_norm": 0.3224579946182638, "kl": 0.1277923583984375, "learning_rate": 4.993388619276861e-07, "loss": 0.0001, "reward": 1.7053572162985802, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7098214700818062, "rewards/format_reward_func": 0.9955357164144516, "step": 7082 }, { "completion_length": 247.4151906967163, "epoch": 1.1877698143258308, "grad_norm": 0.18945706403060317, "kl": 0.0734100341796875, "learning_rate": 4.993379898114748e-07, "loss": 0.0001, "reward": 1.7250000536441803, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7339286059141159, "rewards/format_reward_func": 0.9910714328289032, "step": 7084 }, { "completion_length": 253.5312614440918, "epoch": 1.1881051175656985, "grad_norm": 0.2508063849897521, "kl": 0.0714569091796875, "learning_rate": 4.99337117121195e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7633928768336773, "rewards/format_reward_func": 0.9955357164144516, "step": 7086 }, { "completion_length": 242.6830472946167, "epoch": 1.1884404208055661, "grad_norm": 0.21219012626209383, "kl": 0.0686187744140625, "learning_rate": 4.993362438568487e-07, "loss": 0.0001, "reward": 1.8035714998841286, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8035714402794838, "rewards/format_reward_func": 1.0, "step": 7088 }, { "completion_length": 237.0357265472412, "epoch": 1.1887757240454335, "grad_norm": 0.19441894647892927, "kl": 0.092987060546875, "learning_rate": 4.993353700184379e-07, "loss": 0.0001, "reward": 1.7517857775092125, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 7090 }, { "completion_length": 257.4017972946167, "epoch": 1.1891110272853012, "grad_norm": 0.2658164508523097, "kl": 0.063018798828125, "learning_rate": 4.993344956059646e-07, "loss": 0.0001, "reward": 1.7125001028180122, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7258928902447224, "rewards/format_reward_func": 0.9866071455180645, "step": 7092 }, { "completion_length": 254.77679920196533, "epoch": 1.1894463305251688, "grad_norm": 0.13371165163675044, "kl": 0.062957763671875, "learning_rate": 4.99333620619431e-07, "loss": 0.0001, "reward": 1.769642896950245, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7741071879863739, "rewards/format_reward_func": 0.9955357164144516, "step": 7094 }, { "completion_length": 250.5982265472412, "epoch": 1.1897816337650362, "grad_norm": 0.24803401131551336, "kl": 0.0692291259765625, "learning_rate": 4.993327450588388e-07, "loss": 0.0001, "reward": 1.7214286252856255, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7214286141097546, "rewards/format_reward_func": 1.0, "step": 7096 }, { "completion_length": 250.93304538726807, "epoch": 1.1901169370049038, "grad_norm": 0.2041804890215718, "kl": 0.0629425048828125, "learning_rate": 4.993318689241902e-07, "loss": 0.0001, "reward": 1.8410714864730835, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.8455357365310192, "rewards/format_reward_func": 0.9955357164144516, "step": 7098 }, { "completion_length": 253.7589406967163, "epoch": 1.1904522402447713, "grad_norm": 0.27051242913768825, "kl": 0.101287841796875, "learning_rate": 4.993309922154872e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7732143141329288, "rewards/format_reward_func": 0.9910714328289032, "step": 7100 }, { "completion_length": 264.7232275009155, "epoch": 1.1907875434846389, "grad_norm": 0.22080903137693467, "kl": 0.06524658203125, "learning_rate": 4.993301149327319e-07, "loss": 0.0001, "reward": 1.6964286640286446, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.705357164144516, "rewards/format_reward_func": 0.9910714328289032, "step": 7102 }, { "completion_length": 256.1785831451416, "epoch": 1.1911228467245065, "grad_norm": 0.125236473476325, "kl": 0.0610809326171875, "learning_rate": 4.993292370759261e-07, "loss": 0.0001, "reward": 1.7660714983940125, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7705357298254967, "rewards/format_reward_func": 0.9955357164144516, "step": 7104 }, { "completion_length": 245.0491180419922, "epoch": 1.191458149964374, "grad_norm": 0.1692324014060084, "kl": 0.0619354248046875, "learning_rate": 4.99328358645072e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 7106 }, { "completion_length": 261.42858600616455, "epoch": 1.1917934532042416, "grad_norm": 0.3721622893705267, "kl": 0.1228485107421875, "learning_rate": 4.993274796401716e-07, "loss": 0.0001, "reward": 1.6660715118050575, "reward_std": 0.07828682195395231, "rewards/equation_reward_func": 0.6794643178582191, "rewards/format_reward_func": 0.9866071492433548, "step": 7108 }, { "completion_length": 259.6696548461914, "epoch": 1.1921287564441092, "grad_norm": 0.0981046329887637, "kl": 0.069854736328125, "learning_rate": 4.99326600061227e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 7110 }, { "completion_length": 264.2544775009155, "epoch": 1.1924640596839766, "grad_norm": 0.17182040481196187, "kl": 0.0637969970703125, "learning_rate": 4.993257199082399e-07, "loss": 0.0001, "reward": 1.7232143729925156, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276785876601934, "rewards/format_reward_func": 0.9955357164144516, "step": 7112 }, { "completion_length": 263.9687614440918, "epoch": 1.1927993629238443, "grad_norm": 0.20245448175755204, "kl": 0.0615386962890625, "learning_rate": 4.993248391812127e-07, "loss": 0.0001, "reward": 1.7946429029107094, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7991071715950966, "rewards/format_reward_func": 0.9955357164144516, "step": 7114 }, { "completion_length": 245.4375114440918, "epoch": 1.1931346661637119, "grad_norm": 0.18806544126057884, "kl": 0.0662994384765625, "learning_rate": 4.993239578801473e-07, "loss": 0.0001, "reward": 1.7428572252392769, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 7116 }, { "completion_length": 253.40179634094238, "epoch": 1.1934699694035793, "grad_norm": 0.16657846626696407, "kl": 0.1014556884765625, "learning_rate": 4.993230760050456e-07, "loss": 0.0001, "reward": 1.7571429088711739, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 7118 }, { "completion_length": 257.8794765472412, "epoch": 1.193805272643447, "grad_norm": 0.2076503664785633, "kl": 0.1319122314453125, "learning_rate": 4.993221935559098e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 7120 }, { "completion_length": 267.1250123977661, "epoch": 1.1941405758833143, "grad_norm": 0.27828751029224, "kl": 0.0604400634765625, "learning_rate": 4.993213105327418e-07, "loss": 0.0001, "reward": 1.7160715088248253, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7294643111526966, "rewards/format_reward_func": 0.9866071492433548, "step": 7122 }, { "completion_length": 252.13840579986572, "epoch": 1.194475879123182, "grad_norm": 0.25794649504968276, "kl": 0.0631561279296875, "learning_rate": 4.993204269355438e-07, "loss": 0.0001, "reward": 1.7071429342031479, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7160714827477932, "rewards/format_reward_func": 0.9910714328289032, "step": 7124 }, { "completion_length": 252.7142972946167, "epoch": 1.1948111823630496, "grad_norm": 0.13134844037671142, "kl": 0.0653228759765625, "learning_rate": 4.993195427643176e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7482143137603998, "rewards/format_reward_func": 0.9910714328289032, "step": 7126 }, { "completion_length": 248.602689743042, "epoch": 1.1951464856029173, "grad_norm": 0.2285108171615533, "kl": 0.06787109375, "learning_rate": 4.993186580190655e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 7128 }, { "completion_length": 250.45537090301514, "epoch": 1.1954817888427847, "grad_norm": 0.1980039645739993, "kl": 0.06549072265625, "learning_rate": 4.993177726997894e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321429066359997, "rewards/format_reward_func": 1.0, "step": 7130 }, { "completion_length": 260.33036518096924, "epoch": 1.1958170920826523, "grad_norm": 0.22274091348978325, "kl": 0.0615234375, "learning_rate": 4.993168868064913e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 7132 }, { "completion_length": 245.57144165039062, "epoch": 1.1961523953225197, "grad_norm": 0.19250171428754337, "kl": 0.0677642822265625, "learning_rate": 4.993160003391733e-07, "loss": 0.0001, "reward": 1.7553571909666061, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7687500268220901, "rewards/format_reward_func": 0.9866071492433548, "step": 7134 }, { "completion_length": 248.99554538726807, "epoch": 1.1964876985623873, "grad_norm": 0.29356235790356494, "kl": 0.069305419921875, "learning_rate": 4.993151132978374e-07, "loss": 0.0001, "reward": 1.7285715118050575, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 7136 }, { "completion_length": 253.4062614440918, "epoch": 1.196823001802255, "grad_norm": 0.19904524202169394, "kl": 0.114044189453125, "learning_rate": 4.993142256824857e-07, "loss": 0.0001, "reward": 1.719642922282219, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7241071835160255, "rewards/format_reward_func": 0.9955357164144516, "step": 7138 }, { "completion_length": 251.6071538925171, "epoch": 1.1971583050421224, "grad_norm": 0.19771127345401926, "kl": 0.06500244140625, "learning_rate": 4.993133374931203e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286185801029, "rewards/format_reward_func": 1.0, "step": 7140 }, { "completion_length": 246.4955472946167, "epoch": 1.19749360828199, "grad_norm": 0.29051666126122133, "kl": 0.1295928955078125, "learning_rate": 4.99312448729743e-07, "loss": 0.0001, "reward": 1.7625000849366188, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.766964316368103, "rewards/format_reward_func": 0.9955357164144516, "step": 7142 }, { "completion_length": 242.80358505249023, "epoch": 1.1978289115218577, "grad_norm": 0.2147715894666388, "kl": 0.0679779052734375, "learning_rate": 4.993115593923561e-07, "loss": 0.0001, "reward": 1.7660714909434319, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 7144 }, { "completion_length": 245.52679920196533, "epoch": 1.198164214761725, "grad_norm": 0.2806676529569814, "kl": 0.0613861083984375, "learning_rate": 4.993106694809615e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7357143126428127, "rewards/format_reward_func": 1.0, "step": 7146 }, { "completion_length": 243.39733219146729, "epoch": 1.1984995180015927, "grad_norm": 0.1163241116677545, "kl": 0.06005859375, "learning_rate": 4.993097789955614e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 7148 }, { "completion_length": 242.4241189956665, "epoch": 1.1988348212414603, "grad_norm": 0.2207702446446991, "kl": 0.064971923828125, "learning_rate": 4.993088879361576e-07, "loss": 0.0001, "reward": 1.7232143506407738, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7366071753203869, "rewards/format_reward_func": 0.9866071492433548, "step": 7150 }, { "completion_length": 243.4285831451416, "epoch": 1.1991701244813278, "grad_norm": 0.17856647941258746, "kl": 0.06219482421875, "learning_rate": 4.993079963027525e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.782142873853445, "rewards/format_reward_func": 1.0, "step": 7152 }, { "completion_length": 231.9821538925171, "epoch": 1.1995054277211954, "grad_norm": 0.31929378908916767, "kl": 0.077178955078125, "learning_rate": 4.993071040953477e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 7154 }, { "completion_length": 245.165189743042, "epoch": 1.1998407309610628, "grad_norm": 0.23651149317372253, "kl": 0.1234130859375, "learning_rate": 4.993062113139457e-07, "loss": 0.0001, "reward": 1.7482143715023994, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7526786103844643, "rewards/format_reward_func": 0.9955357164144516, "step": 7156 }, { "completion_length": 241.07590293884277, "epoch": 1.2001760342009304, "grad_norm": 0.18797884413854196, "kl": 0.0643157958984375, "learning_rate": 4.993053179585484e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714600235224, "rewards/format_reward_func": 1.0, "step": 7158 }, { "completion_length": 245.80804347991943, "epoch": 1.200511337440798, "grad_norm": 0.19977985178626378, "kl": 0.0759735107421875, "learning_rate": 4.993044240291576e-07, "loss": 0.0001, "reward": 1.7750000804662704, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 7160 }, { "completion_length": 244.26786994934082, "epoch": 1.2008466406806655, "grad_norm": 0.27002438185556893, "kl": 0.070404052734375, "learning_rate": 4.993035295257758e-07, "loss": 0.0001, "reward": 1.7232143506407738, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.727678619325161, "rewards/format_reward_func": 0.9955357164144516, "step": 7162 }, { "completion_length": 246.47769165039062, "epoch": 1.2011819439205331, "grad_norm": 0.14532020676846388, "kl": 0.063720703125, "learning_rate": 4.993026344484047e-07, "loss": 0.0001, "reward": 1.7553572207689285, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.759821455925703, "rewards/format_reward_func": 0.9955357164144516, "step": 7164 }, { "completion_length": 243.21429824829102, "epoch": 1.2015172471604008, "grad_norm": 0.32398441312856185, "kl": 0.1230926513671875, "learning_rate": 4.993017387970467e-07, "loss": 0.0001, "reward": 1.7071429565548897, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7071429006755352, "rewards/format_reward_func": 1.0, "step": 7166 }, { "completion_length": 249.39733600616455, "epoch": 1.2018525504002682, "grad_norm": 0.16493126908134675, "kl": 0.0857696533203125, "learning_rate": 4.993008425717034e-07, "loss": 0.0001, "reward": 1.7375000640749931, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.750892885029316, "rewards/format_reward_func": 0.9866071492433548, "step": 7168 }, { "completion_length": 246.2009048461914, "epoch": 1.2021878536401358, "grad_norm": 0.17881583185915126, "kl": 0.06036376953125, "learning_rate": 4.992999457723773e-07, "loss": 0.0001, "reward": 1.8107143566012383, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8196428716182709, "rewards/format_reward_func": 0.9910714328289032, "step": 7170 }, { "completion_length": 234.98661708831787, "epoch": 1.2025231568800034, "grad_norm": 0.20858932423195833, "kl": 0.067535400390625, "learning_rate": 4.992990483990702e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 7172 }, { "completion_length": 254.27233123779297, "epoch": 1.2028584601198709, "grad_norm": 0.3299755816245421, "kl": 0.110748291015625, "learning_rate": 4.992981504517843e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 7174 }, { "completion_length": 249.93305110931396, "epoch": 1.2031937633597385, "grad_norm": 0.1657871907998651, "kl": 0.1009368896484375, "learning_rate": 4.992972519305216e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 7176 }, { "completion_length": 254.86608219146729, "epoch": 1.203529066599606, "grad_norm": 0.19653879742103245, "kl": 0.065093994140625, "learning_rate": 4.992963528352843e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7758928798139095, "rewards/format_reward_func": 0.9866071492433548, "step": 7178 }, { "completion_length": 242.56251049041748, "epoch": 1.2038643698394735, "grad_norm": 0.22354122095251158, "kl": 0.1406707763671875, "learning_rate": 4.992954531660742e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143297791481, "rewards/format_reward_func": 1.0, "step": 7180 }, { "completion_length": 245.2544765472412, "epoch": 1.2041996730793412, "grad_norm": 0.2367578009193638, "kl": 0.0679779052734375, "learning_rate": 4.992945529228937e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321429029107094, "rewards/format_reward_func": 1.0, "step": 7182 }, { "completion_length": 257.9464416503906, "epoch": 1.2045349763192088, "grad_norm": 0.19217014082094047, "kl": 0.1060028076171875, "learning_rate": 4.992936521057446e-07, "loss": 0.0001, "reward": 1.7642858102917671, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 7184 }, { "completion_length": 257.2276906967163, "epoch": 1.2048702795590762, "grad_norm": 0.2761550441553349, "kl": 0.1991729736328125, "learning_rate": 4.992927507146291e-07, "loss": 0.0002, "reward": 1.7785715088248253, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7785714659839869, "rewards/format_reward_func": 1.0, "step": 7186 }, { "completion_length": 251.54911708831787, "epoch": 1.2052055827989439, "grad_norm": 0.307690065103978, "kl": 0.085052490234375, "learning_rate": 4.992918487495492e-07, "loss": 0.0001, "reward": 1.801785759627819, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.8062500320374966, "rewards/format_reward_func": 0.9955357164144516, "step": 7188 }, { "completion_length": 255.63840579986572, "epoch": 1.2055408860388113, "grad_norm": 0.15497750976828262, "kl": 0.0872802734375, "learning_rate": 4.992909462105072e-07, "loss": 0.0001, "reward": 1.7375000640749931, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.74196432903409, "rewards/format_reward_func": 0.9955357164144516, "step": 7190 }, { "completion_length": 256.4151916503906, "epoch": 1.205876189278679, "grad_norm": 0.22940595595717467, "kl": 0.0673065185546875, "learning_rate": 4.992900430975048e-07, "loss": 0.0001, "reward": 1.7553572058677673, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214633762836, "rewards/format_reward_func": 0.9955357164144516, "step": 7192 }, { "completion_length": 254.82590675354004, "epoch": 1.2062114925185465, "grad_norm": 0.2030428580858518, "kl": 0.0630340576171875, "learning_rate": 4.992891394105445e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 7194 }, { "completion_length": 249.6160831451416, "epoch": 1.206546795758414, "grad_norm": 0.22482483113253907, "kl": 0.1019287109375, "learning_rate": 4.992882351496281e-07, "loss": 0.0001, "reward": 1.7732143327593803, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7776786275207996, "rewards/format_reward_func": 0.9955357164144516, "step": 7196 }, { "completion_length": 254.1160831451416, "epoch": 1.2068820989982816, "grad_norm": 0.12312991086334027, "kl": 0.086456298828125, "learning_rate": 4.992873303147577e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.025253813713788986, "rewards/equation_reward_func": 0.8053571507334709, "rewards/format_reward_func": 0.9910714328289032, "step": 7198 }, { "completion_length": 263.8794775009155, "epoch": 1.2072174022381492, "grad_norm": 0.10108203752975906, "kl": 0.229705810546875, "learning_rate": 4.992864249059354e-07, "loss": 0.0002, "reward": 1.7053572088479996, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7098214738070965, "rewards/format_reward_func": 0.9955357164144516, "step": 7200 }, { "completion_length": 249.06697750091553, "epoch": 1.2075527054780166, "grad_norm": 0.305712170662913, "kl": 0.169677734375, "learning_rate": 4.992855189231634e-07, "loss": 0.0002, "reward": 1.7464286386966705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 7202 }, { "completion_length": 260.4241199493408, "epoch": 1.2078880087178843, "grad_norm": 0.11947172731424033, "kl": 0.0626373291015625, "learning_rate": 4.992846123664437e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 7204 }, { "completion_length": 260.3839445114136, "epoch": 1.208223311957752, "grad_norm": 0.17124817043019805, "kl": 0.0750732421875, "learning_rate": 4.992837052357783e-07, "loss": 0.0001, "reward": 1.7625000700354576, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7669643145054579, "rewards/format_reward_func": 0.9955357164144516, "step": 7206 }, { "completion_length": 258.6205463409424, "epoch": 1.2085586151976193, "grad_norm": 0.20993607262866407, "kl": 0.0743255615234375, "learning_rate": 4.992827975311695e-07, "loss": 0.0001, "reward": 1.7625000700354576, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9955357164144516, "step": 7208 }, { "completion_length": 249.5803689956665, "epoch": 1.208893918437487, "grad_norm": 0.18415055786807938, "kl": 0.0663604736328125, "learning_rate": 4.992818892526193e-07, "loss": 0.0001, "reward": 1.78035718947649, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214507102966, "rewards/format_reward_func": 0.9955357164144516, "step": 7210 }, { "completion_length": 252.821439743042, "epoch": 1.2092292216773544, "grad_norm": 0.1304870095299773, "kl": 0.0946197509765625, "learning_rate": 4.992809804001296e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8035714626312256, "rewards/format_reward_func": 1.0, "step": 7212 }, { "completion_length": 262.9776906967163, "epoch": 1.209564524917222, "grad_norm": 0.23406135216288343, "kl": 0.0801544189453125, "learning_rate": 4.992800709737029e-07, "loss": 0.0001, "reward": 1.7750000804662704, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 7214 }, { "completion_length": 260.1160840988159, "epoch": 1.2098998281570896, "grad_norm": 0.1897578285746444, "kl": 0.129669189453125, "learning_rate": 4.992791609733408e-07, "loss": 0.0001, "reward": 1.785714328289032, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7946428954601288, "rewards/format_reward_func": 0.9910714328289032, "step": 7216 }, { "completion_length": 256.3482275009155, "epoch": 1.210235131396957, "grad_norm": 0.23263252449293698, "kl": 0.1983489990234375, "learning_rate": 4.992782503990458e-07, "loss": 0.0002, "reward": 1.769642911851406, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 7218 }, { "completion_length": 257.5759048461914, "epoch": 1.2105704346368247, "grad_norm": 0.20205381240686107, "kl": 0.0634307861328125, "learning_rate": 4.992773392508198e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7455357499420643, "rewards/format_reward_func": 0.9955357164144516, "step": 7220 }, { "completion_length": 266.7098388671875, "epoch": 1.2109057378766923, "grad_norm": 0.2924996226040406, "kl": 0.0713043212890625, "learning_rate": 4.99276427528665e-07, "loss": 0.0001, "reward": 1.796428643167019, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 7222 }, { "completion_length": 254.8035831451416, "epoch": 1.2112410411165597, "grad_norm": 0.1814333949957524, "kl": 0.100006103515625, "learning_rate": 4.992755152325833e-07, "loss": 0.0001, "reward": 1.744642935693264, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071578115225, "rewards/format_reward_func": 0.9955357164144516, "step": 7224 }, { "completion_length": 257.64287185668945, "epoch": 1.2115763443564274, "grad_norm": 0.24674960296846465, "kl": 0.1389617919921875, "learning_rate": 4.99274602362577e-07, "loss": 0.0001, "reward": 1.7660714909434319, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7705357596278191, "rewards/format_reward_func": 0.9955357164144516, "step": 7226 }, { "completion_length": 249.9107265472412, "epoch": 1.211911647596295, "grad_norm": 0.17770874745401008, "kl": 0.068023681640625, "learning_rate": 4.992736889186482e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 7228 }, { "completion_length": 256.6250123977661, "epoch": 1.2122469508361624, "grad_norm": 0.2509594504708243, "kl": 0.1199951171875, "learning_rate": 4.992727749007988e-07, "loss": 0.0001, "reward": 1.7267857864499092, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7312500402331352, "rewards/format_reward_func": 0.9955357164144516, "step": 7230 }, { "completion_length": 267.089298248291, "epoch": 1.21258225407603, "grad_norm": 0.22356860342543525, "kl": 0.0696563720703125, "learning_rate": 4.992718603090312e-07, "loss": 0.0001, "reward": 1.7178572043776512, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7178571987897158, "rewards/format_reward_func": 1.0, "step": 7232 }, { "completion_length": 263.86608123779297, "epoch": 1.2129175573158975, "grad_norm": 0.4141292974356211, "kl": 0.0949554443359375, "learning_rate": 4.992709451433473e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.09091372694820166, "rewards/equation_reward_func": 0.801785733550787, "rewards/format_reward_func": 0.9910714328289032, "step": 7234 }, { "completion_length": 259.0000123977661, "epoch": 1.213252860555765, "grad_norm": 0.2666180229355076, "kl": 0.1144866943359375, "learning_rate": 4.992700294037493e-07, "loss": 0.0001, "reward": 1.7214286625385284, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7214285954833031, "rewards/format_reward_func": 1.0, "step": 7236 }, { "completion_length": 252.33929538726807, "epoch": 1.2135881637956327, "grad_norm": 0.08668010444846727, "kl": 0.1309967041015625, "learning_rate": 4.992691130902392e-07, "loss": 0.0001, "reward": 1.7875000834465027, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7919643186032772, "rewards/format_reward_func": 0.9955357164144516, "step": 7238 }, { "completion_length": 255.44197368621826, "epoch": 1.2139234670355001, "grad_norm": 0.09888979720455138, "kl": 0.122528076171875, "learning_rate": 4.992681962028193e-07, "loss": 0.0001, "reward": 1.860714316368103, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8607143089175224, "rewards/format_reward_func": 1.0, "step": 7240 }, { "completion_length": 248.6830472946167, "epoch": 1.2142587702753678, "grad_norm": 0.18925414301509572, "kl": 0.1031341552734375, "learning_rate": 4.992672787414914e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714693367481, "rewards/format_reward_func": 1.0, "step": 7242 }, { "completion_length": 258.4464416503906, "epoch": 1.2145940735152354, "grad_norm": 0.2640904385696533, "kl": 0.074462890625, "learning_rate": 4.99266360706258e-07, "loss": 0.0001, "reward": 1.7892857939004898, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 7244 }, { "completion_length": 251.86162185668945, "epoch": 1.2149293767551028, "grad_norm": 0.31212532650075375, "kl": 0.06591796875, "learning_rate": 4.992654420971209e-07, "loss": 0.0001, "reward": 1.8071429133415222, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 7246 }, { "completion_length": 261.6875114440918, "epoch": 1.2152646799949705, "grad_norm": 0.08009381221104957, "kl": 0.072265625, "learning_rate": 4.992645229140824e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 7248 }, { "completion_length": 255.63394260406494, "epoch": 1.215599983234838, "grad_norm": 0.16946388110095856, "kl": 0.06854248046875, "learning_rate": 4.992636031571444e-07, "loss": 0.0001, "reward": 1.6964286640286446, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.696428619325161, "rewards/format_reward_func": 1.0, "step": 7250 }, { "completion_length": 254.97769165039062, "epoch": 1.2159352864747055, "grad_norm": 0.23666184664199402, "kl": 0.078399658203125, "learning_rate": 4.992626828263093e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 7252 }, { "completion_length": 253.17858028411865, "epoch": 1.2162705897145731, "grad_norm": 0.008383486510034467, "kl": 0.1063690185546875, "learning_rate": 4.992617619215791e-07, "loss": 0.0001, "reward": 1.753571480512619, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714600235224, "rewards/format_reward_func": 1.0, "step": 7254 }, { "completion_length": 262.36608505249023, "epoch": 1.2166058929544405, "grad_norm": 0.23928214815805107, "kl": 0.1030426025390625, "learning_rate": 4.992608404429558e-07, "loss": 0.0001, "reward": 1.7330357879400253, "reward_std": 0.044194172602146864, "rewards/equation_reward_func": 0.7348214592784643, "rewards/format_reward_func": 0.9982142895460129, "step": 7256 }, { "completion_length": 254.133939743042, "epoch": 1.2169411961943082, "grad_norm": 0.16791549556043, "kl": 0.0996551513671875, "learning_rate": 4.992599183904417e-07, "loss": 0.0001, "reward": 1.7232143580913544, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7276786006987095, "rewards/format_reward_func": 0.9955357164144516, "step": 7258 }, { "completion_length": 255.5178689956665, "epoch": 1.2172764994341758, "grad_norm": 0.25849978845251953, "kl": 0.084625244140625, "learning_rate": 4.992589957640388e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7517857402563095, "rewards/format_reward_func": 0.9910714328289032, "step": 7260 }, { "completion_length": 254.0759048461914, "epoch": 1.2176118026740435, "grad_norm": 0.18565102559484725, "kl": 0.154052734375, "learning_rate": 4.992580725637494e-07, "loss": 0.0002, "reward": 1.7370536476373672, "reward_std": 0.0587151157669723, "rewards/equation_reward_func": 0.7446428835391998, "rewards/format_reward_func": 0.9924107193946838, "step": 7262 }, { "completion_length": 259.30358505249023, "epoch": 1.2179471059139109, "grad_norm": 0.21800854720790241, "kl": 0.1430206298828125, "learning_rate": 4.992571487895753e-07, "loss": 0.0001, "reward": 1.7232143431901932, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7276785969734192, "rewards/format_reward_func": 0.9955357164144516, "step": 7264 }, { "completion_length": 250.0759048461914, "epoch": 1.2182824091537785, "grad_norm": 0.1164286611566817, "kl": 0.0711212158203125, "learning_rate": 4.99256224441519e-07, "loss": 0.0001, "reward": 1.7392857521772385, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857633531094, "rewards/format_reward_func": 1.0, "step": 7266 }, { "completion_length": 264.60715198516846, "epoch": 1.218617712393646, "grad_norm": 0.24969613066518914, "kl": 0.062957763671875, "learning_rate": 4.992552995195825e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000279396772, "rewards/format_reward_func": 1.0, "step": 7268 }, { "completion_length": 255.99554920196533, "epoch": 1.2189530156335135, "grad_norm": 0.19538401377886458, "kl": 0.0625152587890625, "learning_rate": 4.992543740237677e-07, "loss": 0.0001, "reward": 1.8339286297559738, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.8383928760886192, "rewards/format_reward_func": 0.9955357164144516, "step": 7270 }, { "completion_length": 250.9642972946167, "epoch": 1.2192883188733812, "grad_norm": 0.23034703579469626, "kl": 0.0625762939453125, "learning_rate": 4.99253447954077e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714752972126, "rewards/format_reward_func": 1.0, "step": 7272 }, { "completion_length": 263.1696557998657, "epoch": 1.2196236221132486, "grad_norm": 0.19438148140689746, "kl": 0.109344482421875, "learning_rate": 4.992525213105124e-07, "loss": 0.0001, "reward": 1.7178572416305542, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7267857454717159, "rewards/format_reward_func": 0.9910714328289032, "step": 7274 }, { "completion_length": 266.7678680419922, "epoch": 1.2199589253531162, "grad_norm": 0.20735329597287194, "kl": 0.077850341796875, "learning_rate": 4.992515940930762e-07, "loss": 0.0001, "reward": 1.7357143461704254, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143238186836, "rewards/format_reward_func": 1.0, "step": 7276 }, { "completion_length": 258.3035840988159, "epoch": 1.2202942285929839, "grad_norm": 0.14736460091258483, "kl": 0.1506500244140625, "learning_rate": 4.992506663017702e-07, "loss": 0.0002, "reward": 1.742857240140438, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 7278 }, { "completion_length": 248.61608409881592, "epoch": 1.2206295318328513, "grad_norm": 0.2677453481686041, "kl": 0.126617431640625, "learning_rate": 4.99249737936597e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7669643051922321, "rewards/format_reward_func": 0.9955357164144516, "step": 7280 }, { "completion_length": 259.7901887893677, "epoch": 1.220964835072719, "grad_norm": 0.492668665701258, "kl": 0.084564208984375, "learning_rate": 4.992488089975583e-07, "loss": 0.0001, "reward": 1.7160715237259865, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7205357477068901, "rewards/format_reward_func": 0.9955357164144516, "step": 7282 }, { "completion_length": 264.9196548461914, "epoch": 1.2213001383125865, "grad_norm": 0.181095995568355, "kl": 0.087677001953125, "learning_rate": 4.992478794846565e-07, "loss": 0.0001, "reward": 1.723214328289032, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7366071715950966, "rewards/format_reward_func": 0.9866071492433548, "step": 7284 }, { "completion_length": 267.1651906967163, "epoch": 1.221635441552454, "grad_norm": 0.2169639788945554, "kl": 0.0669708251953125, "learning_rate": 4.992469493978937e-07, "loss": 0.0001, "reward": 1.7946429178118706, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7991071790456772, "rewards/format_reward_func": 0.9955357164144516, "step": 7286 }, { "completion_length": 256.1875123977661, "epoch": 1.2219707447923216, "grad_norm": 0.6186112811288419, "kl": 0.09869384765625, "learning_rate": 4.992460187372719e-07, "loss": 0.0001, "reward": 1.7178572192788124, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7178571857511997, "rewards/format_reward_func": 1.0, "step": 7288 }, { "completion_length": 260.57590103149414, "epoch": 1.222306048032189, "grad_norm": 0.16121117399200693, "kl": 0.0945587158203125, "learning_rate": 4.992450875027935e-07, "loss": 0.0001, "reward": 1.7160715088248253, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7205357514321804, "rewards/format_reward_func": 0.9955357164144516, "step": 7290 }, { "completion_length": 260.7410879135132, "epoch": 1.2226413512720566, "grad_norm": 0.22727721872779744, "kl": 0.06622314453125, "learning_rate": 4.992441556944604e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143331319094, "rewards/format_reward_func": 1.0, "step": 7292 }, { "completion_length": 259.4107265472412, "epoch": 1.2229766545119243, "grad_norm": 0.24415005335576428, "kl": 0.1076812744140625, "learning_rate": 4.992432233122749e-07, "loss": 0.0001, "reward": 1.7607143744826317, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143204659224, "rewards/format_reward_func": 1.0, "step": 7294 }, { "completion_length": 257.56697845458984, "epoch": 1.2233119577517917, "grad_norm": 0.31739442791663675, "kl": 0.0947113037109375, "learning_rate": 4.99242290356239e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 7296 }, { "completion_length": 270.28126430511475, "epoch": 1.2236472609916593, "grad_norm": 0.1385651283554177, "kl": 0.0988616943359375, "learning_rate": 4.99241356826355e-07, "loss": 0.0001, "reward": 1.7785715162754059, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714454948902, "rewards/format_reward_func": 1.0, "step": 7298 }, { "completion_length": 266.92857933044434, "epoch": 1.223982564231527, "grad_norm": 0.2780453894697533, "kl": 0.0828094482421875, "learning_rate": 4.992404227226249e-07, "loss": 0.0001, "reward": 1.7553571984171867, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.759821455925703, "rewards/format_reward_func": 0.9955357164144516, "step": 7300 }, { "completion_length": 266.64733123779297, "epoch": 1.2243178674713944, "grad_norm": 0.1710191122109286, "kl": 0.0935516357421875, "learning_rate": 4.99239488045051e-07, "loss": 0.0001, "reward": 1.7446429505944252, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 7302 }, { "completion_length": 269.4509057998657, "epoch": 1.224653170711262, "grad_norm": 0.24567936420836164, "kl": 0.071990966796875, "learning_rate": 4.992385527936354e-07, "loss": 0.0001, "reward": 1.7714286521077156, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7803571820259094, "rewards/format_reward_func": 0.9910714328289032, "step": 7304 }, { "completion_length": 267.95090770721436, "epoch": 1.2249884739511296, "grad_norm": 0.2317099030341503, "kl": 0.098785400390625, "learning_rate": 4.992376169683803e-07, "loss": 0.0001, "reward": 1.8071429207921028, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428872644901, "rewards/format_reward_func": 1.0, "step": 7306 }, { "completion_length": 262.07590675354004, "epoch": 1.225323777190997, "grad_norm": 0.21304988210294232, "kl": 0.065704345703125, "learning_rate": 4.992366805692877e-07, "loss": 0.0001, "reward": 1.7696429193019867, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 7308 }, { "completion_length": 267.8169775009155, "epoch": 1.2256590804308647, "grad_norm": 0.226278049448281, "kl": 0.0792388916015625, "learning_rate": 4.992357435963599e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7696428820490837, "rewards/format_reward_func": 0.9910714328289032, "step": 7310 }, { "completion_length": 261.1964454650879, "epoch": 1.225994383670732, "grad_norm": 0.24893292964449734, "kl": 0.0957183837890625, "learning_rate": 4.992348060495989e-07, "loss": 0.0001, "reward": 1.783928632736206, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928716182709, "rewards/format_reward_func": 0.9955357164144516, "step": 7312 }, { "completion_length": 266.98662185668945, "epoch": 1.2263296869105997, "grad_norm": 0.24435019680281253, "kl": 0.075714111328125, "learning_rate": 4.99233867929007e-07, "loss": 0.0001, "reward": 1.7250000834465027, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000312924385, "rewards/format_reward_func": 1.0, "step": 7314 }, { "completion_length": 245.55358219146729, "epoch": 1.2266649901504674, "grad_norm": 0.174986762818187, "kl": 0.0735015869140625, "learning_rate": 4.992329292345865e-07, "loss": 0.0001, "reward": 1.7428572252392769, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7428571656346321, "rewards/format_reward_func": 1.0, "step": 7316 }, { "completion_length": 262.0759029388428, "epoch": 1.227000293390335, "grad_norm": 0.05003726187541759, "kl": 0.06793212890625, "learning_rate": 4.992319899663391e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.05555839091539383, "rewards/equation_reward_func": 0.7839285992085934, "rewards/format_reward_func": 0.9910714328289032, "step": 7318 }, { "completion_length": 249.6071538925171, "epoch": 1.2273355966302024, "grad_norm": 0.0020244057203651937, "kl": 0.06622314453125, "learning_rate": 4.992310501242676e-07, "loss": 0.0001, "reward": 1.8142857551574707, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 7320 }, { "completion_length": 260.71430110931396, "epoch": 1.22767089987007, "grad_norm": 0.20053090389361175, "kl": 0.091400146484375, "learning_rate": 4.992301097083735e-07, "loss": 0.0001, "reward": 1.732142947614193, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 1.0, "step": 7322 }, { "completion_length": 261.8348331451416, "epoch": 1.2280062031099375, "grad_norm": 0.2737092773881806, "kl": 0.0717315673828125, "learning_rate": 4.992291687186595e-07, "loss": 0.0001, "reward": 1.7285715192556381, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 7324 }, { "completion_length": 256.0134057998657, "epoch": 1.228341506349805, "grad_norm": 0.33311744361146584, "kl": 0.130706787109375, "learning_rate": 4.992282271551274e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 7326 }, { "completion_length": 249.45536994934082, "epoch": 1.2286768095896727, "grad_norm": 0.3027524607523363, "kl": 0.1055755615234375, "learning_rate": 4.992272850177795e-07, "loss": 0.0001, "reward": 1.8089286237955093, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8133928813040257, "rewards/format_reward_func": 0.9955357164144516, "step": 7328 }, { "completion_length": 257.183048248291, "epoch": 1.2290121128295401, "grad_norm": 0.2134971880773326, "kl": 0.068511962890625, "learning_rate": 4.992263423066182e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7455357573926449, "rewards/format_reward_func": 0.9955357164144516, "step": 7330 }, { "completion_length": 265.4241189956665, "epoch": 1.2293474160694078, "grad_norm": 0.09134287624357415, "kl": 0.0912628173828125, "learning_rate": 4.992253990216453e-07, "loss": 0.0001, "reward": 1.7839286550879478, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7883928641676903, "rewards/format_reward_func": 0.9955357164144516, "step": 7332 }, { "completion_length": 255.87054634094238, "epoch": 1.2296827193092754, "grad_norm": 0.2729861269599726, "kl": 0.0726165771484375, "learning_rate": 4.992244551628631e-07, "loss": 0.0001, "reward": 1.7142857909202576, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7142857499420643, "rewards/format_reward_func": 1.0, "step": 7334 }, { "completion_length": 243.59375858306885, "epoch": 1.2300180225491428, "grad_norm": 0.12997354741489184, "kl": 0.0819549560546875, "learning_rate": 4.992235107302738e-07, "loss": 0.0001, "reward": 1.7071429044008255, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7071428969502449, "rewards/format_reward_func": 1.0, "step": 7336 }, { "completion_length": 253.1562623977661, "epoch": 1.2303533257890105, "grad_norm": 0.13434226070764227, "kl": 0.089324951171875, "learning_rate": 4.992225657238797e-07, "loss": 0.0001, "reward": 1.7000000923871994, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7000000346451998, "rewards/format_reward_func": 1.0, "step": 7338 }, { "completion_length": 242.4419765472412, "epoch": 1.230688629028878, "grad_norm": 0.1688100720296217, "kl": 0.079833984375, "learning_rate": 4.992216201436827e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857428640127, "rewards/format_reward_func": 1.0, "step": 7340 }, { "completion_length": 240.30358123779297, "epoch": 1.2310239322687455, "grad_norm": 0.22757167938869166, "kl": 0.0682830810546875, "learning_rate": 4.992206739896851e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8214285932481289, "rewards/format_reward_func": 1.0, "step": 7342 }, { "completion_length": 256.2857265472412, "epoch": 1.2313592355086131, "grad_norm": 0.24103032576905367, "kl": 0.0616302490234375, "learning_rate": 4.992197272618893e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 7344 }, { "completion_length": 249.7500114440918, "epoch": 1.2316945387484806, "grad_norm": 0.22890032159343376, "kl": 0.084991455078125, "learning_rate": 4.992187799602972e-07, "loss": 0.0001, "reward": 1.6910715252161026, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.6955357491970062, "rewards/format_reward_func": 0.9955357164144516, "step": 7346 }, { "completion_length": 239.84376335144043, "epoch": 1.2320298419883482, "grad_norm": 0.11993653416671786, "kl": 0.0696258544921875, "learning_rate": 4.992178320849109e-07, "loss": 0.0001, "reward": 1.714285783469677, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857611179352, "rewards/format_reward_func": 1.0, "step": 7348 }, { "completion_length": 229.56697463989258, "epoch": 1.2323651452282158, "grad_norm": 0.283880895706985, "kl": 0.1734161376953125, "learning_rate": 4.992168836357329e-07, "loss": 0.0002, "reward": 1.8250000551342964, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8250000290572643, "rewards/format_reward_func": 1.0, "step": 7350 }, { "completion_length": 243.64733219146729, "epoch": 1.2327004484680832, "grad_norm": 0.2535059072968625, "kl": 0.0676727294921875, "learning_rate": 4.992159346127652e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 7352 }, { "completion_length": 237.9464406967163, "epoch": 1.2330357517079509, "grad_norm": 0.3366521986910057, "kl": 0.0709991455078125, "learning_rate": 4.9921498501601e-07, "loss": 0.0001, "reward": 1.7035715356469154, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.7035714499652386, "rewards/format_reward_func": 1.0, "step": 7354 }, { "completion_length": 230.95983219146729, "epoch": 1.2333710549478185, "grad_norm": 0.3199696882096981, "kl": 0.098846435546875, "learning_rate": 4.992140348454695e-07, "loss": 0.0001, "reward": 1.728571504354477, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7285714503377676, "rewards/format_reward_func": 1.0, "step": 7356 }, { "completion_length": 232.62054824829102, "epoch": 1.233706358187686, "grad_norm": 0.20618256158180728, "kl": 0.0677337646484375, "learning_rate": 4.992130841011461e-07, "loss": 0.0001, "reward": 1.725000075995922, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000257045031, "rewards/format_reward_func": 1.0, "step": 7358 }, { "completion_length": 226.79018878936768, "epoch": 1.2340416614275536, "grad_norm": 0.24268664351870567, "kl": 0.0743560791015625, "learning_rate": 4.992121327830415e-07, "loss": 0.0001, "reward": 1.8071429133415222, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428760886192, "rewards/format_reward_func": 1.0, "step": 7360 }, { "completion_length": 229.5535831451416, "epoch": 1.2343769646674212, "grad_norm": 0.21594958900650493, "kl": 0.066436767578125, "learning_rate": 4.992111808911583e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000543892384, "rewards/format_reward_func": 1.0, "step": 7362 }, { "completion_length": 237.1830472946167, "epoch": 1.2347122679072886, "grad_norm": 0.2437006081039398, "kl": 0.0667266845703125, "learning_rate": 4.992102284254985e-07, "loss": 0.0001, "reward": 1.717857226729393, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7178571671247482, "rewards/format_reward_func": 1.0, "step": 7364 }, { "completion_length": 232.06697463989258, "epoch": 1.2350475711471562, "grad_norm": 0.21956220909973093, "kl": 0.06585693359375, "learning_rate": 4.992092753860644e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8107143044471741, "rewards/format_reward_func": 1.0, "step": 7366 }, { "completion_length": 231.67858123779297, "epoch": 1.2353828743870237, "grad_norm": 0.22599447075235282, "kl": 0.0912017822265625, "learning_rate": 4.992083217728581e-07, "loss": 0.0001, "reward": 1.775000087916851, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 7368 }, { "completion_length": 221.7634038925171, "epoch": 1.2357181776268913, "grad_norm": 0.2089522879086932, "kl": 0.0640411376953125, "learning_rate": 4.992073675858818e-07, "loss": 0.0001, "reward": 1.7678572311997414, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571604192257, "rewards/format_reward_func": 1.0, "step": 7370 }, { "completion_length": 225.31251049041748, "epoch": 1.236053480866759, "grad_norm": 0.27154299086942546, "kl": 0.0687255859375, "learning_rate": 4.992064128251379e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7392857633531094, "rewards/format_reward_func": 1.0, "step": 7372 }, { "completion_length": 231.61608219146729, "epoch": 1.2363887841066263, "grad_norm": 0.2652996298189875, "kl": 0.061309814453125, "learning_rate": 4.992054574906284e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 7374 }, { "completion_length": 231.45983409881592, "epoch": 1.236724087346494, "grad_norm": 0.1997123223499637, "kl": 0.067901611328125, "learning_rate": 4.992045015823555e-07, "loss": 0.0001, "reward": 1.735714353621006, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7357143312692642, "rewards/format_reward_func": 1.0, "step": 7376 }, { "completion_length": 226.44643878936768, "epoch": 1.2370593905863616, "grad_norm": 0.22936699167594957, "kl": 0.0925445556640625, "learning_rate": 4.992035451003214e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 7378 }, { "completion_length": 226.43304538726807, "epoch": 1.237394693826229, "grad_norm": 0.3561236632263674, "kl": 0.072845458984375, "learning_rate": 4.992025880445284e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.782142873853445, "rewards/format_reward_func": 1.0, "step": 7380 }, { "completion_length": 225.39733123779297, "epoch": 1.2377299970660967, "grad_norm": 0.13552781415865434, "kl": 0.06146240234375, "learning_rate": 4.992016304149786e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 7382 }, { "completion_length": 225.352689743042, "epoch": 1.2380653003059643, "grad_norm": 0.16034323470596654, "kl": 0.0637054443359375, "learning_rate": 4.992006722116743e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.771428594365716, "rewards/format_reward_func": 1.0, "step": 7384 }, { "completion_length": 234.2009038925171, "epoch": 1.2384006035458317, "grad_norm": 0.11643361950936694, "kl": 0.06671142578125, "learning_rate": 4.991997134346176e-07, "loss": 0.0001, "reward": 1.725000061094761, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7250000424683094, "rewards/format_reward_func": 1.0, "step": 7386 }, { "completion_length": 237.43304538726807, "epoch": 1.2387359067856993, "grad_norm": 0.34066513169603546, "kl": 0.0608978271484375, "learning_rate": 4.991987540838108e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 7388 }, { "completion_length": 227.40626049041748, "epoch": 1.2390712100255667, "grad_norm": 0.14493342284575672, "kl": 0.064849853515625, "learning_rate": 4.99197794159256e-07, "loss": 0.0001, "reward": 1.8142857551574707, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 7390 }, { "completion_length": 232.2991189956665, "epoch": 1.2394065132654344, "grad_norm": 0.27793165057230773, "kl": 0.067626953125, "learning_rate": 4.991968336609556e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 7392 }, { "completion_length": 225.19643878936768, "epoch": 1.239741816505302, "grad_norm": 0.2876380698092266, "kl": 0.07135009765625, "learning_rate": 4.991958725889116e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 7394 }, { "completion_length": 229.30358123779297, "epoch": 1.2400771197451697, "grad_norm": 0.21288203957645194, "kl": 0.064697265625, "learning_rate": 4.991949109431264e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7357143089175224, "rewards/format_reward_func": 1.0, "step": 7396 }, { "completion_length": 221.82143783569336, "epoch": 1.240412422985037, "grad_norm": 0.10024101777540191, "kl": 0.0673828125, "learning_rate": 4.99193948723602e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428883820772, "rewards/format_reward_func": 1.0, "step": 7398 }, { "completion_length": 226.58036708831787, "epoch": 1.2407477262249047, "grad_norm": 0.22284009957725762, "kl": 0.0872955322265625, "learning_rate": 4.991929859303408e-07, "loss": 0.0001, "reward": 1.710714377462864, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7107143141329288, "rewards/format_reward_func": 1.0, "step": 7400 }, { "completion_length": 223.60268783569336, "epoch": 1.2410830294647721, "grad_norm": 0.2600750417552333, "kl": 0.06585693359375, "learning_rate": 4.99192022563345e-07, "loss": 0.0001, "reward": 1.79642865806818, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 7402 }, { "completion_length": 227.6919755935669, "epoch": 1.2414183327046397, "grad_norm": 0.26431094508941017, "kl": 0.0618896484375, "learning_rate": 4.991910586226166e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 7404 }, { "completion_length": 221.63840198516846, "epoch": 1.2417536359445074, "grad_norm": 0.34389640598822085, "kl": 0.0604095458984375, "learning_rate": 4.991900941081583e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 7406 }, { "completion_length": 225.4419755935669, "epoch": 1.2420889391843748, "grad_norm": 0.38325653075172783, "kl": 0.096160888671875, "learning_rate": 4.991891290199717e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8071428909897804, "rewards/format_reward_func": 1.0, "step": 7408 }, { "completion_length": 223.94197463989258, "epoch": 1.2424242424242424, "grad_norm": 0.16090653148401493, "kl": 0.065399169921875, "learning_rate": 4.991881633580594e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 7410 }, { "completion_length": 228.5000123977661, "epoch": 1.24275954566411, "grad_norm": 0.28773662489496515, "kl": 0.074371337890625, "learning_rate": 4.991871971224237e-07, "loss": 0.0001, "reward": 1.7665179297327995, "reward_std": 0.0776554741896689, "rewards/equation_reward_func": 0.7678571846336126, "rewards/format_reward_func": 0.9986607171595097, "step": 7412 }, { "completion_length": 230.33929634094238, "epoch": 1.2430948489039775, "grad_norm": 0.16326829694279263, "kl": 0.0712127685546875, "learning_rate": 4.991862303130666e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 7414 }, { "completion_length": 225.54018783569336, "epoch": 1.2434301521438451, "grad_norm": 0.14390607940358557, "kl": 0.0926513671875, "learning_rate": 4.991852629299904e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 7416 }, { "completion_length": 222.42858123779297, "epoch": 1.2437654553837127, "grad_norm": 0.22509298933939004, "kl": 0.0763702392578125, "learning_rate": 4.991842949731974e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 7418 }, { "completion_length": 221.15179538726807, "epoch": 1.2441007586235802, "grad_norm": 0.28116774174842096, "kl": 0.0690460205078125, "learning_rate": 4.991833264426896e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 7420 }, { "completion_length": 218.59376049041748, "epoch": 1.2444360618634478, "grad_norm": 0.2843754473417763, "kl": 0.06597900390625, "learning_rate": 4.991823573384695e-07, "loss": 0.0001, "reward": 1.8178572058677673, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8178571686148643, "rewards/format_reward_func": 1.0, "step": 7422 }, { "completion_length": 222.20983219146729, "epoch": 1.2447713651033152, "grad_norm": 0.20690636300290868, "kl": 0.0728607177734375, "learning_rate": 4.991813876605392e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 7424 }, { "completion_length": 218.0178680419922, "epoch": 1.2451066683431828, "grad_norm": 0.13231542882177258, "kl": 0.06787109375, "learning_rate": 4.99180417408901e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 7426 }, { "completion_length": 221.59375667572021, "epoch": 1.2454419715830505, "grad_norm": 0.33948420671007135, "kl": 0.0762939453125, "learning_rate": 4.99179446583557e-07, "loss": 0.0001, "reward": 1.807142935693264, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8071428760886192, "rewards/format_reward_func": 1.0, "step": 7428 }, { "completion_length": 222.44197463989258, "epoch": 1.245777274822918, "grad_norm": 0.18968566266064457, "kl": 0.0654754638671875, "learning_rate": 4.991784751845096e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964286208152771, "rewards/format_reward_func": 1.0, "step": 7430 }, { "completion_length": 221.1741180419922, "epoch": 1.2461125780627855, "grad_norm": 0.13362021561969276, "kl": 0.0767059326171875, "learning_rate": 4.99177503211761e-07, "loss": 0.0001, "reward": 1.82857146859169, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8285714574158192, "rewards/format_reward_func": 1.0, "step": 7432 }, { "completion_length": 220.6696538925171, "epoch": 1.2464478813026532, "grad_norm": 0.40983168744085197, "kl": 0.0979766845703125, "learning_rate": 4.991765306653133e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 7434 }, { "completion_length": 217.53572273254395, "epoch": 1.2467831845425206, "grad_norm": 0.24499121073236454, "kl": 0.073272705078125, "learning_rate": 4.991755575451689e-07, "loss": 0.0001, "reward": 1.7750000432133675, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 7436 }, { "completion_length": 215.0267972946167, "epoch": 1.2471184877823882, "grad_norm": 0.246875958035262, "kl": 0.086761474609375, "learning_rate": 4.991745838513299e-07, "loss": 0.0001, "reward": 1.8250000476837158, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.8250000178813934, "rewards/format_reward_func": 1.0, "step": 7438 }, { "completion_length": 212.84822463989258, "epoch": 1.2474537910222558, "grad_norm": 0.20762540243109817, "kl": 0.0724029541015625, "learning_rate": 4.991736095837987e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 7440 }, { "completion_length": 221.09822463989258, "epoch": 1.2477890942621233, "grad_norm": 0.1869219659646194, "kl": 0.0802001953125, "learning_rate": 4.991726347425774e-07, "loss": 0.0001, "reward": 1.7392857521772385, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857559025288, "rewards/format_reward_func": 1.0, "step": 7442 }, { "completion_length": 232.01340293884277, "epoch": 1.248124397501991, "grad_norm": 0.27557571789481994, "kl": 0.072235107421875, "learning_rate": 4.991716593276684e-07, "loss": 0.0001, "reward": 1.7839286252856255, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 7444 }, { "completion_length": 220.5937614440918, "epoch": 1.2484597007418583, "grad_norm": 0.18519122568768503, "kl": 0.07379150390625, "learning_rate": 4.991706833390738e-07, "loss": 0.0001, "reward": 1.8392857536673546, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.839285746216774, "rewards/format_reward_func": 1.0, "step": 7446 }, { "completion_length": 232.02679443359375, "epoch": 1.248795003981726, "grad_norm": 0.24044948180768533, "kl": 0.0777587890625, "learning_rate": 4.99169706776796e-07, "loss": 0.0001, "reward": 1.7482143566012383, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7526786103844643, "rewards/format_reward_func": 0.9955357164144516, "step": 7448 }, { "completion_length": 228.03125858306885, "epoch": 1.2491303072215936, "grad_norm": 0.11412891327814846, "kl": 0.0772857666015625, "learning_rate": 4.991687296408371e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 7450 }, { "completion_length": 232.45090293884277, "epoch": 1.2494656104614612, "grad_norm": 0.22571443973421887, "kl": 0.069427490234375, "learning_rate": 4.991677519311993e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 7452 }, { "completion_length": 234.48215293884277, "epoch": 1.2498009137013286, "grad_norm": 0.21476238687167007, "kl": 0.0712890625, "learning_rate": 4.99166773647885e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 7454 }, { "completion_length": 230.04018783569336, "epoch": 1.2501362169411963, "grad_norm": 0.25465075300402357, "kl": 0.0692291259765625, "learning_rate": 4.991657947908964e-07, "loss": 0.0001, "reward": 1.7357143387198448, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143238186836, "rewards/format_reward_func": 1.0, "step": 7456 }, { "completion_length": 229.64286613464355, "epoch": 1.2504715201810637, "grad_norm": 0.6685778048694985, "kl": 0.0740203857421875, "learning_rate": 4.991648153602358e-07, "loss": 0.0001, "reward": 1.748214341700077, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7526786141097546, "rewards/format_reward_func": 0.9955357164144516, "step": 7458 }, { "completion_length": 238.12054920196533, "epoch": 1.2508068234209313, "grad_norm": 0.27757295188385095, "kl": 0.0650787353515625, "learning_rate": 4.991638353559054e-07, "loss": 0.0001, "reward": 1.785714365541935, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 7460 }, { "completion_length": 231.2500114440918, "epoch": 1.251142126660799, "grad_norm": 0.25223749992442296, "kl": 0.0772247314453125, "learning_rate": 4.991628547779074e-07, "loss": 0.0001, "reward": 1.775000087916851, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 7462 }, { "completion_length": 232.54465579986572, "epoch": 1.2514774299006663, "grad_norm": 0.20840119266884935, "kl": 0.0738983154296875, "learning_rate": 4.991618736262442e-07, "loss": 0.0001, "reward": 1.7285714894533157, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714596509933, "rewards/format_reward_func": 1.0, "step": 7464 }, { "completion_length": 235.50447845458984, "epoch": 1.251812733140534, "grad_norm": 0.24398954293589886, "kl": 0.07470703125, "learning_rate": 4.991608919009179e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 7466 }, { "completion_length": 242.83929443359375, "epoch": 1.2521480363804014, "grad_norm": 0.19045043384066485, "kl": 0.0765533447265625, "learning_rate": 4.991599096019309e-07, "loss": 0.0001, "reward": 1.6910714954137802, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.6955357603728771, "rewards/format_reward_func": 0.9955357164144516, "step": 7468 }, { "completion_length": 229.02233219146729, "epoch": 1.252483339620269, "grad_norm": 0.2739781670065189, "kl": 0.090850830078125, "learning_rate": 4.991589267292854e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143145054579, "rewards/format_reward_func": 1.0, "step": 7470 }, { "completion_length": 234.63393688201904, "epoch": 1.2528186428601367, "grad_norm": 0.5638522629412447, "kl": 0.0961456298828125, "learning_rate": 4.991579432829837e-07, "loss": 0.0001, "reward": 1.8160714879631996, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8205357380211353, "rewards/format_reward_func": 0.9955357164144516, "step": 7472 }, { "completion_length": 227.69644165039062, "epoch": 1.2531539461000043, "grad_norm": 0.24793558166991733, "kl": 0.0794677734375, "learning_rate": 4.99156959263028e-07, "loss": 0.0001, "reward": 1.789285771548748, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.789285734295845, "rewards/format_reward_func": 1.0, "step": 7474 }, { "completion_length": 231.8973331451416, "epoch": 1.2534892493398717, "grad_norm": 0.10141429069354206, "kl": 0.0692138671875, "learning_rate": 4.991559746694206e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 7476 }, { "completion_length": 231.5446538925171, "epoch": 1.2538245525797393, "grad_norm": 0.2815565446267239, "kl": 0.0777587890625, "learning_rate": 4.991549895021638e-07, "loss": 0.0001, "reward": 1.721428669989109, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214285954833031, "rewards/format_reward_func": 1.0, "step": 7478 }, { "completion_length": 230.41518878936768, "epoch": 1.2541598558196068, "grad_norm": 0.19892908493645406, "kl": 0.0809478759765625, "learning_rate": 4.991540037612598e-07, "loss": 0.0001, "reward": 1.8107143491506577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 7480 }, { "completion_length": 237.64733028411865, "epoch": 1.2544951590594744, "grad_norm": 0.21884502968045427, "kl": 0.0768890380859375, "learning_rate": 4.991530174467109e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 7482 }, { "completion_length": 236.6919755935669, "epoch": 1.254830462299342, "grad_norm": 0.16027566375000496, "kl": 0.0768585205078125, "learning_rate": 4.991520305585194e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 7484 }, { "completion_length": 251.01340293884277, "epoch": 1.2551657655392094, "grad_norm": 0.2896072683566364, "kl": 0.082977294921875, "learning_rate": 4.991510430966875e-07, "loss": 0.0001, "reward": 1.7232143804430962, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.727678619325161, "rewards/format_reward_func": 0.9955357164144516, "step": 7486 }, { "completion_length": 236.20537090301514, "epoch": 1.255501068779077, "grad_norm": 0.24092087177097757, "kl": 0.0662078857421875, "learning_rate": 4.991500550612176e-07, "loss": 0.0001, "reward": 1.814285784959793, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8142857551574707, "rewards/format_reward_func": 1.0, "step": 7488 }, { "completion_length": 239.6919765472412, "epoch": 1.2558363720189447, "grad_norm": 0.27121315571398896, "kl": 0.0826873779296875, "learning_rate": 4.991490664521119e-07, "loss": 0.0001, "reward": 1.7178572118282318, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7178571783006191, "rewards/format_reward_func": 1.0, "step": 7490 }, { "completion_length": 239.20983028411865, "epoch": 1.2561716752588121, "grad_norm": 0.16039280148087817, "kl": 0.0776519775390625, "learning_rate": 4.991480772693726e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 7492 }, { "completion_length": 231.55358219146729, "epoch": 1.2565069784986798, "grad_norm": 0.25401639410075905, "kl": 0.0705413818359375, "learning_rate": 4.99147087513002e-07, "loss": 0.0001, "reward": 1.7553571984171867, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 7494 }, { "completion_length": 236.26340293884277, "epoch": 1.2568422817385474, "grad_norm": 0.31600402330566846, "kl": 0.071044921875, "learning_rate": 4.991460971830026e-07, "loss": 0.0001, "reward": 1.7303571999073029, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7348214536905289, "rewards/format_reward_func": 0.9955357164144516, "step": 7496 }, { "completion_length": 236.80358123779297, "epoch": 1.2571775849784148, "grad_norm": 0.38116439183130013, "kl": 0.0737762451171875, "learning_rate": 4.991451062793763e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714286148548126, "rewards/format_reward_func": 1.0, "step": 7498 }, { "completion_length": 237.85715198516846, "epoch": 1.2575128882182824, "grad_norm": 0.2152823007031557, "kl": 0.0875701904296875, "learning_rate": 4.991441148021258e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143055647612, "rewards/format_reward_func": 1.0, "step": 7500 }, { "completion_length": 239.27233219146729, "epoch": 1.2578481914581499, "grad_norm": 0.23772054865269418, "kl": 0.0738372802734375, "learning_rate": 4.991431227512531e-07, "loss": 0.0001, "reward": 1.7857143133878708, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143320143223, "rewards/format_reward_func": 1.0, "step": 7502 }, { "completion_length": 233.86607933044434, "epoch": 1.2581834946980175, "grad_norm": 0.16101616603817884, "kl": 0.11566162109375, "learning_rate": 4.991421301267604e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 7504 }, { "completion_length": 234.1071548461914, "epoch": 1.2585187979378851, "grad_norm": 0.2618933580358117, "kl": 0.07000732421875, "learning_rate": 4.991411369286503e-07, "loss": 0.0001, "reward": 1.7642858028411865, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857544124126, "rewards/format_reward_func": 1.0, "step": 7506 }, { "completion_length": 238.22768783569336, "epoch": 1.2588541011777528, "grad_norm": 0.2759809096611727, "kl": 0.073516845703125, "learning_rate": 4.99140143156925e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 7508 }, { "completion_length": 229.99107933044434, "epoch": 1.2591894044176202, "grad_norm": 0.13394452526505557, "kl": 0.0980377197265625, "learning_rate": 4.991391488115866e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 7510 }, { "completion_length": 234.29018688201904, "epoch": 1.2595247076574878, "grad_norm": 0.6323557898045636, "kl": 0.08587646484375, "learning_rate": 4.991381538926374e-07, "loss": 0.0001, "reward": 1.7589286118745804, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928898721933, "rewards/format_reward_func": 0.9955357164144516, "step": 7512 }, { "completion_length": 229.91965293884277, "epoch": 1.2598600108973552, "grad_norm": 0.40135163456317896, "kl": 0.07708740234375, "learning_rate": 4.991371584000799e-07, "loss": 0.0001, "reward": 1.7660714834928513, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357354134321, "rewards/format_reward_func": 0.9955357164144516, "step": 7514 }, { "completion_length": 227.17858028411865, "epoch": 1.2601953141372229, "grad_norm": 0.9782226270408743, "kl": 0.07330322265625, "learning_rate": 4.991361623339164e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 7516 }, { "completion_length": 232.57143878936768, "epoch": 1.2605306173770905, "grad_norm": 0.2562746459472891, "kl": 0.0767974853515625, "learning_rate": 4.991351656941489e-07, "loss": 0.0001, "reward": 1.8000000640749931, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 7518 }, { "completion_length": 241.55358219146729, "epoch": 1.260865920616958, "grad_norm": 0.2472584451345171, "kl": 0.0744781494140625, "learning_rate": 4.9913416848078e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 7520 }, { "completion_length": 234.4107255935669, "epoch": 1.2612012238568255, "grad_norm": 0.36177741112164086, "kl": 0.0704498291015625, "learning_rate": 4.991331706938118e-07, "loss": 0.0001, "reward": 1.7607143446803093, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 7522 }, { "completion_length": 242.14286708831787, "epoch": 1.261536527096693, "grad_norm": 0.2512713998995826, "kl": 0.073394775390625, "learning_rate": 4.991321723332467e-07, "loss": 0.0001, "reward": 1.73392865806818, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7383928820490837, "rewards/format_reward_func": 0.9955357164144516, "step": 7524 }, { "completion_length": 238.54911994934082, "epoch": 1.2618718303365606, "grad_norm": 0.23311703994054409, "kl": 0.072662353515625, "learning_rate": 4.99131173399087e-07, "loss": 0.0001, "reward": 1.7553572058677673, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 7526 }, { "completion_length": 240.94197463989258, "epoch": 1.2622071335764282, "grad_norm": 0.23393040111609653, "kl": 0.07373046875, "learning_rate": 4.99130173891335e-07, "loss": 0.0001, "reward": 1.7928571850061417, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571626543999, "rewards/format_reward_func": 1.0, "step": 7528 }, { "completion_length": 238.2544755935669, "epoch": 1.2625424368162959, "grad_norm": 0.2596660312043446, "kl": 0.07232666015625, "learning_rate": 4.991291738099929e-07, "loss": 0.0001, "reward": 1.733928643167019, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 7530 }, { "completion_length": 227.68304538726807, "epoch": 1.2628777400561633, "grad_norm": 0.1490256935867386, "kl": 0.0784912109375, "learning_rate": 4.991281731550631e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 7532 }, { "completion_length": 226.21876049041748, "epoch": 1.263213043296031, "grad_norm": 0.33889375367672325, "kl": 0.081512451171875, "learning_rate": 4.991271719265477e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000204890966, "rewards/format_reward_func": 1.0, "step": 7534 }, { "completion_length": 229.75893592834473, "epoch": 1.2635483465358983, "grad_norm": 0.19253892513658385, "kl": 0.07342529296875, "learning_rate": 4.991261701244494e-07, "loss": 0.0001, "reward": 1.764285795390606, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 7536 }, { "completion_length": 232.67411994934082, "epoch": 1.263883649775766, "grad_norm": 0.1851524664373545, "kl": 0.0771484375, "learning_rate": 4.991251677487702e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143335044384, "rewards/format_reward_func": 1.0, "step": 7538 }, { "completion_length": 228.49108123779297, "epoch": 1.2642189530156336, "grad_norm": 0.3447227639574531, "kl": 0.0768890380859375, "learning_rate": 4.991241647995126e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428809314966, "rewards/format_reward_func": 1.0, "step": 7540 }, { "completion_length": 223.60715293884277, "epoch": 1.264554256255501, "grad_norm": 0.21475977305995408, "kl": 0.079498291015625, "learning_rate": 4.991231612766786e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 7542 }, { "completion_length": 228.71429538726807, "epoch": 1.2648895594953686, "grad_norm": 0.26691773585830647, "kl": 0.1363372802734375, "learning_rate": 4.99122157180271e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571626543999, "rewards/format_reward_func": 1.0, "step": 7544 }, { "completion_length": 220.34822368621826, "epoch": 1.265224862735236, "grad_norm": 0.20614781368504556, "kl": 0.094970703125, "learning_rate": 4.991211525102916e-07, "loss": 0.0001, "reward": 1.7821429371833801, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 7546 }, { "completion_length": 224.1696538925171, "epoch": 1.2655601659751037, "grad_norm": 0.23147580973292636, "kl": 0.0751190185546875, "learning_rate": 4.99120147266743e-07, "loss": 0.0001, "reward": 1.7125000730156898, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7169643249362707, "rewards/format_reward_func": 0.9955357164144516, "step": 7548 }, { "completion_length": 221.75447273254395, "epoch": 1.2658954692149713, "grad_norm": 0.6172895430411022, "kl": 0.0866851806640625, "learning_rate": 4.991191414496274e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 7550 }, { "completion_length": 218.9553680419922, "epoch": 1.266230772454839, "grad_norm": 0.24688172383365634, "kl": 0.11077880859375, "learning_rate": 4.991181350589473e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857529222965, "rewards/format_reward_func": 1.0, "step": 7552 }, { "completion_length": 233.47768878936768, "epoch": 1.2665660756947064, "grad_norm": 0.36008920665302907, "kl": 0.0836181640625, "learning_rate": 4.991171280947047e-07, "loss": 0.0001, "reward": 1.7160715013742447, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7205357477068901, "rewards/format_reward_func": 0.9955357164144516, "step": 7554 }, { "completion_length": 221.57590103149414, "epoch": 1.266901378934574, "grad_norm": 0.2618971345873311, "kl": 0.087188720703125, "learning_rate": 4.991161205569023e-07, "loss": 0.0001, "reward": 1.7375000715255737, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7419643122702837, "rewards/format_reward_func": 0.9955357164144516, "step": 7556 }, { "completion_length": 224.26340293884277, "epoch": 1.2672366821744414, "grad_norm": 0.331582401421873, "kl": 0.1146087646484375, "learning_rate": 4.991151124455422e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 7558 }, { "completion_length": 220.55358219146729, "epoch": 1.267571985414309, "grad_norm": 0.25120591195005054, "kl": 0.08917236328125, "learning_rate": 4.991141037606266e-07, "loss": 0.0001, "reward": 1.7375000938773155, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7419643141329288, "rewards/format_reward_func": 0.9955357164144516, "step": 7560 }, { "completion_length": 212.33483123779297, "epoch": 1.2679072886541767, "grad_norm": 0.40357051105146413, "kl": 0.08935546875, "learning_rate": 4.991130945021581e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 7562 }, { "completion_length": 219.60268878936768, "epoch": 1.2682425918940443, "grad_norm": 0.40444888759563763, "kl": 0.0786590576171875, "learning_rate": 4.991120846701388e-07, "loss": 0.0001, "reward": 1.7696429342031479, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7741071581840515, "rewards/format_reward_func": 0.9955357164144516, "step": 7564 }, { "completion_length": 226.77679538726807, "epoch": 1.2685778951339117, "grad_norm": 0.5012331277544384, "kl": 0.18408203125, "learning_rate": 4.991110742645712e-07, "loss": 0.0002, "reward": 1.75178574770689, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 7566 }, { "completion_length": 227.01786708831787, "epoch": 1.2689131983737794, "grad_norm": 0.41045920652982976, "kl": 0.0800323486328125, "learning_rate": 4.991100632854575e-07, "loss": 0.0001, "reward": 1.7125000804662704, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7169643137603998, "rewards/format_reward_func": 0.9955357164144516, "step": 7568 }, { "completion_length": 228.75000953674316, "epoch": 1.2692485016136468, "grad_norm": 0.41643280522481385, "kl": 0.0793914794921875, "learning_rate": 4.991090517328001e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 7570 }, { "completion_length": 214.71875953674316, "epoch": 1.2695838048535144, "grad_norm": 1.6521458727305902, "kl": 0.091827392578125, "learning_rate": 4.991080396066013e-07, "loss": 0.0001, "reward": 1.7375001013278961, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7419643104076385, "rewards/format_reward_func": 0.9955357164144516, "step": 7572 }, { "completion_length": 211.85715103149414, "epoch": 1.269919108093382, "grad_norm": 0.7937714569865104, "kl": 0.311981201171875, "learning_rate": 4.991070269068633e-07, "loss": 0.0003, "reward": 1.7678572088479996, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 7574 }, { "completion_length": 220.1384038925171, "epoch": 1.2702544113332495, "grad_norm": 2.7509181354526198, "kl": 0.102203369140625, "learning_rate": 4.991060136335887e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 7576 }, { "completion_length": 223.99554443359375, "epoch": 1.270589714573117, "grad_norm": 0.6848668616946066, "kl": 0.173553466796875, "learning_rate": 4.991049997867797e-07, "loss": 0.0002, "reward": 1.7571429312229156, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 7578 }, { "completion_length": 222.53572368621826, "epoch": 1.2709250178129845, "grad_norm": 0.6383890615748599, "kl": 0.078521728515625, "learning_rate": 4.991039853664386e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 7580 }, { "completion_length": 222.75447463989258, "epoch": 1.2712603210528521, "grad_norm": 0.6796629798388669, "kl": 0.0955047607421875, "learning_rate": 4.991029703725677e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 7582 }, { "completion_length": 224.05358219146729, "epoch": 1.2715956242927198, "grad_norm": 0.6087448935727924, "kl": 0.0820770263671875, "learning_rate": 4.991019548051696e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 7584 }, { "completion_length": 222.65179347991943, "epoch": 1.2719309275325874, "grad_norm": 0.2702490964335112, "kl": 0.08392333984375, "learning_rate": 4.991009386642463e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.746428593993187, "rewards/format_reward_func": 1.0, "step": 7586 }, { "completion_length": 220.91072463989258, "epoch": 1.2722662307724548, "grad_norm": 0.9077446820456878, "kl": 0.0844268798828125, "learning_rate": 4.990999219498002e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 7588 }, { "completion_length": 222.08929347991943, "epoch": 1.2726015340123225, "grad_norm": 0.39540866250941975, "kl": 0.0833282470703125, "learning_rate": 4.990989046618338e-07, "loss": 0.0001, "reward": 1.803571455180645, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714663565159, "rewards/format_reward_func": 1.0, "step": 7590 }, { "completion_length": 220.80804538726807, "epoch": 1.2729368372521899, "grad_norm": 1.953295584845841, "kl": 0.0895843505859375, "learning_rate": 4.990978868003494e-07, "loss": 0.0001, "reward": 1.7303572073578835, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7348214611411095, "rewards/format_reward_func": 0.9955357164144516, "step": 7592 }, { "completion_length": 219.47768878936768, "epoch": 1.2732721404920575, "grad_norm": 0.25660438766797417, "kl": 0.0877685546875, "learning_rate": 4.990968683653492e-07, "loss": 0.0001, "reward": 1.7678572311997414, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 7594 }, { "completion_length": 221.82590293884277, "epoch": 1.2736074437319251, "grad_norm": 0.4354199758652535, "kl": 0.088165283203125, "learning_rate": 4.990958493568358e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 7596 }, { "completion_length": 211.41518783569336, "epoch": 1.2739427469717926, "grad_norm": 0.20289326916956163, "kl": 0.0782318115234375, "learning_rate": 4.990948297748113e-07, "loss": 0.0001, "reward": 1.8071429133415222, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 7598 }, { "completion_length": 218.65179538726807, "epoch": 1.2742780502116602, "grad_norm": 0.48082271814419353, "kl": 0.08978271484375, "learning_rate": 4.990938096192782e-07, "loss": 0.0001, "reward": 1.7165179327130318, "reward_std": 0.06755394907668233, "rewards/equation_reward_func": 0.7178571857511997, "rewards/format_reward_func": 0.9986607171595097, "step": 7600 }, { "completion_length": 213.29018688201904, "epoch": 1.2746133534515276, "grad_norm": 0.44731461791586646, "kl": 0.09454345703125, "learning_rate": 4.990927888902387e-07, "loss": 0.0001, "reward": 1.767410784959793, "reward_std": 0.0460882093757391, "rewards/equation_reward_func": 0.7705357484519482, "rewards/format_reward_func": 0.9968750029802322, "step": 7602 }, { "completion_length": 218.48661708831787, "epoch": 1.2749486566913952, "grad_norm": 0.4131093057482022, "kl": 0.0911865234375, "learning_rate": 4.990917675876953e-07, "loss": 0.0001, "reward": 1.6982143893837929, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7026785910129547, "rewards/format_reward_func": 0.9955357164144516, "step": 7604 }, { "completion_length": 209.18750762939453, "epoch": 1.2752839599312629, "grad_norm": 0.5033205739515128, "kl": 0.077178955078125, "learning_rate": 4.990907457116503e-07, "loss": 0.0001, "reward": 1.7285715341567993, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714522004128, "rewards/format_reward_func": 1.0, "step": 7606 }, { "completion_length": 212.22768783569336, "epoch": 1.2756192631711305, "grad_norm": 0.4563452985721969, "kl": 0.07989501953125, "learning_rate": 4.990897232621061e-07, "loss": 0.0001, "reward": 1.8178571984171867, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8178571537137032, "rewards/format_reward_func": 1.0, "step": 7608 }, { "completion_length": 213.19643878936768, "epoch": 1.275954566410998, "grad_norm": 0.5585952840788717, "kl": 0.07147216796875, "learning_rate": 4.990887002390649e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 7610 }, { "completion_length": 213.9285831451416, "epoch": 1.2762898696508655, "grad_norm": 0.32565894244767235, "kl": 0.0990753173828125, "learning_rate": 4.990876766425292e-07, "loss": 0.0001, "reward": 1.7732143625617027, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 7612 }, { "completion_length": 216.44197368621826, "epoch": 1.276625172890733, "grad_norm": 0.2877972666853063, "kl": 0.0857696533203125, "learning_rate": 4.990866524725013e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286148548126, "rewards/format_reward_func": 1.0, "step": 7614 }, { "completion_length": 208.59822273254395, "epoch": 1.2769604761306006, "grad_norm": 3.039888470742808, "kl": 0.1195831298828125, "learning_rate": 4.990856277289836e-07, "loss": 0.0001, "reward": 1.787500038743019, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7919643148779869, "rewards/format_reward_func": 0.9955357164144516, "step": 7616 }, { "completion_length": 198.33483028411865, "epoch": 1.2772957793704682, "grad_norm": 0.004197014084676649, "kl": 0.0860595703125, "learning_rate": 4.990846024119785e-07, "loss": 0.0001, "reward": 1.8000000789761543, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 7618 }, { "completion_length": 197.69643878936768, "epoch": 1.2776310826103356, "grad_norm": 0.23318182594139808, "kl": 0.092559814453125, "learning_rate": 4.990835765214882e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 7620 }, { "completion_length": 202.15625953674316, "epoch": 1.2779663858502033, "grad_norm": 1.2937995948418413, "kl": 0.087005615234375, "learning_rate": 4.990825500575152e-07, "loss": 0.0001, "reward": 1.7785715088248253, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 7622 }, { "completion_length": 196.40626049041748, "epoch": 1.278301689090071, "grad_norm": 0.3271859286057177, "kl": 0.100067138671875, "learning_rate": 4.990815230200618e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 7624 }, { "completion_length": 192.56697368621826, "epoch": 1.2786369923299383, "grad_norm": 0.3072899399621595, "kl": 0.098114013671875, "learning_rate": 4.990804954091302e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 7626 }, { "completion_length": 194.65625858306885, "epoch": 1.278972295569806, "grad_norm": 0.315932345161278, "kl": 0.0919036865234375, "learning_rate": 4.990794672247232e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857596278191, "rewards/format_reward_func": 1.0, "step": 7628 }, { "completion_length": 198.43750953674316, "epoch": 1.2793075988096736, "grad_norm": 0.0880141059924044, "kl": 0.0974578857421875, "learning_rate": 4.990784384668428e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 7630 }, { "completion_length": 188.29465293884277, "epoch": 1.279642902049541, "grad_norm": 0.25524708052962836, "kl": 0.106048583984375, "learning_rate": 4.990774091354915e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714286055415869, "rewards/format_reward_func": 1.0, "step": 7632 }, { "completion_length": 187.6160774230957, "epoch": 1.2799782052894086, "grad_norm": 0.27881454543754314, "kl": 0.1068115234375, "learning_rate": 4.990763792306717e-07, "loss": 0.0001, "reward": 1.753571517765522, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714525729418, "rewards/format_reward_func": 1.0, "step": 7634 }, { "completion_length": 188.61161518096924, "epoch": 1.280313508529276, "grad_norm": 0.31344919963315515, "kl": 0.11016845703125, "learning_rate": 4.990753487523857e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 7636 }, { "completion_length": 193.87054538726807, "epoch": 1.2806488117691437, "grad_norm": 0.2240318405017893, "kl": 0.0923614501953125, "learning_rate": 4.990743177006358e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714454948902, "rewards/format_reward_func": 1.0, "step": 7638 }, { "completion_length": 194.4017915725708, "epoch": 1.2809841150090113, "grad_norm": 0.22470951217114793, "kl": 0.105865478515625, "learning_rate": 4.990732860754246e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714697092772, "rewards/format_reward_func": 1.0, "step": 7640 }, { "completion_length": 188.56250953674316, "epoch": 1.281319418248879, "grad_norm": 0.37580742577286996, "kl": 0.088165283203125, "learning_rate": 4.990722538767543e-07, "loss": 0.0001, "reward": 1.7571429461240768, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 7642 }, { "completion_length": 188.35715103149414, "epoch": 1.2816547214887464, "grad_norm": 0.1599668296641991, "kl": 0.10662841796875, "learning_rate": 4.990712211046273e-07, "loss": 0.0001, "reward": 1.7285715118050575, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 7644 }, { "completion_length": 192.56697368621826, "epoch": 1.281990024728614, "grad_norm": 0.31148905908911567, "kl": 0.083526611328125, "learning_rate": 4.990701877590461e-07, "loss": 0.0001, "reward": 1.7285714969038963, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 7646 }, { "completion_length": 194.48215293884277, "epoch": 1.2823253279684814, "grad_norm": 0.20670813668367558, "kl": 0.090240478515625, "learning_rate": 4.990691538400128e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 7648 }, { "completion_length": 188.84822177886963, "epoch": 1.282660631208349, "grad_norm": 0.14393064848231346, "kl": 0.095794677734375, "learning_rate": 4.990681193475301e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 7650 }, { "completion_length": 190.66072177886963, "epoch": 1.2829959344482167, "grad_norm": 0.17232390997388092, "kl": 0.104400634765625, "learning_rate": 4.990670842816002e-07, "loss": 0.0001, "reward": 1.7486608028411865, "reward_std": 0.03219861118122935, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 0.9986607171595097, "step": 7652 }, { "completion_length": 192.68304443359375, "epoch": 1.283331237688084, "grad_norm": 0.2523200705211646, "kl": 0.0912628173828125, "learning_rate": 4.990660486422256e-07, "loss": 0.0001, "reward": 1.8214286118745804, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8214286006987095, "rewards/format_reward_func": 1.0, "step": 7654 }, { "completion_length": 192.41965007781982, "epoch": 1.2836665409279517, "grad_norm": 0.09130844397646869, "kl": 0.083648681640625, "learning_rate": 4.990650124294085e-07, "loss": 0.0001, "reward": 1.7321429178118706, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7321428917348385, "rewards/format_reward_func": 1.0, "step": 7656 }, { "completion_length": 198.56250953674316, "epoch": 1.2840018441678192, "grad_norm": 0.2058536185574342, "kl": 0.0894012451171875, "learning_rate": 4.990639756431514e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714663565159, "rewards/format_reward_func": 1.0, "step": 7658 }, { "completion_length": 192.67411613464355, "epoch": 1.2843371474076868, "grad_norm": 0.4357492926407639, "kl": 0.0915069580078125, "learning_rate": 4.990629382834567e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143182307482, "rewards/format_reward_func": 1.0, "step": 7660 }, { "completion_length": 199.83482837677002, "epoch": 1.2846724506475544, "grad_norm": 0.31513147134974884, "kl": 0.083404541015625, "learning_rate": 4.990619003503268e-07, "loss": 0.0001, "reward": 1.7250000908970833, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000312924385, "rewards/format_reward_func": 1.0, "step": 7662 }, { "completion_length": 194.86161613464355, "epoch": 1.285007753887422, "grad_norm": 0.19215721495269542, "kl": 0.096099853515625, "learning_rate": 4.990608618437641e-07, "loss": 0.0001, "reward": 1.7714286148548126, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 7664 }, { "completion_length": 198.66072273254395, "epoch": 1.2853430571272895, "grad_norm": 0.2876697309370327, "kl": 0.0840911865234375, "learning_rate": 4.990598227637708e-07, "loss": 0.0001, "reward": 1.6950893551111221, "reward_std": 0.02714784862473607, "rewards/equation_reward_func": 0.6964286249130964, "rewards/format_reward_func": 0.9986607171595097, "step": 7666 }, { "completion_length": 206.60268688201904, "epoch": 1.285678360367157, "grad_norm": 0.2944065370264894, "kl": 0.0862579345703125, "learning_rate": 4.990587831103495e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 7668 }, { "completion_length": 210.10715293884277, "epoch": 1.2860136636070245, "grad_norm": 0.23531668996266744, "kl": 0.0872650146484375, "learning_rate": 4.990577428835026e-07, "loss": 0.0001, "reward": 1.7821429371833801, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 7670 }, { "completion_length": 209.81697177886963, "epoch": 1.2863489668468922, "grad_norm": 0.31108048300835284, "kl": 0.0871734619140625, "learning_rate": 4.990567020832325e-07, "loss": 0.0001, "reward": 1.7035715132951736, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7035714574158192, "rewards/format_reward_func": 1.0, "step": 7672 }, { "completion_length": 200.08482933044434, "epoch": 1.2866842700867598, "grad_norm": 0.30616690330063095, "kl": 0.07989501953125, "learning_rate": 4.990556607095414e-07, "loss": 0.0001, "reward": 1.8142858073115349, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8142857365310192, "rewards/format_reward_func": 1.0, "step": 7674 }, { "completion_length": 200.36161518096924, "epoch": 1.2870195733266272, "grad_norm": 0.31553270795768634, "kl": 0.0896759033203125, "learning_rate": 4.990546187624319e-07, "loss": 0.0001, "reward": 1.7821429371833801, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428701281548, "rewards/format_reward_func": 1.0, "step": 7676 }, { "completion_length": 207.80357837677002, "epoch": 1.2873548765664948, "grad_norm": 0.4448366525953187, "kl": 0.09698486328125, "learning_rate": 4.990535762419062e-07, "loss": 0.0001, "reward": 1.7553572207689285, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 7678 }, { "completion_length": 207.01340198516846, "epoch": 1.2876901798063622, "grad_norm": 0.3822823166526979, "kl": 0.075531005859375, "learning_rate": 4.990525331479669e-07, "loss": 0.0001, "reward": 1.7714286148548126, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286092668772, "rewards/format_reward_func": 1.0, "step": 7680 }, { "completion_length": 213.92858123779297, "epoch": 1.2880254830462299, "grad_norm": 0.1842616328582102, "kl": 0.0823974609375, "learning_rate": 4.990514894806164e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 7682 }, { "completion_length": 213.35268783569336, "epoch": 1.2883607862860975, "grad_norm": 0.3047119390670931, "kl": 0.0763092041015625, "learning_rate": 4.99050445239857e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 7684 }, { "completion_length": 201.67858123779297, "epoch": 1.2886960895259651, "grad_norm": 0.24882400891042786, "kl": 0.0864715576171875, "learning_rate": 4.990494004256911e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 7686 }, { "completion_length": 210.66518783569336, "epoch": 1.2890313927658326, "grad_norm": 0.23940484799827028, "kl": 0.0796661376953125, "learning_rate": 4.990483550381211e-07, "loss": 0.0001, "reward": 1.764285795390606, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 7688 }, { "completion_length": 215.89733123779297, "epoch": 1.2893666960057002, "grad_norm": 0.30294980556103956, "kl": 0.0738983154296875, "learning_rate": 4.990473090771494e-07, "loss": 0.0001, "reward": 1.814285770058632, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 7690 }, { "completion_length": 215.60715103149414, "epoch": 1.2897019992455676, "grad_norm": 0.07984567628802022, "kl": 0.0729522705078125, "learning_rate": 4.990462625427786e-07, "loss": 0.0001, "reward": 1.7392857670783997, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 7692 }, { "completion_length": 209.93304538726807, "epoch": 1.2900373024854352, "grad_norm": 0.2513657048792789, "kl": 0.0680999755859375, "learning_rate": 4.990452154350109e-07, "loss": 0.0001, "reward": 1.7843750640749931, "reward_std": 0.05240166140720248, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 0.9986607171595097, "step": 7694 }, { "completion_length": 220.35268878936768, "epoch": 1.2903726057253029, "grad_norm": 0.23234477772818157, "kl": 0.0713348388671875, "learning_rate": 4.990441677538489e-07, "loss": 0.0001, "reward": 1.7250000685453415, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 1.0, "step": 7696 }, { "completion_length": 209.14733028411865, "epoch": 1.2907079089651705, "grad_norm": 0.2776459759810098, "kl": 0.0708160400390625, "learning_rate": 4.990431194992946e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 7698 }, { "completion_length": 220.24554443359375, "epoch": 1.291043212205038, "grad_norm": 0.3385624563063057, "kl": 0.0813751220703125, "learning_rate": 4.99042070671351e-07, "loss": 0.0001, "reward": 1.692857250571251, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.6928571835160255, "rewards/format_reward_func": 1.0, "step": 7700 }, { "completion_length": 218.83483028411865, "epoch": 1.2913785154449056, "grad_norm": 0.22457947375299586, "kl": 0.0742645263671875, "learning_rate": 4.9904102127002e-07, "loss": 0.0001, "reward": 1.8357143551111221, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.835714302957058, "rewards/format_reward_func": 1.0, "step": 7702 }, { "completion_length": 218.10268783569336, "epoch": 1.291713818684773, "grad_norm": 0.4024637053056976, "kl": 0.079010009765625, "learning_rate": 4.990399712953044e-07, "loss": 0.0001, "reward": 1.7357143461704254, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7357143349945545, "rewards/format_reward_func": 1.0, "step": 7704 }, { "completion_length": 217.05804824829102, "epoch": 1.2920491219246406, "grad_norm": 0.15498067836171614, "kl": 0.0782928466796875, "learning_rate": 4.990389207472064e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 1.0, "step": 7706 }, { "completion_length": 222.46875953674316, "epoch": 1.2923844251645082, "grad_norm": 0.3098561962327767, "kl": 0.0785369873046875, "learning_rate": 4.990378696257284e-07, "loss": 0.0001, "reward": 1.7700893506407738, "reward_std": 0.06250318721868098, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 0.9986607171595097, "step": 7708 }, { "completion_length": 226.8259038925171, "epoch": 1.2927197284043757, "grad_norm": 0.23701639769112526, "kl": 0.0764312744140625, "learning_rate": 4.990368179308728e-07, "loss": 0.0001, "reward": 1.764285795390606, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 7710 }, { "completion_length": 217.90625858306885, "epoch": 1.2930550316442433, "grad_norm": 0.1698667963121498, "kl": 0.0675811767578125, "learning_rate": 4.990357656626423e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 7712 }, { "completion_length": 224.6205472946167, "epoch": 1.2933903348841107, "grad_norm": 0.239622530564761, "kl": 0.0841827392578125, "learning_rate": 4.990347128210391e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8107143193483353, "rewards/format_reward_func": 1.0, "step": 7714 }, { "completion_length": 233.29911518096924, "epoch": 1.2937256381239783, "grad_norm": 0.2877922547255133, "kl": 0.076263427734375, "learning_rate": 4.990336594060656e-07, "loss": 0.0001, "reward": 1.7160715088248253, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7205357383936644, "rewards/format_reward_func": 0.9955357164144516, "step": 7716 }, { "completion_length": 237.74108219146729, "epoch": 1.294060941363846, "grad_norm": 0.17369064662715267, "kl": 0.07440185546875, "learning_rate": 4.990326054177243e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 7718 }, { "completion_length": 234.40625953674316, "epoch": 1.2943962446037136, "grad_norm": 0.1456112782303269, "kl": 0.0776824951171875, "learning_rate": 4.990315508560176e-07, "loss": 0.0001, "reward": 1.696428656578064, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6964286025613546, "rewards/format_reward_func": 1.0, "step": 7720 }, { "completion_length": 227.63840293884277, "epoch": 1.294731547843581, "grad_norm": 0.2851940854186735, "kl": 0.07769775390625, "learning_rate": 4.99030495720948e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 7722 }, { "completion_length": 225.8571548461914, "epoch": 1.2950668510834487, "grad_norm": 0.16598813182764588, "kl": 0.081024169921875, "learning_rate": 4.990294400125177e-07, "loss": 0.0001, "reward": 1.7553571909666061, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7598214577883482, "rewards/format_reward_func": 0.9955357164144516, "step": 7724 }, { "completion_length": 238.36608219146729, "epoch": 1.295402154323316, "grad_norm": 0.29466927347255356, "kl": 0.079254150390625, "learning_rate": 4.990283837307294e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 7726 }, { "completion_length": 237.75447463989258, "epoch": 1.2957374575631837, "grad_norm": 0.2723778547154022, "kl": 0.09173583984375, "learning_rate": 4.990273268755855e-07, "loss": 0.0001, "reward": 1.7214286550879478, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214286159723997, "rewards/format_reward_func": 1.0, "step": 7728 }, { "completion_length": 229.79465293884277, "epoch": 1.2960727608030513, "grad_norm": 0.23247535891431542, "kl": 0.0806121826171875, "learning_rate": 4.990262694470882e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 7730 }, { "completion_length": 233.72768878936768, "epoch": 1.2964080640429188, "grad_norm": 0.23696849547676943, "kl": 0.0730438232421875, "learning_rate": 4.990252114452403e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 7732 }, { "completion_length": 232.67411708831787, "epoch": 1.2967433672827864, "grad_norm": 0.26712344526497833, "kl": 0.0766448974609375, "learning_rate": 4.990241528700439e-07, "loss": 0.0001, "reward": 1.8000000640749931, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 7734 }, { "completion_length": 239.3169755935669, "epoch": 1.2970786705226538, "grad_norm": 0.3596598026506631, "kl": 0.077606201171875, "learning_rate": 4.990230937215016e-07, "loss": 0.0001, "reward": 1.6946429312229156, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.6991071812808514, "rewards/format_reward_func": 0.9955357164144516, "step": 7736 }, { "completion_length": 228.24554634094238, "epoch": 1.2974139737625214, "grad_norm": 0.20678090185850923, "kl": 0.0703277587890625, "learning_rate": 4.990220339996158e-07, "loss": 0.0001, "reward": 1.7392858043313026, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857428640127, "rewards/format_reward_func": 1.0, "step": 7738 }, { "completion_length": 234.41965293884277, "epoch": 1.297749277002389, "grad_norm": 0.15463451116909518, "kl": 0.075775146484375, "learning_rate": 4.99020973704389e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7455357480794191, "rewards/format_reward_func": 0.9955357164144516, "step": 7740 }, { "completion_length": 231.96429634094238, "epoch": 1.2980845802422567, "grad_norm": 0.2654924669106703, "kl": 0.0730743408203125, "learning_rate": 4.990199128358234e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714693367481, "rewards/format_reward_func": 1.0, "step": 7742 }, { "completion_length": 229.4196538925171, "epoch": 1.2984198834821241, "grad_norm": 0.20582707966686384, "kl": 0.081146240234375, "learning_rate": 4.990188513939219e-07, "loss": 0.0001, "reward": 1.707142911851406, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7071428894996643, "rewards/format_reward_func": 1.0, "step": 7744 }, { "completion_length": 219.16518878936768, "epoch": 1.2987551867219918, "grad_norm": 0.13035483487149235, "kl": 0.0706787109375, "learning_rate": 4.990177893786865e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 7746 }, { "completion_length": 227.17858123779297, "epoch": 1.2990904899618592, "grad_norm": 0.2269454033186533, "kl": 0.0691375732421875, "learning_rate": 4.990167267901199e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 7748 }, { "completion_length": 229.66965198516846, "epoch": 1.2994257932017268, "grad_norm": 0.30071007329818733, "kl": 0.086517333984375, "learning_rate": 4.990156636282243e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 7750 }, { "completion_length": 231.19643878936768, "epoch": 1.2997610964415944, "grad_norm": 0.2517132251564651, "kl": 0.1706085205078125, "learning_rate": 4.990145998930025e-07, "loss": 0.0002, "reward": 1.7607143595814705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 7752 }, { "completion_length": 223.51786708831787, "epoch": 1.3000963996814618, "grad_norm": 0.11299277167844926, "kl": 0.0755767822265625, "learning_rate": 4.990135355844567e-07, "loss": 0.0001, "reward": 1.8285714760422707, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8285714499652386, "rewards/format_reward_func": 1.0, "step": 7754 }, { "completion_length": 225.43304443359375, "epoch": 1.3004317029213295, "grad_norm": 0.2308531874026641, "kl": 0.0781707763671875, "learning_rate": 4.990124707025894e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 7756 }, { "completion_length": 229.23661613464355, "epoch": 1.3007670061611971, "grad_norm": 0.21235278465773202, "kl": 0.084625244140625, "learning_rate": 4.990114052474031e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 7758 }, { "completion_length": 230.1116189956665, "epoch": 1.3011023094010645, "grad_norm": 0.21202217971475054, "kl": 0.0723724365234375, "learning_rate": 4.990103392189002e-07, "loss": 0.0001, "reward": 1.807142935693264, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 7760 }, { "completion_length": 226.2678689956665, "epoch": 1.3014376126409322, "grad_norm": 0.3171202168947194, "kl": 0.081817626953125, "learning_rate": 4.990092726170832e-07, "loss": 0.0001, "reward": 1.728571504354477, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 7762 }, { "completion_length": 231.81697463989258, "epoch": 1.3017729158807998, "grad_norm": 0.07547602806092736, "kl": 0.079559326171875, "learning_rate": 4.990082054419544e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.07071067858487368, "rewards/equation_reward_func": 0.7875000312924385, "rewards/format_reward_func": 0.9910714328289032, "step": 7764 }, { "completion_length": 221.40179538726807, "epoch": 1.3021082191206672, "grad_norm": 0.19411017636005276, "kl": 0.075775146484375, "learning_rate": 4.990071376935165e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 7766 }, { "completion_length": 222.508939743042, "epoch": 1.3024435223605348, "grad_norm": 0.2174186617837793, "kl": 0.0889739990234375, "learning_rate": 4.99006069371772e-07, "loss": 0.0001, "reward": 1.708928644657135, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7133928798139095, "rewards/format_reward_func": 0.9955357164144516, "step": 7768 }, { "completion_length": 218.52233123779297, "epoch": 1.3027788256004023, "grad_norm": 0.30772843925684945, "kl": 0.07672119140625, "learning_rate": 4.990050004767229e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 7770 }, { "completion_length": 224.69197463989258, "epoch": 1.30311412884027, "grad_norm": 0.3108382816493444, "kl": 0.084442138671875, "learning_rate": 4.990039310083722e-07, "loss": 0.0001, "reward": 1.728571504354477, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714615136385, "rewards/format_reward_func": 1.0, "step": 7772 }, { "completion_length": 228.41965103149414, "epoch": 1.3034494320801375, "grad_norm": 0.14404724983058562, "kl": 0.0753021240234375, "learning_rate": 4.99002860966722e-07, "loss": 0.0001, "reward": 1.8178571984171867, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 1.0, "step": 7774 }, { "completion_length": 216.98661518096924, "epoch": 1.3037847353200052, "grad_norm": 0.2621768758533317, "kl": 0.0835113525390625, "learning_rate": 4.990017903517748e-07, "loss": 0.0001, "reward": 1.746428668498993, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 7776 }, { "completion_length": 212.20090103149414, "epoch": 1.3041200385598726, "grad_norm": 0.3576231235033641, "kl": 0.077362060546875, "learning_rate": 4.990007191635333e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 7778 }, { "completion_length": 217.42858028411865, "epoch": 1.3044553417997402, "grad_norm": 0.33123491228549057, "kl": 0.0748291015625, "learning_rate": 4.989996474019998e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 7780 }, { "completion_length": 223.04018878936768, "epoch": 1.3047906450396076, "grad_norm": 0.2896579109829537, "kl": 0.0826416015625, "learning_rate": 4.989985750671768e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000469386578, "rewards/format_reward_func": 1.0, "step": 7782 }, { "completion_length": 224.85268878936768, "epoch": 1.3051259482794753, "grad_norm": 0.38177411239092823, "kl": 0.0991973876953125, "learning_rate": 4.989975021590668e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7482143193483353, "rewards/format_reward_func": 0.9910714328289032, "step": 7784 }, { "completion_length": 217.80358123779297, "epoch": 1.305461251519343, "grad_norm": 0.391786171910137, "kl": 0.09320068359375, "learning_rate": 4.989964286776721e-07, "loss": 0.0001, "reward": 1.7486607655882835, "reward_std": 0.06250318652018905, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 0.9986607171595097, "step": 7786 }, { "completion_length": 219.24108219146729, "epoch": 1.3057965547592103, "grad_norm": 0.29750057352726367, "kl": 0.0730133056640625, "learning_rate": 4.989953546229954e-07, "loss": 0.0001, "reward": 1.7660714909434319, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.9955357164144516, "step": 7788 }, { "completion_length": 228.39733123779297, "epoch": 1.306131857999078, "grad_norm": 0.339059597344952, "kl": 0.06829833984375, "learning_rate": 4.98994279995039e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 7790 }, { "completion_length": 224.32143783569336, "epoch": 1.3064671612389454, "grad_norm": 0.17052533501531209, "kl": 0.0645294189453125, "learning_rate": 4.989932047938055e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 7792 }, { "completion_length": 232.91965579986572, "epoch": 1.306802464478813, "grad_norm": 0.3939521238618866, "kl": 0.0809478759765625, "learning_rate": 4.989921290192974e-07, "loss": 0.0001, "reward": 1.7982143312692642, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8026786036789417, "rewards/format_reward_func": 0.9955357164144516, "step": 7794 }, { "completion_length": 225.34375953674316, "epoch": 1.3071377677186806, "grad_norm": 0.14989733463861954, "kl": 0.069671630859375, "learning_rate": 4.98991052671517e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 7796 }, { "completion_length": 224.74107933044434, "epoch": 1.3074730709585483, "grad_norm": 0.22708778689946796, "kl": 0.081268310546875, "learning_rate": 4.989899757504669e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 1.0, "step": 7798 }, { "completion_length": 237.43751049041748, "epoch": 1.3078083741984157, "grad_norm": 0.2108386163132425, "kl": 0.1372833251953125, "learning_rate": 4.989888982561495e-07, "loss": 0.0001, "reward": 1.7464286163449287, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7553571779280901, "rewards/format_reward_func": 0.9910714328289032, "step": 7800 }, { "completion_length": 234.86161708831787, "epoch": 1.3081436774382833, "grad_norm": 0.1809852379315868, "kl": 0.1102294921875, "learning_rate": 4.989878201885674e-07, "loss": 0.0001, "reward": 1.7285714969038963, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7375000342726707, "rewards/format_reward_func": 0.9910714328289032, "step": 7802 }, { "completion_length": 234.46875858306885, "epoch": 1.3084789806781507, "grad_norm": 0.2641798629492874, "kl": 0.0806427001953125, "learning_rate": 4.98986741547723e-07, "loss": 0.0001, "reward": 1.7504464983940125, "reward_std": 0.09028238197788596, "rewards/equation_reward_func": 0.7562500443309546, "rewards/format_reward_func": 0.9941964335739613, "step": 7804 }, { "completion_length": 229.58483219146729, "epoch": 1.3088142839180184, "grad_norm": 0.1348543022563862, "kl": 0.0857391357421875, "learning_rate": 4.989856623336188e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 7806 }, { "completion_length": 236.95090103149414, "epoch": 1.309149587157886, "grad_norm": 0.2442230067538243, "kl": 0.08929443359375, "learning_rate": 4.989845825462574e-07, "loss": 0.0001, "reward": 1.7410714998841286, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7455357387661934, "rewards/format_reward_func": 0.9955357164144516, "step": 7808 }, { "completion_length": 239.0134048461914, "epoch": 1.3094848903977534, "grad_norm": 0.11169604253691609, "kl": 0.089385986328125, "learning_rate": 4.98983502185641e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964285872876644, "rewards/format_reward_func": 1.0, "step": 7810 }, { "completion_length": 237.43304634094238, "epoch": 1.309820193637621, "grad_norm": 0.33155649606336196, "kl": 0.0821075439453125, "learning_rate": 4.989824212517724e-07, "loss": 0.0001, "reward": 1.7446429133415222, "reward_std": 0.0883883461356163, "rewards/equation_reward_func": 0.7580357417464256, "rewards/format_reward_func": 0.9866071492433548, "step": 7812 }, { "completion_length": 242.7500123977661, "epoch": 1.3101554968774884, "grad_norm": 0.2148534984579552, "kl": 0.0830078125, "learning_rate": 4.989813397446538e-07, "loss": 0.0001, "reward": 1.782589353621006, "reward_std": 0.039774756878614426, "rewards/equation_reward_func": 0.797321442514658, "rewards/format_reward_func": 0.9852678626775742, "step": 7814 }, { "completion_length": 236.7142972946167, "epoch": 1.310490800117356, "grad_norm": 0.2801228926143477, "kl": 0.090118408203125, "learning_rate": 4.98980257664288e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7991071753203869, "rewards/format_reward_func": 0.9955357164144516, "step": 7816 }, { "completion_length": 237.13394165039062, "epoch": 1.3108261033572237, "grad_norm": 0.21059345642327687, "kl": 0.1118316650390625, "learning_rate": 4.989791750106772e-07, "loss": 0.0001, "reward": 1.7325893640518188, "reward_std": 0.05492704175412655, "rewards/equation_reward_func": 0.7383928783237934, "rewards/format_reward_func": 0.994196429848671, "step": 7818 }, { "completion_length": 240.52679634094238, "epoch": 1.3111614065970914, "grad_norm": 0.22267002073074174, "kl": 0.0712890625, "learning_rate": 4.989780917838241e-07, "loss": 0.0001, "reward": 1.7732143327593803, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776786126196384, "rewards/format_reward_func": 0.9955357164144516, "step": 7820 }, { "completion_length": 231.9375114440918, "epoch": 1.3114967098369588, "grad_norm": 0.29483440744225925, "kl": 0.0724029541015625, "learning_rate": 4.989770079837311e-07, "loss": 0.0001, "reward": 1.7767857536673546, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7812500223517418, "rewards/format_reward_func": 0.9955357164144516, "step": 7822 }, { "completion_length": 228.0134048461914, "epoch": 1.3118320130768264, "grad_norm": 0.18827096899708526, "kl": 0.111663818359375, "learning_rate": 4.989759236104007e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 7824 }, { "completion_length": 228.86608219146729, "epoch": 1.3121673163166938, "grad_norm": 0.2393960966488759, "kl": 0.08990478515625, "learning_rate": 4.989748386638354e-07, "loss": 0.0001, "reward": 1.7339286357164383, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7383928969502449, "rewards/format_reward_func": 0.9955357164144516, "step": 7826 }, { "completion_length": 234.2901906967163, "epoch": 1.3125026195565614, "grad_norm": 0.3691026151200785, "kl": 0.083038330078125, "learning_rate": 4.989737531440378e-07, "loss": 0.0001, "reward": 1.7732143625617027, "reward_std": 0.138895976357162, "rewards/equation_reward_func": 0.8044643178582191, "rewards/format_reward_func": 0.9687500149011612, "step": 7828 }, { "completion_length": 225.30804538726807, "epoch": 1.312837922796429, "grad_norm": 0.17235853000671505, "kl": 0.159576416015625, "learning_rate": 4.989726670510102e-07, "loss": 0.0002, "reward": 1.7660714834928513, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7705357447266579, "rewards/format_reward_func": 0.9955357164144516, "step": 7830 }, { "completion_length": 223.16518783569336, "epoch": 1.3131732260362965, "grad_norm": 0.20867123219482434, "kl": 0.0762176513671875, "learning_rate": 4.989715803847553e-07, "loss": 0.0001, "reward": 1.828571505844593, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8285714462399483, "rewards/format_reward_func": 1.0, "step": 7832 }, { "completion_length": 233.1919755935669, "epoch": 1.3135085292761641, "grad_norm": 0.24279714655234397, "kl": 0.09161376953125, "learning_rate": 4.989704931452754e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071429058909416, "rewards/format_reward_func": 1.0, "step": 7834 }, { "completion_length": 231.41072368621826, "epoch": 1.3138438325160318, "grad_norm": 0.16989311883932803, "kl": 0.1334381103515625, "learning_rate": 4.989694053325732e-07, "loss": 0.0001, "reward": 1.7285715192556381, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7285714540630579, "rewards/format_reward_func": 1.0, "step": 7836 }, { "completion_length": 230.12054634094238, "epoch": 1.3141791357558992, "grad_norm": 0.26715548521941174, "kl": 0.0696868896484375, "learning_rate": 4.989683169466511e-07, "loss": 0.0001, "reward": 1.7982143387198448, "reward_std": 0.06313453428447247, "rewards/equation_reward_func": 0.8116071671247482, "rewards/format_reward_func": 0.9866071492433548, "step": 7838 }, { "completion_length": 231.8928689956665, "epoch": 1.3145144389957668, "grad_norm": 0.269292185061531, "kl": 0.0734405517578125, "learning_rate": 4.989672279875116e-07, "loss": 0.0001, "reward": 1.7732143551111221, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7866071686148643, "rewards/format_reward_func": 0.9866071492433548, "step": 7840 }, { "completion_length": 235.4910831451416, "epoch": 1.3148497422356344, "grad_norm": 0.20359318194734094, "kl": 0.06927490234375, "learning_rate": 4.989661384551573e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 7842 }, { "completion_length": 236.55358505249023, "epoch": 1.3151850454755019, "grad_norm": 0.5545695180093116, "kl": 0.2422637939453125, "learning_rate": 4.989650483495906e-07, "loss": 0.0002, "reward": 1.7660715132951736, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 7844 }, { "completion_length": 223.20983123779297, "epoch": 1.3155203487153695, "grad_norm": 0.5333929645774035, "kl": 0.2826690673828125, "learning_rate": 4.98963957670814e-07, "loss": 0.0003, "reward": 1.751785770058632, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500424683094, "rewards/format_reward_func": 0.9955357164144516, "step": 7846 }, { "completion_length": 225.40625953674316, "epoch": 1.315855651955237, "grad_norm": 0.20162221324994153, "kl": 0.5379486083984375, "learning_rate": 4.9896286641883e-07, "loss": 0.0005, "reward": 1.7571429386734962, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7660714499652386, "rewards/format_reward_func": 0.9910714328289032, "step": 7848 }, { "completion_length": 240.8705472946167, "epoch": 1.3161909551951045, "grad_norm": 0.5158776214210989, "kl": 0.2147674560546875, "learning_rate": 4.989617745936413e-07, "loss": 0.0002, "reward": 1.6625000685453415, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.6848214585334063, "rewards/format_reward_func": 0.977678582072258, "step": 7850 }, { "completion_length": 229.63840293884277, "epoch": 1.3165262584349722, "grad_norm": 0.3106276145668767, "kl": 0.5753936767578125, "learning_rate": 4.989606821952503e-07, "loss": 0.0006, "reward": 1.7892857566475868, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.798214316368103, "rewards/format_reward_func": 0.9910714328289032, "step": 7852 }, { "completion_length": 231.4866180419922, "epoch": 1.3168615616748398, "grad_norm": 0.20367040132891345, "kl": 0.349151611328125, "learning_rate": 4.989595892236594e-07, "loss": 0.0003, "reward": 1.7500000670552254, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7589286006987095, "rewards/format_reward_func": 0.9910714328289032, "step": 7854 }, { "completion_length": 246.17411518096924, "epoch": 1.3171968649147072, "grad_norm": 0.36064056384011967, "kl": 0.149871826171875, "learning_rate": 4.989584956788713e-07, "loss": 0.0001, "reward": 1.6415179520845413, "reward_std": 0.10796005232259631, "rewards/equation_reward_func": 0.6785714626312256, "rewards/format_reward_func": 0.9629464484751225, "step": 7856 }, { "completion_length": 232.4509038925171, "epoch": 1.3175321681545749, "grad_norm": 0.27023069215816137, "kl": 0.454803466796875, "learning_rate": 4.989574015608883e-07, "loss": 0.0005, "reward": 1.6946429386734962, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7169643137603998, "rewards/format_reward_func": 0.9776785783469677, "step": 7858 }, { "completion_length": 233.8303680419922, "epoch": 1.3178674713944423, "grad_norm": 0.3203622502088371, "kl": 0.268585205078125, "learning_rate": 4.989563068697133e-07, "loss": 0.0003, "reward": 1.7535715028643608, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 7860 }, { "completion_length": 230.17411613464355, "epoch": 1.31820277463431, "grad_norm": 0.29306740052637414, "kl": 0.418975830078125, "learning_rate": 4.989552116053485e-07, "loss": 0.0004, "reward": 1.7821429297327995, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7910714522004128, "rewards/format_reward_func": 0.9910714328289032, "step": 7862 }, { "completion_length": 230.42858123779297, "epoch": 1.3185380778741775, "grad_norm": 0.2370092201280241, "kl": 0.0861053466796875, "learning_rate": 4.989541157677964e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7723214514553547, "rewards/format_reward_func": 0.9866071492433548, "step": 7864 }, { "completion_length": 227.3750114440918, "epoch": 1.318873381114045, "grad_norm": 0.26860458742969134, "kl": 0.0887451171875, "learning_rate": 4.989530193570596e-07, "loss": 0.0001, "reward": 1.7214286774396896, "reward_std": 0.1010152529925108, "rewards/equation_reward_func": 0.7303571738302708, "rewards/format_reward_func": 0.9910714328289032, "step": 7866 }, { "completion_length": 216.43304634094238, "epoch": 1.3192086843539126, "grad_norm": 0.2723609670431003, "kl": 0.082366943359375, "learning_rate": 4.989519223731408e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714454948902, "rewards/format_reward_func": 1.0, "step": 7868 }, { "completion_length": 218.71876049041748, "epoch": 1.31954398759378, "grad_norm": 0.38728502112057167, "kl": 0.2922515869140625, "learning_rate": 4.989508248160423e-07, "loss": 0.0003, "reward": 1.78035718947649, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 7870 }, { "completion_length": 219.24108123779297, "epoch": 1.3198792908336476, "grad_norm": 0.27127451546841486, "kl": 0.170257568359375, "learning_rate": 4.989497266857666e-07, "loss": 0.0002, "reward": 1.730357214808464, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7348214574158192, "rewards/format_reward_func": 0.9955357164144516, "step": 7872 }, { "completion_length": 219.01340198516846, "epoch": 1.3202145940735153, "grad_norm": 0.283198873620819, "kl": 0.092987060546875, "learning_rate": 4.989486279823164e-07, "loss": 0.0001, "reward": 1.7946429327130318, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7991071678698063, "rewards/format_reward_func": 0.9955357164144516, "step": 7874 }, { "completion_length": 222.977689743042, "epoch": 1.320549897313383, "grad_norm": 0.29053446930371934, "kl": 0.107940673828125, "learning_rate": 4.989475287056941e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 7876 }, { "completion_length": 221.5089406967163, "epoch": 1.3208852005532503, "grad_norm": 0.31560908109020885, "kl": 0.1349945068359375, "learning_rate": 4.989464288559024e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143238186836, "rewards/format_reward_func": 1.0, "step": 7878 }, { "completion_length": 208.75000953674316, "epoch": 1.321220503793118, "grad_norm": 0.23600630732348715, "kl": 0.0809326171875, "learning_rate": 4.989453284329436e-07, "loss": 0.0001, "reward": 1.7379464954137802, "reward_std": 0.03724937466904521, "rewards/equation_reward_func": 0.7392857316881418, "rewards/format_reward_func": 0.9986607171595097, "step": 7880 }, { "completion_length": 216.7232265472412, "epoch": 1.3215558070329854, "grad_norm": 0.3341347334253455, "kl": 0.1131591796875, "learning_rate": 4.989442274368204e-07, "loss": 0.0001, "reward": 1.7803571969270706, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214693367481, "rewards/format_reward_func": 0.9955357164144516, "step": 7882 }, { "completion_length": 215.28125953674316, "epoch": 1.321891110272853, "grad_norm": 0.1194486987207903, "kl": 0.1649322509765625, "learning_rate": 4.989431258675353e-07, "loss": 0.0002, "reward": 1.7571429088711739, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 7884 }, { "completion_length": 213.87054538726807, "epoch": 1.3222264135127206, "grad_norm": 0.16692747952837317, "kl": 0.9911956787109375, "learning_rate": 4.989420237250909e-07, "loss": 0.001, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857398837805, "rewards/format_reward_func": 1.0, "step": 7886 }, { "completion_length": 208.29911613464355, "epoch": 1.322561716752588, "grad_norm": 0.43172098055364655, "kl": 0.154571533203125, "learning_rate": 4.989409210094895e-07, "loss": 0.0002, "reward": 1.798214353621006, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8026785925030708, "rewards/format_reward_func": 0.9955357164144516, "step": 7888 }, { "completion_length": 211.33036518096924, "epoch": 1.3228970199924557, "grad_norm": 0.33694847774059483, "kl": 0.096923828125, "learning_rate": 4.989398177207339e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7321428749710321, "rewards/format_reward_func": 1.0, "step": 7890 }, { "completion_length": 211.00893783569336, "epoch": 1.3232323232323233, "grad_norm": 0.29598731413356466, "kl": 0.2599945068359375, "learning_rate": 4.989387138588265e-07, "loss": 0.0003, "reward": 1.8017857521772385, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.806250024586916, "rewards/format_reward_func": 0.9955357164144516, "step": 7892 }, { "completion_length": 220.1741180419922, "epoch": 1.3235676264721907, "grad_norm": 0.2904979857112134, "kl": 0.0965576171875, "learning_rate": 4.989376094237699e-07, "loss": 0.0001, "reward": 1.717857226729393, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7267857305705547, "rewards/format_reward_func": 0.9910714328289032, "step": 7894 }, { "completion_length": 220.26786613464355, "epoch": 1.3239029297120584, "grad_norm": 0.27770757884590075, "kl": 0.145721435546875, "learning_rate": 4.989365044155667e-07, "loss": 0.0001, "reward": 1.6714286506175995, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.6803571786731482, "rewards/format_reward_func": 0.9910714328289032, "step": 7896 }, { "completion_length": 217.25447368621826, "epoch": 1.324238232951926, "grad_norm": 0.32922269503566337, "kl": 0.196502685546875, "learning_rate": 4.989353988342192e-07, "loss": 0.0002, "reward": 1.7392857894301414, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.748214315623045, "rewards/format_reward_func": 0.9910714328289032, "step": 7898 }, { "completion_length": 223.9107265472412, "epoch": 1.3245735361917934, "grad_norm": 0.998183898998436, "kl": 0.1063232421875, "learning_rate": 4.989342926797303e-07, "loss": 0.0001, "reward": 1.7660714834928513, "reward_std": 0.0883883461356163, "rewards/equation_reward_func": 0.7794643230736256, "rewards/format_reward_func": 0.9866071492433548, "step": 7900 }, { "completion_length": 226.12054538726807, "epoch": 1.324908839431661, "grad_norm": 0.1907288726707059, "kl": 0.097930908203125, "learning_rate": 4.989331859521022e-07, "loss": 0.0001, "reward": 1.7178572043776512, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7267857529222965, "rewards/format_reward_func": 0.9910714328289032, "step": 7902 }, { "completion_length": 224.27233219146729, "epoch": 1.3252441426715285, "grad_norm": 2.421709001433735, "kl": 0.112579345703125, "learning_rate": 4.989320786513376e-07, "loss": 0.0001, "reward": 1.7267857864499092, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.740178607404232, "rewards/format_reward_func": 0.9866071492433548, "step": 7904 }, { "completion_length": 226.09822463989258, "epoch": 1.325579445911396, "grad_norm": 1.1546552639652499, "kl": 0.12115478515625, "learning_rate": 4.989309707774391e-07, "loss": 0.0001, "reward": 1.769642896950245, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7830357477068901, "rewards/format_reward_func": 0.9866071492433548, "step": 7906 }, { "completion_length": 221.54018878936768, "epoch": 1.3259147491512637, "grad_norm": 0.2796337690765062, "kl": 0.14959716796875, "learning_rate": 4.989298623304094e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 7908 }, { "completion_length": 210.87501049041748, "epoch": 1.3262500523911314, "grad_norm": 0.3019066657629454, "kl": 0.1559600830078125, "learning_rate": 4.989287533102506e-07, "loss": 0.0002, "reward": 1.8178572058677673, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8178571611642838, "rewards/format_reward_func": 1.0, "step": 7910 }, { "completion_length": 220.26340103149414, "epoch": 1.3265853556309988, "grad_norm": 0.34559487640089015, "kl": 0.151519775390625, "learning_rate": 4.989276437169656e-07, "loss": 0.0002, "reward": 1.7535714954137802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 7912 }, { "completion_length": 213.41518688201904, "epoch": 1.3269206588708664, "grad_norm": 0.292567252173144, "kl": 0.08819580078125, "learning_rate": 4.989265335505569e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 1.0, "step": 7914 }, { "completion_length": 208.04465103149414, "epoch": 1.3272559621107338, "grad_norm": 0.3393569470550142, "kl": 0.0819091796875, "learning_rate": 4.989254228110271e-07, "loss": 0.0001, "reward": 1.7714286595582962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 7916 }, { "completion_length": 214.92858028411865, "epoch": 1.3275912653506015, "grad_norm": 0.0855442589689345, "kl": 0.0947113037109375, "learning_rate": 4.989243114983785e-07, "loss": 0.0001, "reward": 1.8142857775092125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857551574707, "rewards/format_reward_func": 1.0, "step": 7918 }, { "completion_length": 214.98661708831787, "epoch": 1.327926568590469, "grad_norm": 0.23092207006843854, "kl": 0.0896759033203125, "learning_rate": 4.98923199612614e-07, "loss": 0.0001, "reward": 1.7267857939004898, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7401785831898451, "rewards/format_reward_func": 0.9866071492433548, "step": 7920 }, { "completion_length": 211.44197273254395, "epoch": 1.3282618718303365, "grad_norm": 0.2068555217674292, "kl": 0.0815582275390625, "learning_rate": 4.989220871537359e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 7922 }, { "completion_length": 212.82590198516846, "epoch": 1.3285971750702041, "grad_norm": 0.28652744944235065, "kl": 0.0796966552734375, "learning_rate": 4.98920974121747e-07, "loss": 0.0001, "reward": 1.8321429044008255, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8321428708732128, "rewards/format_reward_func": 1.0, "step": 7924 }, { "completion_length": 206.47768688201904, "epoch": 1.3289324783100716, "grad_norm": 0.29701924863495033, "kl": 0.09295654296875, "learning_rate": 4.989198605166495e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 7926 }, { "completion_length": 210.23215103149414, "epoch": 1.3292677815499392, "grad_norm": 0.39862120982234434, "kl": 0.09503173828125, "learning_rate": 4.989187463384463e-07, "loss": 0.0001, "reward": 1.7250000834465027, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.7250000350177288, "rewards/format_reward_func": 1.0, "step": 7928 }, { "completion_length": 223.2232255935669, "epoch": 1.3296030847898068, "grad_norm": 0.3389688200507532, "kl": 0.0911407470703125, "learning_rate": 4.989176315871398e-07, "loss": 0.0001, "reward": 1.74821437895298, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 7930 }, { "completion_length": 211.71429538726807, "epoch": 1.3299383880296745, "grad_norm": 0.30581841187049746, "kl": 0.0901336669921875, "learning_rate": 4.989165162627328e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 7932 }, { "completion_length": 223.56251049041748, "epoch": 1.3302736912695419, "grad_norm": 0.3911307845060985, "kl": 0.0928497314453125, "learning_rate": 4.989154003652275e-07, "loss": 0.0001, "reward": 1.7928572222590446, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 7934 }, { "completion_length": 214.97322463989258, "epoch": 1.3306089945094095, "grad_norm": 0.3465375626222913, "kl": 0.102874755859375, "learning_rate": 4.989142838946267e-07, "loss": 0.0001, "reward": 1.7428572252392769, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571619093418, "rewards/format_reward_func": 1.0, "step": 7936 }, { "completion_length": 215.17411708831787, "epoch": 1.330944297749277, "grad_norm": 0.2738688404966579, "kl": 0.0842132568359375, "learning_rate": 4.98913166850933e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571589291096, "rewards/format_reward_func": 1.0, "step": 7938 }, { "completion_length": 220.07590293884277, "epoch": 1.3312796009891446, "grad_norm": 0.20492354543887023, "kl": 0.379638671875, "learning_rate": 4.989120492341489e-07, "loss": 0.0004, "reward": 1.76071435213089, "reward_std": 0.05555838719010353, "rewards/equation_reward_func": 0.7607142999768257, "rewards/format_reward_func": 1.0, "step": 7940 }, { "completion_length": 215.39733123779297, "epoch": 1.3316149042290122, "grad_norm": 0.30510096492084604, "kl": 0.0955047607421875, "learning_rate": 4.989109310442769e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 7942 }, { "completion_length": 218.67858028411865, "epoch": 1.3319502074688796, "grad_norm": 0.34981667176896825, "kl": 6.00408935546875, "learning_rate": 4.989098122813197e-07, "loss": 0.006, "reward": 1.7892857864499092, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 7944 }, { "completion_length": 206.76786708831787, "epoch": 1.3322855107087472, "grad_norm": 0.5514828094624111, "kl": 0.338348388671875, "learning_rate": 4.989086929452797e-07, "loss": 0.0003, "reward": 1.7678572088479996, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.767857177183032, "rewards/format_reward_func": 1.0, "step": 7946 }, { "completion_length": 218.44197273254395, "epoch": 1.3326208139486146, "grad_norm": 0.21252407205890636, "kl": 0.2454071044921875, "learning_rate": 4.989075730361598e-07, "loss": 0.0002, "reward": 1.7785714715719223, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 7948 }, { "completion_length": 212.97768688201904, "epoch": 1.3329561171884823, "grad_norm": 0.24461102951428326, "kl": 0.1788330078125, "learning_rate": 4.989064525539622e-07, "loss": 0.0002, "reward": 1.7928571850061417, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571719676256, "rewards/format_reward_func": 1.0, "step": 7950 }, { "completion_length": 213.28126049041748, "epoch": 1.33329142042835, "grad_norm": 0.25165262044320724, "kl": 0.157684326171875, "learning_rate": 4.989053314986898e-07, "loss": 0.0002, "reward": 1.7964286357164383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286059141159, "rewards/format_reward_func": 1.0, "step": 7952 }, { "completion_length": 218.63840007781982, "epoch": 1.3336267236682176, "grad_norm": 0.3567360154219791, "kl": 0.103912353515625, "learning_rate": 4.989042098703449e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 7954 }, { "completion_length": 222.22768783569336, "epoch": 1.333962026908085, "grad_norm": 0.251345285427576, "kl": 0.21942138671875, "learning_rate": 4.989030876689303e-07, "loss": 0.0002, "reward": 1.7892857864499092, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 7956 }, { "completion_length": 233.54465293884277, "epoch": 1.3342973301479526, "grad_norm": 0.3240709202095466, "kl": 0.26336669921875, "learning_rate": 4.989019648944486e-07, "loss": 0.0003, "reward": 1.739285796880722, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 7958 }, { "completion_length": 227.4107255935669, "epoch": 1.33463263338782, "grad_norm": 0.3387491696363639, "kl": 0.09735107421875, "learning_rate": 4.989008415469022e-07, "loss": 0.0001, "reward": 1.8178572058677673, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8178571630269289, "rewards/format_reward_func": 1.0, "step": 7960 }, { "completion_length": 239.0803689956665, "epoch": 1.3349679366276876, "grad_norm": 0.3606421949515841, "kl": 0.183502197265625, "learning_rate": 4.988997176262937e-07, "loss": 0.0002, "reward": 1.7642858028411865, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857320606709, "rewards/format_reward_func": 1.0, "step": 7962 }, { "completion_length": 229.78126049041748, "epoch": 1.3353032398675553, "grad_norm": 0.16454299451021592, "kl": 0.39007568359375, "learning_rate": 4.988985931326259e-07, "loss": 0.0004, "reward": 1.7392857894301414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 1.0, "step": 7964 }, { "completion_length": 231.1250114440918, "epoch": 1.3356385431074227, "grad_norm": 0.4889025325932433, "kl": 0.591400146484375, "learning_rate": 4.988974680659012e-07, "loss": 0.0006, "reward": 1.7964286357164383, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 7966 }, { "completion_length": 232.08483409881592, "epoch": 1.3359738463472903, "grad_norm": 0.40254854105558957, "kl": 0.818389892578125, "learning_rate": 4.988963424261221e-07, "loss": 0.0008, "reward": 1.772321529686451, "reward_std": 0.06944798585027456, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 0.9973214343190193, "step": 7968 }, { "completion_length": 230.89733600616455, "epoch": 1.336309149587158, "grad_norm": 0.43390298939426425, "kl": 1.7117919921875, "learning_rate": 4.988952162132916e-07, "loss": 0.0017, "reward": 1.779464341700077, "reward_std": 0.06944798701442778, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 0.9973214343190193, "step": 7970 }, { "completion_length": 222.60715293884277, "epoch": 1.3366444528270254, "grad_norm": 0.2671454665293245, "kl": 0.3524322509765625, "learning_rate": 4.988940894274118e-07, "loss": 0.0004, "reward": 1.81428574770689, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857365310192, "rewards/format_reward_func": 1.0, "step": 7972 }, { "completion_length": 231.09375953674316, "epoch": 1.336979756066893, "grad_norm": 0.20695948130143144, "kl": 0.834014892578125, "learning_rate": 4.988929620684857e-07, "loss": 0.0008, "reward": 1.7558036223053932, "reward_std": 0.032198611879721284, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 0.9986607171595097, "step": 7974 }, { "completion_length": 231.2187623977661, "epoch": 1.3373150593067606, "grad_norm": 0.25623959963115334, "kl": 0.179534912109375, "learning_rate": 4.988918341365156e-07, "loss": 0.0002, "reward": 1.7464286461472511, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7553571574389935, "rewards/format_reward_func": 0.9910714328289032, "step": 7976 }, { "completion_length": 232.86161708831787, "epoch": 1.337650362546628, "grad_norm": 0.07860704228809769, "kl": 0.182830810546875, "learning_rate": 4.988907056315043e-07, "loss": 0.0002, "reward": 1.7272322252392769, "reward_std": 0.03219861118122935, "rewards/equation_reward_func": 0.7285714615136385, "rewards/format_reward_func": 0.9986607171595097, "step": 7978 }, { "completion_length": 241.88394165039062, "epoch": 1.3379856657864957, "grad_norm": 0.3020735349351519, "kl": 0.331207275390625, "learning_rate": 4.988895765534542e-07, "loss": 0.0003, "reward": 1.7017857879400253, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7062500528991222, "rewards/format_reward_func": 0.9955357164144516, "step": 7980 }, { "completion_length": 235.10268878936768, "epoch": 1.338320969026363, "grad_norm": 1.859077809209031, "kl": 0.191925048828125, "learning_rate": 4.98888446902368e-07, "loss": 0.0002, "reward": 1.7428572252392769, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 7982 }, { "completion_length": 242.65180015563965, "epoch": 1.3386562722662307, "grad_norm": 0.3395116480501293, "kl": 0.2183990478515625, "learning_rate": 4.988873166782484e-07, "loss": 0.0002, "reward": 1.7700893580913544, "reward_std": 0.07260471256449819, "rewards/equation_reward_func": 0.7714285887777805, "rewards/format_reward_func": 0.9986607171595097, "step": 7984 }, { "completion_length": 244.06697463989258, "epoch": 1.3389915755060984, "grad_norm": 0.30610268249495964, "kl": 0.224273681640625, "learning_rate": 4.98886185881098e-07, "loss": 0.0002, "reward": 1.7392858117818832, "reward_std": 0.09596448857337236, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 1.0, "step": 7986 }, { "completion_length": 248.25001049041748, "epoch": 1.339326878745966, "grad_norm": 0.23843987852462686, "kl": 0.1561737060546875, "learning_rate": 4.988850545109193e-07, "loss": 0.0002, "reward": 1.7357143461704254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 1.0, "step": 7988 }, { "completion_length": 247.65180015563965, "epoch": 1.3396621819858334, "grad_norm": 0.2668127447982816, "kl": 0.113250732421875, "learning_rate": 4.988839225677147e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9955357164144516, "step": 7990 }, { "completion_length": 243.83483123779297, "epoch": 1.339997485225701, "grad_norm": 0.17925726284968613, "kl": 0.08819580078125, "learning_rate": 4.988827900514873e-07, "loss": 0.0001, "reward": 1.7482143566012383, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.752678606659174, "rewards/format_reward_func": 0.9955357164144516, "step": 7992 }, { "completion_length": 238.2321548461914, "epoch": 1.3403327884655685, "grad_norm": 0.17101341252068045, "kl": 0.1177520751953125, "learning_rate": 4.988816569622392e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 7994 }, { "completion_length": 232.06697463989258, "epoch": 1.340668091705436, "grad_norm": 0.22431370698058403, "kl": 0.083343505859375, "learning_rate": 4.988805232999735e-07, "loss": 0.0001, "reward": 1.8160714730620384, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8205357417464256, "rewards/format_reward_func": 0.9955357164144516, "step": 7996 }, { "completion_length": 240.4375123977661, "epoch": 1.3410033949453037, "grad_norm": 0.2647862614895852, "kl": 0.082122802734375, "learning_rate": 4.988793890646924e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857640981674, "rewards/format_reward_func": 1.0, "step": 7998 }, { "completion_length": 235.17858123779297, "epoch": 1.3413386981851712, "grad_norm": 0.18667197312641198, "kl": 0.0843658447265625, "learning_rate": 4.988782542563988e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 8000 }, { "completion_length": 243.3928689956665, "epoch": 1.3416740014250388, "grad_norm": 0.30499551404800335, "kl": 0.139678955078125, "learning_rate": 4.988771188750949e-07, "loss": 0.0001, "reward": 1.835714377462864, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8357142992317677, "rewards/format_reward_func": 1.0, "step": 8002 }, { "completion_length": 248.0000123977661, "epoch": 1.3420093046649062, "grad_norm": 0.2519452088794451, "kl": 0.1127777099609375, "learning_rate": 4.988759829207839e-07, "loss": 0.0001, "reward": 1.7821429371833801, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 8004 }, { "completion_length": 234.8169755935669, "epoch": 1.3423446079047738, "grad_norm": 0.2282132101143147, "kl": 0.1110076904296875, "learning_rate": 4.98874846393468e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 8006 }, { "completion_length": 240.383939743042, "epoch": 1.3426799111446415, "grad_norm": 0.3600678021417168, "kl": 0.17425537109375, "learning_rate": 4.988737092931499e-07, "loss": 0.0002, "reward": 1.7232143729925156, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7276786044239998, "rewards/format_reward_func": 0.9955357164144516, "step": 8008 }, { "completion_length": 247.48662090301514, "epoch": 1.343015214384509, "grad_norm": 0.3601856091328208, "kl": 0.108642578125, "learning_rate": 4.988725716198322e-07, "loss": 0.0001, "reward": 1.7714286521077156, "reward_std": 0.09091372601687908, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 8010 }, { "completion_length": 234.50447368621826, "epoch": 1.3433505176243765, "grad_norm": 0.30609978360331364, "kl": 0.09759521484375, "learning_rate": 4.988714333735176e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 8012 }, { "completion_length": 229.9419765472412, "epoch": 1.3436858208642442, "grad_norm": 0.12760156152712052, "kl": 0.085052490234375, "learning_rate": 4.988702945542088e-07, "loss": 0.0001, "reward": 1.8321429193019867, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8321428783237934, "rewards/format_reward_func": 1.0, "step": 8014 }, { "completion_length": 243.6428680419922, "epoch": 1.3440211241041116, "grad_norm": 0.39674180978928336, "kl": 0.156829833984375, "learning_rate": 4.988691551619081e-07, "loss": 0.0002, "reward": 1.7517857775092125, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7562500461935997, "rewards/format_reward_func": 0.9955357164144516, "step": 8016 }, { "completion_length": 233.2544755935669, "epoch": 1.3443564273439792, "grad_norm": 0.16498579602363317, "kl": 0.084625244140625, "learning_rate": 4.988680151966186e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 8018 }, { "completion_length": 240.07590293884277, "epoch": 1.3446917305838468, "grad_norm": 0.26198383378203854, "kl": 0.0823822021484375, "learning_rate": 4.988668746583424e-07, "loss": 0.0001, "reward": 1.7517857626080513, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500350177288, "rewards/format_reward_func": 0.9955357164144516, "step": 8020 }, { "completion_length": 243.4553680419922, "epoch": 1.3450270338237142, "grad_norm": 0.27333683560494554, "kl": 0.0854644775390625, "learning_rate": 4.988657335470826e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 8022 }, { "completion_length": 246.2901906967163, "epoch": 1.3453623370635819, "grad_norm": 0.43851686807388973, "kl": 0.092041015625, "learning_rate": 4.988645918628414e-07, "loss": 0.0001, "reward": 1.789285771548748, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 8024 }, { "completion_length": 235.3928680419922, "epoch": 1.3456976403034493, "grad_norm": 0.30842295436356104, "kl": 0.08282470703125, "learning_rate": 4.988634496056218e-07, "loss": 0.0001, "reward": 1.7982143387198448, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.8026785999536514, "rewards/format_reward_func": 0.9955357164144516, "step": 8026 }, { "completion_length": 238.07590293884277, "epoch": 1.346032943543317, "grad_norm": 0.12391568944034573, "kl": 0.0708465576171875, "learning_rate": 4.988623067754262e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 8028 }, { "completion_length": 243.37947273254395, "epoch": 1.3463682467831846, "grad_norm": 0.18952970799683763, "kl": 0.0840911865234375, "learning_rate": 4.988611633722573e-07, "loss": 0.0001, "reward": 1.7357143834233284, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 1.0, "step": 8030 }, { "completion_length": 247.5401906967163, "epoch": 1.3467035500230522, "grad_norm": 0.22351549042268146, "kl": 0.08148193359375, "learning_rate": 4.988600193961177e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 8032 }, { "completion_length": 243.38393688201904, "epoch": 1.3470388532629196, "grad_norm": 0.37835809624396216, "kl": 0.085357666015625, "learning_rate": 4.988588748470101e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 8034 }, { "completion_length": 238.62054634094238, "epoch": 1.3473741565027872, "grad_norm": 0.23108849325790354, "kl": 0.08673095703125, "learning_rate": 4.988577297249371e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428772062063, "rewards/format_reward_func": 1.0, "step": 8036 }, { "completion_length": 245.7053680419922, "epoch": 1.3477094597426547, "grad_norm": 0.0852437054563733, "kl": 0.0871124267578125, "learning_rate": 4.988565840299014e-07, "loss": 0.0001, "reward": 1.6982143744826317, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7026786133646965, "rewards/format_reward_func": 0.9955357164144516, "step": 8038 }, { "completion_length": 256.2857246398926, "epoch": 1.3480447629825223, "grad_norm": 0.27334831001457666, "kl": 0.09234619140625, "learning_rate": 4.988554377619054e-07, "loss": 0.0001, "reward": 1.7214286252856255, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214286103844643, "rewards/format_reward_func": 1.0, "step": 8040 }, { "completion_length": 240.62500953674316, "epoch": 1.34838006622239, "grad_norm": 0.3848916028092399, "kl": 0.1457366943359375, "learning_rate": 4.988542909209521e-07, "loss": 0.0001, "reward": 1.8000000789761543, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.800000037997961, "rewards/format_reward_func": 1.0, "step": 8042 }, { "completion_length": 237.4375123977661, "epoch": 1.3487153694622576, "grad_norm": 0.24599279522305645, "kl": 0.0876922607421875, "learning_rate": 4.988531435070438e-07, "loss": 0.0001, "reward": 1.8000000640749931, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 8044 }, { "completion_length": 258.0759038925171, "epoch": 1.349050672702125, "grad_norm": 0.4030775726183148, "kl": 0.0973968505859375, "learning_rate": 4.988519955201834e-07, "loss": 0.0001, "reward": 1.710714377462864, "reward_std": 0.1060660146176815, "rewards/equation_reward_func": 0.7196428924798965, "rewards/format_reward_func": 0.9910714328289032, "step": 8046 }, { "completion_length": 248.3214406967163, "epoch": 1.3493859759419926, "grad_norm": 0.30576816556802056, "kl": 0.12847900390625, "learning_rate": 4.988508469603735e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 8048 }, { "completion_length": 241.42858123779297, "epoch": 1.34972127918186, "grad_norm": 0.26936985666206786, "kl": 0.1039581298828125, "learning_rate": 4.988496978276166e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571850061417, "rewards/format_reward_func": 1.0, "step": 8050 }, { "completion_length": 255.56251525878906, "epoch": 1.3500565824217277, "grad_norm": 0.32989257568241814, "kl": 0.133758544921875, "learning_rate": 4.988485481219154e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857354134321, "rewards/format_reward_func": 1.0, "step": 8052 }, { "completion_length": 242.9866189956665, "epoch": 1.3503918856615953, "grad_norm": 0.44298681376559773, "kl": 0.0977783203125, "learning_rate": 4.988473978432725e-07, "loss": 0.0001, "reward": 1.8000000789761543, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000081956387, "rewards/format_reward_func": 1.0, "step": 8054 }, { "completion_length": 257.57590198516846, "epoch": 1.3507271889014627, "grad_norm": 0.380273329373798, "kl": 0.085113525390625, "learning_rate": 4.988462469916908e-07, "loss": 0.0001, "reward": 1.7982143312692642, "reward_std": 0.0732360603287816, "rewards/equation_reward_func": 0.8116071745753288, "rewards/format_reward_func": 0.9866071492433548, "step": 8056 }, { "completion_length": 251.74108505249023, "epoch": 1.3510624921413303, "grad_norm": 0.23128181541135234, "kl": 0.1066436767578125, "learning_rate": 4.988450955671727e-07, "loss": 0.0001, "reward": 1.7696429044008255, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.9955357164144516, "step": 8058 }, { "completion_length": 239.02679824829102, "epoch": 1.3513977953811978, "grad_norm": 0.3129928101399441, "kl": 0.1588897705078125, "learning_rate": 4.988439435697209e-07, "loss": 0.0002, "reward": 1.7500000819563866, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7589286118745804, "rewards/format_reward_func": 0.9910714328289032, "step": 8060 }, { "completion_length": 244.73215293884277, "epoch": 1.3517330986210654, "grad_norm": 0.31500260650227535, "kl": 0.1183929443359375, "learning_rate": 4.98842790999338e-07, "loss": 0.0001, "reward": 1.7964286133646965, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7964286096394062, "rewards/format_reward_func": 1.0, "step": 8062 }, { "completion_length": 266.3928699493408, "epoch": 1.352068401860933, "grad_norm": 0.19326811694448875, "kl": 0.0916748046875, "learning_rate": 4.98841637856027e-07, "loss": 0.0001, "reward": 1.7910714820027351, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7955357357859612, "rewards/format_reward_func": 0.9955357164144516, "step": 8064 }, { "completion_length": 251.99554824829102, "epoch": 1.3524037051008007, "grad_norm": 0.28242915098566646, "kl": 0.0901641845703125, "learning_rate": 4.9884048413979e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 8066 }, { "completion_length": 254.74108409881592, "epoch": 1.352739008340668, "grad_norm": 0.21990591678680338, "kl": 0.1272125244140625, "learning_rate": 4.988393298506301e-07, "loss": 0.0001, "reward": 1.7267858013510704, "reward_std": 0.0732360603287816, "rewards/equation_reward_func": 0.7401785962283611, "rewards/format_reward_func": 0.9866071492433548, "step": 8068 }, { "completion_length": 261.4910831451416, "epoch": 1.3530743115805357, "grad_norm": 0.19448140488390295, "kl": 0.101318359375, "learning_rate": 4.988381749885498e-07, "loss": 0.0001, "reward": 1.8000000417232513, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.800000037997961, "rewards/format_reward_func": 1.0, "step": 8070 }, { "completion_length": 260.87501335144043, "epoch": 1.3534096148204031, "grad_norm": 0.3282465087207066, "kl": 0.15740966796875, "learning_rate": 4.988370195535517e-07, "loss": 0.0002, "reward": 1.7375000640749931, "reward_std": 0.0883883461356163, "rewards/equation_reward_func": 0.7508928887546062, "rewards/format_reward_func": 0.9866071492433548, "step": 8072 }, { "completion_length": 268.11162185668945, "epoch": 1.3537449180602708, "grad_norm": 1.0868446090043193, "kl": 0.2178192138671875, "learning_rate": 4.988358635456385e-07, "loss": 0.0002, "reward": 1.721428632736206, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7303571570664644, "rewards/format_reward_func": 0.9910714328289032, "step": 8074 }, { "completion_length": 260.8928680419922, "epoch": 1.3540802213001384, "grad_norm": 0.16292994704342872, "kl": 0.0915069580078125, "learning_rate": 4.988347069648129e-07, "loss": 0.0001, "reward": 1.787500075995922, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 8076 }, { "completion_length": 269.1830472946167, "epoch": 1.3544155245400058, "grad_norm": 0.2766759911148509, "kl": 0.10125732421875, "learning_rate": 4.988335498110776e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7410714700818062, "rewards/format_reward_func": 0.9910714328289032, "step": 8078 }, { "completion_length": 270.62947940826416, "epoch": 1.3547508277798734, "grad_norm": 0.24982035488239168, "kl": 0.084320068359375, "learning_rate": 4.988323920844352e-07, "loss": 0.0001, "reward": 1.7964286506175995, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 8080 }, { "completion_length": 262.83037090301514, "epoch": 1.3550861310197408, "grad_norm": 0.12586729994994952, "kl": 0.0821075439453125, "learning_rate": 4.988312337848883e-07, "loss": 0.0001, "reward": 1.7125000804662704, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7169643081724644, "rewards/format_reward_func": 0.9955357164144516, "step": 8082 }, { "completion_length": 277.20983695983887, "epoch": 1.3554214342596085, "grad_norm": 0.35417812775222424, "kl": 0.243377685546875, "learning_rate": 4.988300749124397e-07, "loss": 0.0002, "reward": 1.6589286550879478, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.6723214648663998, "rewards/format_reward_func": 0.9866071492433548, "step": 8084 }, { "completion_length": 268.1919755935669, "epoch": 1.3557567374994761, "grad_norm": 0.35492565059691283, "kl": 0.2265167236328125, "learning_rate": 4.988289154670919e-07, "loss": 0.0002, "reward": 1.6982143893837929, "reward_std": 0.11364216171205044, "rewards/equation_reward_func": 0.7205357477068901, "rewards/format_reward_func": 0.977678582072258, "step": 8086 }, { "completion_length": 275.0803689956665, "epoch": 1.3560920407393438, "grad_norm": 0.12388342800816557, "kl": 0.2826995849609375, "learning_rate": 4.988277554488478e-07, "loss": 0.0003, "reward": 1.7517857998609543, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500312924385, "rewards/format_reward_func": 0.9955357164144516, "step": 8088 }, { "completion_length": 258.0134038925171, "epoch": 1.3564273439792112, "grad_norm": 0.3590281267788517, "kl": 0.2235107421875, "learning_rate": 4.988265948577099e-07, "loss": 0.0002, "reward": 1.7767857760190964, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7901785932481289, "rewards/format_reward_func": 0.9866071492433548, "step": 8090 }, { "completion_length": 263.65625953674316, "epoch": 1.3567626472190788, "grad_norm": 0.2747411302913466, "kl": 0.1129913330078125, "learning_rate": 4.98825433693681e-07, "loss": 0.0001, "reward": 1.7250000461935997, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7339285984635353, "rewards/format_reward_func": 0.9910714328289032, "step": 8092 }, { "completion_length": 279.2901906967163, "epoch": 1.3570979504589462, "grad_norm": 0.33908673465572003, "kl": 0.1176605224609375, "learning_rate": 4.988242719567636e-07, "loss": 0.0001, "reward": 1.6571429520845413, "reward_std": 0.10606601648032665, "rewards/equation_reward_func": 0.6839285977184772, "rewards/format_reward_func": 0.9732142984867096, "step": 8094 }, { "completion_length": 262.75894260406494, "epoch": 1.3574332536988138, "grad_norm": 0.245226238939219, "kl": 0.1348724365234375, "learning_rate": 4.988231096469606e-07, "loss": 0.0001, "reward": 1.7375000640749931, "reward_std": 0.07828682195395231, "rewards/equation_reward_func": 0.7508928775787354, "rewards/format_reward_func": 0.9866071492433548, "step": 8096 }, { "completion_length": 273.8214416503906, "epoch": 1.3577685569386815, "grad_norm": 0.21272801489659302, "kl": 0.1975250244140625, "learning_rate": 4.988219467642743e-07, "loss": 0.0002, "reward": 1.7857143357396126, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 8098 }, { "completion_length": 266.92412090301514, "epoch": 1.358103860178549, "grad_norm": 0.1923993590207927, "kl": 0.10992431640625, "learning_rate": 4.988207833087078e-07, "loss": 0.0001, "reward": 1.7178572341799736, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571783006191, "rewards/format_reward_func": 1.0, "step": 8100 }, { "completion_length": 273.35715198516846, "epoch": 1.3584391634184165, "grad_norm": 0.2306174121200835, "kl": 0.5870361328125, "learning_rate": 4.988196192802636e-07, "loss": 0.0006, "reward": 1.7642857730388641, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7732143066823483, "rewards/format_reward_func": 0.9910714328289032, "step": 8102 }, { "completion_length": 253.38393878936768, "epoch": 1.3587744666582842, "grad_norm": 0.2780585896084185, "kl": 0.104766845703125, "learning_rate": 4.988184546789444e-07, "loss": 0.0001, "reward": 1.7665179446339607, "reward_std": 0.037249374436214566, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 0.9986607171595097, "step": 8104 }, { "completion_length": 264.9151887893677, "epoch": 1.3591097698981516, "grad_norm": 0.21535603932789843, "kl": 0.2724151611328125, "learning_rate": 4.988172895047528e-07, "loss": 0.0003, "reward": 1.7214286103844643, "reward_std": 0.0909137288108468, "rewards/equation_reward_func": 0.7392857559025288, "rewards/format_reward_func": 0.9821428656578064, "step": 8106 }, { "completion_length": 257.81697940826416, "epoch": 1.3594450731380192, "grad_norm": 0.1236021672041698, "kl": 0.0832977294921875, "learning_rate": 4.988161237576915e-07, "loss": 0.0001, "reward": 1.7625000476837158, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 8108 }, { "completion_length": 266.21429920196533, "epoch": 1.3597803763778868, "grad_norm": 0.28031837613588595, "kl": 0.12738037109375, "learning_rate": 4.988149574377633e-07, "loss": 0.0001, "reward": 1.6785715073347092, "reward_std": 0.09091372694820166, "rewards/equation_reward_func": 0.6875000447034836, "rewards/format_reward_func": 0.9910714328289032, "step": 8110 }, { "completion_length": 260.03572940826416, "epoch": 1.3601156796177543, "grad_norm": 0.24458932541296974, "kl": 0.3977203369140625, "learning_rate": 4.988137905449708e-07, "loss": 0.0004, "reward": 1.7678571939468384, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7767857424914837, "rewards/format_reward_func": 0.9910714328289032, "step": 8112 }, { "completion_length": 261.80804920196533, "epoch": 1.360450982857622, "grad_norm": 0.249895327320769, "kl": 0.093292236328125, "learning_rate": 4.988126230793167e-07, "loss": 0.0001, "reward": 1.7339286282658577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7383928969502449, "rewards/format_reward_func": 0.9955357164144516, "step": 8114 }, { "completion_length": 255.57590293884277, "epoch": 1.3607862860974893, "grad_norm": 0.3183229100046573, "kl": 0.235931396484375, "learning_rate": 4.988114550408037e-07, "loss": 0.0002, "reward": 1.7607143372297287, "reward_std": 0.05555839091539383, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 0.9821428656578064, "step": 8116 }, { "completion_length": 257.883939743042, "epoch": 1.361121589337357, "grad_norm": 0.215454386421447, "kl": 0.0789642333984375, "learning_rate": 4.988102864294344e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7910714522004128, "rewards/format_reward_func": 0.9910714328289032, "step": 8118 }, { "completion_length": 252.44644165039062, "epoch": 1.3614568925772246, "grad_norm": 0.12456854677300534, "kl": 0.2279205322265625, "learning_rate": 4.988091172452117e-07, "loss": 0.0002, "reward": 1.730357214808464, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7348214536905289, "rewards/format_reward_func": 0.9955357164144516, "step": 8120 }, { "completion_length": 245.48215579986572, "epoch": 1.3617921958170922, "grad_norm": 0.32630944614624346, "kl": 0.2685546875, "learning_rate": 4.988079474881381e-07, "loss": 0.0003, "reward": 1.7839286476373672, "reward_std": 0.06313453521579504, "rewards/equation_reward_func": 0.7973214536905289, "rewards/format_reward_func": 0.9866071492433548, "step": 8122 }, { "completion_length": 252.8169755935669, "epoch": 1.3621274990569596, "grad_norm": 0.24195474875844628, "kl": 0.3311767578125, "learning_rate": 4.988067771582163e-07, "loss": 0.0003, "reward": 1.7696429267525673, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7830357439815998, "rewards/format_reward_func": 0.9866071492433548, "step": 8124 }, { "completion_length": 259.477689743042, "epoch": 1.3624628022968273, "grad_norm": 0.7045995087371681, "kl": 1.045379638671875, "learning_rate": 4.98805606255449e-07, "loss": 0.001, "reward": 1.7071429267525673, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 0.9821428656578064, "step": 8126 }, { "completion_length": 275.4866199493408, "epoch": 1.3627981055366947, "grad_norm": 0.31304647795741647, "kl": 1.1280517578125, "learning_rate": 4.988044347798392e-07, "loss": 0.0011, "reward": 1.708928644657135, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7133928816765547, "rewards/format_reward_func": 0.9955357164144516, "step": 8128 }, { "completion_length": 248.54912090301514, "epoch": 1.3631334087765623, "grad_norm": 0.23159962235353818, "kl": 0.232452392578125, "learning_rate": 4.988032627313892e-07, "loss": 0.0002, "reward": 1.769642911851406, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 8130 }, { "completion_length": 267.6607255935669, "epoch": 1.36346871201643, "grad_norm": 0.20785342484097794, "kl": 0.69158935546875, "learning_rate": 4.988020901101017e-07, "loss": 0.0007, "reward": 1.719642922282219, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.7330357357859612, "rewards/format_reward_func": 0.9866071492433548, "step": 8132 }, { "completion_length": 255.8348331451416, "epoch": 1.3638040152562974, "grad_norm": 0.5485224272178171, "kl": 0.561676025390625, "learning_rate": 4.988009169159798e-07, "loss": 0.0006, "reward": 1.7339286357164383, "reward_std": 0.08333758544176817, "rewards/equation_reward_func": 0.7473214641213417, "rewards/format_reward_func": 0.9866071492433548, "step": 8134 }, { "completion_length": 251.4196548461914, "epoch": 1.364139318496165, "grad_norm": 0.37869015687501495, "kl": 0.3833465576171875, "learning_rate": 4.987997431490257e-07, "loss": 0.0004, "reward": 1.7428571805357933, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7517857514321804, "rewards/format_reward_func": 0.9910714328289032, "step": 8136 }, { "completion_length": 246.7589349746704, "epoch": 1.3644746217360324, "grad_norm": 0.29526207045490876, "kl": 0.158782958984375, "learning_rate": 4.987985688092426e-07, "loss": 0.0002, "reward": 1.7982143387198448, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.820535734295845, "rewards/format_reward_func": 0.977678582072258, "step": 8138 }, { "completion_length": 261.4687614440918, "epoch": 1.3648099249759, "grad_norm": 0.33114232485365375, "kl": 0.33135986328125, "learning_rate": 4.987973938966326e-07, "loss": 0.0003, "reward": 1.798214353621006, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.802678607404232, "rewards/format_reward_func": 0.9955357164144516, "step": 8140 }, { "completion_length": 251.94197463989258, "epoch": 1.3651452282157677, "grad_norm": 0.4965083376527316, "kl": 0.1101531982421875, "learning_rate": 4.98796218411199e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 8142 }, { "completion_length": 242.477689743042, "epoch": 1.3654805314556353, "grad_norm": 0.32604095360515667, "kl": 0.1768951416015625, "learning_rate": 4.987950423529442e-07, "loss": 0.0002, "reward": 1.8142857775092125, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8142857365310192, "rewards/format_reward_func": 1.0, "step": 8144 }, { "completion_length": 250.02679538726807, "epoch": 1.3658158346955027, "grad_norm": 0.28201790087899403, "kl": 0.1674957275390625, "learning_rate": 4.98793865721871e-07, "loss": 0.0002, "reward": 1.7303572222590446, "reward_std": 0.07828682195395231, "rewards/equation_reward_func": 0.7437500394880772, "rewards/format_reward_func": 0.9866071492433548, "step": 8146 }, { "completion_length": 237.24554538726807, "epoch": 1.3661511379353704, "grad_norm": 0.5227818679067141, "kl": 0.1568145751953125, "learning_rate": 4.987926885179821e-07, "loss": 0.0002, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000156462193, "rewards/format_reward_func": 1.0, "step": 8148 }, { "completion_length": 239.01786708831787, "epoch": 1.3664864411752378, "grad_norm": 0.20125727672007632, "kl": 0.110504150390625, "learning_rate": 4.987915107412801e-07, "loss": 0.0001, "reward": 1.769642911851406, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 8150 }, { "completion_length": 231.80358409881592, "epoch": 1.3668217444151054, "grad_norm": 0.26540877315998423, "kl": 0.13861083984375, "learning_rate": 4.987903323917678e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.8035714477300644, "rewards/format_reward_func": 1.0, "step": 8152 }, { "completion_length": 234.2232265472412, "epoch": 1.367157047654973, "grad_norm": 2.742825030886559, "kl": 0.16217041015625, "learning_rate": 4.987891534694479e-07, "loss": 0.0002, "reward": 1.7303571999073029, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7348214704543352, "rewards/format_reward_func": 0.9955357164144516, "step": 8154 }, { "completion_length": 237.60268783569336, "epoch": 1.3674923508948404, "grad_norm": 0.8197647544954363, "kl": 0.338653564453125, "learning_rate": 4.987879739743232e-07, "loss": 0.0003, "reward": 1.7214286401867867, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214285954833031, "rewards/format_reward_func": 1.0, "step": 8156 }, { "completion_length": 238.80358123779297, "epoch": 1.367827654134708, "grad_norm": 0.17306658774659567, "kl": 0.164398193359375, "learning_rate": 4.987867939063963e-07, "loss": 0.0002, "reward": 1.821428619325161, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8214285932481289, "rewards/format_reward_func": 1.0, "step": 8158 }, { "completion_length": 240.7366189956665, "epoch": 1.3681629573745755, "grad_norm": 0.2755680868704616, "kl": 0.3265380859375, "learning_rate": 4.987856132656701e-07, "loss": 0.0003, "reward": 1.7767857909202576, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500335276127, "rewards/format_reward_func": 0.9955357164144516, "step": 8160 }, { "completion_length": 242.43750953674316, "epoch": 1.3684982606144431, "grad_norm": 0.34220703415178927, "kl": 0.2494964599609375, "learning_rate": 4.987844320521469e-07, "loss": 0.0002, "reward": 1.7392857745289803, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 1.0, "step": 8162 }, { "completion_length": 244.96875858306885, "epoch": 1.3688335638543108, "grad_norm": 0.3656314574025403, "kl": 0.13720703125, "learning_rate": 4.987832502658299e-07, "loss": 0.0001, "reward": 1.8178571835160255, "reward_std": 0.04545686487108469, "rewards/equation_reward_func": 0.8267857357859612, "rewards/format_reward_func": 0.9910714328289032, "step": 8164 }, { "completion_length": 248.82143878936768, "epoch": 1.3691688670941784, "grad_norm": 0.5701026727666527, "kl": 0.2044219970703125, "learning_rate": 4.987820679067215e-07, "loss": 0.0002, "reward": 1.7839286401867867, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 8166 }, { "completion_length": 248.11608219146729, "epoch": 1.3695041703340458, "grad_norm": 0.1886594138713511, "kl": 0.4144134521484375, "learning_rate": 4.987808849748246e-07, "loss": 0.0004, "reward": 1.7839286476373672, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7883928678929806, "rewards/format_reward_func": 0.9955357164144516, "step": 8168 }, { "completion_length": 244.99108219146729, "epoch": 1.3698394735739134, "grad_norm": 0.21902865172820737, "kl": 0.2199249267578125, "learning_rate": 4.987797014701418e-07, "loss": 0.0002, "reward": 1.753571480512619, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7625000327825546, "rewards/format_reward_func": 0.9910714328289032, "step": 8170 }, { "completion_length": 245.6517972946167, "epoch": 1.3701747768137809, "grad_norm": 0.2876812085996151, "kl": 0.0823822021484375, "learning_rate": 4.98778517392676e-07, "loss": 0.0001, "reward": 1.7625000700354576, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 8172 }, { "completion_length": 251.2901906967163, "epoch": 1.3705100800536485, "grad_norm": 0.23891560103695314, "kl": 0.3800506591796875, "learning_rate": 4.987773327424297e-07, "loss": 0.0004, "reward": 1.7535714879631996, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 8174 }, { "completion_length": 239.9553680419922, "epoch": 1.3708453832935161, "grad_norm": 0.16681507824174535, "kl": 0.07891845703125, "learning_rate": 4.987761475194058e-07, "loss": 0.0001, "reward": 1.7464286237955093, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.755357176065445, "rewards/format_reward_func": 0.9910714328289032, "step": 8176 }, { "completion_length": 246.50001049041748, "epoch": 1.3711806865333838, "grad_norm": 0.1460767895717453, "kl": 0.605316162109375, "learning_rate": 4.98774961723607e-07, "loss": 0.0006, "reward": 1.728571504354477, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7375000230967999, "rewards/format_reward_func": 0.9910714328289032, "step": 8178 }, { "completion_length": 240.90626049041748, "epoch": 1.3715159897732512, "grad_norm": 0.27316633410956814, "kl": 0.10211181640625, "learning_rate": 4.987737753550359e-07, "loss": 0.0001, "reward": 1.7125000432133675, "reward_std": 0.061871841782703996, "rewards/equation_reward_func": 0.7258928939700127, "rewards/format_reward_func": 0.9866071529686451, "step": 8180 }, { "completion_length": 247.2544765472412, "epoch": 1.3718512930131188, "grad_norm": 0.27321212078962503, "kl": 0.32537841796875, "learning_rate": 4.987725884136954e-07, "loss": 0.0003, "reward": 1.787500075995922, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7919643223285675, "rewards/format_reward_func": 0.9955357164144516, "step": 8182 }, { "completion_length": 235.66072273254395, "epoch": 1.3721865962529862, "grad_norm": 0.26702454806950976, "kl": 0.393646240234375, "learning_rate": 4.987714008995882e-07, "loss": 0.0004, "reward": 1.778571493923664, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 8184 }, { "completion_length": 236.3928689956665, "epoch": 1.3725218994928539, "grad_norm": 0.2859289731323632, "kl": 0.1675567626953125, "learning_rate": 4.987702128127169e-07, "loss": 0.0002, "reward": 1.7428572326898575, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 8186 }, { "completion_length": 235.7053680419922, "epoch": 1.3728572027327215, "grad_norm": 0.1957172849424329, "kl": 0.1398468017578125, "learning_rate": 4.987690241530844e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 1.0, "step": 8188 }, { "completion_length": 238.45983028411865, "epoch": 1.373192505972589, "grad_norm": 0.15376641020518758, "kl": 0.145050048828125, "learning_rate": 4.987678349206933e-07, "loss": 0.0001, "reward": 1.751785770058632, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 8190 }, { "completion_length": 248.5044765472412, "epoch": 1.3735278092124565, "grad_norm": 0.27968755909602633, "kl": 0.077972412109375, "learning_rate": 4.987666451155465e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7946428842842579, "rewards/format_reward_func": 0.9910714328289032, "step": 8192 }, { "completion_length": 242.2500123977661, "epoch": 1.373863112452324, "grad_norm": 0.2436045067533531, "kl": 0.108734130859375, "learning_rate": 4.987654547376466e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 8194 }, { "completion_length": 240.0937614440918, "epoch": 1.3741984156921916, "grad_norm": 0.3174198813775443, "kl": 0.0972442626953125, "learning_rate": 4.987642637869965e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643089175224, "rewards/format_reward_func": 0.9955357164144516, "step": 8196 }, { "completion_length": 241.19643878936768, "epoch": 1.3745337189320592, "grad_norm": 0.32104686249339653, "kl": 0.245880126953125, "learning_rate": 4.987630722635986e-07, "loss": 0.0002, "reward": 1.7839286550879478, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 8198 }, { "completion_length": 238.9062614440918, "epoch": 1.3748690221719269, "grad_norm": 0.22119044107872407, "kl": 0.101409912109375, "learning_rate": 4.98761880167456e-07, "loss": 0.0001, "reward": 1.721428632736206, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7214286010712385, "rewards/format_reward_func": 1.0, "step": 8200 }, { "completion_length": 242.1294765472412, "epoch": 1.3752043254117943, "grad_norm": 0.19621369458051252, "kl": 0.0926971435546875, "learning_rate": 4.987606874985713e-07, "loss": 0.0001, "reward": 1.7750000432133675, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 8202 }, { "completion_length": 252.14287185668945, "epoch": 1.375539628651662, "grad_norm": 0.20777406668459847, "kl": 0.213043212890625, "learning_rate": 4.987594942569473e-07, "loss": 0.0002, "reward": 1.800000049173832, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 8204 }, { "completion_length": 246.38840198516846, "epoch": 1.3758749318915293, "grad_norm": 0.25030524166954016, "kl": 0.1303863525390625, "learning_rate": 4.987583004425867e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 8206 }, { "completion_length": 256.933048248291, "epoch": 1.376210235131397, "grad_norm": 0.21811990945534226, "kl": 0.104339599609375, "learning_rate": 4.987571060554922e-07, "loss": 0.0001, "reward": 1.7553572282195091, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 8208 }, { "completion_length": 247.2410831451416, "epoch": 1.3765455383712646, "grad_norm": 0.30362258829253413, "kl": 0.1866455078125, "learning_rate": 4.987559110956667e-07, "loss": 0.0002, "reward": 1.7946429029107094, "reward_std": 0.07828682195395231, "rewards/equation_reward_func": 0.808035746216774, "rewards/format_reward_func": 0.9866071492433548, "step": 8210 }, { "completion_length": 244.8705472946167, "epoch": 1.376880841611132, "grad_norm": 0.14472449444708704, "kl": 0.1186981201171875, "learning_rate": 4.987547155631128e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7633928954601288, "rewards/format_reward_func": 0.9955357164144516, "step": 8212 }, { "completion_length": 250.7812623977661, "epoch": 1.3772161448509996, "grad_norm": 0.21972446651081873, "kl": 0.090972900390625, "learning_rate": 4.987535194578333e-07, "loss": 0.0001, "reward": 1.7732143923640251, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 8214 }, { "completion_length": 243.51786708831787, "epoch": 1.377551448090867, "grad_norm": 0.23292783449066065, "kl": 0.0885467529296875, "learning_rate": 4.98752322779831e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 8216 }, { "completion_length": 252.18751049041748, "epoch": 1.3778867513307347, "grad_norm": 0.20195243651174724, "kl": 0.0833892822265625, "learning_rate": 4.987511255291087e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7946428954601288, "rewards/format_reward_func": 0.9910714328289032, "step": 8218 }, { "completion_length": 240.0446538925171, "epoch": 1.3782220545706023, "grad_norm": 0.17906265132759394, "kl": 0.0956573486328125, "learning_rate": 4.987499277056689e-07, "loss": 0.0001, "reward": 1.798214316368103, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.8026785962283611, "rewards/format_reward_func": 0.9955357164144516, "step": 8220 }, { "completion_length": 254.49108505249023, "epoch": 1.37855735781047, "grad_norm": 0.3025207003237902, "kl": 0.0892486572265625, "learning_rate": 4.987487293095148e-07, "loss": 0.0001, "reward": 1.7732143700122833, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7776785995811224, "rewards/format_reward_func": 0.9955357164144516, "step": 8222 }, { "completion_length": 254.6741180419922, "epoch": 1.3788926610503374, "grad_norm": 0.17416299428135643, "kl": 0.0872802734375, "learning_rate": 4.987475303406486e-07, "loss": 0.0001, "reward": 1.7053572237491608, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7098214644938707, "rewards/format_reward_func": 0.9955357164144516, "step": 8224 }, { "completion_length": 243.1026906967163, "epoch": 1.379227964290205, "grad_norm": 0.007376292518863739, "kl": 0.0882415771484375, "learning_rate": 4.987463307990735e-07, "loss": 0.0001, "reward": 1.7875000908970833, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7919643186032772, "rewards/format_reward_func": 0.9955357164144516, "step": 8226 }, { "completion_length": 243.65179634094238, "epoch": 1.3795632675300724, "grad_norm": 0.3491150701987759, "kl": 0.29400634765625, "learning_rate": 4.987451306847922e-07, "loss": 0.0003, "reward": 1.666071504354477, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.6705357562750578, "rewards/format_reward_func": 0.9955357164144516, "step": 8228 }, { "completion_length": 259.85268783569336, "epoch": 1.37989857076994, "grad_norm": 0.2254974501549908, "kl": 0.18572998046875, "learning_rate": 4.987439299978072e-07, "loss": 0.0002, "reward": 1.7500000596046448, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 8230 }, { "completion_length": 248.71876335144043, "epoch": 1.3802338740098077, "grad_norm": 0.005612214152327968, "kl": 0.0982513427734375, "learning_rate": 4.987427287381215e-07, "loss": 0.0001, "reward": 1.7446429058909416, "reward_std": 0.007576144300401211, "rewards/equation_reward_func": 0.74910718947649, "rewards/format_reward_func": 0.9955357164144516, "step": 8232 }, { "completion_length": 242.35268783569336, "epoch": 1.380569177249675, "grad_norm": 0.15117705551384866, "kl": 0.1826324462890625, "learning_rate": 4.987415269057379e-07, "loss": 0.0002, "reward": 1.8178571984171867, "reward_std": 0.04545686487108469, "rewards/equation_reward_func": 0.8267857395112514, "rewards/format_reward_func": 0.9910714328289032, "step": 8234 }, { "completion_length": 248.83036994934082, "epoch": 1.3809044804895427, "grad_norm": 0.16819016833733927, "kl": 0.1221923828125, "learning_rate": 4.987403245006591e-07, "loss": 0.0001, "reward": 1.7553571984171867, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214633762836, "rewards/format_reward_func": 0.9955357164144516, "step": 8236 }, { "completion_length": 257.6607303619385, "epoch": 1.3812397837294104, "grad_norm": 0.20559730867779458, "kl": 0.1548309326171875, "learning_rate": 4.987391215228878e-07, "loss": 0.0002, "reward": 1.7482143566012383, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 8238 }, { "completion_length": 244.97769260406494, "epoch": 1.3815750869692778, "grad_norm": 0.14968388550478384, "kl": 0.2997283935546875, "learning_rate": 4.987379179724267e-07, "loss": 0.0003, "reward": 1.778571493923664, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 8240 }, { "completion_length": 253.08037090301514, "epoch": 1.3819103902091454, "grad_norm": 0.16791201111642334, "kl": 0.1606903076171875, "learning_rate": 4.987367138492787e-07, "loss": 0.0002, "reward": 1.7660715207457542, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7705357447266579, "rewards/format_reward_func": 0.9955357164144516, "step": 8242 }, { "completion_length": 250.44644260406494, "epoch": 1.382245693449013, "grad_norm": 0.2818567106401476, "kl": 0.3727569580078125, "learning_rate": 4.987355091534467e-07, "loss": 0.0004, "reward": 1.7714286521077156, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714285776019096, "rewards/format_reward_func": 1.0, "step": 8244 }, { "completion_length": 241.80358219146729, "epoch": 1.3825809966888805, "grad_norm": 0.21332612942021542, "kl": 0.193572998046875, "learning_rate": 4.987343038849333e-07, "loss": 0.0002, "reward": 1.7571429461240768, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 8246 }, { "completion_length": 244.90179443359375, "epoch": 1.382916299928748, "grad_norm": 0.29629913715379597, "kl": 0.24725341796875, "learning_rate": 4.987330980437413e-07, "loss": 0.0002, "reward": 1.7750000432133675, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 8248 }, { "completion_length": 258.2053689956665, "epoch": 1.3832516031686155, "grad_norm": 0.3395190955566177, "kl": 0.3449859619140625, "learning_rate": 4.987318916298734e-07, "loss": 0.0003, "reward": 1.7517857775092125, "reward_std": 0.09848987217992544, "rewards/equation_reward_func": 0.765178594738245, "rewards/format_reward_func": 0.9866071492433548, "step": 8250 }, { "completion_length": 249.21429634094238, "epoch": 1.3835869064084831, "grad_norm": 0.44822066564718804, "kl": 0.234832763671875, "learning_rate": 4.987306846433325e-07, "loss": 0.0002, "reward": 1.7446429282426834, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7491071820259094, "rewards/format_reward_func": 0.9955357164144516, "step": 8252 }, { "completion_length": 242.1071548461914, "epoch": 1.3839222096483508, "grad_norm": 0.3920342374420616, "kl": 0.464263916015625, "learning_rate": 4.987294770841214e-07, "loss": 0.0005, "reward": 1.733928643167019, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7473214566707611, "rewards/format_reward_func": 0.9866071492433548, "step": 8254 }, { "completion_length": 240.27233028411865, "epoch": 1.3842575128882184, "grad_norm": 0.1406635263351389, "kl": 0.174072265625, "learning_rate": 4.987282689522428e-07, "loss": 0.0002, "reward": 1.7464286386966705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464285790920258, "rewards/format_reward_func": 1.0, "step": 8256 }, { "completion_length": 239.23215198516846, "epoch": 1.3845928161280858, "grad_norm": 0.179425887440818, "kl": 0.1930084228515625, "learning_rate": 4.987270602476995e-07, "loss": 0.0002, "reward": 1.7750000804662704, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 8258 }, { "completion_length": 242.12947273254395, "epoch": 1.3849281193679535, "grad_norm": 0.3154012365668129, "kl": 0.10003662109375, "learning_rate": 4.987258509704942e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857559025288, "rewards/format_reward_func": 1.0, "step": 8260 }, { "completion_length": 234.19643783569336, "epoch": 1.3852634226078209, "grad_norm": 0.16562049076361046, "kl": 0.114166259765625, "learning_rate": 4.9872464112063e-07, "loss": 0.0001, "reward": 1.821428619325161, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8214285969734192, "rewards/format_reward_func": 1.0, "step": 8262 }, { "completion_length": 234.68750953674316, "epoch": 1.3855987258476885, "grad_norm": 0.7369731607621153, "kl": 0.1848907470703125, "learning_rate": 4.987234306981093e-07, "loss": 0.0002, "reward": 1.7428572326898575, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 8264 }, { "completion_length": 233.227689743042, "epoch": 1.3859340290875561, "grad_norm": 0.02821810625385667, "kl": 0.226776123046875, "learning_rate": 4.98722219702935e-07, "loss": 0.0002, "reward": 1.7732143253087997, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.7776786126196384, "rewards/format_reward_func": 0.9955357164144516, "step": 8266 }, { "completion_length": 228.66072463989258, "epoch": 1.3862693323274236, "grad_norm": 0.19036897284281554, "kl": 0.1602020263671875, "learning_rate": 4.9872100813511e-07, "loss": 0.0002, "reward": 1.782142922282219, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 8268 }, { "completion_length": 240.06251049041748, "epoch": 1.3866046355672912, "grad_norm": 0.28445586166487996, "kl": 0.0912628173828125, "learning_rate": 4.98719795994637e-07, "loss": 0.0001, "reward": 1.7321429550647736, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.732142873108387, "rewards/format_reward_func": 1.0, "step": 8270 }, { "completion_length": 228.25001049041748, "epoch": 1.3869399388071586, "grad_norm": 0.36868253449781974, "kl": 0.4998779296875, "learning_rate": 4.987185832815188e-07, "loss": 0.0005, "reward": 1.7928572073578835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571626543999, "rewards/format_reward_func": 1.0, "step": 8272 }, { "completion_length": 231.93304634094238, "epoch": 1.3872752420470262, "grad_norm": 0.2135789845077452, "kl": 0.1878509521484375, "learning_rate": 4.987173699957582e-07, "loss": 0.0002, "reward": 1.7803571820259094, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7848214656114578, "rewards/format_reward_func": 0.9955357164144516, "step": 8274 }, { "completion_length": 233.14733219146729, "epoch": 1.3876105452868939, "grad_norm": 0.09741457644301535, "kl": 0.0953216552734375, "learning_rate": 4.98716156137358e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 8276 }, { "completion_length": 231.3259038925171, "epoch": 1.3879458485267615, "grad_norm": 0.3273387632249645, "kl": 0.5626983642578125, "learning_rate": 4.98714941706321e-07, "loss": 0.0006, "reward": 1.7857143506407738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 8278 }, { "completion_length": 228.06697463989258, "epoch": 1.388281151766629, "grad_norm": 0.4202954088324719, "kl": 0.4383392333984375, "learning_rate": 4.9871372670265e-07, "loss": 0.0004, "reward": 1.7750000432133675, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.775000037625432, "rewards/format_reward_func": 1.0, "step": 8280 }, { "completion_length": 228.60715293884277, "epoch": 1.3886164550064966, "grad_norm": 0.28889603554498616, "kl": 0.09014892578125, "learning_rate": 4.987125111263477e-07, "loss": 0.0001, "reward": 1.7607143446803093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143241912127, "rewards/format_reward_func": 1.0, "step": 8282 }, { "completion_length": 225.5535831451416, "epoch": 1.388951758246364, "grad_norm": 0.33761219684242044, "kl": 0.2540435791015625, "learning_rate": 4.987112949774171e-07, "loss": 0.0003, "reward": 1.7214286550879478, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7214286141097546, "rewards/format_reward_func": 1.0, "step": 8284 }, { "completion_length": 226.7634048461914, "epoch": 1.3892870614862316, "grad_norm": 0.19009764938273654, "kl": 0.253997802734375, "learning_rate": 4.987100782558608e-07, "loss": 0.0003, "reward": 1.7678572237491608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 8286 }, { "completion_length": 235.06251049041748, "epoch": 1.3896223647260992, "grad_norm": 0.25891486225934435, "kl": 0.2195892333984375, "learning_rate": 4.987088609616818e-07, "loss": 0.0002, "reward": 1.7500000596046448, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 8288 }, { "completion_length": 226.99554538726807, "epoch": 1.3899576679659666, "grad_norm": 0.1975587721492033, "kl": 0.310394287109375, "learning_rate": 4.987076430948827e-07, "loss": 0.0003, "reward": 1.717857226729393, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7178571783006191, "rewards/format_reward_func": 1.0, "step": 8290 }, { "completion_length": 238.50447463989258, "epoch": 1.3902929712058343, "grad_norm": 0.2642575662903022, "kl": 0.089019775390625, "learning_rate": 4.987064246554664e-07, "loss": 0.0001, "reward": 1.7696429416537285, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7741071842610836, "rewards/format_reward_func": 0.9955357164144516, "step": 8292 }, { "completion_length": 230.88840293884277, "epoch": 1.3906282744457017, "grad_norm": 0.1691942612744324, "kl": 0.2000579833984375, "learning_rate": 4.987052056434357e-07, "loss": 0.0002, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 8294 }, { "completion_length": 237.10268783569336, "epoch": 1.3909635776855693, "grad_norm": 0.330917499427202, "kl": 0.1829986572265625, "learning_rate": 4.987039860587933e-07, "loss": 0.0002, "reward": 1.7375000715255737, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7419643048197031, "rewards/format_reward_func": 0.9955357164144516, "step": 8296 }, { "completion_length": 244.22322463989258, "epoch": 1.391298880925437, "grad_norm": 0.17135511845815518, "kl": 0.373077392578125, "learning_rate": 4.987027659015423e-07, "loss": 0.0004, "reward": 1.735714353621006, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9910714328289032, "step": 8298 }, { "completion_length": 252.75001335144043, "epoch": 1.3916341841653046, "grad_norm": 0.33960622184873085, "kl": 0.1415557861328125, "learning_rate": 4.987015451716853e-07, "loss": 0.0001, "reward": 1.6946429386734962, "reward_std": 0.09848987031728029, "rewards/equation_reward_func": 0.6991071663796902, "rewards/format_reward_func": 0.9955357164144516, "step": 8300 }, { "completion_length": 245.071439743042, "epoch": 1.391969487405172, "grad_norm": 0.2723528900900101, "kl": 0.122833251953125, "learning_rate": 4.987003238692251e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 8302 }, { "completion_length": 241.62054824829102, "epoch": 1.3923047906450396, "grad_norm": 0.2372067895679325, "kl": 0.095794677734375, "learning_rate": 4.986991019941644e-07, "loss": 0.0001, "reward": 1.7660714834928513, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357484519482, "rewards/format_reward_func": 0.9955357164144516, "step": 8304 }, { "completion_length": 236.24108409881592, "epoch": 1.392640093884907, "grad_norm": 0.27936471533894636, "kl": 0.105682373046875, "learning_rate": 4.986978795465063e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 8306 }, { "completion_length": 236.5089406967163, "epoch": 1.3929753971247747, "grad_norm": 0.23005213349580997, "kl": 0.1139984130859375, "learning_rate": 4.986966565262534e-07, "loss": 0.0001, "reward": 1.8178572058677673, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 1.0, "step": 8308 }, { "completion_length": 232.9285831451416, "epoch": 1.3933107003646423, "grad_norm": 0.3397062470054958, "kl": 0.1307373046875, "learning_rate": 4.986954329334087e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857268452644, "rewards/format_reward_func": 1.0, "step": 8310 }, { "completion_length": 248.77679634094238, "epoch": 1.39364600360451, "grad_norm": 0.1851405173069177, "kl": 0.1040496826171875, "learning_rate": 4.986942087679749e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714286148548126, "rewards/format_reward_func": 1.0, "step": 8312 }, { "completion_length": 254.46429824829102, "epoch": 1.3939813068443774, "grad_norm": 0.28245811292805495, "kl": 0.1212158203125, "learning_rate": 4.986929840299547e-07, "loss": 0.0001, "reward": 1.7410714998841286, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357424914837, "rewards/format_reward_func": 0.9955357164144516, "step": 8314 }, { "completion_length": 233.6875114440918, "epoch": 1.394316610084245, "grad_norm": 0.22962909738151074, "kl": 0.161712646484375, "learning_rate": 4.986917587193511e-07, "loss": 0.0002, "reward": 1.725000061094761, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000424683094, "rewards/format_reward_func": 1.0, "step": 8316 }, { "completion_length": 232.30358409881592, "epoch": 1.3946519133241124, "grad_norm": 0.3743388791517971, "kl": 0.281494140625, "learning_rate": 4.986905328361668e-07, "loss": 0.0003, "reward": 1.7785715013742447, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 8318 }, { "completion_length": 236.19197463989258, "epoch": 1.39498721656398, "grad_norm": 0.18783598559833184, "kl": 0.1347198486328125, "learning_rate": 4.986893063804048e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714469850063, "rewards/format_reward_func": 1.0, "step": 8320 }, { "completion_length": 230.33483219146729, "epoch": 1.3953225198038477, "grad_norm": 0.21152856279262738, "kl": 0.2019805908203125, "learning_rate": 4.986880793520677e-07, "loss": 0.0002, "reward": 1.7571429461240768, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 8322 }, { "completion_length": 230.64733219146729, "epoch": 1.395657823043715, "grad_norm": 0.1390615445266548, "kl": 0.3101959228515625, "learning_rate": 4.986868517511585e-07, "loss": 0.0003, "reward": 1.8000000566244125, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 8324 }, { "completion_length": 248.91518878936768, "epoch": 1.3959931262835827, "grad_norm": 0.24380398211878274, "kl": 0.0861358642578125, "learning_rate": 4.9868562357768e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286059141159, "rewards/format_reward_func": 1.0, "step": 8326 }, { "completion_length": 237.41965293884277, "epoch": 1.3963284295234502, "grad_norm": 0.2780829880264992, "kl": 0.08013916015625, "learning_rate": 4.98684394831635e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 8328 }, { "completion_length": 233.1473331451416, "epoch": 1.3966637327633178, "grad_norm": 0.25010869340527275, "kl": 0.0888214111328125, "learning_rate": 4.986831655130262e-07, "loss": 0.0001, "reward": 1.766071505844593, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.9955357164144516, "step": 8330 }, { "completion_length": 228.9866189956665, "epoch": 1.3969990360031854, "grad_norm": 0.2859377036608154, "kl": 0.0903472900390625, "learning_rate": 4.986819356218565e-07, "loss": 0.0001, "reward": 1.8000000715255737, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 8332 }, { "completion_length": 236.1384048461914, "epoch": 1.397334339243053, "grad_norm": 0.2847175423965275, "kl": 0.1934661865234375, "learning_rate": 4.98680705158129e-07, "loss": 0.0002, "reward": 1.7535714879631996, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7535714656114578, "rewards/format_reward_func": 1.0, "step": 8334 }, { "completion_length": 239.97769260406494, "epoch": 1.3976696424829205, "grad_norm": 0.24022825278721455, "kl": 0.14080810546875, "learning_rate": 4.986794741218462e-07, "loss": 0.0001, "reward": 1.7196429520845413, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 8336 }, { "completion_length": 244.83037090301514, "epoch": 1.398004945722788, "grad_norm": 0.20960399495324336, "kl": 0.2378387451171875, "learning_rate": 4.986782425130111e-07, "loss": 0.0002, "reward": 1.7750000581145287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 8338 }, { "completion_length": 238.55804443359375, "epoch": 1.3983402489626555, "grad_norm": 0.1458231825218131, "kl": 0.10693359375, "learning_rate": 4.986770103316263e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 8340 }, { "completion_length": 245.87947750091553, "epoch": 1.3986755522025232, "grad_norm": 0.4447577806999814, "kl": 0.1026153564453125, "learning_rate": 4.98675777577695e-07, "loss": 0.0001, "reward": 1.6696429327130318, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.6830357573926449, "rewards/format_reward_func": 0.9866071492433548, "step": 8342 }, { "completion_length": 246.63840198516846, "epoch": 1.3990108554423908, "grad_norm": 0.5346485602738086, "kl": 0.145660400390625, "learning_rate": 4.986745442512198e-07, "loss": 0.0001, "reward": 1.7982143387198448, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8026786036789417, "rewards/format_reward_func": 0.9955357164144516, "step": 8344 }, { "completion_length": 245.3125123977661, "epoch": 1.3993461586822582, "grad_norm": 0.2319599019557805, "kl": 0.086669921875, "learning_rate": 4.986733103522037e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857450991869, "rewards/format_reward_func": 1.0, "step": 8346 }, { "completion_length": 258.2634038925171, "epoch": 1.3996814619221258, "grad_norm": 0.42665677837153787, "kl": 0.156463623046875, "learning_rate": 4.986720758806493e-07, "loss": 0.0002, "reward": 1.7464286386966705, "reward_std": 0.08586296625435352, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 0.9821428656578064, "step": 8348 }, { "completion_length": 239.5178680419922, "epoch": 1.4000167651619932, "grad_norm": 0.23925510003351777, "kl": 0.1253509521484375, "learning_rate": 4.986708408365596e-07, "loss": 0.0001, "reward": 1.7928572297096252, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571626543999, "rewards/format_reward_func": 1.0, "step": 8350 }, { "completion_length": 256.7232255935669, "epoch": 1.4003520684018609, "grad_norm": 0.29109935293740163, "kl": 0.097412109375, "learning_rate": 4.986696052199373e-07, "loss": 0.0001, "reward": 1.7464286163449287, "reward_std": 0.0858629634603858, "rewards/equation_reward_func": 0.7553571909666061, "rewards/format_reward_func": 0.9910714328289032, "step": 8352 }, { "completion_length": 257.20537090301514, "epoch": 1.4006873716417285, "grad_norm": 0.17636591360078244, "kl": 0.255035400390625, "learning_rate": 4.986683690307856e-07, "loss": 0.0003, "reward": 1.6839286461472511, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.6883928999304771, "rewards/format_reward_func": 0.9955357164144516, "step": 8354 }, { "completion_length": 260.008939743042, "epoch": 1.4010226748815962, "grad_norm": 0.20055391467891595, "kl": 0.430389404296875, "learning_rate": 4.986671322691071e-07, "loss": 0.0004, "reward": 1.6892857998609543, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.6982143372297287, "rewards/format_reward_func": 0.9910714328289032, "step": 8356 }, { "completion_length": 259.1562604904175, "epoch": 1.4013579781214636, "grad_norm": 0.5636506182839675, "kl": 0.3524169921875, "learning_rate": 4.986658949349046e-07, "loss": 0.0004, "reward": 1.7232143580913544, "reward_std": 0.08838834706693888, "rewards/equation_reward_func": 0.7366071753203869, "rewards/format_reward_func": 0.9866071492433548, "step": 8358 }, { "completion_length": 255.43304634094238, "epoch": 1.4016932813613312, "grad_norm": 0.5249611321278694, "kl": 0.516448974609375, "learning_rate": 4.98664657028181e-07, "loss": 0.0005, "reward": 1.7500000596046448, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7589286118745804, "rewards/format_reward_func": 0.9910714328289032, "step": 8360 }, { "completion_length": 260.5312623977661, "epoch": 1.4020285846011986, "grad_norm": 0.36025818419775324, "kl": 0.0961761474609375, "learning_rate": 4.986634185489391e-07, "loss": 0.0001, "reward": 1.7218750789761543, "reward_std": 0.09028238290920854, "rewards/equation_reward_func": 0.7276786044239998, "rewards/format_reward_func": 0.9941964335739613, "step": 8362 }, { "completion_length": 256.7142963409424, "epoch": 1.4023638878410662, "grad_norm": 0.19437306007039692, "kl": 0.405792236328125, "learning_rate": 4.986621794971819e-07, "loss": 0.0004, "reward": 1.7428572326898575, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7517857402563095, "rewards/format_reward_func": 0.9910714328289032, "step": 8364 }, { "completion_length": 257.4955463409424, "epoch": 1.4026991910809339, "grad_norm": 0.5732167804087518, "kl": 0.6250762939453125, "learning_rate": 4.986609398729121e-07, "loss": 0.0006, "reward": 1.7714286670088768, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 8366 }, { "completion_length": 262.8080472946167, "epoch": 1.4030344943208013, "grad_norm": 0.33261515236998473, "kl": 0.738128662109375, "learning_rate": 4.986596996761327e-07, "loss": 0.0007, "reward": 1.7375000715255737, "reward_std": 0.0883883461356163, "rewards/equation_reward_func": 0.7419643104076385, "rewards/format_reward_func": 0.9955357164144516, "step": 8368 }, { "completion_length": 258.48662281036377, "epoch": 1.403369797560669, "grad_norm": 0.4386239574155109, "kl": 1.22271728515625, "learning_rate": 4.986584589068465e-07, "loss": 0.0012, "reward": 1.7758929282426834, "reward_std": 0.04419417306780815, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 0.9973214343190193, "step": 8370 }, { "completion_length": 256.2455472946167, "epoch": 1.4037051008005366, "grad_norm": 0.27849866199909995, "kl": 1.667144775390625, "learning_rate": 4.986572175650562e-07, "loss": 0.0017, "reward": 1.7357143610715866, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7446428872644901, "rewards/format_reward_func": 0.9910714328289032, "step": 8372 }, { "completion_length": 243.52233028411865, "epoch": 1.404040404040404, "grad_norm": 0.25705270920538403, "kl": 0.59259033203125, "learning_rate": 4.986559756507649e-07, "loss": 0.0006, "reward": 1.7625000774860382, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7669643107801676, "rewards/format_reward_func": 0.9955357164144516, "step": 8374 }, { "completion_length": 251.6294755935669, "epoch": 1.4043757072802716, "grad_norm": 0.15817553115601674, "kl": 1.261627197265625, "learning_rate": 4.986547331639753e-07, "loss": 0.0013, "reward": 1.7321429550647736, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7410714533179998, "rewards/format_reward_func": 0.9910714328289032, "step": 8376 }, { "completion_length": 253.94644165039062, "epoch": 1.4047110105201392, "grad_norm": 0.14483387736198614, "kl": 1.48773193359375, "learning_rate": 4.986534901046903e-07, "loss": 0.0015, "reward": 1.814285770058632, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857365310192, "rewards/format_reward_func": 1.0, "step": 8378 }, { "completion_length": 250.39733505249023, "epoch": 1.4050463137600067, "grad_norm": 0.20242507143435626, "kl": 0.5549163818359375, "learning_rate": 4.986522464729129e-07, "loss": 0.0006, "reward": 1.7616072297096252, "reward_std": 0.07071067788638175, "rewards/equation_reward_func": 0.7732143178582191, "rewards/format_reward_func": 0.9883928671479225, "step": 8380 }, { "completion_length": 261.4821548461914, "epoch": 1.4053816169998743, "grad_norm": 0.3677156368563927, "kl": 0.290008544921875, "learning_rate": 4.986510022686456e-07, "loss": 0.0003, "reward": 1.8062500581145287, "reward_std": 0.07197336968965828, "rewards/equation_reward_func": 0.8223214484751225, "rewards/format_reward_func": 0.9839285798370838, "step": 8382 }, { "completion_length": 269.2812623977661, "epoch": 1.4057169202397417, "grad_norm": 0.5339161928148323, "kl": 0.60052490234375, "learning_rate": 4.986497574918917e-07, "loss": 0.0006, "reward": 1.7718750163912773, "reward_std": 0.10669736191630363, "rewards/equation_reward_func": 0.8053571619093418, "rewards/format_reward_func": 0.9665178768336773, "step": 8384 }, { "completion_length": 261.0803689956665, "epoch": 1.4060522234796093, "grad_norm": 0.21607394771384367, "kl": 0.5733642578125, "learning_rate": 4.986485121426538e-07, "loss": 0.0006, "reward": 1.728571467101574, "reward_std": 0.07071067858487368, "rewards/equation_reward_func": 0.746428582817316, "rewards/format_reward_func": 0.9821428656578064, "step": 8386 }, { "completion_length": 281.7589416503906, "epoch": 1.406387526719477, "grad_norm": 0.23092201842398968, "kl": 0.65911865234375, "learning_rate": 4.986472662209348e-07, "loss": 0.0007, "reward": 1.6799107789993286, "reward_std": 0.11932427063584328, "rewards/equation_reward_func": 0.715178593993187, "rewards/format_reward_func": 0.9647321589291096, "step": 8388 }, { "completion_length": 263.16519355773926, "epoch": 1.4067228299593446, "grad_norm": 0.30303730808498497, "kl": 0.55340576171875, "learning_rate": 4.986460197267376e-07, "loss": 0.0006, "reward": 1.7633929401636124, "reward_std": 0.07197336852550507, "rewards/equation_reward_func": 0.779464315623045, "rewards/format_reward_func": 0.9839285835623741, "step": 8390 }, { "completion_length": 287.9866180419922, "epoch": 1.407058133199212, "grad_norm": 0.3113564380005515, "kl": 0.73895263671875, "learning_rate": 4.986447726600651e-07, "loss": 0.0007, "reward": 1.6491071954369545, "reward_std": 0.128794448915869, "rewards/equation_reward_func": 0.6919643133878708, "rewards/format_reward_func": 0.9571428783237934, "step": 8392 }, { "completion_length": 260.9285821914673, "epoch": 1.4073934364390797, "grad_norm": 0.5877392569400088, "kl": 0.58837890625, "learning_rate": 4.986435250209201e-07, "loss": 0.0006, "reward": 1.7267857939004898, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7312500365078449, "rewards/format_reward_func": 0.9955357164144516, "step": 8394 }, { "completion_length": 270.6071538925171, "epoch": 1.407728739678947, "grad_norm": 0.37487510306904775, "kl": 1.393402099609375, "learning_rate": 4.986422768093056e-07, "loss": 0.0014, "reward": 1.771875038743019, "reward_std": 0.05997780826874077, "rewards/equation_reward_func": 0.786607176065445, "rewards/format_reward_func": 0.9852678664028645, "step": 8396 }, { "completion_length": 257.44197273254395, "epoch": 1.4080640429188147, "grad_norm": 0.1787303931438916, "kl": 0.350555419921875, "learning_rate": 4.986410280252244e-07, "loss": 0.0004, "reward": 1.7986607998609543, "reward_std": 0.05240166233852506, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 0.9986607171595097, "step": 8398 }, { "completion_length": 261.2142972946167, "epoch": 1.4083993461586823, "grad_norm": 0.3143881061853852, "kl": 0.587188720703125, "learning_rate": 4.986397786686793e-07, "loss": 0.0006, "reward": 1.7607143595814705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 0.9821428656578064, "step": 8400 }, { "completion_length": 263.7767972946167, "epoch": 1.4087346493985498, "grad_norm": 0.608426074512046, "kl": 0.341552734375, "learning_rate": 4.986385287396733e-07, "loss": 0.0003, "reward": 1.7392857894301414, "reward_std": 0.10606601554900408, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 0.9821428656578064, "step": 8402 }, { "completion_length": 274.3080472946167, "epoch": 1.4090699526384174, "grad_norm": 0.6285536610000946, "kl": 0.3597412109375, "learning_rate": 4.986372782382092e-07, "loss": 0.0004, "reward": 1.692857213318348, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.701785746961832, "rewards/format_reward_func": 0.9910714328289032, "step": 8404 }, { "completion_length": 260.9910831451416, "epoch": 1.4094052558782848, "grad_norm": 0.10934691631824027, "kl": 0.264923095703125, "learning_rate": 4.986360271642898e-07, "loss": 0.0003, "reward": 1.7553571984171867, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7687500305473804, "rewards/format_reward_func": 0.9866071492433548, "step": 8406 }, { "completion_length": 262.0044755935669, "epoch": 1.4097405591181524, "grad_norm": 0.14820756319286144, "kl": 0.15960693359375, "learning_rate": 4.986347755179181e-07, "loss": 0.0002, "reward": 1.8107143267989159, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107142932713032, "rewards/format_reward_func": 1.0, "step": 8408 }, { "completion_length": 260.5714406967163, "epoch": 1.41007586235802, "grad_norm": 0.21250860071479305, "kl": 0.140960693359375, "learning_rate": 4.98633523299097e-07, "loss": 0.0001, "reward": 1.7303572222590446, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7348214592784643, "rewards/format_reward_func": 0.9955357164144516, "step": 8410 }, { "completion_length": 258.4509029388428, "epoch": 1.4104111655978877, "grad_norm": 0.355366517952804, "kl": 0.192138671875, "learning_rate": 4.986322705078294e-07, "loss": 0.0002, "reward": 1.7053572237491608, "reward_std": 0.12374368496239185, "rewards/equation_reward_func": 0.7187500242143869, "rewards/format_reward_func": 0.9866071492433548, "step": 8412 }, { "completion_length": 248.47322845458984, "epoch": 1.4107464688377551, "grad_norm": 0.24897194525629096, "kl": 0.14501953125, "learning_rate": 4.98631017144118e-07, "loss": 0.0001, "reward": 1.7928571924567223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 8414 }, { "completion_length": 233.80804634094238, "epoch": 1.4110817720776228, "grad_norm": 0.22718979397807326, "kl": 0.15289306640625, "learning_rate": 4.986297632079659e-07, "loss": 0.0002, "reward": 1.767857201397419, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 8416 }, { "completion_length": 247.35268783569336, "epoch": 1.4114170753174902, "grad_norm": 0.1969943143254704, "kl": 0.1800537109375, "learning_rate": 4.986285086993759e-07, "loss": 0.0002, "reward": 1.7500000819563866, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 8418 }, { "completion_length": 248.67412090301514, "epoch": 1.4117523785573578, "grad_norm": 0.3579731369561279, "kl": 0.17572021484375, "learning_rate": 4.986272536183509e-07, "loss": 0.0002, "reward": 1.771428607404232, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 8420 }, { "completion_length": 245.44197463989258, "epoch": 1.4120876817972254, "grad_norm": 0.19579481260834086, "kl": 0.20904541015625, "learning_rate": 4.986259979648938e-07, "loss": 0.0002, "reward": 1.6642857864499092, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.6732143200933933, "rewards/format_reward_func": 0.9910714328289032, "step": 8422 }, { "completion_length": 250.2053680419922, "epoch": 1.4124229850370928, "grad_norm": 0.4279326961042182, "kl": 0.214385986328125, "learning_rate": 4.986247417390074e-07, "loss": 0.0002, "reward": 1.7553572058677673, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 8424 }, { "completion_length": 239.1562623977661, "epoch": 1.4127582882769605, "grad_norm": 0.3513471847587535, "kl": 0.18414306640625, "learning_rate": 4.986234849406947e-07, "loss": 0.0002, "reward": 1.7629465088248253, "reward_std": 0.06250318652018905, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 0.9986607171595097, "step": 8426 }, { "completion_length": 245.0714406967163, "epoch": 1.413093591516828, "grad_norm": 0.3502444509464687, "kl": 0.243865966796875, "learning_rate": 4.986222275699585e-07, "loss": 0.0002, "reward": 1.8375000655651093, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.8419643118977547, "rewards/format_reward_func": 0.9955357164144516, "step": 8428 }, { "completion_length": 245.3125114440918, "epoch": 1.4134288947566955, "grad_norm": 0.437842987118793, "kl": 0.224090576171875, "learning_rate": 4.986209696268018e-07, "loss": 0.0002, "reward": 1.739285796880722, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857559025288, "rewards/format_reward_func": 1.0, "step": 8430 }, { "completion_length": 247.5312623977661, "epoch": 1.4137641979965632, "grad_norm": 0.19685596006311468, "kl": 0.1822509765625, "learning_rate": 4.986197111112275e-07, "loss": 0.0002, "reward": 1.7392857819795609, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 8432 }, { "completion_length": 244.63393783569336, "epoch": 1.4140995012364308, "grad_norm": 0.16053489041924518, "kl": 0.25244140625, "learning_rate": 4.986184520232383e-07, "loss": 0.0003, "reward": 1.7696429044008255, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7741071842610836, "rewards/format_reward_func": 0.9955357164144516, "step": 8434 }, { "completion_length": 242.8973331451416, "epoch": 1.4144348044762982, "grad_norm": 0.11632207449364086, "kl": 0.202850341796875, "learning_rate": 4.986171923628373e-07, "loss": 0.0002, "reward": 1.8035715073347092, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 8436 }, { "completion_length": 248.72769165039062, "epoch": 1.4147701077161658, "grad_norm": 0.3687579739976894, "kl": 0.3116455078125, "learning_rate": 4.986159321300274e-07, "loss": 0.0003, "reward": 1.7767857685685158, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7812500298023224, "rewards/format_reward_func": 0.9955357164144516, "step": 8438 }, { "completion_length": 246.8928680419922, "epoch": 1.4151054109560333, "grad_norm": 0.25257908439965243, "kl": 0.21636962890625, "learning_rate": 4.986146713248115e-07, "loss": 0.0002, "reward": 1.7642857804894447, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 8440 }, { "completion_length": 254.77679538726807, "epoch": 1.415440714195901, "grad_norm": 0.3316500270046741, "kl": 0.380126953125, "learning_rate": 4.986134099471923e-07, "loss": 0.0004, "reward": 1.757142923772335, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428846567869, "rewards/format_reward_func": 1.0, "step": 8442 }, { "completion_length": 244.6116189956665, "epoch": 1.4157760174357685, "grad_norm": 0.13862563902299166, "kl": 0.159881591796875, "learning_rate": 4.986121479971729e-07, "loss": 0.0002, "reward": 1.7357143312692642, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7357142996042967, "rewards/format_reward_func": 1.0, "step": 8444 }, { "completion_length": 241.01340770721436, "epoch": 1.4161113206756362, "grad_norm": 0.18927875554614454, "kl": 0.397674560546875, "learning_rate": 4.986108854747561e-07, "loss": 0.0004, "reward": 1.7620536461472511, "reward_std": 0.05366435460746288, "rewards/equation_reward_func": 0.7696428932249546, "rewards/format_reward_func": 0.9924107193946838, "step": 8446 }, { "completion_length": 251.79465579986572, "epoch": 1.4164466239155036, "grad_norm": 0.1946390969329135, "kl": 0.354339599609375, "learning_rate": 4.986096223799449e-07, "loss": 0.0004, "reward": 1.805357202887535, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.8187500312924385, "rewards/format_reward_func": 0.9866071492433548, "step": 8448 }, { "completion_length": 262.28126525878906, "epoch": 1.4167819271553712, "grad_norm": 0.15546482453488383, "kl": 0.262237548828125, "learning_rate": 4.986083587127423e-07, "loss": 0.0003, "reward": 1.7732143476605415, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776785977184772, "rewards/format_reward_func": 0.9955357164144516, "step": 8450 }, { "completion_length": 250.01786994934082, "epoch": 1.4171172303952386, "grad_norm": 0.25181161802683366, "kl": 0.218536376953125, "learning_rate": 4.98607094473151e-07, "loss": 0.0002, "reward": 1.796428643167019, "reward_std": 0.08586296625435352, "rewards/equation_reward_func": 0.8053571656346321, "rewards/format_reward_func": 0.9910714328289032, "step": 8452 }, { "completion_length": 255.62500858306885, "epoch": 1.4174525336351063, "grad_norm": 0.3271275806533319, "kl": 0.414764404296875, "learning_rate": 4.98605829661174e-07, "loss": 0.0004, "reward": 1.7678571939468384, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.776785746216774, "rewards/format_reward_func": 0.9910714328289032, "step": 8454 }, { "completion_length": 256.46875858306885, "epoch": 1.417787836874974, "grad_norm": 0.47129680996280954, "kl": 0.260772705078125, "learning_rate": 4.986045642768141e-07, "loss": 0.0003, "reward": 1.764285758137703, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 8456 }, { "completion_length": 259.5714387893677, "epoch": 1.4181231401148413, "grad_norm": 0.09692164782946838, "kl": 0.162628173828125, "learning_rate": 4.986032983200745e-07, "loss": 0.0002, "reward": 1.789285771548748, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7982143089175224, "rewards/format_reward_func": 0.9910714328289032, "step": 8458 }, { "completion_length": 256.38840198516846, "epoch": 1.418458443354709, "grad_norm": 0.23052555459158877, "kl": 0.2418212890625, "learning_rate": 4.986020317909577e-07, "loss": 0.0002, "reward": 1.7785714864730835, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7875000275671482, "rewards/format_reward_func": 0.9910714328289032, "step": 8460 }, { "completion_length": 259.8259048461914, "epoch": 1.4187937465945764, "grad_norm": 0.21238298783978266, "kl": 0.22589111328125, "learning_rate": 4.98600764689467e-07, "loss": 0.0002, "reward": 1.7500000894069672, "reward_std": 0.04040610231459141, "rewards/equation_reward_func": 0.7589285895228386, "rewards/format_reward_func": 0.9910714328289032, "step": 8462 }, { "completion_length": 255.8125123977661, "epoch": 1.419129049834444, "grad_norm": 0.10720861997222418, "kl": 0.178863525390625, "learning_rate": 4.985994970156052e-07, "loss": 0.0002, "reward": 1.7410714775323868, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357313156128, "rewards/format_reward_func": 0.9955357164144516, "step": 8464 }, { "completion_length": 263.75001335144043, "epoch": 1.4194643530743116, "grad_norm": 0.2796565830384278, "kl": 0.134246826171875, "learning_rate": 4.985982287693751e-07, "loss": 0.0001, "reward": 1.7803572043776512, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7848214693367481, "rewards/format_reward_func": 0.9955357164144516, "step": 8466 }, { "completion_length": 269.77679920196533, "epoch": 1.4197996563141793, "grad_norm": 0.1424100872454886, "kl": 0.282989501953125, "learning_rate": 4.985969599507797e-07, "loss": 0.0003, "reward": 1.741071492433548, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.745535746216774, "rewards/format_reward_func": 0.9955357164144516, "step": 8468 }, { "completion_length": 255.5759038925171, "epoch": 1.4201349595540467, "grad_norm": 0.10532280309336838, "kl": 0.143096923828125, "learning_rate": 4.98595690559822e-07, "loss": 0.0001, "reward": 1.7446429505944252, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.9955357164144516, "step": 8470 }, { "completion_length": 252.64733505249023, "epoch": 1.4204702627939143, "grad_norm": 0.4641752137066568, "kl": 0.17669677734375, "learning_rate": 4.985944205965048e-07, "loss": 0.0002, "reward": 1.723214365541935, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276786006987095, "rewards/format_reward_func": 0.9955357164144516, "step": 8472 }, { "completion_length": 248.41965198516846, "epoch": 1.4208055660337817, "grad_norm": 0.20406659597155857, "kl": 0.146026611328125, "learning_rate": 4.985931500608311e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285980910063, "rewards/format_reward_func": 1.0, "step": 8474 }, { "completion_length": 246.37500953674316, "epoch": 1.4211408692736494, "grad_norm": 0.25091824175709215, "kl": 0.225311279296875, "learning_rate": 4.985918789528037e-07, "loss": 0.0002, "reward": 1.732142947614193, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 1.0, "step": 8476 }, { "completion_length": 252.33929920196533, "epoch": 1.421476172513517, "grad_norm": 0.3030296561947238, "kl": 0.157745361328125, "learning_rate": 4.985906072724257e-07, "loss": 0.0002, "reward": 1.735714353621006, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7357143219560385, "rewards/format_reward_func": 1.0, "step": 8478 }, { "completion_length": 263.6250114440918, "epoch": 1.4218114757533844, "grad_norm": 0.3695755041875649, "kl": 0.115570068359375, "learning_rate": 4.985893350196999e-07, "loss": 0.0001, "reward": 1.7125000804662704, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7169643212109804, "rewards/format_reward_func": 0.9955357164144516, "step": 8480 }, { "completion_length": 256.9732255935669, "epoch": 1.422146778993252, "grad_norm": 0.1439424708583325, "kl": 0.1053314208984375, "learning_rate": 4.985880621946294e-07, "loss": 0.0001, "reward": 1.7392858043313026, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.748214315623045, "rewards/format_reward_func": 0.9910714328289032, "step": 8482 }, { "completion_length": 263.15626335144043, "epoch": 1.4224820822331194, "grad_norm": 0.281594071406156, "kl": 0.112548828125, "learning_rate": 4.985867887972169e-07, "loss": 0.0001, "reward": 1.6857143715023994, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7035714685916901, "rewards/format_reward_func": 0.9821428656578064, "step": 8484 }, { "completion_length": 253.27679920196533, "epoch": 1.422817385472987, "grad_norm": 0.20893025538573007, "kl": 0.092681884765625, "learning_rate": 4.985855148274655e-07, "loss": 0.0001, "reward": 1.7767857611179352, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7812500298023224, "rewards/format_reward_func": 0.9955357164144516, "step": 8486 }, { "completion_length": 250.0714406967163, "epoch": 1.4231526887128547, "grad_norm": 0.16251693933127415, "kl": 0.122161865234375, "learning_rate": 4.985842402853781e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 8488 }, { "completion_length": 253.47769260406494, "epoch": 1.4234879919527224, "grad_norm": 0.2043458697797928, "kl": 0.140777587890625, "learning_rate": 4.985829651709575e-07, "loss": 0.0001, "reward": 1.8303571939468384, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8348214440047741, "rewards/format_reward_func": 0.9955357164144516, "step": 8490 }, { "completion_length": 246.39733123779297, "epoch": 1.4238232951925898, "grad_norm": 0.20184167414718476, "kl": 0.106292724609375, "learning_rate": 4.985816894842069e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714626312256, "rewards/format_reward_func": 1.0, "step": 8492 }, { "completion_length": 245.11608219146729, "epoch": 1.4241585984324574, "grad_norm": 0.1438843833872152, "kl": 0.119415283203125, "learning_rate": 4.985804132251289e-07, "loss": 0.0001, "reward": 1.769642911851406, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7741071749478579, "rewards/format_reward_func": 0.9955357164144516, "step": 8494 }, { "completion_length": 258.9151887893677, "epoch": 1.4244939016723248, "grad_norm": 0.26308992695053623, "kl": 0.12542724609375, "learning_rate": 4.985791363937267e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 8496 }, { "completion_length": 251.2410831451416, "epoch": 1.4248292049121924, "grad_norm": 0.1744191837380716, "kl": 0.1085205078125, "learning_rate": 4.985778589900032e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 8498 }, { "completion_length": 242.3348331451416, "epoch": 1.42516450815206, "grad_norm": 0.25796914063104437, "kl": 0.091278076171875, "learning_rate": 4.985765810139614e-07, "loss": 0.0001, "reward": 1.7660714834928513, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 8500 }, { "completion_length": 243.51340198516846, "epoch": 1.4254998113919275, "grad_norm": 0.27267829180547887, "kl": 0.1027984619140625, "learning_rate": 4.98575302465604e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 8502 }, { "completion_length": 253.4553689956665, "epoch": 1.4258351146317951, "grad_norm": 0.25539325970142795, "kl": 0.093658447265625, "learning_rate": 4.985740233449341e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.10354063473641872, "rewards/equation_reward_func": 0.7616071701049805, "rewards/format_reward_func": 0.9866071492433548, "step": 8504 }, { "completion_length": 227.38393688201904, "epoch": 1.4261704178716628, "grad_norm": 0.2182021184194296, "kl": 0.0866546630859375, "learning_rate": 4.985727436519547e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 1.0, "step": 8506 }, { "completion_length": 233.50447368621826, "epoch": 1.4265057211115302, "grad_norm": 0.12847280880524686, "kl": 0.1015472412109375, "learning_rate": 4.985714633866685e-07, "loss": 0.0001, "reward": 1.728571504354477, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7285714484751225, "rewards/format_reward_func": 1.0, "step": 8508 }, { "completion_length": 247.74108219146729, "epoch": 1.4268410243513978, "grad_norm": 0.17515847834387244, "kl": 0.113861083984375, "learning_rate": 4.985701825490789e-07, "loss": 0.0001, "reward": 1.7125000953674316, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.716964328661561, "rewards/format_reward_func": 0.9955357164144516, "step": 8510 }, { "completion_length": 241.46429824829102, "epoch": 1.4271763275912654, "grad_norm": 0.18851509196918953, "kl": 0.089111328125, "learning_rate": 4.985689011391884e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428999304771, "rewards/format_reward_func": 1.0, "step": 8512 }, { "completion_length": 236.56697463989258, "epoch": 1.4275116308311329, "grad_norm": 0.1949627540989246, "kl": 0.0927734375, "learning_rate": 4.985676191570001e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 8514 }, { "completion_length": 228.66518878936768, "epoch": 1.4278469340710005, "grad_norm": 0.2164410413970927, "kl": 0.0898284912109375, "learning_rate": 4.985663366025171e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 8516 }, { "completion_length": 230.446439743042, "epoch": 1.428182237310868, "grad_norm": 0.10347510631654368, "kl": 0.11016845703125, "learning_rate": 4.985650534757421e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 8518 }, { "completion_length": 238.00447368621826, "epoch": 1.4285175405507355, "grad_norm": 0.1783372713511173, "kl": 0.0781707763671875, "learning_rate": 4.985637697766783e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.767857177183032, "rewards/format_reward_func": 1.0, "step": 8520 }, { "completion_length": 246.2767972946167, "epoch": 1.4288528437906032, "grad_norm": 0.2876657858107366, "kl": 0.0819549560546875, "learning_rate": 4.985624855053286e-07, "loss": 0.0001, "reward": 1.7321429178118706, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428935974836, "rewards/format_reward_func": 1.0, "step": 8522 }, { "completion_length": 230.7544755935669, "epoch": 1.4291881470304708, "grad_norm": 0.22398410695846024, "kl": 0.0896148681640625, "learning_rate": 4.985612006616957e-07, "loss": 0.0001, "reward": 1.7892857939004898, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 8524 }, { "completion_length": 241.4241189956665, "epoch": 1.4295234502703382, "grad_norm": 0.5155656556852308, "kl": 0.111846923828125, "learning_rate": 4.985599152457829e-07, "loss": 0.0001, "reward": 1.764285795390606, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 8526 }, { "completion_length": 237.36608219146729, "epoch": 1.4298587535102059, "grad_norm": 0.12106024005112531, "kl": 0.085052490234375, "learning_rate": 4.985586292575929e-07, "loss": 0.0001, "reward": 1.7678571790456772, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571697324514, "rewards/format_reward_func": 1.0, "step": 8528 }, { "completion_length": 227.28572845458984, "epoch": 1.4301940567500733, "grad_norm": 0.15222247755437548, "kl": 0.0845947265625, "learning_rate": 4.985573426971289e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 8530 }, { "completion_length": 240.3705472946167, "epoch": 1.430529359989941, "grad_norm": 0.23859971475872568, "kl": 0.091400146484375, "learning_rate": 4.985560555643937e-07, "loss": 0.0001, "reward": 1.78035718947649, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 8532 }, { "completion_length": 243.41072463989258, "epoch": 1.4308646632298085, "grad_norm": 0.22536579496061887, "kl": 0.081878662109375, "learning_rate": 4.985547678593903e-07, "loss": 0.0001, "reward": 1.7232143357396126, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276785969734192, "rewards/format_reward_func": 0.9955357164144516, "step": 8534 }, { "completion_length": 247.3750123977661, "epoch": 1.431199966469676, "grad_norm": 0.2802477726314655, "kl": 0.079833984375, "learning_rate": 4.985534795821217e-07, "loss": 0.0001, "reward": 1.728571504354477, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 8536 }, { "completion_length": 244.95983123779297, "epoch": 1.4315352697095436, "grad_norm": 0.22365417759999096, "kl": 0.0858612060546875, "learning_rate": 4.985521907325907e-07, "loss": 0.0001, "reward": 1.7160715386271477, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7205357328057289, "rewards/format_reward_func": 0.9955357164144516, "step": 8538 }, { "completion_length": 247.64286708831787, "epoch": 1.431870572949411, "grad_norm": 0.29738940177670603, "kl": 0.0897216796875, "learning_rate": 4.985509013108005e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 8540 }, { "completion_length": 242.60269260406494, "epoch": 1.4322058761892786, "grad_norm": 0.12314884860826378, "kl": 0.092803955078125, "learning_rate": 4.985496113167539e-07, "loss": 0.0001, "reward": 1.8321429044008255, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8321428932249546, "rewards/format_reward_func": 1.0, "step": 8542 }, { "completion_length": 236.54018878936768, "epoch": 1.4325411794291463, "grad_norm": 0.28216533234311264, "kl": 0.093536376953125, "learning_rate": 4.985483207504541e-07, "loss": 0.0001, "reward": 1.7428572326898575, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 8544 }, { "completion_length": 234.81250953674316, "epoch": 1.432876482669014, "grad_norm": 0.22984926963704252, "kl": 0.110687255859375, "learning_rate": 4.985470296119038e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 8546 }, { "completion_length": 232.24554634094238, "epoch": 1.4332117859088813, "grad_norm": 0.11927480159230068, "kl": 0.120819091796875, "learning_rate": 4.985457379011061e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 8548 }, { "completion_length": 230.32143783569336, "epoch": 1.433547089148749, "grad_norm": 0.20098209710952783, "kl": 0.1114654541015625, "learning_rate": 4.98544445618064e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428719907999, "rewards/format_reward_func": 1.0, "step": 8550 }, { "completion_length": 237.6294755935669, "epoch": 1.4338823923886164, "grad_norm": 0.12974919344947425, "kl": 0.113372802734375, "learning_rate": 4.985431527627804e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.8089285865426064, "rewards/format_reward_func": 0.9910714328289032, "step": 8552 }, { "completion_length": 242.09375953674316, "epoch": 1.434217695628484, "grad_norm": 0.1779069025850947, "kl": 0.11090087890625, "learning_rate": 4.985418593352583e-07, "loss": 0.0001, "reward": 1.801785796880722, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.8062500208616257, "rewards/format_reward_func": 0.9955357164144516, "step": 8554 }, { "completion_length": 253.00894165039062, "epoch": 1.4345529988683516, "grad_norm": 0.39904585725146463, "kl": 0.11761474609375, "learning_rate": 4.985405653355006e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7732143327593803, "rewards/format_reward_func": 0.9910714328289032, "step": 8556 }, { "completion_length": 241.35268783569336, "epoch": 1.434888302108219, "grad_norm": 0.20989425840657888, "kl": 0.116424560546875, "learning_rate": 4.985392707635104e-07, "loss": 0.0001, "reward": 1.7535714656114578, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714600235224, "rewards/format_reward_func": 1.0, "step": 8558 }, { "completion_length": 254.6339406967163, "epoch": 1.4352236053480867, "grad_norm": 0.5080392816810477, "kl": 0.169158935546875, "learning_rate": 4.985379756192908e-07, "loss": 0.0002, "reward": 1.7428572177886963, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7517857514321804, "rewards/format_reward_func": 0.9910714328289032, "step": 8560 }, { "completion_length": 250.40625953674316, "epoch": 1.435558908587954, "grad_norm": 0.40974130131501063, "kl": 0.1434326171875, "learning_rate": 4.985366799028445e-07, "loss": 0.0001, "reward": 1.7361607775092125, "reward_std": 0.05997780663892627, "rewards/equation_reward_func": 0.741964302957058, "rewards/format_reward_func": 0.9941964335739613, "step": 8562 }, { "completion_length": 238.5759038925171, "epoch": 1.4358942118278217, "grad_norm": 0.3081459571029257, "kl": 0.13323974609375, "learning_rate": 4.985353836141746e-07, "loss": 0.0001, "reward": 1.742857187986374, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7517857477068901, "rewards/format_reward_func": 0.9910714328289032, "step": 8564 }, { "completion_length": 245.1116189956665, "epoch": 1.4362295150676894, "grad_norm": 0.17198720339414358, "kl": 0.1402435302734375, "learning_rate": 4.985340867532841e-07, "loss": 0.0001, "reward": 1.7285714969038963, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7285714708268642, "rewards/format_reward_func": 1.0, "step": 8566 }, { "completion_length": 245.3437614440918, "epoch": 1.436564818307557, "grad_norm": 0.19688696870277422, "kl": 0.13861083984375, "learning_rate": 4.98532789320176e-07, "loss": 0.0001, "reward": 1.7464286237955093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286163449287, "rewards/format_reward_func": 1.0, "step": 8568 }, { "completion_length": 236.9910831451416, "epoch": 1.4369001215474244, "grad_norm": 0.1986506605536204, "kl": 0.130035400390625, "learning_rate": 4.985314913148534e-07, "loss": 0.0001, "reward": 1.7910715118050575, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 8570 }, { "completion_length": 236.37054634094238, "epoch": 1.437235424787292, "grad_norm": 0.1790702839307691, "kl": 0.146270751953125, "learning_rate": 4.98530192737319e-07, "loss": 0.0001, "reward": 1.7964286133646965, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 8572 }, { "completion_length": 252.28126049041748, "epoch": 1.4375707280271595, "grad_norm": 0.224827230192582, "kl": 0.1280517578125, "learning_rate": 4.98528893587576e-07, "loss": 0.0001, "reward": 1.7526786476373672, "reward_std": 0.05682107945904136, "rewards/equation_reward_func": 0.7589285969734192, "rewards/format_reward_func": 0.9937500059604645, "step": 8574 }, { "completion_length": 242.94643878936768, "epoch": 1.437906031267027, "grad_norm": 0.40589939211072706, "kl": 0.128082275390625, "learning_rate": 4.985275938656273e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.11111677810549736, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 0.9821428656578064, "step": 8576 }, { "completion_length": 240.07590293884277, "epoch": 1.4382413345068947, "grad_norm": 0.2912371749687889, "kl": 0.328521728515625, "learning_rate": 4.98526293571476e-07, "loss": 0.0003, "reward": 1.705357238650322, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7098214663565159, "rewards/format_reward_func": 0.9955357164144516, "step": 8578 }, { "completion_length": 242.60715293884277, "epoch": 1.4385766377467624, "grad_norm": 0.21713702252679135, "kl": 0.2657470703125, "learning_rate": 4.98524992705125e-07, "loss": 0.0003, "reward": 1.742857202887535, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 8580 }, { "completion_length": 248.51340293884277, "epoch": 1.4389119409866298, "grad_norm": 0.34041115823987345, "kl": 0.33697509765625, "learning_rate": 4.985236912665773e-07, "loss": 0.0003, "reward": 1.7482143640518188, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7526785917580128, "rewards/format_reward_func": 0.9955357164144516, "step": 8582 }, { "completion_length": 252.89733219146729, "epoch": 1.4392472442264974, "grad_norm": 0.3167723579300486, "kl": 0.2979736328125, "learning_rate": 4.985223892558359e-07, "loss": 0.0003, "reward": 1.757142923772335, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 8584 }, { "completion_length": 232.28125953674316, "epoch": 1.4395825474663648, "grad_norm": 0.14006470671982402, "kl": 0.265777587890625, "learning_rate": 4.985210866729038e-07, "loss": 0.0003, "reward": 1.7053572237491608, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.709821455180645, "rewards/format_reward_func": 0.9955357164144516, "step": 8586 }, { "completion_length": 240.96429347991943, "epoch": 1.4399178507062325, "grad_norm": 0.5398088833976759, "kl": 0.422210693359375, "learning_rate": 4.985197835177841e-07, "loss": 0.0004, "reward": 1.7107143625617027, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.719642885029316, "rewards/format_reward_func": 0.9910714328289032, "step": 8588 }, { "completion_length": 252.01340293884277, "epoch": 1.4402531539461, "grad_norm": 0.19388137471372102, "kl": 0.404083251953125, "learning_rate": 4.985184797904797e-07, "loss": 0.0004, "reward": 1.7017858028411865, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.706250037997961, "rewards/format_reward_func": 0.9955357164144516, "step": 8590 }, { "completion_length": 249.28572750091553, "epoch": 1.4405884571859675, "grad_norm": 0.1371797585529847, "kl": 0.428375244140625, "learning_rate": 4.985171754909936e-07, "loss": 0.0004, "reward": 1.7428572252392769, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7517857365310192, "rewards/format_reward_func": 0.9910714328289032, "step": 8592 }, { "completion_length": 244.2500114440918, "epoch": 1.4409237604258351, "grad_norm": 0.23078455389106672, "kl": 0.218597412109375, "learning_rate": 4.985158706193287e-07, "loss": 0.0002, "reward": 1.7750000581145287, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7839285936206579, "rewards/format_reward_func": 0.9910714328289032, "step": 8594 }, { "completion_length": 251.49108219146729, "epoch": 1.4412590636657026, "grad_norm": 0.2085646350562671, "kl": 0.250274658203125, "learning_rate": 4.985145651754883e-07, "loss": 0.0003, "reward": 1.7625000551342964, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 8596 }, { "completion_length": 253.1517972946167, "epoch": 1.4415943669055702, "grad_norm": 0.19498147177236294, "kl": 0.826019287109375, "learning_rate": 4.985132591594751e-07, "loss": 0.0008, "reward": 1.7857143357396126, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143022119999, "rewards/format_reward_func": 1.0, "step": 8598 }, { "completion_length": 234.71876049041748, "epoch": 1.4419296701454378, "grad_norm": 0.544497343527771, "kl": 0.1741943359375, "learning_rate": 4.985119525712924e-07, "loss": 0.0002, "reward": 1.76071435213089, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143297791481, "rewards/format_reward_func": 1.0, "step": 8600 }, { "completion_length": 230.98661708831787, "epoch": 1.4422649733853055, "grad_norm": 0.2627565818988088, "kl": 0.214813232421875, "learning_rate": 4.98510645410943e-07, "loss": 0.0002, "reward": 1.716071479022503, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7205357514321804, "rewards/format_reward_func": 0.9955357164144516, "step": 8602 }, { "completion_length": 233.70090103149414, "epoch": 1.4426002766251729, "grad_norm": 0.6021051909045801, "kl": 0.244903564453125, "learning_rate": 4.985093376784298e-07, "loss": 0.0002, "reward": 1.7660715132951736, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9955357164144516, "step": 8604 }, { "completion_length": 235.54911518096924, "epoch": 1.4429355798650405, "grad_norm": 0.2665148810564702, "kl": 0.25433349609375, "learning_rate": 4.98508029373756e-07, "loss": 0.0003, "reward": 1.7142858058214188, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7142857424914837, "rewards/format_reward_func": 1.0, "step": 8606 }, { "completion_length": 232.60268688201904, "epoch": 1.443270883104908, "grad_norm": 0.34224759146463907, "kl": 0.1395263671875, "learning_rate": 4.985067204969247e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 8608 }, { "completion_length": 233.10715579986572, "epoch": 1.4436061863447756, "grad_norm": 0.19504296285660103, "kl": 0.133758544921875, "learning_rate": 4.985054110479387e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 8610 }, { "completion_length": 239.98215293884277, "epoch": 1.4439414895846432, "grad_norm": 0.17531776073423436, "kl": 0.121978759765625, "learning_rate": 4.985041010268011e-07, "loss": 0.0001, "reward": 1.7250000685453415, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000387430191, "rewards/format_reward_func": 1.0, "step": 8612 }, { "completion_length": 219.0044755935669, "epoch": 1.4442767928245106, "grad_norm": 0.5596495225303464, "kl": 0.151397705078125, "learning_rate": 4.98502790433515e-07, "loss": 0.0002, "reward": 1.8142857402563095, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8142857477068901, "rewards/format_reward_func": 1.0, "step": 8614 }, { "completion_length": 229.80358219146729, "epoch": 1.4446120960643782, "grad_norm": 0.15586386632307309, "kl": 0.11224365234375, "learning_rate": 4.985014792680833e-07, "loss": 0.0001, "reward": 1.7035714834928513, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.703571455553174, "rewards/format_reward_func": 1.0, "step": 8616 }, { "completion_length": 245.8526906967163, "epoch": 1.4449473993042456, "grad_norm": 0.13640709963299535, "kl": 0.118865966796875, "learning_rate": 4.98500167530509e-07, "loss": 0.0001, "reward": 1.6883929371833801, "reward_std": 0.06692260596901178, "rewards/equation_reward_func": 0.6946429014205933, "rewards/format_reward_func": 0.9937500059604645, "step": 8618 }, { "completion_length": 227.4687614440918, "epoch": 1.4452827025441133, "grad_norm": 0.2478581313144655, "kl": 0.0968017578125, "learning_rate": 4.984988552207952e-07, "loss": 0.0001, "reward": 1.7535715252161026, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 8620 }, { "completion_length": 242.321439743042, "epoch": 1.445618005783981, "grad_norm": 0.4948357753835724, "kl": 0.118927001953125, "learning_rate": 4.984975423389449e-07, "loss": 0.0001, "reward": 1.7375000789761543, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643253087997, "rewards/format_reward_func": 0.9955357164144516, "step": 8622 }, { "completion_length": 230.17858123779297, "epoch": 1.4459533090238486, "grad_norm": 0.13151233430921153, "kl": 0.130401611328125, "learning_rate": 4.984962288849611e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 8624 }, { "completion_length": 234.3616189956665, "epoch": 1.446288612263716, "grad_norm": 0.3987436135276987, "kl": 0.14691162109375, "learning_rate": 4.98494914858847e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7553571704775095, "rewards/format_reward_func": 0.9910714328289032, "step": 8626 }, { "completion_length": 239.37054920196533, "epoch": 1.4466239155035836, "grad_norm": 0.5486163983262353, "kl": 0.307373046875, "learning_rate": 4.984936002606053e-07, "loss": 0.0003, "reward": 1.8000000566244125, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000342726707, "rewards/format_reward_func": 1.0, "step": 8628 }, { "completion_length": 230.57590675354004, "epoch": 1.446959218743451, "grad_norm": 0.12681436673323587, "kl": 0.11016845703125, "learning_rate": 4.984922850902392e-07, "loss": 0.0001, "reward": 1.7375000715255737, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643197208643, "rewards/format_reward_func": 0.9955357164144516, "step": 8630 }, { "completion_length": 241.89286994934082, "epoch": 1.4472945219833186, "grad_norm": 0.18086561065557644, "kl": 0.263641357421875, "learning_rate": 4.984909693477518e-07, "loss": 0.0003, "reward": 1.7285714894533157, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7285714596509933, "rewards/format_reward_func": 1.0, "step": 8632 }, { "completion_length": 242.1562623977661, "epoch": 1.4476298252231863, "grad_norm": 0.2787489245249078, "kl": 0.3875732421875, "learning_rate": 4.98489653033146e-07, "loss": 0.0004, "reward": 1.7125000804662704, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7169643305242062, "rewards/format_reward_func": 0.9955357164144516, "step": 8634 }, { "completion_length": 245.22322463989258, "epoch": 1.4479651284630537, "grad_norm": 0.2814785129957571, "kl": 0.5368499755859375, "learning_rate": 4.98488336146425e-07, "loss": 0.0005, "reward": 1.7464286237955093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 8636 }, { "completion_length": 240.5000114440918, "epoch": 1.4483004317029213, "grad_norm": 0.24754685535957013, "kl": 0.8140869140625, "learning_rate": 4.984870186875916e-07, "loss": 0.0008, "reward": 1.7821429297327995, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 8638 }, { "completion_length": 232.0669755935669, "epoch": 1.448635734942789, "grad_norm": 0.24005853574977867, "kl": 0.136199951171875, "learning_rate": 4.984857006566489e-07, "loss": 0.0001, "reward": 1.7071429565548897, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7071428876370192, "rewards/format_reward_func": 1.0, "step": 8640 }, { "completion_length": 239.9776906967163, "epoch": 1.4489710381826564, "grad_norm": 0.3229400482830985, "kl": 0.5002593994140625, "learning_rate": 4.984843820536002e-07, "loss": 0.0005, "reward": 1.7500000596046448, "reward_std": 0.08081220369786024, "rewards/equation_reward_func": 0.7589285913854837, "rewards/format_reward_func": 0.9910714328289032, "step": 8642 }, { "completion_length": 246.5714406967163, "epoch": 1.449306341422524, "grad_norm": 0.19913141938149045, "kl": 0.5619049072265625, "learning_rate": 4.984830628784482e-07, "loss": 0.0006, "reward": 1.74642863124609, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7553571835160255, "rewards/format_reward_func": 0.9910714328289032, "step": 8644 }, { "completion_length": 256.48215675354004, "epoch": 1.4496416446623916, "grad_norm": 0.1987641534881554, "kl": 0.1080780029296875, "learning_rate": 4.984817431311961e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7767857350409031, "rewards/format_reward_func": 0.9910714328289032, "step": 8646 }, { "completion_length": 243.52679634094238, "epoch": 1.449976947902259, "grad_norm": 0.388932344482841, "kl": 0.1192169189453125, "learning_rate": 4.984804228118468e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 0.987500011920929, "step": 8648 }, { "completion_length": 253.69643688201904, "epoch": 1.4503122511421267, "grad_norm": 0.12055179889101425, "kl": 0.193603515625, "learning_rate": 4.984791019204034e-07, "loss": 0.0002, "reward": 1.7267857640981674, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7401786055415869, "rewards/format_reward_func": 0.9866071492433548, "step": 8650 }, { "completion_length": 256.64287185668945, "epoch": 1.450647554381994, "grad_norm": 0.45740297022280063, "kl": 0.192108154296875, "learning_rate": 4.984777804568692e-07, "loss": 0.0002, "reward": 1.785714328289032, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 8652 }, { "completion_length": 255.66519165039062, "epoch": 1.4509828576218617, "grad_norm": 0.17801379402730066, "kl": 0.08984375, "learning_rate": 4.984764584212469e-07, "loss": 0.0001, "reward": 1.7803572192788124, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214618861675, "rewards/format_reward_func": 0.9955357164144516, "step": 8654 }, { "completion_length": 267.5491199493408, "epoch": 1.4513181608617294, "grad_norm": 0.11665353846037967, "kl": 0.127777099609375, "learning_rate": 4.984751358135396e-07, "loss": 0.0001, "reward": 1.714285783469677, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7232143133878708, "rewards/format_reward_func": 0.9910714328289032, "step": 8656 }, { "completion_length": 263.14287090301514, "epoch": 1.451653464101597, "grad_norm": 0.25408979673938825, "kl": 0.13909912109375, "learning_rate": 4.984738126337504e-07, "loss": 0.0001, "reward": 1.6875000521540642, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7008928880095482, "rewards/format_reward_func": 0.9866071492433548, "step": 8658 }, { "completion_length": 272.9107255935669, "epoch": 1.4519887673414644, "grad_norm": 0.18260584338052813, "kl": 0.2103271484375, "learning_rate": 4.984724888818824e-07, "loss": 0.0002, "reward": 1.7982143387198448, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8026785962283611, "rewards/format_reward_func": 0.9955357164144516, "step": 8660 }, { "completion_length": 264.65625953674316, "epoch": 1.452324070581332, "grad_norm": 0.06142023819391897, "kl": 0.0996246337890625, "learning_rate": 4.984711645579387e-07, "loss": 0.0001, "reward": 1.7571429088711739, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7660714685916901, "rewards/format_reward_func": 0.9910714328289032, "step": 8662 }, { "completion_length": 263.11162090301514, "epoch": 1.4526593738211995, "grad_norm": 0.1881334188221849, "kl": 0.0886993408203125, "learning_rate": 4.984698396619221e-07, "loss": 0.0001, "reward": 1.7464286163449287, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 0.9821428656578064, "step": 8664 }, { "completion_length": 257.37500953674316, "epoch": 1.452994677061067, "grad_norm": 0.18317443366305186, "kl": 0.09564208984375, "learning_rate": 4.98468514193836e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7875000275671482, "rewards/format_reward_func": 0.9910714328289032, "step": 8666 }, { "completion_length": 268.60715770721436, "epoch": 1.4533299803009347, "grad_norm": 0.4507416161216136, "kl": 0.1070556640625, "learning_rate": 4.984671881536831e-07, "loss": 0.0001, "reward": 1.6857143715023994, "reward_std": 0.11111677810549736, "rewards/equation_reward_func": 0.7035714704543352, "rewards/format_reward_func": 0.9821428656578064, "step": 8668 }, { "completion_length": 267.8214387893677, "epoch": 1.4536652835408022, "grad_norm": 0.1606135561413552, "kl": 0.0889129638671875, "learning_rate": 4.984658615414666e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8169643022119999, "rewards/format_reward_func": 0.9955357164144516, "step": 8670 }, { "completion_length": 254.62947463989258, "epoch": 1.4540005867806698, "grad_norm": 0.13900528444515145, "kl": 0.0863189697265625, "learning_rate": 4.984645343571896e-07, "loss": 0.0001, "reward": 1.7875000685453415, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7919643148779869, "rewards/format_reward_func": 0.9955357164144516, "step": 8672 }, { "completion_length": 272.78126525878906, "epoch": 1.4543358900205372, "grad_norm": 0.20889585300332983, "kl": 0.0862579345703125, "learning_rate": 4.984632066008551e-07, "loss": 0.0001, "reward": 1.7571428939700127, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 0.9821428656578064, "step": 8674 }, { "completion_length": 270.0491189956665, "epoch": 1.4546711932604048, "grad_norm": 0.23130697896746683, "kl": 0.1115264892578125, "learning_rate": 4.984618782724662e-07, "loss": 0.0001, "reward": 1.7473215013742447, "reward_std": 0.06439722282812, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9982142895460129, "step": 8676 }, { "completion_length": 267.8303699493408, "epoch": 1.4550064965002725, "grad_norm": 0.32041604349225783, "kl": 0.11407470703125, "learning_rate": 4.98460549372026e-07, "loss": 0.0001, "reward": 1.7392858117818832, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7482143118977547, "rewards/format_reward_func": 0.9910714328289032, "step": 8678 }, { "completion_length": 265.58037185668945, "epoch": 1.45534179974014, "grad_norm": 0.14919159431849408, "kl": 0.1207122802734375, "learning_rate": 4.984592198995373e-07, "loss": 0.0001, "reward": 1.7125000581145287, "reward_std": 0.06313453428447247, "rewards/equation_reward_func": 0.725892897695303, "rewards/format_reward_func": 0.9866071492433548, "step": 8680 }, { "completion_length": 265.38394260406494, "epoch": 1.4556771029800075, "grad_norm": 0.3035788406369713, "kl": 0.1330718994140625, "learning_rate": 4.984578898550035e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 8682 }, { "completion_length": 254.71875953674316, "epoch": 1.4560124062198752, "grad_norm": 0.19167303109979877, "kl": 0.13134765625, "learning_rate": 4.984565592384275e-07, "loss": 0.0001, "reward": 1.6964286416769028, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7053571902215481, "rewards/format_reward_func": 0.9910714328289032, "step": 8684 }, { "completion_length": 265.8303689956665, "epoch": 1.4563477094597426, "grad_norm": 0.18865444325673764, "kl": 0.0995635986328125, "learning_rate": 4.984552280498124e-07, "loss": 0.0001, "reward": 1.8321429044008255, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.832142885774374, "rewards/format_reward_func": 1.0, "step": 8686 }, { "completion_length": 256.9241247177124, "epoch": 1.4566830126996102, "grad_norm": 0.8027363666604862, "kl": 0.3691558837890625, "learning_rate": 4.984538962891612e-07, "loss": 0.0004, "reward": 1.7750000655651093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 8688 }, { "completion_length": 261.95536708831787, "epoch": 1.4570183159394778, "grad_norm": 0.19079471714756488, "kl": 0.086334228515625, "learning_rate": 4.984525639564771e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7875000350177288, "rewards/format_reward_func": 0.9910714328289032, "step": 8690 }, { "completion_length": 242.62947463989258, "epoch": 1.4573536191793452, "grad_norm": 0.16115868086484084, "kl": 0.103118896484375, "learning_rate": 4.984512310517629e-07, "loss": 0.0001, "reward": 1.764732226729393, "reward_std": 0.056189734023064375, "rewards/equation_reward_func": 0.770535733550787, "rewards/format_reward_func": 0.9941964335739613, "step": 8692 }, { "completion_length": 257.6428699493408, "epoch": 1.4576889224192129, "grad_norm": 0.18748173456319275, "kl": 0.13623046875, "learning_rate": 4.98449897575022e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 8694 }, { "completion_length": 253.9732255935669, "epoch": 1.4580242256590803, "grad_norm": 0.0670366887196531, "kl": 0.087493896484375, "learning_rate": 4.984485635262573e-07, "loss": 0.0001, "reward": 1.7625000476837158, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7669643275439739, "rewards/format_reward_func": 0.9955357164144516, "step": 8696 }, { "completion_length": 255.08483123779297, "epoch": 1.458359528898948, "grad_norm": 0.46439941711117166, "kl": 0.146087646484375, "learning_rate": 4.984472289054718e-07, "loss": 0.0001, "reward": 1.7660714983940125, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9955357164144516, "step": 8698 }, { "completion_length": 258.120548248291, "epoch": 1.4586948321388156, "grad_norm": 0.13445293436482703, "kl": 0.2231903076171875, "learning_rate": 4.984458937126687e-07, "loss": 0.0002, "reward": 1.782142922282219, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 8700 }, { "completion_length": 253.5178689956665, "epoch": 1.4590301353786832, "grad_norm": 0.2825612021039923, "kl": 0.203125, "learning_rate": 4.984445579478512e-07, "loss": 0.0002, "reward": 1.7535715103149414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 8702 }, { "completion_length": 255.8616180419922, "epoch": 1.4593654386185506, "grad_norm": 0.0662004696664487, "kl": 0.160614013671875, "learning_rate": 4.98443221611022e-07, "loss": 0.0002, "reward": 1.7928571701049805, "reward_std": 0.030304577201604843, "rewards/equation_reward_func": 0.8017857428640127, "rewards/format_reward_func": 0.9910714328289032, "step": 8704 }, { "completion_length": 247.165189743042, "epoch": 1.4597007418584182, "grad_norm": 0.14048445465509524, "kl": 0.146881103515625, "learning_rate": 4.984418847021845e-07, "loss": 0.0001, "reward": 1.8321429044008255, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.832142885774374, "rewards/format_reward_func": 1.0, "step": 8706 }, { "completion_length": 253.55804538726807, "epoch": 1.4600360450982857, "grad_norm": 0.13718770727586044, "kl": 0.219635009765625, "learning_rate": 4.984405472213415e-07, "loss": 0.0002, "reward": 1.789285771548748, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 8708 }, { "completion_length": 253.86161613464355, "epoch": 1.4603713483381533, "grad_norm": 0.17726014798555442, "kl": 0.115692138671875, "learning_rate": 4.984392091684965e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 8710 }, { "completion_length": 258.8660840988159, "epoch": 1.460706651578021, "grad_norm": 0.10583912091027659, "kl": 0.15997314453125, "learning_rate": 4.984378705436521e-07, "loss": 0.0002, "reward": 1.7464286610484123, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 8712 }, { "completion_length": 246.37054347991943, "epoch": 1.4610419548178886, "grad_norm": 0.19567685530098944, "kl": 0.1878814697265625, "learning_rate": 4.984365313468117e-07, "loss": 0.0002, "reward": 1.7928571626543999, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.8017857372760773, "rewards/format_reward_func": 0.9910714328289032, "step": 8714 }, { "completion_length": 252.37054824829102, "epoch": 1.461377258057756, "grad_norm": 0.2787780610294813, "kl": 0.104888916015625, "learning_rate": 4.984351915779783e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7303571738302708, "rewards/format_reward_func": 0.9910714328289032, "step": 8716 }, { "completion_length": 252.12947750091553, "epoch": 1.4617125612976236, "grad_norm": 0.3540296381410723, "kl": 1.019378662109375, "learning_rate": 4.98433851237155e-07, "loss": 0.001, "reward": 1.7285714894533157, "reward_std": 0.03030457627028227, "rewards/equation_reward_func": 0.7375000361353159, "rewards/format_reward_func": 0.9910714328289032, "step": 8718 }, { "completion_length": 251.0625114440918, "epoch": 1.462047864537491, "grad_norm": 0.29658882871831993, "kl": 0.2493743896484375, "learning_rate": 4.984325103243448e-07, "loss": 0.0002, "reward": 1.7125000730156898, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.716964315623045, "rewards/format_reward_func": 0.9955357164144516, "step": 8720 }, { "completion_length": 255.5446538925171, "epoch": 1.4623831677773587, "grad_norm": 0.15793259635685353, "kl": 0.204254150390625, "learning_rate": 4.984311688395508e-07, "loss": 0.0002, "reward": 1.789285771548748, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7982143051922321, "rewards/format_reward_func": 0.9910714328289032, "step": 8722 }, { "completion_length": 245.4553689956665, "epoch": 1.4627184710172263, "grad_norm": 0.20221390732780958, "kl": 0.2862548828125, "learning_rate": 4.984298267827763e-07, "loss": 0.0003, "reward": 1.735714353621006, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7357143256813288, "rewards/format_reward_func": 1.0, "step": 8724 }, { "completion_length": 256.0401906967163, "epoch": 1.4630537742570937, "grad_norm": 0.18251496359166494, "kl": 0.3800048828125, "learning_rate": 4.984284841540242e-07, "loss": 0.0004, "reward": 1.7053572162985802, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7098214589059353, "rewards/format_reward_func": 0.9955357164144516, "step": 8726 }, { "completion_length": 239.9241189956665, "epoch": 1.4633890774969613, "grad_norm": 0.16568506817127004, "kl": 0.2138671875, "learning_rate": 4.984271409532975e-07, "loss": 0.0002, "reward": 1.6928572282195091, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.6928571704775095, "rewards/format_reward_func": 1.0, "step": 8728 }, { "completion_length": 242.13840198516846, "epoch": 1.4637243807368288, "grad_norm": 0.1784867004860529, "kl": 0.3534393310546875, "learning_rate": 4.984257971805995e-07, "loss": 0.0004, "reward": 1.7678571939468384, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571827709675, "rewards/format_reward_func": 1.0, "step": 8730 }, { "completion_length": 248.75447368621826, "epoch": 1.4640596839766964, "grad_norm": 0.1337191378943348, "kl": 0.240386962890625, "learning_rate": 4.984244528359332e-07, "loss": 0.0002, "reward": 1.7339286357164383, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7383929006755352, "rewards/format_reward_func": 0.9955357164144516, "step": 8732 }, { "completion_length": 245.96429824829102, "epoch": 1.464394987216564, "grad_norm": 0.2954717061176378, "kl": 0.18841552734375, "learning_rate": 4.984231079193016e-07, "loss": 0.0002, "reward": 1.7285715341567993, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 8734 }, { "completion_length": 241.88393783569336, "epoch": 1.4647302904564317, "grad_norm": 0.15578577592582596, "kl": 0.1372833251953125, "learning_rate": 4.98421762430708e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7526786103844643, "rewards/format_reward_func": 0.9955357164144516, "step": 8736 }, { "completion_length": 246.76340293884277, "epoch": 1.465065593696299, "grad_norm": 0.18416701370666422, "kl": 0.2396087646484375, "learning_rate": 4.984204163701554e-07, "loss": 0.0002, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 8738 }, { "completion_length": 235.80358028411865, "epoch": 1.4654008969361667, "grad_norm": 0.27698587198850594, "kl": 0.2167816162109375, "learning_rate": 4.984190697376469e-07, "loss": 0.0002, "reward": 1.7446429207921028, "reward_std": 0.09848987311124802, "rewards/equation_reward_func": 0.7580357454717159, "rewards/format_reward_func": 0.9866071492433548, "step": 8740 }, { "completion_length": 236.6919755935669, "epoch": 1.4657362001760341, "grad_norm": 0.41206796353337627, "kl": 0.5396728515625, "learning_rate": 4.984177225331856e-07, "loss": 0.0005, "reward": 1.733928643167019, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7383928820490837, "rewards/format_reward_func": 0.9955357164144516, "step": 8742 }, { "completion_length": 242.49108505249023, "epoch": 1.4660715034159018, "grad_norm": 0.7406925620636272, "kl": 0.42578125, "learning_rate": 4.984163747567745e-07, "loss": 0.0004, "reward": 1.7964286357164383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 8744 }, { "completion_length": 230.26786613464355, "epoch": 1.4664068066557694, "grad_norm": 0.15986850405720687, "kl": 0.114776611328125, "learning_rate": 4.984150264084169e-07, "loss": 0.0001, "reward": 1.798214353621006, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8017857447266579, "rewards/format_reward_func": 0.9964285716414452, "step": 8746 }, { "completion_length": 246.60715579986572, "epoch": 1.4667421098956368, "grad_norm": 0.21336797543590305, "kl": 0.48760986328125, "learning_rate": 4.984136774881158e-07, "loss": 0.0005, "reward": 1.7857143506407738, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 8748 }, { "completion_length": 232.2544755935669, "epoch": 1.4670774131355044, "grad_norm": 0.2915242898738858, "kl": 0.111358642578125, "learning_rate": 4.984123279958742e-07, "loss": 0.0001, "reward": 1.7160715013742447, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.720535745844245, "rewards/format_reward_func": 0.9955357164144516, "step": 8750 }, { "completion_length": 236.7544755935669, "epoch": 1.4674127163753719, "grad_norm": 0.43406847290546435, "kl": 0.15557861328125, "learning_rate": 4.984109779316955e-07, "loss": 0.0002, "reward": 1.750000074505806, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 8752 }, { "completion_length": 233.5937623977661, "epoch": 1.4677480196152395, "grad_norm": 0.1601888555299674, "kl": 0.16204833984375, "learning_rate": 4.984096272955825e-07, "loss": 0.0002, "reward": 1.7892857640981674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857398837805, "rewards/format_reward_func": 1.0, "step": 8754 }, { "completion_length": 237.0669755935669, "epoch": 1.4680833228551071, "grad_norm": 0.1986376468777605, "kl": 0.101776123046875, "learning_rate": 4.984082760875383e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 8756 }, { "completion_length": 230.04465198516846, "epoch": 1.4684186260949748, "grad_norm": 0.11074341297077231, "kl": 0.114471435546875, "learning_rate": 4.984069243075663e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 8758 }, { "completion_length": 236.65179824829102, "epoch": 1.4687539293348422, "grad_norm": 0.15408371005615754, "kl": 0.10626220703125, "learning_rate": 4.984055719556695e-07, "loss": 0.0001, "reward": 1.8178571984171867, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8178571704775095, "rewards/format_reward_func": 1.0, "step": 8760 }, { "completion_length": 239.63840198516846, "epoch": 1.4690892325747098, "grad_norm": 0.21080123821184735, "kl": 0.1241455078125, "learning_rate": 4.984042190318509e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 8762 }, { "completion_length": 237.87054443359375, "epoch": 1.4694245358145772, "grad_norm": 0.2850582997015366, "kl": 0.1470184326171875, "learning_rate": 4.984028655361138e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 8764 }, { "completion_length": 240.77679824829102, "epoch": 1.4697598390544448, "grad_norm": 0.2321030600462851, "kl": 0.164703369140625, "learning_rate": 4.98401511468461e-07, "loss": 0.0002, "reward": 1.7357143461704254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 1.0, "step": 8766 }, { "completion_length": 236.86608123779297, "epoch": 1.4700951422943125, "grad_norm": 0.3982264017600752, "kl": 0.1109619140625, "learning_rate": 4.984001568288961e-07, "loss": 0.0001, "reward": 1.7464286163449287, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 8768 }, { "completion_length": 239.758939743042, "epoch": 1.47043044553418, "grad_norm": 0.5475701173802654, "kl": 0.11822509765625, "learning_rate": 4.983988016174216e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 8770 }, { "completion_length": 238.2901906967163, "epoch": 1.4707657487740475, "grad_norm": 0.13873530815564392, "kl": 0.13739013671875, "learning_rate": 4.983974458340412e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000223517418, "rewards/format_reward_func": 1.0, "step": 8772 }, { "completion_length": 240.0044765472412, "epoch": 1.4711010520139152, "grad_norm": 0.20762954442612053, "kl": 0.152435302734375, "learning_rate": 4.983960894787577e-07, "loss": 0.0002, "reward": 1.73392865806818, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7383928913623095, "rewards/format_reward_func": 0.9955357164144516, "step": 8774 }, { "completion_length": 240.7009048461914, "epoch": 1.4714363552537826, "grad_norm": 0.24434572005939045, "kl": 0.20208740234375, "learning_rate": 4.983947325515743e-07, "loss": 0.0002, "reward": 1.7535714879631996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 8776 }, { "completion_length": 237.5937623977661, "epoch": 1.4717716584936502, "grad_norm": 0.14532140615552122, "kl": 0.213592529296875, "learning_rate": 4.983933750524941e-07, "loss": 0.0002, "reward": 1.7642857730388641, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 8778 }, { "completion_length": 237.67858219146729, "epoch": 1.4721069617335178, "grad_norm": 0.21694863813568013, "kl": 0.153778076171875, "learning_rate": 4.983920169815203e-07, "loss": 0.0002, "reward": 1.782142922282219, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 8780 }, { "completion_length": 237.61608505249023, "epoch": 1.4724422649733853, "grad_norm": 0.1438045442306162, "kl": 0.290191650390625, "learning_rate": 4.98390658338656e-07, "loss": 0.0003, "reward": 1.7607143819332123, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 8782 }, { "completion_length": 238.18751049041748, "epoch": 1.472777568213253, "grad_norm": 0.21501601267495266, "kl": 0.1873779296875, "learning_rate": 4.983892991239043e-07, "loss": 0.0002, "reward": 1.750000074505806, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 8784 }, { "completion_length": 237.9419755935669, "epoch": 1.4731128714531203, "grad_norm": 0.22531915179161718, "kl": 0.19091796875, "learning_rate": 4.983879393372683e-07, "loss": 0.0002, "reward": 1.8071429133415222, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8071428909897804, "rewards/format_reward_func": 1.0, "step": 8786 }, { "completion_length": 240.008939743042, "epoch": 1.473448174692988, "grad_norm": 0.26067253479642094, "kl": 0.281646728515625, "learning_rate": 4.983865789787512e-07, "loss": 0.0003, "reward": 1.7196429371833801, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7241071835160255, "rewards/format_reward_func": 0.9955357164144516, "step": 8788 }, { "completion_length": 238.67858123779297, "epoch": 1.4737834779328556, "grad_norm": 0.08043931843097116, "kl": 0.195220947265625, "learning_rate": 4.983852180483561e-07, "loss": 0.0002, "reward": 1.7964286133646965, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.805357163771987, "rewards/format_reward_func": 0.9910714328289032, "step": 8790 }, { "completion_length": 239.61161708831787, "epoch": 1.4741187811727232, "grad_norm": 0.13354989199160783, "kl": 0.142486572265625, "learning_rate": 4.983838565460861e-07, "loss": 0.0001, "reward": 1.8071429282426834, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 8792 }, { "completion_length": 224.60715293884277, "epoch": 1.4744540844125906, "grad_norm": 0.08819330980406609, "kl": 0.14404296875, "learning_rate": 4.983824944719445e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857142984867096, "rewards/format_reward_func": 1.0, "step": 8794 }, { "completion_length": 235.8928680419922, "epoch": 1.4747893876524583, "grad_norm": 0.1696130647851362, "kl": 0.256622314453125, "learning_rate": 4.983811318259341e-07, "loss": 0.0003, "reward": 1.782142922282219, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 8796 }, { "completion_length": 251.71429538726807, "epoch": 1.4751246908923257, "grad_norm": 0.24888454972454868, "kl": 0.257476806640625, "learning_rate": 4.983797686080584e-07, "loss": 0.0003, "reward": 1.74642863124609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464285995811224, "rewards/format_reward_func": 1.0, "step": 8798 }, { "completion_length": 242.61161518096924, "epoch": 1.4754599941321933, "grad_norm": 0.24204832950998906, "kl": 0.138885498046875, "learning_rate": 4.983784048183203e-07, "loss": 0.0001, "reward": 1.7357143834233284, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 8800 }, { "completion_length": 247.0357265472412, "epoch": 1.475795297372061, "grad_norm": 0.2644117340159772, "kl": 0.148284912109375, "learning_rate": 4.98377040456723e-07, "loss": 0.0001, "reward": 1.7910714820027351, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 8802 }, { "completion_length": 244.20983219146729, "epoch": 1.4761306006119284, "grad_norm": 0.10373343324288181, "kl": 0.2305908203125, "learning_rate": 4.983756755232698e-07, "loss": 0.0002, "reward": 1.7196429520845413, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 8804 }, { "completion_length": 245.05804634094238, "epoch": 1.476465903851796, "grad_norm": 0.2235878076098335, "kl": 0.140228271484375, "learning_rate": 4.983743100179635e-07, "loss": 0.0001, "reward": 1.7553572282195091, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.759821455925703, "rewards/format_reward_func": 0.9955357164144516, "step": 8806 }, { "completion_length": 237.69644165039062, "epoch": 1.4768012070916634, "grad_norm": 0.40044982762353, "kl": 0.15948486328125, "learning_rate": 4.983729439408076e-07, "loss": 0.0002, "reward": 1.84285718947649, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8428571783006191, "rewards/format_reward_func": 1.0, "step": 8808 }, { "completion_length": 237.32590675354004, "epoch": 1.477136510331531, "grad_norm": 0.3943765992765898, "kl": 0.201751708984375, "learning_rate": 4.983715772918051e-07, "loss": 0.0002, "reward": 1.7571429386734962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 8810 }, { "completion_length": 228.9642972946167, "epoch": 1.4774718135713987, "grad_norm": 0.257684754189661, "kl": 0.1728515625, "learning_rate": 4.98370210070959e-07, "loss": 0.0002, "reward": 1.77946437895298, "reward_std": 0.049244935158640146, "rewards/equation_reward_func": 0.7812500223517418, "rewards/format_reward_func": 0.9982142895460129, "step": 8812 }, { "completion_length": 240.70983219146729, "epoch": 1.4778071168112663, "grad_norm": 0.2669589546020435, "kl": 0.183319091796875, "learning_rate": 4.983688422782727e-07, "loss": 0.0002, "reward": 1.7642857730388641, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 8814 }, { "completion_length": 237.05358219146729, "epoch": 1.4781424200511337, "grad_norm": 0.18092645813421443, "kl": 0.289398193359375, "learning_rate": 4.983674739137492e-07, "loss": 0.0003, "reward": 1.7142857909202576, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857536673546, "rewards/format_reward_func": 1.0, "step": 8816 }, { "completion_length": 242.8750114440918, "epoch": 1.4784777232910014, "grad_norm": 0.12930487883190817, "kl": 0.164398193359375, "learning_rate": 4.983661049773918e-07, "loss": 0.0002, "reward": 1.7821428999304771, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 8818 }, { "completion_length": 226.96429824829102, "epoch": 1.4788130265308688, "grad_norm": 0.4132107732816038, "kl": 0.1771240234375, "learning_rate": 4.983647354692034e-07, "loss": 0.0002, "reward": 1.81428574770689, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8142857551574707, "rewards/format_reward_func": 1.0, "step": 8820 }, { "completion_length": 243.04018783569336, "epoch": 1.4791483297707364, "grad_norm": 0.39982853624464665, "kl": 0.329010009765625, "learning_rate": 4.983633653891872e-07, "loss": 0.0003, "reward": 1.7446429207921028, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 8822 }, { "completion_length": 252.696439743042, "epoch": 1.479483633010604, "grad_norm": 0.5636208493610942, "kl": 0.17095947265625, "learning_rate": 4.983619947373467e-07, "loss": 0.0002, "reward": 1.7678572461009026, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 8824 }, { "completion_length": 242.7857255935669, "epoch": 1.4798189362504715, "grad_norm": 0.22129015934686244, "kl": 0.28814697265625, "learning_rate": 4.983606235136847e-07, "loss": 0.0003, "reward": 1.7571429088711739, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 8826 }, { "completion_length": 237.4687614440918, "epoch": 1.480154239490339, "grad_norm": 0.2303722888334993, "kl": 0.1688385009765625, "learning_rate": 4.983592517182044e-07, "loss": 0.0002, "reward": 1.7089286521077156, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7133928760886192, "rewards/format_reward_func": 0.9955357164144516, "step": 8828 }, { "completion_length": 235.96429634094238, "epoch": 1.4804895427302065, "grad_norm": 0.2179046244872161, "kl": 0.308685302734375, "learning_rate": 4.98357879350909e-07, "loss": 0.0003, "reward": 1.7357143685221672, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9910714328289032, "step": 8830 }, { "completion_length": 221.7366189956665, "epoch": 1.4808248459700741, "grad_norm": 0.12126952172765075, "kl": 0.15643310546875, "learning_rate": 4.983565064118017e-07, "loss": 0.0002, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 8832 }, { "completion_length": 234.1428689956665, "epoch": 1.4811601492099418, "grad_norm": 0.31648037804345736, "kl": 0.201019287109375, "learning_rate": 4.983551329008858e-07, "loss": 0.0002, "reward": 1.7232143506407738, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7276786137372255, "rewards/format_reward_func": 0.9955357164144516, "step": 8834 }, { "completion_length": 233.80804538726807, "epoch": 1.4814954524498094, "grad_norm": 0.22828808628135586, "kl": 0.19964599609375, "learning_rate": 4.983537588181641e-07, "loss": 0.0002, "reward": 1.678571529686451, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.678571455180645, "rewards/format_reward_func": 1.0, "step": 8836 }, { "completion_length": 223.83036708831787, "epoch": 1.4818307556896768, "grad_norm": 0.25069049217082295, "kl": 0.148529052734375, "learning_rate": 4.9835238416364e-07, "loss": 0.0001, "reward": 1.8178571909666061, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.817857176065445, "rewards/format_reward_func": 1.0, "step": 8838 }, { "completion_length": 228.49554634094238, "epoch": 1.4821660589295444, "grad_norm": 0.18364389650314558, "kl": 0.13134765625, "learning_rate": 4.983510089373167e-07, "loss": 0.0001, "reward": 1.7410714849829674, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357424914837, "rewards/format_reward_func": 0.9955357164144516, "step": 8840 }, { "completion_length": 227.50447368621826, "epoch": 1.4825013621694119, "grad_norm": 0.3162348652446139, "kl": 0.1231842041015625, "learning_rate": 4.983496331391973e-07, "loss": 0.0001, "reward": 1.7250000908970833, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7250000350177288, "rewards/format_reward_func": 1.0, "step": 8842 }, { "completion_length": 225.16072463989258, "epoch": 1.4828366654092795, "grad_norm": 0.1262953317962862, "kl": 0.11572265625, "learning_rate": 4.983482567692848e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 8844 }, { "completion_length": 236.4375114440918, "epoch": 1.4831719686491471, "grad_norm": 0.348430568533721, "kl": 0.1246185302734375, "learning_rate": 4.983468798275827e-07, "loss": 0.0001, "reward": 1.7142857983708382, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.714285746216774, "rewards/format_reward_func": 1.0, "step": 8846 }, { "completion_length": 222.82143688201904, "epoch": 1.4835072718890145, "grad_norm": 0.10965358167212523, "kl": 0.1109466552734375, "learning_rate": 4.98345502314094e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 8848 }, { "completion_length": 237.7366180419922, "epoch": 1.4838425751288822, "grad_norm": 0.2539963287004196, "kl": 0.1054534912109375, "learning_rate": 4.983441242288219e-07, "loss": 0.0001, "reward": 1.7464286088943481, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464285902678967, "rewards/format_reward_func": 1.0, "step": 8850 }, { "completion_length": 236.00447750091553, "epoch": 1.4841778783687498, "grad_norm": 0.2132569693177203, "kl": 0.104949951171875, "learning_rate": 4.983427455717694e-07, "loss": 0.0001, "reward": 1.7000000849366188, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7000000309199095, "rewards/format_reward_func": 1.0, "step": 8852 }, { "completion_length": 235.29019355773926, "epoch": 1.4845131816086172, "grad_norm": 0.178193772294305, "kl": 0.125152587890625, "learning_rate": 4.983413663429399e-07, "loss": 0.0001, "reward": 1.7482143715023994, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 8854 }, { "completion_length": 231.33929538726807, "epoch": 1.4848484848484849, "grad_norm": 0.37231971986180934, "kl": 0.13079833984375, "learning_rate": 4.983399865423365e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 8856 }, { "completion_length": 230.4821538925171, "epoch": 1.4851837880883525, "grad_norm": 0.23070799188104596, "kl": 0.146514892578125, "learning_rate": 4.983386061699624e-07, "loss": 0.0001, "reward": 1.7732143476605415, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7776785977184772, "rewards/format_reward_func": 0.9955357164144516, "step": 8858 }, { "completion_length": 249.3571538925171, "epoch": 1.48551909132822, "grad_norm": 0.4236600611044629, "kl": 0.24822998046875, "learning_rate": 4.983372252258207e-07, "loss": 0.0002, "reward": 1.6915179640054703, "reward_std": 0.0625031883828342, "rewards/equation_reward_func": 0.7017857395112514, "rewards/format_reward_func": 0.9897321499884129, "step": 8860 }, { "completion_length": 248.9151906967163, "epoch": 1.4858543945680875, "grad_norm": 0.39799241693952037, "kl": 0.156219482421875, "learning_rate": 4.983358437099147e-07, "loss": 0.0002, "reward": 1.7125000730156898, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7169643193483353, "rewards/format_reward_func": 0.9955357164144516, "step": 8862 }, { "completion_length": 245.5714406967163, "epoch": 1.486189697807955, "grad_norm": 0.17593234268182617, "kl": 0.192962646484375, "learning_rate": 4.983344616222475e-07, "loss": 0.0002, "reward": 1.6982143595814705, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7026786133646965, "rewards/format_reward_func": 0.9955357164144516, "step": 8864 }, { "completion_length": 234.52233123779297, "epoch": 1.4865250010478226, "grad_norm": 0.13123695463945143, "kl": 0.311370849609375, "learning_rate": 4.983330789628221e-07, "loss": 0.0003, "reward": 1.8107143491506577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143044471741, "rewards/format_reward_func": 1.0, "step": 8866 }, { "completion_length": 239.7678680419922, "epoch": 1.4868603042876902, "grad_norm": 0.12358910005630577, "kl": 0.30950927734375, "learning_rate": 4.983316957316421e-07, "loss": 0.0003, "reward": 1.7535714879631996, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 8868 }, { "completion_length": 232.96875953674316, "epoch": 1.4871956075275579, "grad_norm": 0.17331399232615474, "kl": 0.1680908203125, "learning_rate": 4.983303119287104e-07, "loss": 0.0002, "reward": 1.76607146859169, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7705357633531094, "rewards/format_reward_func": 0.9955357164144516, "step": 8870 }, { "completion_length": 232.16072368621826, "epoch": 1.4875309107674253, "grad_norm": 0.26597933576708865, "kl": 0.197265625, "learning_rate": 4.983289275540302e-07, "loss": 0.0002, "reward": 1.732142947614193, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428880095482, "rewards/format_reward_func": 1.0, "step": 8872 }, { "completion_length": 233.13840579986572, "epoch": 1.487866214007293, "grad_norm": 0.1892458028270941, "kl": 0.406097412109375, "learning_rate": 4.983275426076048e-07, "loss": 0.0004, "reward": 1.7678572088479996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 8874 }, { "completion_length": 235.5982255935669, "epoch": 1.4882015172471603, "grad_norm": 0.23906095397175636, "kl": 0.17352294921875, "learning_rate": 4.983261570894373e-07, "loss": 0.0002, "reward": 1.7660714983940125, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357298254967, "rewards/format_reward_func": 0.9955357164144516, "step": 8876 }, { "completion_length": 230.30358123779297, "epoch": 1.488536820487028, "grad_norm": 0.2635429378102701, "kl": 0.173614501953125, "learning_rate": 4.98324770999531e-07, "loss": 0.0002, "reward": 1.7392858117818832, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 8878 }, { "completion_length": 244.73215866088867, "epoch": 1.4888721237268956, "grad_norm": 0.20412065409466093, "kl": 0.16748046875, "learning_rate": 4.983233843378889e-07, "loss": 0.0002, "reward": 1.7553571984171867, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214708268642, "rewards/format_reward_func": 0.9955357164144516, "step": 8880 }, { "completion_length": 231.58036994934082, "epoch": 1.489207426966763, "grad_norm": 0.39266692298473066, "kl": 0.111083984375, "learning_rate": 4.983219971045143e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 8882 }, { "completion_length": 222.5982255935669, "epoch": 1.4895427302066306, "grad_norm": 0.15750322995007576, "kl": 0.106170654296875, "learning_rate": 4.983206092994104e-07, "loss": 0.0001, "reward": 1.841071456670761, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8455357328057289, "rewards/format_reward_func": 0.9955357164144516, "step": 8884 }, { "completion_length": 234.8125123977661, "epoch": 1.489878033446498, "grad_norm": 0.20708568340897027, "kl": 0.2044830322265625, "learning_rate": 4.983192209225805e-07, "loss": 0.0002, "reward": 1.7428572326898575, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571674972773, "rewards/format_reward_func": 1.0, "step": 8886 }, { "completion_length": 238.50447368621826, "epoch": 1.4902133366863657, "grad_norm": 0.3230453112977611, "kl": 0.141143798828125, "learning_rate": 4.983178319740276e-07, "loss": 0.0001, "reward": 1.7625000700354576, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 8888 }, { "completion_length": 231.5401906967163, "epoch": 1.4905486399262333, "grad_norm": 0.23606147243547548, "kl": 0.140899658203125, "learning_rate": 4.983164424537549e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 8890 }, { "completion_length": 232.88840293884277, "epoch": 1.490883943166101, "grad_norm": 0.21897116798986696, "kl": 0.1290740966796875, "learning_rate": 4.983150523617658e-07, "loss": 0.0001, "reward": 1.789285771548748, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 8892 }, { "completion_length": 237.89287185668945, "epoch": 1.4912192464059684, "grad_norm": 0.21563554948529737, "kl": 0.155853271484375, "learning_rate": 4.983136616980633e-07, "loss": 0.0002, "reward": 1.730357214808464, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7348214499652386, "rewards/format_reward_func": 0.9955357164144516, "step": 8894 }, { "completion_length": 246.1384048461914, "epoch": 1.491554549645836, "grad_norm": 0.303244848945181, "kl": 0.187408447265625, "learning_rate": 4.983122704626507e-07, "loss": 0.0002, "reward": 1.6964286342263222, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6964286155998707, "rewards/format_reward_func": 1.0, "step": 8896 }, { "completion_length": 242.7634048461914, "epoch": 1.4918898528857034, "grad_norm": 0.3079019603547445, "kl": 0.136077880859375, "learning_rate": 4.983108786555312e-07, "loss": 0.0001, "reward": 1.8160714730620384, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.8205357454717159, "rewards/format_reward_func": 0.9955357164144516, "step": 8898 }, { "completion_length": 242.7500114440918, "epoch": 1.492225156125571, "grad_norm": 0.37232604700225147, "kl": 0.154022216796875, "learning_rate": 4.98309486276708e-07, "loss": 0.0002, "reward": 1.760714367032051, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143297791481, "rewards/format_reward_func": 1.0, "step": 8900 }, { "completion_length": 242.83483123779297, "epoch": 1.4925604593654387, "grad_norm": 0.1844494338084748, "kl": 0.201995849609375, "learning_rate": 4.983080933261842e-07, "loss": 0.0002, "reward": 1.7375000715255737, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 8902 }, { "completion_length": 236.4776906967163, "epoch": 1.492895762605306, "grad_norm": 0.25817305870319135, "kl": 0.271881103515625, "learning_rate": 4.983066998039632e-07, "loss": 0.0003, "reward": 1.7357143759727478, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7357143349945545, "rewards/format_reward_func": 1.0, "step": 8904 }, { "completion_length": 243.10715293884277, "epoch": 1.4932310658451737, "grad_norm": 0.20426186955697542, "kl": 0.264862060546875, "learning_rate": 4.983053057100482e-07, "loss": 0.0003, "reward": 1.725000075995922, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000312924385, "rewards/format_reward_func": 1.0, "step": 8906 }, { "completion_length": 247.6919755935669, "epoch": 1.4935663690850411, "grad_norm": 0.1531023894494314, "kl": 0.41204833984375, "learning_rate": 4.983039110444422e-07, "loss": 0.0004, "reward": 1.6678572371602058, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.6767857410013676, "rewards/format_reward_func": 0.9910714328289032, "step": 8908 }, { "completion_length": 231.68304538726807, "epoch": 1.4939016723249088, "grad_norm": 0.2368336569619978, "kl": 0.259857177734375, "learning_rate": 4.983025158071485e-07, "loss": 0.0003, "reward": 1.775000087916851, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 8910 }, { "completion_length": 249.6250114440918, "epoch": 1.4942369755647764, "grad_norm": 0.1959684567937748, "kl": 0.44342041015625, "learning_rate": 4.983011199981704e-07, "loss": 0.0004, "reward": 1.7464286386966705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 8912 }, { "completion_length": 249.63393878936768, "epoch": 1.494572278804644, "grad_norm": 0.20096044788713058, "kl": 0.470428466796875, "learning_rate": 4.982997236175111e-07, "loss": 0.0005, "reward": 1.7053572088479996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7089286148548126, "rewards/format_reward_func": 0.9964285716414452, "step": 8914 }, { "completion_length": 250.508939743042, "epoch": 1.4949075820445115, "grad_norm": 0.37251539691184704, "kl": 0.377716064453125, "learning_rate": 4.982983266651737e-07, "loss": 0.0004, "reward": 1.7589286416769028, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928954601288, "rewards/format_reward_func": 0.9955357164144516, "step": 8916 }, { "completion_length": 235.14733219146729, "epoch": 1.495242885284379, "grad_norm": 0.11145203648541949, "kl": 0.121337890625, "learning_rate": 4.982969291411615e-07, "loss": 0.0001, "reward": 1.801785759627819, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.8062500171363354, "rewards/format_reward_func": 0.9955357164144516, "step": 8918 }, { "completion_length": 231.8884048461914, "epoch": 1.4955781885242465, "grad_norm": 0.19749947364265377, "kl": 0.156341552734375, "learning_rate": 4.982955310454777e-07, "loss": 0.0002, "reward": 1.7714286148548126, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286223053932, "rewards/format_reward_func": 1.0, "step": 8920 }, { "completion_length": 233.74108219146729, "epoch": 1.4959134917641141, "grad_norm": 0.10809478877179468, "kl": 0.143829345703125, "learning_rate": 4.982941323781255e-07, "loss": 0.0001, "reward": 1.7571429461240768, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 8922 }, { "completion_length": 224.13393878936768, "epoch": 1.4962487950039818, "grad_norm": 0.25690059581071395, "kl": 0.108489990234375, "learning_rate": 4.982927331391083e-07, "loss": 0.0001, "reward": 1.8178571611642838, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8178571704775095, "rewards/format_reward_func": 1.0, "step": 8924 }, { "completion_length": 225.33483219146729, "epoch": 1.4965840982438494, "grad_norm": 0.21596429702799877, "kl": 0.1488037109375, "learning_rate": 4.98291333328429e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 8926 }, { "completion_length": 225.4509038925171, "epoch": 1.4969194014837168, "grad_norm": 0.3113412682431212, "kl": 0.1666259765625, "learning_rate": 4.982899329460911e-07, "loss": 0.0002, "reward": 1.7535715028643608, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 8928 }, { "completion_length": 228.73215198516846, "epoch": 1.4972547047235845, "grad_norm": 0.10233896625503296, "kl": 0.1457366943359375, "learning_rate": 4.982885319920977e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 8930 }, { "completion_length": 236.79465293884277, "epoch": 1.4975900079634519, "grad_norm": 0.24018858070649615, "kl": 0.164825439453125, "learning_rate": 4.98287130466452e-07, "loss": 0.0002, "reward": 1.7625000700354576, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 8932 }, { "completion_length": 220.77679538726807, "epoch": 1.4979253112033195, "grad_norm": 0.15697397655397416, "kl": 0.1427001953125, "learning_rate": 4.982857283691572e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714488476515, "rewards/format_reward_func": 1.0, "step": 8934 }, { "completion_length": 236.42858028411865, "epoch": 1.4982606144431871, "grad_norm": 0.12975874262251802, "kl": 0.12359619140625, "learning_rate": 4.982843257002167e-07, "loss": 0.0001, "reward": 1.742857202887535, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7428571749478579, "rewards/format_reward_func": 1.0, "step": 8936 }, { "completion_length": 226.4241180419922, "epoch": 1.4985959176830546, "grad_norm": 0.33443616062865283, "kl": 0.098114013671875, "learning_rate": 4.982829224596337e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7633928805589676, "rewards/format_reward_func": 0.9955357164144516, "step": 8938 }, { "completion_length": 219.69197273254395, "epoch": 1.4989312209229222, "grad_norm": 0.6156013334847034, "kl": 0.187744140625, "learning_rate": 4.982815186474111e-07, "loss": 0.0002, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 8940 }, { "completion_length": 227.27233219146729, "epoch": 1.4992665241627896, "grad_norm": 0.11772230548243262, "kl": 0.121368408203125, "learning_rate": 4.982801142635526e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8107142969965935, "rewards/format_reward_func": 1.0, "step": 8942 }, { "completion_length": 223.13393783569336, "epoch": 1.4996018274026572, "grad_norm": 0.21252309938680947, "kl": 0.1238250732421875, "learning_rate": 4.982787093080612e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 8944 }, { "completion_length": 227.83036422729492, "epoch": 1.4999371306425249, "grad_norm": 0.26532050624733566, "kl": 0.21636962890625, "learning_rate": 4.982773037809402e-07, "loss": 0.0002, "reward": 1.796428643167019, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285910129547, "rewards/format_reward_func": 1.0, "step": 8946 }, { "completion_length": 218.97322273254395, "epoch": 1.5002724338823925, "grad_norm": 0.26540863359360023, "kl": 0.182037353515625, "learning_rate": 4.982758976821927e-07, "loss": 0.0002, "reward": 1.730357214808464, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7348214723169804, "rewards/format_reward_func": 0.9955357164144516, "step": 8948 }, { "completion_length": 221.95983123779297, "epoch": 1.50060773712226, "grad_norm": 0.20609867460584388, "kl": 0.11181640625, "learning_rate": 4.982744910118221e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 8950 }, { "completion_length": 216.87947463989258, "epoch": 1.5009430403621273, "grad_norm": 0.2493354640229781, "kl": 0.1196746826171875, "learning_rate": 4.982730837698314e-07, "loss": 0.0001, "reward": 1.7535715252161026, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714562982321, "rewards/format_reward_func": 1.0, "step": 8952 }, { "completion_length": 215.49108219146729, "epoch": 1.501278343601995, "grad_norm": 0.2773946950947669, "kl": 0.125640869140625, "learning_rate": 4.982716759562242e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 1.0, "step": 8954 }, { "completion_length": 220.76340293884277, "epoch": 1.5016136468418626, "grad_norm": 0.21977779618397725, "kl": 0.131927490234375, "learning_rate": 4.982702675710034e-07, "loss": 0.0001, "reward": 1.7000000923871994, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7000000383704901, "rewards/format_reward_func": 1.0, "step": 8956 }, { "completion_length": 218.67858123779297, "epoch": 1.5019489500817302, "grad_norm": 0.23496077922526748, "kl": 0.100067138671875, "learning_rate": 4.982688586141725e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 8958 }, { "completion_length": 211.27233219146729, "epoch": 1.5022842533215979, "grad_norm": 0.24280331098890967, "kl": 0.1251220703125, "learning_rate": 4.982674490857346e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 8960 }, { "completion_length": 225.0759038925171, "epoch": 1.5026195565614653, "grad_norm": 0.16089475492518973, "kl": 0.10797119140625, "learning_rate": 4.98266038985693e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 8962 }, { "completion_length": 212.95090007781982, "epoch": 1.5029548598013327, "grad_norm": 0.17764220536121444, "kl": 0.0988922119140625, "learning_rate": 4.982646283140509e-07, "loss": 0.0001, "reward": 1.8107143267989159, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143044471741, "rewards/format_reward_func": 1.0, "step": 8964 }, { "completion_length": 213.56697368621826, "epoch": 1.5032901630412003, "grad_norm": 0.17043077666424078, "kl": 0.0966796875, "learning_rate": 4.982632170708117e-07, "loss": 0.0001, "reward": 1.8607143312692642, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8607143089175224, "rewards/format_reward_func": 1.0, "step": 8966 }, { "completion_length": 221.25893592834473, "epoch": 1.503625466281068, "grad_norm": 0.11552382456613353, "kl": 0.10955810546875, "learning_rate": 4.982618052559783e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 8968 }, { "completion_length": 218.31251049041748, "epoch": 1.5039607695209356, "grad_norm": 0.24386668329689457, "kl": 0.12744140625, "learning_rate": 4.982603928695543e-07, "loss": 0.0001, "reward": 1.8357143178582191, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8357143066823483, "rewards/format_reward_func": 1.0, "step": 8970 }, { "completion_length": 210.50000858306885, "epoch": 1.504296072760803, "grad_norm": 0.24801821338106403, "kl": 0.089141845703125, "learning_rate": 4.982589799115429e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428999304771, "rewards/format_reward_func": 1.0, "step": 8972 }, { "completion_length": 212.86608123779297, "epoch": 1.5046313760006707, "grad_norm": 0.11105424263655118, "kl": 0.097076416015625, "learning_rate": 4.982575663819471e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 8974 }, { "completion_length": 216.56251049041748, "epoch": 1.504966679240538, "grad_norm": 0.18365734597731484, "kl": 0.113494873046875, "learning_rate": 4.982561522807705e-07, "loss": 0.0001, "reward": 1.7750000804662704, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 8976 }, { "completion_length": 222.071439743042, "epoch": 1.5053019824804057, "grad_norm": 0.19870798357054514, "kl": 0.116790771484375, "learning_rate": 4.982547376080161e-07, "loss": 0.0001, "reward": 1.8107143566012383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8107143081724644, "rewards/format_reward_func": 1.0, "step": 8978 }, { "completion_length": 218.92411708831787, "epoch": 1.5056372857202733, "grad_norm": 0.28292647884657346, "kl": 0.10693359375, "learning_rate": 4.982533223636872e-07, "loss": 0.0001, "reward": 1.8535714820027351, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8535714522004128, "rewards/format_reward_func": 1.0, "step": 8980 }, { "completion_length": 230.99554538726807, "epoch": 1.505972588960141, "grad_norm": 0.24335191206106543, "kl": 0.146392822265625, "learning_rate": 4.982519065477873e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571864962578, "rewards/format_reward_func": 1.0, "step": 8982 }, { "completion_length": 231.1205472946167, "epoch": 1.5063078922000084, "grad_norm": 0.24953749066660705, "kl": 0.19580078125, "learning_rate": 4.982504901603192e-07, "loss": 0.0002, "reward": 1.753571517765522, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 8984 }, { "completion_length": 217.58482933044434, "epoch": 1.5066431954398758, "grad_norm": 0.31805716184155647, "kl": 0.17828369140625, "learning_rate": 4.982490732012867e-07, "loss": 0.0002, "reward": 1.7821429297327995, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428775787354, "rewards/format_reward_func": 1.0, "step": 8986 }, { "completion_length": 227.37501049041748, "epoch": 1.5069784986797434, "grad_norm": 0.0028478271534510976, "kl": 0.110992431640625, "learning_rate": 4.982476556706926e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 8988 }, { "completion_length": 229.70090198516846, "epoch": 1.507313801919611, "grad_norm": 0.3544715140069262, "kl": 0.20654296875, "learning_rate": 4.982462375685404e-07, "loss": 0.0002, "reward": 1.7821429297327995, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.782142873853445, "rewards/format_reward_func": 1.0, "step": 8990 }, { "completion_length": 215.92858123779297, "epoch": 1.5076491051594787, "grad_norm": 0.23203895801518692, "kl": 0.116851806640625, "learning_rate": 4.982448188948333e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 8992 }, { "completion_length": 227.571439743042, "epoch": 1.507984408399346, "grad_norm": 0.2609133394304276, "kl": 0.152923583984375, "learning_rate": 4.982433996495747e-07, "loss": 0.0002, "reward": 1.7071429267525673, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7071428913623095, "rewards/format_reward_func": 1.0, "step": 8994 }, { "completion_length": 229.30804634094238, "epoch": 1.5083197116392137, "grad_norm": 0.1709798535313269, "kl": 0.137298583984375, "learning_rate": 4.982419798327676e-07, "loss": 0.0001, "reward": 1.6964286640286446, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.6964286174625158, "rewards/format_reward_func": 1.0, "step": 8996 }, { "completion_length": 227.90626049041748, "epoch": 1.5086550148790812, "grad_norm": 0.24870973516291606, "kl": 0.157684326171875, "learning_rate": 4.982405594444155e-07, "loss": 0.0002, "reward": 1.7857143431901932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 8998 }, { "completion_length": 222.56250762939453, "epoch": 1.5089903181189488, "grad_norm": 0.5034636051894814, "kl": 0.1951904296875, "learning_rate": 4.982391384845216e-07, "loss": 0.0002, "reward": 1.7392858117818832, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 1.0, "step": 9000 }, { "completion_length": 227.89286613464355, "epoch": 1.5093256213588164, "grad_norm": 0.19870862664022154, "kl": 0.1785888671875, "learning_rate": 4.982377169530892e-07, "loss": 0.0002, "reward": 1.7107143551111221, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.71071432903409, "rewards/format_reward_func": 1.0, "step": 9002 }, { "completion_length": 227.68750953674316, "epoch": 1.509660924598684, "grad_norm": 0.2525564638952568, "kl": 0.20538330078125, "learning_rate": 4.982362948501214e-07, "loss": 0.0002, "reward": 1.7035715207457542, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7035714536905289, "rewards/format_reward_func": 1.0, "step": 9004 }, { "completion_length": 233.28125953674316, "epoch": 1.5099962278385515, "grad_norm": 0.23294504953174144, "kl": 0.242095947265625, "learning_rate": 4.982348721756217e-07, "loss": 0.0002, "reward": 1.726785771548748, "reward_std": 0.04293148126453161, "rewards/equation_reward_func": 0.7312500271946192, "rewards/format_reward_func": 0.9955357164144516, "step": 9006 }, { "completion_length": 233.90179634094238, "epoch": 1.5103315310784189, "grad_norm": 0.4282919691411335, "kl": 0.20703125, "learning_rate": 4.982334489295933e-07, "loss": 0.0002, "reward": 1.7750000655651093, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 9008 }, { "completion_length": 236.61607933044434, "epoch": 1.5106668343182865, "grad_norm": 0.492780322237796, "kl": 0.288299560546875, "learning_rate": 4.982320251120395e-07, "loss": 0.0003, "reward": 1.7571429312229156, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 9010 }, { "completion_length": 242.3750114440918, "epoch": 1.5110021375581542, "grad_norm": 0.233727013707272, "kl": 0.2298583984375, "learning_rate": 4.982306007229634e-07, "loss": 0.0002, "reward": 1.7089286521077156, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7133928909897804, "rewards/format_reward_func": 0.9955357164144516, "step": 9012 }, { "completion_length": 234.3080472946167, "epoch": 1.5113374407980218, "grad_norm": 0.1943769671417021, "kl": 0.2056884765625, "learning_rate": 4.982291757623685e-07, "loss": 0.0002, "reward": 1.7357143610715866, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143275439739, "rewards/format_reward_func": 1.0, "step": 9014 }, { "completion_length": 244.28572463989258, "epoch": 1.5116727440378894, "grad_norm": 0.592650509147069, "kl": 0.294830322265625, "learning_rate": 4.98227750230258e-07, "loss": 0.0003, "reward": 1.6875000819563866, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.6919643245637417, "rewards/format_reward_func": 0.9955357164144516, "step": 9016 }, { "completion_length": 233.68304824829102, "epoch": 1.5120080472777568, "grad_norm": 0.19249539570632418, "kl": 0.17291259765625, "learning_rate": 4.982263241266353e-07, "loss": 0.0002, "reward": 1.8250000476837158, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8250000290572643, "rewards/format_reward_func": 1.0, "step": 9018 }, { "completion_length": 221.12947463989258, "epoch": 1.5123433505176243, "grad_norm": 0.23756867356380112, "kl": 0.20294189453125, "learning_rate": 4.982248974515033e-07, "loss": 0.0002, "reward": 1.8178571835160255, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8178571723401546, "rewards/format_reward_func": 1.0, "step": 9020 }, { "completion_length": 232.18751049041748, "epoch": 1.5126786537574919, "grad_norm": 0.16497460287344504, "kl": 0.15875244140625, "learning_rate": 4.982234702048658e-07, "loss": 0.0002, "reward": 1.7571429312229156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 9022 }, { "completion_length": 223.31251049041748, "epoch": 1.5130139569973595, "grad_norm": 0.20123506275968653, "kl": 0.142669677734375, "learning_rate": 4.982220423867257e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 1.0, "step": 9024 }, { "completion_length": 242.10268878936768, "epoch": 1.5133492602372272, "grad_norm": 0.20078758437765387, "kl": 0.135955810546875, "learning_rate": 4.982206139970865e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571823984385, "rewards/format_reward_func": 1.0, "step": 9026 }, { "completion_length": 230.040189743042, "epoch": 1.5136845634770946, "grad_norm": 0.2897700898573048, "kl": 0.130035400390625, "learning_rate": 4.982191850359514e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 9028 }, { "completion_length": 245.99108123779297, "epoch": 1.5140198667169622, "grad_norm": 0.23874503537560982, "kl": 0.152069091796875, "learning_rate": 4.982177555033236e-07, "loss": 0.0002, "reward": 1.7522322237491608, "reward_std": 0.0473508988507092, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 0.9986607171595097, "step": 9030 }, { "completion_length": 237.39733219146729, "epoch": 1.5143551699568296, "grad_norm": 0.10178434262825313, "kl": 0.1427001953125, "learning_rate": 4.982163253992066e-07, "loss": 0.0001, "reward": 1.7035715132951736, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7035714574158192, "rewards/format_reward_func": 1.0, "step": 9032 }, { "completion_length": 244.5625123977661, "epoch": 1.5146904731966973, "grad_norm": 0.23837435651216773, "kl": 0.157928466796875, "learning_rate": 4.982148947236036e-07, "loss": 0.0002, "reward": 1.7464286237955093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 9034 }, { "completion_length": 237.78126335144043, "epoch": 1.5150257764365649, "grad_norm": 0.41831335496057137, "kl": 0.15301513671875, "learning_rate": 4.982134634765178e-07, "loss": 0.0002, "reward": 1.7714286223053932, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 9036 }, { "completion_length": 243.93304824829102, "epoch": 1.5153610796764325, "grad_norm": 0.19669736268949578, "kl": 0.146759033203125, "learning_rate": 4.982120316579527e-07, "loss": 0.0001, "reward": 1.6750001087784767, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.6750000193715096, "rewards/format_reward_func": 1.0, "step": 9038 }, { "completion_length": 249.227689743042, "epoch": 1.5156963829163, "grad_norm": 0.11420041596317355, "kl": 0.165985107421875, "learning_rate": 4.982105992679113e-07, "loss": 0.0002, "reward": 1.7678571939468384, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7678571827709675, "rewards/format_reward_func": 1.0, "step": 9040 }, { "completion_length": 253.27233219146729, "epoch": 1.5160316861561673, "grad_norm": 0.5580972834206843, "kl": 0.196990966796875, "learning_rate": 4.982091663063972e-07, "loss": 0.0002, "reward": 1.7785715013742447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714454948902, "rewards/format_reward_func": 1.0, "step": 9042 }, { "completion_length": 249.9330472946167, "epoch": 1.516366989396035, "grad_norm": 0.3001462222001796, "kl": 0.154052734375, "learning_rate": 4.982077327734135e-07, "loss": 0.0002, "reward": 1.7803572192788124, "reward_std": 0.09848987124860287, "rewards/equation_reward_func": 0.7848214581608772, "rewards/format_reward_func": 0.9955357164144516, "step": 9044 }, { "completion_length": 249.7946548461914, "epoch": 1.5167022926359026, "grad_norm": 0.2560114961097288, "kl": 0.13873291015625, "learning_rate": 4.982062986689637e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 9046 }, { "completion_length": 249.946439743042, "epoch": 1.5170375958757703, "grad_norm": 0.22287832163678353, "kl": 0.1337890625, "learning_rate": 4.982048639930509e-07, "loss": 0.0001, "reward": 1.7285714969038963, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714484751225, "rewards/format_reward_func": 1.0, "step": 9048 }, { "completion_length": 241.42858409881592, "epoch": 1.5173728991156377, "grad_norm": 0.17462550405638447, "kl": 0.124725341796875, "learning_rate": 4.982034287456784e-07, "loss": 0.0001, "reward": 1.725000075995922, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7339286059141159, "rewards/format_reward_func": 0.9910714328289032, "step": 9050 }, { "completion_length": 241.64287090301514, "epoch": 1.5177082023555053, "grad_norm": 0.31667186359080757, "kl": 0.12017822265625, "learning_rate": 4.982019929268496e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 9052 }, { "completion_length": 244.20983219146729, "epoch": 1.5180435055953727, "grad_norm": 0.7155592391967859, "kl": 0.233062744140625, "learning_rate": 4.982005565365678e-07, "loss": 0.0002, "reward": 1.7428571954369545, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 9054 }, { "completion_length": 234.5044755935669, "epoch": 1.5183788088352403, "grad_norm": 0.19988312655943996, "kl": 0.1508026123046875, "learning_rate": 4.981991195748363e-07, "loss": 0.0002, "reward": 1.741071492433548, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7455357573926449, "rewards/format_reward_func": 0.9955357164144516, "step": 9056 }, { "completion_length": 238.2053680419922, "epoch": 1.518714112075108, "grad_norm": 0.2720902850609489, "kl": 0.113433837890625, "learning_rate": 4.981976820416584e-07, "loss": 0.0001, "reward": 1.7285715192556381, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714596509933, "rewards/format_reward_func": 1.0, "step": 9058 }, { "completion_length": 232.69197750091553, "epoch": 1.5190494153149756, "grad_norm": 0.20592691107488253, "kl": 0.165008544921875, "learning_rate": 4.981962439370374e-07, "loss": 0.0002, "reward": 1.8250000476837158, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8250000327825546, "rewards/format_reward_func": 1.0, "step": 9060 }, { "completion_length": 236.633939743042, "epoch": 1.519384718554843, "grad_norm": 0.31761312113051826, "kl": 0.1402587890625, "learning_rate": 4.981948052609767e-07, "loss": 0.0001, "reward": 1.8000000417232513, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8000000342726707, "rewards/format_reward_func": 1.0, "step": 9062 }, { "completion_length": 235.4241189956665, "epoch": 1.5197200217947104, "grad_norm": 0.1862511774783427, "kl": 0.136566162109375, "learning_rate": 4.981933660134795e-07, "loss": 0.0001, "reward": 1.7517857924103737, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 9064 }, { "completion_length": 234.3928689956665, "epoch": 1.520055325034578, "grad_norm": 0.22103160635442895, "kl": 0.165435791015625, "learning_rate": 4.981919261945491e-07, "loss": 0.0002, "reward": 1.7767857685685158, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7812500204890966, "rewards/format_reward_func": 0.9955357164144516, "step": 9066 }, { "completion_length": 234.696439743042, "epoch": 1.5203906282744457, "grad_norm": 0.5237816922040125, "kl": 0.127655029296875, "learning_rate": 4.981904858041889e-07, "loss": 0.0001, "reward": 1.6714286729693413, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.6714286096394062, "rewards/format_reward_func": 1.0, "step": 9068 }, { "completion_length": 233.1116180419922, "epoch": 1.5207259315143133, "grad_norm": 0.2222210056755971, "kl": 0.108612060546875, "learning_rate": 4.981890448424021e-07, "loss": 0.0001, "reward": 1.7642857655882835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 9070 }, { "completion_length": 227.2232265472412, "epoch": 1.5210612347541808, "grad_norm": 0.3268976894652414, "kl": 0.09259033203125, "learning_rate": 4.981876033091922e-07, "loss": 0.0001, "reward": 1.766071505844593, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7696428820490837, "rewards/format_reward_func": 0.9964285790920258, "step": 9072 }, { "completion_length": 221.62947368621826, "epoch": 1.5213965379940484, "grad_norm": 0.11221103222961556, "kl": 0.10101318359375, "learning_rate": 4.981861612045624e-07, "loss": 0.0001, "reward": 1.8357143253087997, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8357143104076385, "rewards/format_reward_func": 1.0, "step": 9074 }, { "completion_length": 229.33483123779297, "epoch": 1.5217318412339158, "grad_norm": 0.16956317672941165, "kl": 0.0871429443359375, "learning_rate": 4.98184718528516e-07, "loss": 0.0001, "reward": 1.8178572058677673, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8178571723401546, "rewards/format_reward_func": 1.0, "step": 9076 }, { "completion_length": 228.6741180419922, "epoch": 1.5220671444737834, "grad_norm": 0.11080000675820717, "kl": 0.113494873046875, "learning_rate": 4.981832752810564e-07, "loss": 0.0001, "reward": 1.7642857506871223, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 9078 }, { "completion_length": 231.3660831451416, "epoch": 1.522402447713651, "grad_norm": 0.074742168877599, "kl": 0.09735107421875, "learning_rate": 4.981818314621868e-07, "loss": 0.0001, "reward": 1.787500075995922, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7919643148779869, "rewards/format_reward_func": 0.9955357164144516, "step": 9080 }, { "completion_length": 230.1785831451416, "epoch": 1.5227377509535187, "grad_norm": 0.2248487524696226, "kl": 0.111541748046875, "learning_rate": 4.981803870719107e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7821428775787354, "rewards/format_reward_func": 1.0, "step": 9082 }, { "completion_length": 240.22322273254395, "epoch": 1.5230730541933861, "grad_norm": 0.2183842155755358, "kl": 0.0999603271484375, "learning_rate": 4.981789421102313e-07, "loss": 0.0001, "reward": 1.8250000327825546, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8250000216066837, "rewards/format_reward_func": 1.0, "step": 9084 }, { "completion_length": 231.1384048461914, "epoch": 1.5234083574332535, "grad_norm": 0.20518936982415142, "kl": 0.0991363525390625, "learning_rate": 4.98177496577152e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714767873287, "rewards/format_reward_func": 1.0, "step": 9086 }, { "completion_length": 229.571439743042, "epoch": 1.5237436606731212, "grad_norm": 0.255223191177945, "kl": 0.094573974609375, "learning_rate": 4.981760504726759e-07, "loss": 0.0001, "reward": 1.8250000476837158, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8250000402331352, "rewards/format_reward_func": 1.0, "step": 9088 }, { "completion_length": 244.84822463989258, "epoch": 1.5240789639129888, "grad_norm": 0.24607461370798528, "kl": 0.1124420166015625, "learning_rate": 4.981746037968068e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 9090 }, { "completion_length": 242.04911708831787, "epoch": 1.5244142671528564, "grad_norm": 0.09837423239495768, "kl": 0.1160888671875, "learning_rate": 4.981731565495475e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 9092 }, { "completion_length": 240.37947463989258, "epoch": 1.524749570392724, "grad_norm": 0.19651923643307115, "kl": 0.097412109375, "learning_rate": 4.981717087309018e-07, "loss": 0.0001, "reward": 1.79642865806818, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 9094 }, { "completion_length": 237.21875858306885, "epoch": 1.5250848736325915, "grad_norm": 0.14018062890921712, "kl": 0.09429931640625, "learning_rate": 4.981702603408726e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 9096 }, { "completion_length": 230.48215198516846, "epoch": 1.525420176872459, "grad_norm": 0.21898535817201434, "kl": 0.1555023193359375, "learning_rate": 4.981688113794636e-07, "loss": 0.0002, "reward": 1.803571492433548, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714477300644, "rewards/format_reward_func": 1.0, "step": 9098 }, { "completion_length": 244.7991180419922, "epoch": 1.5257554801123265, "grad_norm": 0.16935366185685477, "kl": 0.0987396240234375, "learning_rate": 4.981673618466779e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 9100 }, { "completion_length": 241.27233123779297, "epoch": 1.5260907833521942, "grad_norm": 0.33333039079707383, "kl": 0.0966796875, "learning_rate": 4.981659117425189e-07, "loss": 0.0001, "reward": 1.6571429520845413, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.6571428943425417, "rewards/format_reward_func": 1.0, "step": 9102 }, { "completion_length": 235.34376049041748, "epoch": 1.5264260865920618, "grad_norm": 0.2157871403374037, "kl": 0.09100341796875, "learning_rate": 4.9816446106699e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 9104 }, { "completion_length": 235.5714406967163, "epoch": 1.5267613898319292, "grad_norm": 0.21741295384543297, "kl": 0.0951690673828125, "learning_rate": 4.981630098200946e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.760714303702116, "rewards/format_reward_func": 1.0, "step": 9106 }, { "completion_length": 239.915189743042, "epoch": 1.5270966930717969, "grad_norm": 0.3244411047147427, "kl": 0.1490478515625, "learning_rate": 4.981615580018357e-07, "loss": 0.0001, "reward": 1.7910714820027351, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7955357395112514, "rewards/format_reward_func": 0.9955357164144516, "step": 9108 }, { "completion_length": 240.0937614440918, "epoch": 1.5274319963116643, "grad_norm": 0.17487077329756923, "kl": 0.125091552734375, "learning_rate": 4.98160105612217e-07, "loss": 0.0001, "reward": 1.8000000342726707, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000342726707, "rewards/format_reward_func": 1.0, "step": 9110 }, { "completion_length": 251.28125953674316, "epoch": 1.527767299551532, "grad_norm": 0.26637816520531216, "kl": 0.140777587890625, "learning_rate": 4.981586526512418e-07, "loss": 0.0001, "reward": 1.8000000417232513, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.800000037997961, "rewards/format_reward_func": 1.0, "step": 9112 }, { "completion_length": 251.94197463989258, "epoch": 1.5281026027913995, "grad_norm": 0.20245928613813574, "kl": 0.1270904541015625, "learning_rate": 4.981571991189133e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 9114 }, { "completion_length": 242.29018878936768, "epoch": 1.5284379060312672, "grad_norm": 0.17394760960328057, "kl": 0.153533935546875, "learning_rate": 4.98155745015235e-07, "loss": 0.0002, "reward": 1.8214286118745804, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8214285969734192, "rewards/format_reward_func": 1.0, "step": 9116 }, { "completion_length": 247.1160831451416, "epoch": 1.5287732092711346, "grad_norm": 0.2279073006313851, "kl": 0.10235595703125, "learning_rate": 4.9815429034021e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357536673546, "rewards/format_reward_func": 0.9955357164144516, "step": 9118 }, { "completion_length": 242.0178689956665, "epoch": 1.529108512511002, "grad_norm": 0.2639212009108935, "kl": 0.1070709228515625, "learning_rate": 4.981528350938419e-07, "loss": 0.0001, "reward": 1.7017857879400253, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7062500435858965, "rewards/format_reward_func": 0.9955357164144516, "step": 9120 }, { "completion_length": 241.91965293884277, "epoch": 1.5294438157508696, "grad_norm": 0.22109535656033716, "kl": 0.105133056640625, "learning_rate": 4.98151379276134e-07, "loss": 0.0001, "reward": 1.796428605914116, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286096394062, "rewards/format_reward_func": 1.0, "step": 9122 }, { "completion_length": 249.31697463989258, "epoch": 1.5297791189907373, "grad_norm": 0.15664011211006298, "kl": 0.098358154296875, "learning_rate": 4.981499228870895e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.785714304074645, "rewards/format_reward_func": 1.0, "step": 9124 }, { "completion_length": 242.37054634094238, "epoch": 1.530114422230605, "grad_norm": 0.18290893825178986, "kl": 0.1169281005859375, "learning_rate": 4.981484659267121e-07, "loss": 0.0001, "reward": 1.775000087916851, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 9126 }, { "completion_length": 251.2991189956665, "epoch": 1.5304497254704723, "grad_norm": 0.25195483521072903, "kl": 0.1036376953125, "learning_rate": 4.981470083950047e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571674972773, "rewards/format_reward_func": 1.0, "step": 9128 }, { "completion_length": 239.30358219146729, "epoch": 1.53078502871034, "grad_norm": 0.12518929441684795, "kl": 0.095672607421875, "learning_rate": 4.98145550291971e-07, "loss": 0.0001, "reward": 1.7642857432365417, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857600003481, "rewards/format_reward_func": 1.0, "step": 9130 }, { "completion_length": 237.91072463989258, "epoch": 1.5311203319502074, "grad_norm": 0.20083257203715613, "kl": 0.092926025390625, "learning_rate": 4.981440916176142e-07, "loss": 0.0001, "reward": 1.7250000983476639, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7250000238418579, "rewards/format_reward_func": 1.0, "step": 9132 }, { "completion_length": 247.0937614440918, "epoch": 1.531455635190075, "grad_norm": 0.22650875766293518, "kl": 0.103668212890625, "learning_rate": 4.981426323719377e-07, "loss": 0.0001, "reward": 1.728571504354477, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714708268642, "rewards/format_reward_func": 1.0, "step": 9134 }, { "completion_length": 243.86162090301514, "epoch": 1.5317909384299426, "grad_norm": 0.08014615946695847, "kl": 0.09442138671875, "learning_rate": 4.981411725549449e-07, "loss": 0.0001, "reward": 1.710714377462864, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7107143178582191, "rewards/format_reward_func": 1.0, "step": 9136 }, { "completion_length": 237.8616180419922, "epoch": 1.5321262416698103, "grad_norm": 0.20283591817892682, "kl": 0.085479736328125, "learning_rate": 4.98139712166639e-07, "loss": 0.0001, "reward": 1.7803572043776512, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 9138 }, { "completion_length": 243.31250953674316, "epoch": 1.5324615449096777, "grad_norm": 0.2042566525715254, "kl": 0.0798492431640625, "learning_rate": 4.981382512070235e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 9140 }, { "completion_length": 237.3437614440918, "epoch": 1.532796848149545, "grad_norm": 0.2004883026580934, "kl": 0.088653564453125, "learning_rate": 4.981367896761019e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 9142 }, { "completion_length": 238.80358219146729, "epoch": 1.5331321513894127, "grad_norm": 0.19496765967296867, "kl": 0.090240478515625, "learning_rate": 4.981353275738772e-07, "loss": 0.0001, "reward": 1.7285714969038963, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714447498322, "rewards/format_reward_func": 1.0, "step": 9144 }, { "completion_length": 231.03125953674316, "epoch": 1.5334674546292804, "grad_norm": 0.17878598708437798, "kl": 0.0771331787109375, "learning_rate": 4.981338649003531e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 9146 }, { "completion_length": 241.41965579986572, "epoch": 1.533802757869148, "grad_norm": 0.1694416729522096, "kl": 0.092437744140625, "learning_rate": 4.981324016555328e-07, "loss": 0.0001, "reward": 1.7142857909202576, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7142857573926449, "rewards/format_reward_func": 1.0, "step": 9148 }, { "completion_length": 241.62947368621826, "epoch": 1.5341380611090156, "grad_norm": 0.3325622452794708, "kl": 0.0935211181640625, "learning_rate": 4.981309378394197e-07, "loss": 0.0001, "reward": 1.6928572282195091, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.6928571686148643, "rewards/format_reward_func": 1.0, "step": 9150 }, { "completion_length": 233.10268878936768, "epoch": 1.534473364348883, "grad_norm": 0.23292539109008156, "kl": 0.086761474609375, "learning_rate": 4.981294734520172e-07, "loss": 0.0001, "reward": 1.767857238650322, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 9152 }, { "completion_length": 237.7544765472412, "epoch": 1.5348086675887505, "grad_norm": 0.20881513610441563, "kl": 0.0846099853515625, "learning_rate": 4.981280084933287e-07, "loss": 0.0001, "reward": 1.78035718947649, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214730620384, "rewards/format_reward_func": 0.9955357164144516, "step": 9154 }, { "completion_length": 233.49554824829102, "epoch": 1.535143970828618, "grad_norm": 0.2662071000635271, "kl": 0.089324951171875, "learning_rate": 4.981265429633575e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 9156 }, { "completion_length": 238.50001049041748, "epoch": 1.5354792740684857, "grad_norm": 0.22534214293420757, "kl": 0.102813720703125, "learning_rate": 4.98125076862107e-07, "loss": 0.0001, "reward": 1.7357143461704254, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 9158 }, { "completion_length": 226.07590293884277, "epoch": 1.5358145773083534, "grad_norm": 0.2740503946863582, "kl": 0.1274871826171875, "learning_rate": 4.981236101895806e-07, "loss": 0.0001, "reward": 1.7607143446803093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 9160 }, { "completion_length": 219.53572463989258, "epoch": 1.5361498805482208, "grad_norm": 0.206061237532187, "kl": 0.0881195068359375, "learning_rate": 4.981221429457815e-07, "loss": 0.0001, "reward": 1.753571517765522, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714488476515, "rewards/format_reward_func": 1.0, "step": 9162 }, { "completion_length": 222.23661708831787, "epoch": 1.5364851837880884, "grad_norm": 0.1439280155706265, "kl": 0.093841552734375, "learning_rate": 4.981206751307135e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 9164 }, { "completion_length": 228.29018878936768, "epoch": 1.5368204870279558, "grad_norm": 0.24340325547414882, "kl": 0.092681884765625, "learning_rate": 4.981192067443795e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428991854191, "rewards/format_reward_func": 1.0, "step": 9166 }, { "completion_length": 223.34376049041748, "epoch": 1.5371557902678235, "grad_norm": 0.18917360158958488, "kl": 0.10345458984375, "learning_rate": 4.981177377867831e-07, "loss": 0.0001, "reward": 1.7892857566475868, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857361584902, "rewards/format_reward_func": 1.0, "step": 9168 }, { "completion_length": 238.33483600616455, "epoch": 1.537491093507691, "grad_norm": 0.273360201970595, "kl": 0.10321044921875, "learning_rate": 4.981162682579278e-07, "loss": 0.0001, "reward": 1.7071429267525673, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7071428801864386, "rewards/format_reward_func": 1.0, "step": 9170 }, { "completion_length": 226.3482255935669, "epoch": 1.5378263967475587, "grad_norm": 0.32470792042468705, "kl": 0.098785400390625, "learning_rate": 4.981147981578168e-07, "loss": 0.0001, "reward": 1.8178571984171867, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.8178571574389935, "rewards/format_reward_func": 1.0, "step": 9172 }, { "completion_length": 234.78572273254395, "epoch": 1.5381616999874261, "grad_norm": 0.2515031157446506, "kl": 0.1017303466796875, "learning_rate": 4.981133274864535e-07, "loss": 0.0001, "reward": 1.7665178999304771, "reward_std": 0.027147849323228, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 0.9986607171595097, "step": 9174 }, { "completion_length": 229.47768688201904, "epoch": 1.5384970032272935, "grad_norm": 0.16447082251076406, "kl": 0.092315673828125, "learning_rate": 4.981118562438414e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 9176 }, { "completion_length": 234.4107255935669, "epoch": 1.5388323064671612, "grad_norm": 0.21182922552358208, "kl": 0.107757568359375, "learning_rate": 4.981103844299837e-07, "loss": 0.0001, "reward": 1.7750000804662704, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 9178 }, { "completion_length": 232.08483123779297, "epoch": 1.5391676097070288, "grad_norm": 0.1271282836822565, "kl": 0.09710693359375, "learning_rate": 4.981089120448839e-07, "loss": 0.0001, "reward": 1.8000000268220901, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.8000000342726707, "rewards/format_reward_func": 1.0, "step": 9180 }, { "completion_length": 251.49554920196533, "epoch": 1.5395029129468965, "grad_norm": 0.08852330164511354, "kl": 0.103973388671875, "learning_rate": 4.981074390885455e-07, "loss": 0.0001, "reward": 1.7607143372297287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 9182 }, { "completion_length": 247.26787090301514, "epoch": 1.5398382161867639, "grad_norm": 0.18896351681271314, "kl": 0.09765625, "learning_rate": 4.981059655609717e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 9184 }, { "completion_length": 245.33036708831787, "epoch": 1.5401735194266315, "grad_norm": 0.16901570124704474, "kl": 0.0955810546875, "learning_rate": 4.98104491462166e-07, "loss": 0.0001, "reward": 1.8035714626312256, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 9186 }, { "completion_length": 244.08482933044434, "epoch": 1.540508822666499, "grad_norm": 0.21763164806003021, "kl": 0.0872344970703125, "learning_rate": 4.981030167921317e-07, "loss": 0.0001, "reward": 1.7553572058677673, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214671015739, "rewards/format_reward_func": 0.9955357164144516, "step": 9188 }, { "completion_length": 244.71876049041748, "epoch": 1.5408441259063665, "grad_norm": 0.26243133709106686, "kl": 0.0949249267578125, "learning_rate": 4.981015415508725e-07, "loss": 0.0001, "reward": 1.7107143625617027, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7107143141329288, "rewards/format_reward_func": 1.0, "step": 9190 }, { "completion_length": 242.61608028411865, "epoch": 1.5411794291462342, "grad_norm": 0.17520268835161978, "kl": 0.10418701171875, "learning_rate": 4.981000657383914e-07, "loss": 0.0001, "reward": 1.8000000268220901, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000417232513, "rewards/format_reward_func": 1.0, "step": 9192 }, { "completion_length": 239.6473331451416, "epoch": 1.5415147323861018, "grad_norm": 0.20087686035129643, "kl": 0.109222412109375, "learning_rate": 4.980985893546919e-07, "loss": 0.0001, "reward": 1.7428572475910187, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571730852127, "rewards/format_reward_func": 1.0, "step": 9194 }, { "completion_length": 244.56697463989258, "epoch": 1.5418500356259692, "grad_norm": 0.20778828417353182, "kl": 0.10235595703125, "learning_rate": 4.980971123997776e-07, "loss": 0.0001, "reward": 1.7303572297096252, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7348214574158192, "rewards/format_reward_func": 0.9955357164144516, "step": 9196 }, { "completion_length": 241.5357265472412, "epoch": 1.5421853388658366, "grad_norm": 0.28343013163751585, "kl": 0.094146728515625, "learning_rate": 4.980956348736516e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7875000275671482, "rewards/format_reward_func": 0.9910714328289032, "step": 9198 }, { "completion_length": 236.90179920196533, "epoch": 1.5425206421057043, "grad_norm": 0.19766081655551698, "kl": 0.102569580078125, "learning_rate": 4.980941567763176e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 9200 }, { "completion_length": 236.74554538726807, "epoch": 1.542855945345572, "grad_norm": 0.22589139544291423, "kl": 0.1072540283203125, "learning_rate": 4.980926781077788e-07, "loss": 0.0001, "reward": 1.735714353621006, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143182307482, "rewards/format_reward_func": 1.0, "step": 9202 }, { "completion_length": 238.62500858306885, "epoch": 1.5431912485854395, "grad_norm": 0.2167072922018885, "kl": 0.0994873046875, "learning_rate": 4.980911988680386e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 9204 }, { "completion_length": 237.29018783569336, "epoch": 1.543526551825307, "grad_norm": 0.23633367374109066, "kl": 0.095428466796875, "learning_rate": 4.980897190571006e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428880095482, "rewards/format_reward_func": 1.0, "step": 9206 }, { "completion_length": 243.02233219146729, "epoch": 1.5438618550651746, "grad_norm": 0.2340838662723435, "kl": 0.11846923828125, "learning_rate": 4.980882386749681e-07, "loss": 0.0001, "reward": 1.7339286655187607, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7383928894996643, "rewards/format_reward_func": 0.9955357164144516, "step": 9208 }, { "completion_length": 236.89733219146729, "epoch": 1.544197158305042, "grad_norm": 0.2706127020814316, "kl": 0.097137451171875, "learning_rate": 4.980867577216444e-07, "loss": 0.0001, "reward": 1.7214286550879478, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214285880327225, "rewards/format_reward_func": 1.0, "step": 9210 }, { "completion_length": 233.38393878936768, "epoch": 1.5445324615449096, "grad_norm": 0.20047158656273134, "kl": 0.105621337890625, "learning_rate": 4.98085276197133e-07, "loss": 0.0001, "reward": 1.707142911851406, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7071428932249546, "rewards/format_reward_func": 1.0, "step": 9212 }, { "completion_length": 244.8303680419922, "epoch": 1.5448677647847773, "grad_norm": 0.134787846932301, "kl": 0.119598388671875, "learning_rate": 4.980837941014374e-07, "loss": 0.0001, "reward": 1.6964286714792252, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6964286044239998, "rewards/format_reward_func": 1.0, "step": 9214 }, { "completion_length": 238.9776906967163, "epoch": 1.545203068024645, "grad_norm": 0.2149803153413945, "kl": 0.110198974609375, "learning_rate": 4.980823114345608e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 9216 }, { "completion_length": 224.63393783569336, "epoch": 1.5455383712645123, "grad_norm": 0.24007473653044104, "kl": 0.105682373046875, "learning_rate": 4.980808281965068e-07, "loss": 0.0001, "reward": 1.7357143759727478, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143275439739, "rewards/format_reward_func": 1.0, "step": 9218 }, { "completion_length": 235.7634038925171, "epoch": 1.5458736745043797, "grad_norm": 0.1669212593409537, "kl": 0.110992431640625, "learning_rate": 4.980793443872788e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 1.0, "step": 9220 }, { "completion_length": 226.36161613464355, "epoch": 1.5462089777442474, "grad_norm": 0.16524317745228337, "kl": 0.111541748046875, "learning_rate": 4.980778600068801e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7214286103844643, "rewards/format_reward_func": 1.0, "step": 9222 }, { "completion_length": 219.8169765472412, "epoch": 1.546544280984115, "grad_norm": 0.2477187774885109, "kl": 0.091156005859375, "learning_rate": 4.980763750553142e-07, "loss": 0.0001, "reward": 1.8071429207921028, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 9224 }, { "completion_length": 235.0000114440918, "epoch": 1.5468795842239826, "grad_norm": 0.11273552378741418, "kl": 0.117584228515625, "learning_rate": 4.980748895325845e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 9226 }, { "completion_length": 224.48215293884277, "epoch": 1.5472148874638503, "grad_norm": 0.2502290193023174, "kl": 0.1129608154296875, "learning_rate": 4.980734034386944e-07, "loss": 0.0001, "reward": 1.7964286133646965, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 9228 }, { "completion_length": 222.09375953674316, "epoch": 1.5475501907037177, "grad_norm": 0.1613240611923338, "kl": 0.113433837890625, "learning_rate": 4.980719167736474e-07, "loss": 0.0001, "reward": 1.7839286476373672, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.788392897695303, "rewards/format_reward_func": 0.9955357164144516, "step": 9230 }, { "completion_length": 226.97768688201904, "epoch": 1.547885493943585, "grad_norm": 0.6493610709705243, "kl": 0.1061248779296875, "learning_rate": 4.980704295374469e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 9232 }, { "completion_length": 222.69197463989258, "epoch": 1.5482207971834527, "grad_norm": 0.25523525723368407, "kl": 0.114501953125, "learning_rate": 4.980689417300963e-07, "loss": 0.0001, "reward": 1.76250009983778, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 9234 }, { "completion_length": 223.65625858306885, "epoch": 1.5485561004233204, "grad_norm": 0.15555612635792165, "kl": 0.1201171875, "learning_rate": 4.980674533515989e-07, "loss": 0.0001, "reward": 1.8285714462399483, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8285714536905289, "rewards/format_reward_func": 1.0, "step": 9236 }, { "completion_length": 216.60268783569336, "epoch": 1.548891403663188, "grad_norm": 0.0735118992899377, "kl": 0.10296630859375, "learning_rate": 4.980659644019584e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 9238 }, { "completion_length": 217.47322368621826, "epoch": 1.5492267069030554, "grad_norm": 0.25828737432664267, "kl": 0.104248046875, "learning_rate": 4.980644748811778e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 9240 }, { "completion_length": 214.54911518096924, "epoch": 1.549562010142923, "grad_norm": 0.21234573043333244, "kl": 0.132171630859375, "learning_rate": 4.980629847892611e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571809083223, "rewards/format_reward_func": 1.0, "step": 9242 }, { "completion_length": 218.95982933044434, "epoch": 1.5498973133827905, "grad_norm": 0.15823434184499677, "kl": 0.100311279296875, "learning_rate": 4.980614941262113e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 9244 }, { "completion_length": 212.95536708831787, "epoch": 1.550232616622658, "grad_norm": 0.21603125601891104, "kl": 0.103302001953125, "learning_rate": 4.98060002892032e-07, "loss": 0.0001, "reward": 1.82857146859169, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8285714648663998, "rewards/format_reward_func": 1.0, "step": 9246 }, { "completion_length": 217.14733123779297, "epoch": 1.5505679198625257, "grad_norm": 0.2851890565606586, "kl": 0.114898681640625, "learning_rate": 4.980585110867265e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 9248 }, { "completion_length": 213.21875858306885, "epoch": 1.5509032231023934, "grad_norm": 0.40561699254233524, "kl": 0.11175537109375, "learning_rate": 4.980570187102985e-07, "loss": 0.0001, "reward": 1.7267857864499092, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7312500402331352, "rewards/format_reward_func": 0.9955357164144516, "step": 9250 }, { "completion_length": 214.15179634094238, "epoch": 1.5512385263422608, "grad_norm": 0.2224340282727774, "kl": 0.097808837890625, "learning_rate": 4.980555257627511e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 9252 }, { "completion_length": 210.20090198516846, "epoch": 1.5515738295821282, "grad_norm": 0.1431330448665486, "kl": 0.11029052734375, "learning_rate": 4.980540322440881e-07, "loss": 0.0001, "reward": 1.832142896950245, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8321428745985031, "rewards/format_reward_func": 1.0, "step": 9254 }, { "completion_length": 215.96429443359375, "epoch": 1.5519091328219958, "grad_norm": 0.42489945876210056, "kl": 0.1027374267578125, "learning_rate": 4.980525381543126e-07, "loss": 0.0001, "reward": 1.7607143446803093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 9256 }, { "completion_length": 219.38840198516846, "epoch": 1.5522444360618635, "grad_norm": 0.24199594531058644, "kl": 0.11187744140625, "learning_rate": 4.980510434934283e-07, "loss": 0.0001, "reward": 1.7035714983940125, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7035714685916901, "rewards/format_reward_func": 1.0, "step": 9258 }, { "completion_length": 205.69197463989258, "epoch": 1.552579739301731, "grad_norm": 0.2722525309108444, "kl": 0.092529296875, "learning_rate": 4.980495482614384e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 9260 }, { "completion_length": 218.0134038925171, "epoch": 1.5529150425415985, "grad_norm": 0.16731072750023818, "kl": 0.0987548828125, "learning_rate": 4.980480524583465e-07, "loss": 0.0001, "reward": 1.7571429088711739, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 9262 }, { "completion_length": 210.11608123779297, "epoch": 1.5532503457814661, "grad_norm": 0.20944642746491426, "kl": 0.098785400390625, "learning_rate": 4.98046556084156e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857264727354, "rewards/format_reward_func": 1.0, "step": 9264 }, { "completion_length": 220.58483123779297, "epoch": 1.5535856490213336, "grad_norm": 0.39444762467548544, "kl": 0.1048736572265625, "learning_rate": 4.980450591388705e-07, "loss": 0.0001, "reward": 1.7178572192788124, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7267857398837805, "rewards/format_reward_func": 0.9910714328289032, "step": 9266 }, { "completion_length": 219.38393688201904, "epoch": 1.5539209522612012, "grad_norm": 0.23693520520013314, "kl": 0.102447509765625, "learning_rate": 4.980435616224932e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 9268 }, { "completion_length": 218.48661613464355, "epoch": 1.5542562555010688, "grad_norm": 0.653163923404132, "kl": 0.10009765625, "learning_rate": 4.980420635350277e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7991071566939354, "rewards/format_reward_func": 0.9955357164144516, "step": 9270 }, { "completion_length": 212.60715293884277, "epoch": 1.5545915587409365, "grad_norm": 0.6311999205666676, "kl": 0.097198486328125, "learning_rate": 4.980405648764773e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.785714304074645, "rewards/format_reward_func": 1.0, "step": 9272 }, { "completion_length": 221.80358219146729, "epoch": 1.5549268619808039, "grad_norm": 0.7841224772161673, "kl": 0.11199951171875, "learning_rate": 4.980390656468456e-07, "loss": 0.0001, "reward": 1.776785783469677, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7812500260770321, "rewards/format_reward_func": 0.9955357164144516, "step": 9274 }, { "completion_length": 212.97322368621826, "epoch": 1.5552621652206713, "grad_norm": 0.4696213408353257, "kl": 0.111358642578125, "learning_rate": 4.980375658461361e-07, "loss": 0.0001, "reward": 1.7696429044008255, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7741071730852127, "rewards/format_reward_func": 0.9955357164144516, "step": 9276 }, { "completion_length": 213.40179443359375, "epoch": 1.555597468460539, "grad_norm": 0.18546115989892328, "kl": 0.10791015625, "learning_rate": 4.980360654743521e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 9278 }, { "completion_length": 210.76786708831787, "epoch": 1.5559327717004066, "grad_norm": 0.25145691695231076, "kl": 0.109344482421875, "learning_rate": 4.98034564531497e-07, "loss": 0.0001, "reward": 1.737500049173832, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643234461546, "rewards/format_reward_func": 0.9955357164144516, "step": 9280 }, { "completion_length": 213.77679443359375, "epoch": 1.5562680749402742, "grad_norm": 0.20233206188903105, "kl": 0.125946044921875, "learning_rate": 4.980330630175746e-07, "loss": 0.0001, "reward": 1.7526786401867867, "reward_std": 0.056821079924702644, "rewards/equation_reward_func": 0.7589286062866449, "rewards/format_reward_func": 0.9937500059604645, "step": 9282 }, { "completion_length": 215.31251049041748, "epoch": 1.5566033781801418, "grad_norm": 0.2149426775367761, "kl": 0.14520263671875, "learning_rate": 4.980315609325879e-07, "loss": 0.0001, "reward": 1.7571429088711739, "reward_std": 0.10101525206118822, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 9284 }, { "completion_length": 222.44643878936768, "epoch": 1.5569386814200092, "grad_norm": 0.3231430975015793, "kl": 0.1226806640625, "learning_rate": 4.980300582765406e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 9286 }, { "completion_length": 218.65179634094238, "epoch": 1.5572739846598767, "grad_norm": 0.1417486913493435, "kl": 0.122344970703125, "learning_rate": 4.980285550494362e-07, "loss": 0.0001, "reward": 1.769642911851406, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.9955357164144516, "step": 9288 }, { "completion_length": 223.09376049041748, "epoch": 1.5576092878997443, "grad_norm": 0.4358009612428243, "kl": 0.121795654296875, "learning_rate": 4.980270512512782e-07, "loss": 0.0001, "reward": 1.7535714656114578, "reward_std": 0.10606601648032665, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 0.9821428656578064, "step": 9290 }, { "completion_length": 208.5178680419922, "epoch": 1.557944591139612, "grad_norm": 0.276077930647653, "kl": 0.1182861328125, "learning_rate": 4.980255468820699e-07, "loss": 0.0001, "reward": 1.7750000357627869, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 9292 }, { "completion_length": 227.321439743042, "epoch": 1.5582798943794796, "grad_norm": 0.2746660299952982, "kl": 0.126617431640625, "learning_rate": 4.980240419418148e-07, "loss": 0.0001, "reward": 1.7285714745521545, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7285714577883482, "rewards/format_reward_func": 1.0, "step": 9294 }, { "completion_length": 205.32590103149414, "epoch": 1.558615197619347, "grad_norm": 0.251892460012717, "kl": 0.115447998046875, "learning_rate": 4.980225364305164e-07, "loss": 0.0001, "reward": 1.8071428760886192, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428947150707, "rewards/format_reward_func": 1.0, "step": 9296 }, { "completion_length": 221.91965293884277, "epoch": 1.5589505008592146, "grad_norm": 0.5319414903169392, "kl": 0.1319580078125, "learning_rate": 4.980210303481782e-07, "loss": 0.0001, "reward": 1.7446429133415222, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071671247482, "rewards/format_reward_func": 0.9955357164144516, "step": 9298 }, { "completion_length": 227.07590293884277, "epoch": 1.559285804099082, "grad_norm": 0.436605329446181, "kl": 0.13720703125, "learning_rate": 4.980195236948036e-07, "loss": 0.0001, "reward": 1.7053572311997414, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7098214644938707, "rewards/format_reward_func": 0.9955357164144516, "step": 9300 }, { "completion_length": 225.92858123779297, "epoch": 1.5596211073389497, "grad_norm": 0.2605545821872648, "kl": 0.1298828125, "learning_rate": 4.98018016470396e-07, "loss": 0.0001, "reward": 1.7750000804662704, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 9302 }, { "completion_length": 227.6607265472412, "epoch": 1.5599564105788173, "grad_norm": 0.4288816041377394, "kl": 0.13018798828125, "learning_rate": 4.980165086749592e-07, "loss": 0.0001, "reward": 1.7763393446803093, "reward_std": 0.07386740390211344, "rewards/equation_reward_func": 0.7839286029338837, "rewards/format_reward_func": 0.9924107193946838, "step": 9304 }, { "completion_length": 225.93750953674316, "epoch": 1.560291713818685, "grad_norm": 0.19834380017707282, "kl": 0.1331787109375, "learning_rate": 4.980150003084962e-07, "loss": 0.0001, "reward": 1.7553571984171867, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.759821455925703, "rewards/format_reward_func": 0.9955357164144516, "step": 9306 }, { "completion_length": 225.0312614440918, "epoch": 1.5606270170585523, "grad_norm": 0.1229449240722239, "kl": 0.127410888671875, "learning_rate": 4.980134913710108e-07, "loss": 0.0001, "reward": 1.7232143580913544, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7276786081492901, "rewards/format_reward_func": 0.9955357164144516, "step": 9308 }, { "completion_length": 226.49554634094238, "epoch": 1.5609623202984197, "grad_norm": 0.5843174232684581, "kl": 0.125244140625, "learning_rate": 4.980119818625064e-07, "loss": 0.0001, "reward": 1.7660715132951736, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7705357484519482, "rewards/format_reward_func": 0.9955357164144516, "step": 9310 }, { "completion_length": 233.28125858306885, "epoch": 1.5612976235382874, "grad_norm": 0.12572644726899307, "kl": 0.1334228515625, "learning_rate": 4.980104717829865e-07, "loss": 0.0001, "reward": 1.7517857775092125, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7562500312924385, "rewards/format_reward_func": 0.9955357164144516, "step": 9312 }, { "completion_length": 228.2098331451416, "epoch": 1.561632926778155, "grad_norm": 0.20672609168897751, "kl": 0.13775634765625, "learning_rate": 4.980089611324545e-07, "loss": 0.0001, "reward": 1.7285715118050575, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714652389288, "rewards/format_reward_func": 1.0, "step": 9314 }, { "completion_length": 232.10268878936768, "epoch": 1.5619682300180227, "grad_norm": 0.27711563932045935, "kl": 0.13140869140625, "learning_rate": 4.980074499109139e-07, "loss": 0.0001, "reward": 1.7071429342031479, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7071428894996643, "rewards/format_reward_func": 1.0, "step": 9316 }, { "completion_length": 235.50000953674316, "epoch": 1.56230353325789, "grad_norm": 0.2513881781273045, "kl": 0.143341064453125, "learning_rate": 4.980059381183682e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428749710321, "rewards/format_reward_func": 1.0, "step": 9318 }, { "completion_length": 222.54018688201904, "epoch": 1.5626388364977577, "grad_norm": 0.20648914774796778, "kl": 0.136016845703125, "learning_rate": 4.980044257548209e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 9320 }, { "completion_length": 235.7634048461914, "epoch": 1.5629741397376251, "grad_norm": 0.26204188469958983, "kl": 0.159210205078125, "learning_rate": 4.980029128202755e-07, "loss": 0.0002, "reward": 1.7553572058677673, "reward_std": 0.0833375845104456, "rewards/equation_reward_func": 0.7687500342726707, "rewards/format_reward_func": 0.9866071492433548, "step": 9322 }, { "completion_length": 215.79465293884277, "epoch": 1.5633094429774927, "grad_norm": 0.31719240838874485, "kl": 0.118438720703125, "learning_rate": 4.980013993147353e-07, "loss": 0.0001, "reward": 1.7928571626543999, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 9324 }, { "completion_length": 211.04911518096924, "epoch": 1.5636447462173604, "grad_norm": 0.14979368350307748, "kl": 0.1125640869140625, "learning_rate": 4.97999885238204e-07, "loss": 0.0001, "reward": 1.8232143446803093, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8276785835623741, "rewards/format_reward_func": 0.9955357164144516, "step": 9326 }, { "completion_length": 223.22768878936768, "epoch": 1.563980049457228, "grad_norm": 0.31624646214512, "kl": 0.12713623046875, "learning_rate": 4.979983705906852e-07, "loss": 0.0001, "reward": 1.7500000968575478, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 9328 }, { "completion_length": 218.57590293884277, "epoch": 1.5643153526970954, "grad_norm": 0.395515407485876, "kl": 0.124420166015625, "learning_rate": 4.979968553721819e-07, "loss": 0.0001, "reward": 1.7660714983940125, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7705357484519482, "rewards/format_reward_func": 0.9955357164144516, "step": 9330 }, { "completion_length": 211.87054634094238, "epoch": 1.5646506559369628, "grad_norm": 0.2938857964215618, "kl": 0.10455322265625, "learning_rate": 4.97995339582698e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 9332 }, { "completion_length": 212.06250953674316, "epoch": 1.5649859591768305, "grad_norm": 0.2958914012499181, "kl": 0.135986328125, "learning_rate": 4.97993823222237e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 9334 }, { "completion_length": 216.01786708831787, "epoch": 1.5653212624166981, "grad_norm": 0.22586410780715277, "kl": 0.13189697265625, "learning_rate": 4.979923062908022e-07, "loss": 0.0001, "reward": 1.7267858013510704, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7312500402331352, "rewards/format_reward_func": 0.9955357164144516, "step": 9336 }, { "completion_length": 210.58036422729492, "epoch": 1.5656565656565657, "grad_norm": 0.31286413540600805, "kl": 0.1175537109375, "learning_rate": 4.979907887883971e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 9338 }, { "completion_length": 209.39286613464355, "epoch": 1.5659918688964332, "grad_norm": 0.004725622950303789, "kl": 0.134185791015625, "learning_rate": 4.979892707150253e-07, "loss": 0.0001, "reward": 1.8035714700818062, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 9340 }, { "completion_length": 210.72322368621826, "epoch": 1.5663271721363008, "grad_norm": 0.22215252473600736, "kl": 0.126220703125, "learning_rate": 4.979877520706902e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7785714510828257, "rewards/format_reward_func": 1.0, "step": 9342 }, { "completion_length": 211.70982837677002, "epoch": 1.5666624753761682, "grad_norm": 0.28259642807042173, "kl": 0.126251220703125, "learning_rate": 4.979862328553954e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7678571697324514, "rewards/format_reward_func": 1.0, "step": 9344 }, { "completion_length": 203.66072463989258, "epoch": 1.5669977786160358, "grad_norm": 0.11420190732138948, "kl": 0.1182861328125, "learning_rate": 4.979847130691442e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000081956387, "rewards/format_reward_func": 1.0, "step": 9346 }, { "completion_length": 204.75893878936768, "epoch": 1.5673330818559035, "grad_norm": 0.2543779500102239, "kl": 0.117279052734375, "learning_rate": 4.979831927119405e-07, "loss": 0.0001, "reward": 1.7035715132951736, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7035714611411095, "rewards/format_reward_func": 1.0, "step": 9348 }, { "completion_length": 214.9107265472412, "epoch": 1.567668385095771, "grad_norm": 0.2692869245101748, "kl": 0.16180419921875, "learning_rate": 4.979816717837874e-07, "loss": 0.0002, "reward": 1.7339286357164383, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7383928950875998, "rewards/format_reward_func": 0.9955357164144516, "step": 9350 }, { "completion_length": 200.07143783569336, "epoch": 1.5680036883356385, "grad_norm": 0.3329726441021534, "kl": 0.144622802734375, "learning_rate": 4.979801502846885e-07, "loss": 0.0001, "reward": 1.785714328289032, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.785714328289032, "rewards/format_reward_func": 1.0, "step": 9352 }, { "completion_length": 206.977689743042, "epoch": 1.568338991575506, "grad_norm": 0.18730600752968107, "kl": 0.13232421875, "learning_rate": 4.979786282146474e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143022119999, "rewards/format_reward_func": 1.0, "step": 9354 }, { "completion_length": 208.29018783569336, "epoch": 1.5686742948153736, "grad_norm": 0.13967090602322763, "kl": 0.225189208984375, "learning_rate": 4.979771055736677e-07, "loss": 0.0002, "reward": 1.807142898440361, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428909897804, "rewards/format_reward_func": 1.0, "step": 9356 }, { "completion_length": 207.01340293884277, "epoch": 1.5690095980552412, "grad_norm": 0.27378668060925143, "kl": 0.123443603515625, "learning_rate": 4.979755823617525e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 9358 }, { "completion_length": 204.61607933044434, "epoch": 1.5693449012951088, "grad_norm": 0.1799442544954095, "kl": 0.1424560546875, "learning_rate": 4.979740585789057e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428701281548, "rewards/format_reward_func": 1.0, "step": 9360 }, { "completion_length": 218.508939743042, "epoch": 1.5696802045349765, "grad_norm": 0.10917451343746819, "kl": 0.1553955078125, "learning_rate": 4.979725342251307e-07, "loss": 0.0002, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857640981674, "rewards/format_reward_func": 1.0, "step": 9362 }, { "completion_length": 199.85268878936768, "epoch": 1.5700155077748439, "grad_norm": 0.34883051529480286, "kl": 0.137542724609375, "learning_rate": 4.979710093004311e-07, "loss": 0.0001, "reward": 1.764285795390606, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 9364 }, { "completion_length": 213.64286613464355, "epoch": 1.5703508110147113, "grad_norm": 0.23714127105446917, "kl": 0.195068359375, "learning_rate": 4.9796948380481e-07, "loss": 0.0002, "reward": 1.7000000923871994, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7000000439584255, "rewards/format_reward_func": 1.0, "step": 9366 }, { "completion_length": 204.15179634094238, "epoch": 1.570686114254579, "grad_norm": 0.31596825075507784, "kl": 0.106353759765625, "learning_rate": 4.979679577382714e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571730852127, "rewards/format_reward_func": 1.0, "step": 9368 }, { "completion_length": 200.46875858306885, "epoch": 1.5710214174944466, "grad_norm": 0.25781774042292127, "kl": 0.127532958984375, "learning_rate": 4.979664311008185e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 9370 }, { "completion_length": 207.22768783569336, "epoch": 1.5713567207343142, "grad_norm": 0.2883900615043515, "kl": 0.1649169921875, "learning_rate": 4.979649038924551e-07, "loss": 0.0002, "reward": 1.7642857730388641, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 9372 }, { "completion_length": 208.43304443359375, "epoch": 1.5716920239741816, "grad_norm": 0.26372455175352644, "kl": 0.18939208984375, "learning_rate": 4.979633761131845e-07, "loss": 0.0002, "reward": 1.7464286461472511, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 9374 }, { "completion_length": 209.62947273254395, "epoch": 1.5720273272140493, "grad_norm": 0.3049719918557979, "kl": 0.19256591796875, "learning_rate": 4.979618477630102e-07, "loss": 0.0002, "reward": 1.7714286297559738, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 1.0, "step": 9376 }, { "completion_length": 201.66072273254395, "epoch": 1.5723626304539167, "grad_norm": 0.16845659994063056, "kl": 0.12188720703125, "learning_rate": 4.979603188419358e-07, "loss": 0.0001, "reward": 1.7392858117818832, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 9378 }, { "completion_length": 205.45536518096924, "epoch": 1.5726979336937843, "grad_norm": 0.22718930926429257, "kl": 0.132293701171875, "learning_rate": 4.979587893499649e-07, "loss": 0.0001, "reward": 1.735714353621006, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 9380 }, { "completion_length": 209.20983028411865, "epoch": 1.573033236933652, "grad_norm": 0.2509327116323553, "kl": 0.1441650390625, "learning_rate": 4.979572592871009e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 9382 }, { "completion_length": 211.25000858306885, "epoch": 1.5733685401735196, "grad_norm": 0.08130233488984963, "kl": 0.2010498046875, "learning_rate": 4.979557286533473e-07, "loss": 0.0002, "reward": 1.800000049173832, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 9384 }, { "completion_length": 200.88840007781982, "epoch": 1.573703843413387, "grad_norm": 0.08435947022938288, "kl": 0.12811279296875, "learning_rate": 4.979541974487077e-07, "loss": 0.0001, "reward": 1.7450893595814705, "reward_std": 0.033461302518844604, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 0.9986607171595097, "step": 9386 }, { "completion_length": 216.1919755935669, "epoch": 1.5740391466532544, "grad_norm": 0.3632452957424689, "kl": 0.2205810546875, "learning_rate": 4.979526656731856e-07, "loss": 0.0002, "reward": 1.7303572222590446, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7348214574158192, "rewards/format_reward_func": 0.9955357164144516, "step": 9388 }, { "completion_length": 219.18304538726807, "epoch": 1.574374449893122, "grad_norm": 0.30401658940048676, "kl": 0.13153076171875, "learning_rate": 4.979511333267845e-07, "loss": 0.0001, "reward": 1.7267857939004898, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7312500402331352, "rewards/format_reward_func": 0.9955357164144516, "step": 9390 }, { "completion_length": 215.1741180419922, "epoch": 1.5747097531329897, "grad_norm": 0.24728973081644434, "kl": 0.20745849609375, "learning_rate": 4.979496004095081e-07, "loss": 0.0002, "reward": 1.7375001013278961, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7419643066823483, "rewards/format_reward_func": 0.9955357164144516, "step": 9392 }, { "completion_length": 207.53572273254395, "epoch": 1.5750450563728573, "grad_norm": 0.21061804157548694, "kl": 0.24725341796875, "learning_rate": 4.979480669213596e-07, "loss": 0.0002, "reward": 1.739285796880722, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857559025288, "rewards/format_reward_func": 1.0, "step": 9394 }, { "completion_length": 197.40179443359375, "epoch": 1.5753803596127247, "grad_norm": 0.26878445750989804, "kl": 0.260589599609375, "learning_rate": 4.979465328623428e-07, "loss": 0.0003, "reward": 1.782142922282219, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 9396 }, { "completion_length": 191.18750953674316, "epoch": 1.5757156628525923, "grad_norm": 0.26487623591318504, "kl": 0.123138427734375, "learning_rate": 4.979449982324612e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 9398 }, { "completion_length": 193.94643878936768, "epoch": 1.5760509660924598, "grad_norm": 0.3455811579684974, "kl": 0.138336181640625, "learning_rate": 4.979434630317181e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 9400 }, { "completion_length": 200.52233123779297, "epoch": 1.5763862693323274, "grad_norm": 0.1336782326343173, "kl": 0.133148193359375, "learning_rate": 4.979419272601174e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 9402 }, { "completion_length": 192.94643783569336, "epoch": 1.576721572572195, "grad_norm": 0.23079795904077532, "kl": 0.14453125, "learning_rate": 4.979403909176625e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428868919611, "rewards/format_reward_func": 1.0, "step": 9404 }, { "completion_length": 203.18750858306885, "epoch": 1.5770568758120627, "grad_norm": 0.20095549925177159, "kl": 0.288726806640625, "learning_rate": 4.979388540043568e-07, "loss": 0.0003, "reward": 1.7750000730156898, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 9406 }, { "completion_length": 191.08036613464355, "epoch": 1.57739217905193, "grad_norm": 0.20087382205830567, "kl": 0.220916748046875, "learning_rate": 4.979373165202039e-07, "loss": 0.0002, "reward": 1.7535715103149414, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 9408 }, { "completion_length": 191.23661708831787, "epoch": 1.5777274822917975, "grad_norm": 0.13166586092674154, "kl": 0.138763427734375, "learning_rate": 4.979357784652073e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.753571467474103, "rewards/format_reward_func": 1.0, "step": 9410 }, { "completion_length": 188.56697177886963, "epoch": 1.5780627855316651, "grad_norm": 0.13549451743155633, "kl": 0.115631103515625, "learning_rate": 4.979342398393707e-07, "loss": 0.0001, "reward": 1.7214286476373672, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.721428606659174, "rewards/format_reward_func": 1.0, "step": 9412 }, { "completion_length": 181.10268592834473, "epoch": 1.5783980887715328, "grad_norm": 0.20587388166420736, "kl": 0.23712158203125, "learning_rate": 4.979327006426975e-07, "loss": 0.0002, "reward": 1.7053572311997414, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7098214700818062, "rewards/format_reward_func": 0.9955357164144516, "step": 9414 }, { "completion_length": 179.82143688201904, "epoch": 1.5787333920114004, "grad_norm": 0.324524725201001, "kl": 0.12750244140625, "learning_rate": 4.979311608751915e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321429029107094, "rewards/format_reward_func": 1.0, "step": 9416 }, { "completion_length": 181.51340103149414, "epoch": 1.579068695251268, "grad_norm": 0.363719970992371, "kl": 0.156158447265625, "learning_rate": 4.979296205368558e-07, "loss": 0.0002, "reward": 1.7821428999304771, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821429036557674, "rewards/format_reward_func": 1.0, "step": 9418 }, { "completion_length": 184.34375953674316, "epoch": 1.5794039984911354, "grad_norm": 0.2968390327039545, "kl": 0.14080810546875, "learning_rate": 4.979280796276943e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.746428607031703, "rewards/format_reward_func": 1.0, "step": 9420 }, { "completion_length": 184.85715103149414, "epoch": 1.5797393017310029, "grad_norm": 0.2681062341314168, "kl": 0.1266632080078125, "learning_rate": 4.979265381477104e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 9422 }, { "completion_length": 181.8125057220459, "epoch": 1.5800746049708705, "grad_norm": 0.18636219767722761, "kl": 0.105621337890625, "learning_rate": 4.979249960969077e-07, "loss": 0.0001, "reward": 1.7410715222358704, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.745535746216774, "rewards/format_reward_func": 0.9955357164144516, "step": 9424 }, { "completion_length": 192.44643592834473, "epoch": 1.5804099082107381, "grad_norm": 0.2592943293661732, "kl": 0.173614501953125, "learning_rate": 4.979234534752898e-07, "loss": 0.0002, "reward": 1.778571493923664, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 9426 }, { "completion_length": 180.08929347991943, "epoch": 1.5807452114506058, "grad_norm": 0.018409488738189025, "kl": 0.177093505859375, "learning_rate": 4.979219102828601e-07, "loss": 0.0002, "reward": 1.7678571939468384, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571902215481, "rewards/format_reward_func": 1.0, "step": 9428 }, { "completion_length": 188.42857933044434, "epoch": 1.5810805146904732, "grad_norm": 0.19724579011986249, "kl": 0.108642578125, "learning_rate": 4.979203665196222e-07, "loss": 0.0001, "reward": 1.7142857983708382, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857424914837, "rewards/format_reward_func": 1.0, "step": 9430 }, { "completion_length": 188.49554347991943, "epoch": 1.5814158179303408, "grad_norm": 0.37446507304918436, "kl": 0.10906982421875, "learning_rate": 4.979188221855797e-07, "loss": 0.0001, "reward": 1.7285714894533157, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7285714596509933, "rewards/format_reward_func": 1.0, "step": 9432 }, { "completion_length": 188.35268688201904, "epoch": 1.5817511211702082, "grad_norm": 0.28298486688243313, "kl": 0.1163330078125, "learning_rate": 4.979172772807363e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 9434 }, { "completion_length": 180.41518592834473, "epoch": 1.5820864244100759, "grad_norm": 0.19784500966747084, "kl": 0.117919921875, "learning_rate": 4.979157318050953e-07, "loss": 0.0001, "reward": 1.7571429088711739, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 9436 }, { "completion_length": 182.25893783569336, "epoch": 1.5824217276499435, "grad_norm": 0.18481772827255594, "kl": 0.114227294921875, "learning_rate": 4.979141857586604e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 9438 }, { "completion_length": 181.86607933044434, "epoch": 1.5827570308898111, "grad_norm": 0.40152965965342363, "kl": 0.295135498046875, "learning_rate": 4.979126391414352e-07, "loss": 0.0003, "reward": 1.7428572326898575, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 9440 }, { "completion_length": 188.20090293884277, "epoch": 1.5830923341296785, "grad_norm": 0.3072119809838924, "kl": 0.13543701171875, "learning_rate": 4.97911091953423e-07, "loss": 0.0001, "reward": 1.72857154160738, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7285714540630579, "rewards/format_reward_func": 1.0, "step": 9442 }, { "completion_length": 184.23661518096924, "epoch": 1.583427637369546, "grad_norm": 0.27136970912830893, "kl": 0.186431884765625, "learning_rate": 4.979095441946276e-07, "loss": 0.0002, "reward": 1.7785715013742447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 9444 }, { "completion_length": 184.88393688201904, "epoch": 1.5837629406094136, "grad_norm": 0.2625038055494905, "kl": 0.111175537109375, "learning_rate": 4.979079958650525e-07, "loss": 0.0001, "reward": 1.735714353621006, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7357143126428127, "rewards/format_reward_func": 1.0, "step": 9446 }, { "completion_length": 188.89733028411865, "epoch": 1.5840982438492812, "grad_norm": 0.20991182672532305, "kl": 0.110870361328125, "learning_rate": 4.979064469647014e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 9448 }, { "completion_length": 187.84822273254395, "epoch": 1.5844335470891489, "grad_norm": 0.24590372324776322, "kl": 0.131500244140625, "learning_rate": 4.979048974935776e-07, "loss": 0.0001, "reward": 1.7642857655882835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 9450 }, { "completion_length": 185.66072368621826, "epoch": 1.5847688503290163, "grad_norm": 0.15685101508432459, "kl": 0.204193115234375, "learning_rate": 4.97903347451685e-07, "loss": 0.0002, "reward": 1.7357143387198448, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 1.0, "step": 9452 }, { "completion_length": 204.43750762939453, "epoch": 1.585104153568884, "grad_norm": 0.28556903563308955, "kl": 0.4090576171875, "learning_rate": 4.979017968390268e-07, "loss": 0.0004, "reward": 1.7250000983476639, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7250000182539225, "rewards/format_reward_func": 1.0, "step": 9454 }, { "completion_length": 192.05804538726807, "epoch": 1.5854394568087513, "grad_norm": 0.268773106820556, "kl": 0.1308441162109375, "learning_rate": 4.979002456556068e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 9456 }, { "completion_length": 199.78125858306885, "epoch": 1.585774760048619, "grad_norm": 0.24722104897199124, "kl": 0.10369873046875, "learning_rate": 4.978986939014285e-07, "loss": 0.0001, "reward": 1.732142947614193, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428954601288, "rewards/format_reward_func": 1.0, "step": 9458 }, { "completion_length": 198.80804347991943, "epoch": 1.5861100632884866, "grad_norm": 0.20960824627267807, "kl": 0.140289306640625, "learning_rate": 4.978971415764955e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 9460 }, { "completion_length": 203.90179443359375, "epoch": 1.5864453665283542, "grad_norm": 0.291625399876519, "kl": 0.148956298828125, "learning_rate": 4.978955886808114e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857544124126, "rewards/format_reward_func": 1.0, "step": 9462 }, { "completion_length": 196.58483123779297, "epoch": 1.5867806697682216, "grad_norm": 0.3246287484114839, "kl": 0.113006591796875, "learning_rate": 4.978940352143797e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 9464 }, { "completion_length": 186.13840198516846, "epoch": 1.587115973008089, "grad_norm": 0.4854994353952275, "kl": 0.113677978515625, "learning_rate": 4.97892481177204e-07, "loss": 0.0001, "reward": 1.721428669989109, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214286103844643, "rewards/format_reward_func": 1.0, "step": 9466 }, { "completion_length": 197.19197368621826, "epoch": 1.5874512762479567, "grad_norm": 0.31571928329953813, "kl": 0.101104736328125, "learning_rate": 4.97890926569288e-07, "loss": 0.0001, "reward": 1.8000000715255737, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 9468 }, { "completion_length": 198.43750953674316, "epoch": 1.5877865794878243, "grad_norm": 0.18825773566152013, "kl": 0.14898681640625, "learning_rate": 4.978893713906351e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571864962578, "rewards/format_reward_func": 1.0, "step": 9470 }, { "completion_length": 206.13840198516846, "epoch": 1.588121882727692, "grad_norm": 0.1629431048033093, "kl": 0.111083984375, "learning_rate": 4.97887815641249e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 9472 }, { "completion_length": 203.19197273254395, "epoch": 1.5884571859675594, "grad_norm": 0.33415035155337125, "kl": 0.104461669921875, "learning_rate": 4.978862593211331e-07, "loss": 0.0001, "reward": 1.7508929297327995, "reward_std": 0.08965103607624769, "rewards/equation_reward_func": 0.7526786141097546, "rewards/format_reward_func": 0.9982142895460129, "step": 9474 }, { "completion_length": 195.89286708831787, "epoch": 1.588792489207427, "grad_norm": 0.3242104161132054, "kl": 0.09625244140625, "learning_rate": 4.978847024302914e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000417232513, "rewards/format_reward_func": 1.0, "step": 9476 }, { "completion_length": 194.82590293884277, "epoch": 1.5891277924472944, "grad_norm": 0.2486906112229366, "kl": 0.0967254638671875, "learning_rate": 4.97883144968727e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857540398836, "rewards/format_reward_func": 1.0, "step": 9478 }, { "completion_length": 196.92411422729492, "epoch": 1.589463095687162, "grad_norm": 0.19468741497020042, "kl": 0.10626220703125, "learning_rate": 4.978815869364437e-07, "loss": 0.0001, "reward": 1.7464286237955093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286237955093, "rewards/format_reward_func": 1.0, "step": 9480 }, { "completion_length": 205.51786613464355, "epoch": 1.5897983989270297, "grad_norm": 0.21640894935551291, "kl": 0.12603759765625, "learning_rate": 4.978800283334451e-07, "loss": 0.0001, "reward": 1.7272322252392769, "reward_std": 0.04230013629421592, "rewards/equation_reward_func": 0.7285714596509933, "rewards/format_reward_func": 0.9986607171595097, "step": 9482 }, { "completion_length": 193.09375762939453, "epoch": 1.5901337021668973, "grad_norm": 0.15703601361285324, "kl": 0.100189208984375, "learning_rate": 4.978784691597347e-07, "loss": 0.0001, "reward": 1.7892857640981674, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7892857361584902, "rewards/format_reward_func": 1.0, "step": 9484 }, { "completion_length": 188.65179347991943, "epoch": 1.5904690054067647, "grad_norm": 0.2601317163988536, "kl": 0.136810302734375, "learning_rate": 4.978769094153163e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 9486 }, { "completion_length": 196.0580472946167, "epoch": 1.5908043086466321, "grad_norm": 0.2600653373287085, "kl": 0.117095947265625, "learning_rate": 4.978753491001933e-07, "loss": 0.0001, "reward": 1.7857143133878708, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 9488 }, { "completion_length": 200.73215198516846, "epoch": 1.5911396118864998, "grad_norm": 0.27887034524746496, "kl": 0.18231201171875, "learning_rate": 4.978737882143693e-07, "loss": 0.0002, "reward": 1.8178572058677673, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 1.0, "step": 9490 }, { "completion_length": 202.33036708831787, "epoch": 1.5914749151263674, "grad_norm": 0.3410612068150405, "kl": 0.125335693359375, "learning_rate": 4.97872226757848e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7678571734577417, "rewards/format_reward_func": 1.0, "step": 9492 }, { "completion_length": 203.17858123779297, "epoch": 1.591810218366235, "grad_norm": 0.18929719663093758, "kl": 0.266021728515625, "learning_rate": 4.978706647306329e-07, "loss": 0.0003, "reward": 1.717857226729393, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571727126837, "rewards/format_reward_func": 1.0, "step": 9494 }, { "completion_length": 194.48661708831787, "epoch": 1.5921455216061027, "grad_norm": 0.44137712767320597, "kl": 0.210601806640625, "learning_rate": 4.978691021327276e-07, "loss": 0.0002, "reward": 1.7553572058677673, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 9496 }, { "completion_length": 208.91965007781982, "epoch": 1.59248082484597, "grad_norm": 0.3316997283825163, "kl": 0.11297607421875, "learning_rate": 4.978675389641357e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 9498 }, { "completion_length": 200.73215103149414, "epoch": 1.5928161280858375, "grad_norm": 0.27617819723690057, "kl": 0.2326507568359375, "learning_rate": 4.978659752248608e-07, "loss": 0.0002, "reward": 1.7714286223053932, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 1.0, "step": 9500 }, { "completion_length": 210.42411613464355, "epoch": 1.5931514313257051, "grad_norm": 0.23732779196931864, "kl": 0.19146728515625, "learning_rate": 4.978644109149066e-07, "loss": 0.0002, "reward": 1.778571479022503, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 9502 }, { "completion_length": 206.71875858306885, "epoch": 1.5934867345655728, "grad_norm": 0.1889520164049944, "kl": 0.13079833984375, "learning_rate": 4.978628460342766e-07, "loss": 0.0001, "reward": 1.7714286148548126, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 9504 }, { "completion_length": 209.49107837677002, "epoch": 1.5938220378054404, "grad_norm": 0.28097707245087605, "kl": 0.125396728515625, "learning_rate": 4.978612805829745e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7571428846567869, "rewards/format_reward_func": 1.0, "step": 9506 }, { "completion_length": 219.37947368621826, "epoch": 1.5941573410453078, "grad_norm": 0.1472375894145825, "kl": 0.1482086181640625, "learning_rate": 4.978597145610037e-07, "loss": 0.0001, "reward": 1.7035714983940125, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7125000394880772, "rewards/format_reward_func": 0.9910714328289032, "step": 9508 }, { "completion_length": 194.54465103149414, "epoch": 1.5944926442851755, "grad_norm": 0.30787715412318145, "kl": 0.1240234375, "learning_rate": 4.97858147968368e-07, "loss": 0.0001, "reward": 1.7732143476605415, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7776785977184772, "rewards/format_reward_func": 0.9955357164144516, "step": 9510 }, { "completion_length": 205.63840198516846, "epoch": 1.5948279475250429, "grad_norm": 0.22203363715006483, "kl": 0.12939453125, "learning_rate": 4.97856580805071e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714693367481, "rewards/format_reward_func": 1.0, "step": 9512 }, { "completion_length": 218.54465293884277, "epoch": 1.5951632507649105, "grad_norm": 0.3115238153228958, "kl": 0.197662353515625, "learning_rate": 4.978550130711163e-07, "loss": 0.0002, "reward": 1.7607143819332123, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.760714303702116, "rewards/format_reward_func": 1.0, "step": 9514 }, { "completion_length": 210.4062614440918, "epoch": 1.5954985540047781, "grad_norm": 0.33087151315019003, "kl": 0.162689208984375, "learning_rate": 4.978534447665072e-07, "loss": 0.0002, "reward": 1.76071435213089, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 9516 }, { "completion_length": 219.63840198516846, "epoch": 1.5958338572446458, "grad_norm": 0.3291601561579799, "kl": 0.1533203125, "learning_rate": 4.978518758912478e-07, "loss": 0.0002, "reward": 1.712500087916851, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.716964328661561, "rewards/format_reward_func": 0.9955357164144516, "step": 9518 }, { "completion_length": 217.66518688201904, "epoch": 1.5961691604845132, "grad_norm": 0.16251913228443396, "kl": 0.175567626953125, "learning_rate": 4.978503064453414e-07, "loss": 0.0002, "reward": 1.7875000536441803, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 9520 }, { "completion_length": 214.29911708831787, "epoch": 1.5965044637243806, "grad_norm": 0.275822424940639, "kl": 0.133331298828125, "learning_rate": 4.978487364287918e-07, "loss": 0.0001, "reward": 1.7589286491274834, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 9522 }, { "completion_length": 222.16965293884277, "epoch": 1.5968397669642482, "grad_norm": 0.28340261424616403, "kl": 0.2544708251953125, "learning_rate": 4.978471658416024e-07, "loss": 0.0003, "reward": 1.7392857819795609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7482143118977547, "rewards/format_reward_func": 0.9910714328289032, "step": 9524 }, { "completion_length": 219.68304634094238, "epoch": 1.5971750702041159, "grad_norm": 0.3308031864705601, "kl": 0.1259765625, "learning_rate": 4.978455946837769e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 9526 }, { "completion_length": 228.6205472946167, "epoch": 1.5975103734439835, "grad_norm": 0.23764646592147184, "kl": 0.1822509765625, "learning_rate": 4.978440229553191e-07, "loss": 0.0002, "reward": 1.7535714954137802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 9528 }, { "completion_length": 221.06697273254395, "epoch": 1.597845676683851, "grad_norm": 0.1900498694957338, "kl": 0.152618408203125, "learning_rate": 4.978424506562324e-07, "loss": 0.0002, "reward": 1.780357226729393, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7848214469850063, "rewards/format_reward_func": 0.9955357164144516, "step": 9530 }, { "completion_length": 238.60268783569336, "epoch": 1.5981809799237185, "grad_norm": 0.21449225533502828, "kl": 0.18243408203125, "learning_rate": 4.978408777865204e-07, "loss": 0.0002, "reward": 1.71428582072258, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857350409031, "rewards/format_reward_func": 1.0, "step": 9532 }, { "completion_length": 228.45536708831787, "epoch": 1.598516283163586, "grad_norm": 0.25287001539221815, "kl": 0.203948974609375, "learning_rate": 4.978393043461869e-07, "loss": 0.0002, "reward": 1.7785714864730835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 9534 }, { "completion_length": 251.9598331451416, "epoch": 1.5988515864034536, "grad_norm": 0.35490299267432196, "kl": 0.30194091796875, "learning_rate": 4.978377303352353e-07, "loss": 0.0003, "reward": 1.705357238650322, "reward_std": 0.13384521286934614, "rewards/equation_reward_func": 0.7276785857975483, "rewards/format_reward_func": 0.977678582072258, "step": 9536 }, { "completion_length": 235.15179824829102, "epoch": 1.5991868896433212, "grad_norm": 0.2435142863699395, "kl": 0.14727783203125, "learning_rate": 4.978361557536696e-07, "loss": 0.0001, "reward": 1.7714286521077156, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7803571745753288, "rewards/format_reward_func": 0.9910714328289032, "step": 9538 }, { "completion_length": 230.4866180419922, "epoch": 1.5995221928831889, "grad_norm": 0.26605692272202913, "kl": 0.159637451171875, "learning_rate": 4.97834580601493e-07, "loss": 0.0002, "reward": 1.7589286342263222, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9955357164144516, "step": 9540 }, { "completion_length": 235.8571548461914, "epoch": 1.5998574961230563, "grad_norm": 0.18506222865561817, "kl": 0.141998291015625, "learning_rate": 4.978330048787092e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.8089286088943481, "rewards/format_reward_func": 0.9910714328289032, "step": 9542 }, { "completion_length": 225.2857255935669, "epoch": 1.6001927993629237, "grad_norm": 0.24497391090326853, "kl": 0.1688232421875, "learning_rate": 4.978314285853222e-07, "loss": 0.0002, "reward": 1.8107143267989159, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8107143044471741, "rewards/format_reward_func": 1.0, "step": 9544 }, { "completion_length": 229.04465293884277, "epoch": 1.6005281026027913, "grad_norm": 0.16757262472399967, "kl": 0.142120361328125, "learning_rate": 4.978298517213352e-07, "loss": 0.0001, "reward": 1.7142857909202576, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7232143171131611, "rewards/format_reward_func": 0.9910714328289032, "step": 9546 }, { "completion_length": 230.67858219146729, "epoch": 1.600863405842659, "grad_norm": 0.32121427829377835, "kl": 0.168212890625, "learning_rate": 4.97828274286752e-07, "loss": 0.0002, "reward": 1.7035714983940125, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7035714648663998, "rewards/format_reward_func": 1.0, "step": 9548 }, { "completion_length": 242.65626049041748, "epoch": 1.6011987090825266, "grad_norm": 0.3786336398381342, "kl": 0.359466552734375, "learning_rate": 4.978266962815763e-07, "loss": 0.0004, "reward": 1.6892857998609543, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.6982143260538578, "rewards/format_reward_func": 0.9910714328289032, "step": 9550 }, { "completion_length": 218.7142972946167, "epoch": 1.6015340123223942, "grad_norm": 0.1542904347763142, "kl": 0.2550048828125, "learning_rate": 4.978251177058116e-07, "loss": 0.0003, "reward": 1.766071505844593, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.9955357164144516, "step": 9552 }, { "completion_length": 220.08483219146729, "epoch": 1.6018693155622616, "grad_norm": 0.2862551303649256, "kl": 0.214569091796875, "learning_rate": 4.978235385594616e-07, "loss": 0.0002, "reward": 1.8107143640518188, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.8107142969965935, "rewards/format_reward_func": 1.0, "step": 9554 }, { "completion_length": 225.383939743042, "epoch": 1.602204618802129, "grad_norm": 0.24500226164542638, "kl": 0.258697509765625, "learning_rate": 4.9782195884253e-07, "loss": 0.0003, "reward": 1.7464286386966705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286033064127, "rewards/format_reward_func": 1.0, "step": 9556 }, { "completion_length": 211.54018783569336, "epoch": 1.6025399220419967, "grad_norm": 0.2018758611186633, "kl": 0.230255126953125, "learning_rate": 4.978203785550203e-07, "loss": 0.0002, "reward": 1.8357143476605415, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8357143104076385, "rewards/format_reward_func": 1.0, "step": 9558 }, { "completion_length": 226.4553680419922, "epoch": 1.6028752252818643, "grad_norm": 0.22041226567845768, "kl": 0.21649169921875, "learning_rate": 4.978187976969361e-07, "loss": 0.0002, "reward": 1.7392857745289803, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7482143305242062, "rewards/format_reward_func": 0.9910714328289032, "step": 9560 }, { "completion_length": 203.51340198516846, "epoch": 1.603210528521732, "grad_norm": 0.6848401584259823, "kl": 0.314666748046875, "learning_rate": 4.978172162682812e-07, "loss": 0.0003, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857581377029, "rewards/format_reward_func": 1.0, "step": 9562 }, { "completion_length": 208.80804443359375, "epoch": 1.6035458317615994, "grad_norm": 0.23038082418106143, "kl": 0.174072265625, "learning_rate": 4.978156342690593e-07, "loss": 0.0002, "reward": 1.7946429252624512, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7991071678698063, "rewards/format_reward_func": 0.9955357164144516, "step": 9564 }, { "completion_length": 210.25893783569336, "epoch": 1.603881135001467, "grad_norm": 0.26692083769371217, "kl": 0.13800048828125, "learning_rate": 4.978140516992738e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 9566 }, { "completion_length": 209.26786613464355, "epoch": 1.6042164382413344, "grad_norm": 0.6875469688497401, "kl": 0.182403564453125, "learning_rate": 4.978124685589286e-07, "loss": 0.0002, "reward": 1.7714286595582962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 9568 }, { "completion_length": 211.85268688201904, "epoch": 1.604551741481202, "grad_norm": 0.10928835035140934, "kl": 0.168670654296875, "learning_rate": 4.978108848480271e-07, "loss": 0.0002, "reward": 1.716071479022503, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7205357421189547, "rewards/format_reward_func": 0.9955357164144516, "step": 9570 }, { "completion_length": 206.12500953674316, "epoch": 1.6048870447210697, "grad_norm": 0.35867449197804246, "kl": 0.158905029296875, "learning_rate": 4.97809300566573e-07, "loss": 0.0002, "reward": 1.825000062584877, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8250000327825546, "rewards/format_reward_func": 1.0, "step": 9572 }, { "completion_length": 208.10268878936768, "epoch": 1.6052223479609373, "grad_norm": 0.3136847225169866, "kl": 0.259307861328125, "learning_rate": 4.978077157145702e-07, "loss": 0.0003, "reward": 1.7446429133415222, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7491071820259094, "rewards/format_reward_func": 0.9955357164144516, "step": 9574 }, { "completion_length": 194.73661422729492, "epoch": 1.6055576512008047, "grad_norm": 0.1617781791123457, "kl": 0.131927490234375, "learning_rate": 4.97806130292022e-07, "loss": 0.0001, "reward": 1.7089286595582962, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7133928909897804, "rewards/format_reward_func": 0.9955357164144516, "step": 9576 }, { "completion_length": 200.80358028411865, "epoch": 1.6058929544406721, "grad_norm": 0.5766524950768712, "kl": 0.162017822265625, "learning_rate": 4.978045442989323e-07, "loss": 0.0002, "reward": 1.775000087916851, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 9578 }, { "completion_length": 193.70090198516846, "epoch": 1.6062282576805398, "grad_norm": 0.33900780695246113, "kl": 0.133544921875, "learning_rate": 4.978029577353047e-07, "loss": 0.0001, "reward": 1.7714286521077156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 9580 }, { "completion_length": 198.50893688201904, "epoch": 1.6065635609204074, "grad_norm": 0.31181010593599673, "kl": 0.1903076171875, "learning_rate": 4.978013706011427e-07, "loss": 0.0002, "reward": 1.7500000596046448, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000279396772, "rewards/format_reward_func": 1.0, "step": 9582 }, { "completion_length": 201.63840293884277, "epoch": 1.606898864160275, "grad_norm": 0.2769393664102841, "kl": 0.160369873046875, "learning_rate": 4.977997828964501e-07, "loss": 0.0002, "reward": 1.7928571850061417, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 9584 }, { "completion_length": 192.71875858306885, "epoch": 1.6072341674001425, "grad_norm": 0.3967558757175834, "kl": 0.1456298828125, "learning_rate": 4.977981946212305e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.8035714440047741, "rewards/format_reward_func": 1.0, "step": 9586 }, { "completion_length": 209.50893878936768, "epoch": 1.60756947064001, "grad_norm": 0.08294927589433111, "kl": 0.18817138671875, "learning_rate": 4.977966057754877e-07, "loss": 0.0002, "reward": 1.733928643167019, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7383928876370192, "rewards/format_reward_func": 0.9955357164144516, "step": 9588 }, { "completion_length": 208.34375953674316, "epoch": 1.6079047738798775, "grad_norm": 0.21126790987528263, "kl": 0.141693115234375, "learning_rate": 4.977950163592251e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214286215603352, "rewards/format_reward_func": 1.0, "step": 9590 }, { "completion_length": 222.32590198516846, "epoch": 1.6082400771197451, "grad_norm": 0.2547810669547315, "kl": 0.19598388671875, "learning_rate": 4.977934263724466e-07, "loss": 0.0002, "reward": 1.7285715118050575, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 9592 }, { "completion_length": 208.25000858306885, "epoch": 1.6085753803596128, "grad_norm": 0.1419821972171379, "kl": 0.1297607421875, "learning_rate": 4.977918358151557e-07, "loss": 0.0001, "reward": 1.764285795390606, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 9594 }, { "completion_length": 199.52233123779297, "epoch": 1.6089106835994804, "grad_norm": 0.17384380850416808, "kl": 0.115386962890625, "learning_rate": 4.977902446873561e-07, "loss": 0.0001, "reward": 1.776785746216774, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7812500260770321, "rewards/format_reward_func": 0.9955357164144516, "step": 9596 }, { "completion_length": 201.48661613464355, "epoch": 1.6092459868393478, "grad_norm": 0.36774919060599237, "kl": 0.175018310546875, "learning_rate": 4.977886529890515e-07, "loss": 0.0002, "reward": 1.8000000417232513, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.800000037997961, "rewards/format_reward_func": 1.0, "step": 9598 }, { "completion_length": 213.88393878936768, "epoch": 1.6095812900792152, "grad_norm": 0.20051143266263496, "kl": 0.131591796875, "learning_rate": 4.977870607202456e-07, "loss": 0.0001, "reward": 1.7357143759727478, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143089175224, "rewards/format_reward_func": 1.0, "step": 9600 }, { "completion_length": 216.59822368621826, "epoch": 1.6099165933190829, "grad_norm": 0.29833393306666617, "kl": 0.210479736328125, "learning_rate": 4.977854678809419e-07, "loss": 0.0002, "reward": 1.7821429297327995, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428999304771, "rewards/format_reward_func": 1.0, "step": 9602 }, { "completion_length": 225.0178689956665, "epoch": 1.6102518965589505, "grad_norm": 0.2855325199172955, "kl": 0.134735107421875, "learning_rate": 4.977838744711443e-07, "loss": 0.0001, "reward": 1.7214286476373672, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.721428606659174, "rewards/format_reward_func": 1.0, "step": 9604 }, { "completion_length": 219.96429538726807, "epoch": 1.6105871997988181, "grad_norm": 0.1848297056822976, "kl": 0.1536865234375, "learning_rate": 4.977822804908562e-07, "loss": 0.0002, "reward": 1.7950893342494965, "reward_std": 0.02714784862473607, "rewards/equation_reward_func": 0.7964285928755999, "rewards/format_reward_func": 0.9986607171595097, "step": 9606 }, { "completion_length": 218.5178680419922, "epoch": 1.6109225030386856, "grad_norm": 0.12919435157945253, "kl": 0.184478759765625, "learning_rate": 4.977806859400816e-07, "loss": 0.0002, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 9608 }, { "completion_length": 238.8705472946167, "epoch": 1.6112578062785532, "grad_norm": 0.6828934653371714, "kl": 0.240020751953125, "learning_rate": 4.977790908188239e-07, "loss": 0.0002, "reward": 1.7178572118282318, "reward_std": 0.09596449136734009, "rewards/equation_reward_func": 0.7357143089175224, "rewards/format_reward_func": 0.9821428656578064, "step": 9610 }, { "completion_length": 222.44197463989258, "epoch": 1.6115931095184206, "grad_norm": 0.3959098496736238, "kl": 0.192840576171875, "learning_rate": 4.977774951270869e-07, "loss": 0.0002, "reward": 1.739285796880722, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857559025288, "rewards/format_reward_func": 1.0, "step": 9612 }, { "completion_length": 225.5491180419922, "epoch": 1.6119284127582882, "grad_norm": 0.13520841983062826, "kl": 0.23187255859375, "learning_rate": 4.977758988648742e-07, "loss": 0.0002, "reward": 1.7321429178118706, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7321428786963224, "rewards/format_reward_func": 1.0, "step": 9614 }, { "completion_length": 219.13840293884277, "epoch": 1.6122637159981559, "grad_norm": 0.8712622346968198, "kl": 0.259063720703125, "learning_rate": 4.977743020321896e-07, "loss": 0.0003, "reward": 1.789285771548748, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 9616 }, { "completion_length": 221.70983219146729, "epoch": 1.6125990192380235, "grad_norm": 0.3392027587240292, "kl": 0.131103515625, "learning_rate": 4.977727046290365e-07, "loss": 0.0001, "reward": 1.7375000640749931, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643141329288, "rewards/format_reward_func": 0.9955357164144516, "step": 9618 }, { "completion_length": 222.52679634094238, "epoch": 1.612934322477891, "grad_norm": 0.29242676314966715, "kl": 0.13372802734375, "learning_rate": 4.977711066554189e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 9620 }, { "completion_length": 231.80804634094238, "epoch": 1.6132696257177583, "grad_norm": 0.22772919308178813, "kl": 0.1455078125, "learning_rate": 4.977695081113403e-07, "loss": 0.0001, "reward": 1.7803572043776512, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7848214618861675, "rewards/format_reward_func": 0.9955357164144516, "step": 9622 }, { "completion_length": 215.53125953674316, "epoch": 1.613604928957626, "grad_norm": 0.24217956537165358, "kl": 0.1041412353515625, "learning_rate": 4.977679089968044e-07, "loss": 0.0001, "reward": 1.7678571864962578, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 9624 }, { "completion_length": 227.67411613464355, "epoch": 1.6139402321974936, "grad_norm": 0.2160774633134719, "kl": 0.239593505859375, "learning_rate": 4.977663093118151e-07, "loss": 0.0002, "reward": 1.782142922282219, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 9626 }, { "completion_length": 230.6919755935669, "epoch": 1.6142755354373612, "grad_norm": 0.1998150112790896, "kl": 0.160888671875, "learning_rate": 4.977647090563757e-07, "loss": 0.0002, "reward": 1.7517857775092125, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500070780516, "rewards/format_reward_func": 0.9955357164144516, "step": 9628 }, { "completion_length": 228.01786613464355, "epoch": 1.6146108386772289, "grad_norm": 0.22424239037497734, "kl": 0.160491943359375, "learning_rate": 4.977631082304901e-07, "loss": 0.0002, "reward": 1.7714286372065544, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 9630 }, { "completion_length": 232.89733409881592, "epoch": 1.6149461419170963, "grad_norm": 0.3348732755377922, "kl": 0.160919189453125, "learning_rate": 4.97761506834162e-07, "loss": 0.0002, "reward": 1.7446429058909416, "reward_std": 0.08838834706693888, "rewards/equation_reward_func": 0.7580357380211353, "rewards/format_reward_func": 0.9866071492433548, "step": 9632 }, { "completion_length": 234.50447368621826, "epoch": 1.6152814451569637, "grad_norm": 0.1766412362968214, "kl": 0.18402099609375, "learning_rate": 4.97759904867395e-07, "loss": 0.0002, "reward": 1.7321429252624512, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428917348385, "rewards/format_reward_func": 1.0, "step": 9634 }, { "completion_length": 230.77233123779297, "epoch": 1.6156167483968313, "grad_norm": 0.2087278214446919, "kl": 0.19451904296875, "learning_rate": 4.977583023301929e-07, "loss": 0.0002, "reward": 1.7035715132951736, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7035714648663998, "rewards/format_reward_func": 1.0, "step": 9636 }, { "completion_length": 227.9375123977661, "epoch": 1.615952051636699, "grad_norm": 0.7674398765859399, "kl": 0.294891357421875, "learning_rate": 4.977566992225594e-07, "loss": 0.0003, "reward": 1.816071480512619, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.820535734295845, "rewards/format_reward_func": 0.9955357164144516, "step": 9638 }, { "completion_length": 231.5312614440918, "epoch": 1.6162873548765666, "grad_norm": 0.24075346358740746, "kl": 0.1507568359375, "learning_rate": 4.97755095544498e-07, "loss": 0.0002, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857305705547, "rewards/format_reward_func": 1.0, "step": 9640 }, { "completion_length": 237.66072463989258, "epoch": 1.616622658116434, "grad_norm": 0.2443153487966591, "kl": 0.151885986328125, "learning_rate": 4.977534912960124e-07, "loss": 0.0002, "reward": 1.698214367032051, "reward_std": 0.06313453428447247, "rewards/equation_reward_func": 0.7116071619093418, "rewards/format_reward_func": 0.9866071492433548, "step": 9642 }, { "completion_length": 236.14733028411865, "epoch": 1.6169579613563017, "grad_norm": 0.33337972372874886, "kl": 0.476715087890625, "learning_rate": 4.977518864771065e-07, "loss": 0.0005, "reward": 1.7375000938773155, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.741964302957058, "rewards/format_reward_func": 0.9955357164144516, "step": 9644 }, { "completion_length": 229.7812614440918, "epoch": 1.617293264596169, "grad_norm": 0.16053063780435226, "kl": 0.09991455078125, "learning_rate": 4.97750281087784e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 9646 }, { "completion_length": 234.77679920196533, "epoch": 1.6176285678360367, "grad_norm": 0.21157833209685314, "kl": 0.130218505859375, "learning_rate": 4.977486751280484e-07, "loss": 0.0001, "reward": 1.8035714700818062, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 9648 }, { "completion_length": 234.44643878936768, "epoch": 1.6179638710759043, "grad_norm": 0.3738183027599362, "kl": 0.736419677734375, "learning_rate": 4.977470685979035e-07, "loss": 0.0007, "reward": 1.8035714700818062, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.8125000223517418, "rewards/format_reward_func": 0.9910714328289032, "step": 9650 }, { "completion_length": 231.8482265472412, "epoch": 1.618299174315772, "grad_norm": 0.1965934925777346, "kl": 0.094573974609375, "learning_rate": 4.97745461497353e-07, "loss": 0.0001, "reward": 1.841071493923664, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8455357328057289, "rewards/format_reward_func": 0.9955357164144516, "step": 9652 }, { "completion_length": 231.94643783569336, "epoch": 1.6186344775556394, "grad_norm": 0.3680231256765474, "kl": 0.1709747314453125, "learning_rate": 4.977438538264006e-07, "loss": 0.0002, "reward": 1.8571428954601288, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8571428693830967, "rewards/format_reward_func": 1.0, "step": 9654 }, { "completion_length": 224.1741180419922, "epoch": 1.6189697807955068, "grad_norm": 0.32522419030659977, "kl": 0.321319580078125, "learning_rate": 4.9774224558505e-07, "loss": 0.0003, "reward": 1.7875000461935997, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7919643297791481, "rewards/format_reward_func": 0.9955357164144516, "step": 9656 }, { "completion_length": 227.69643783569336, "epoch": 1.6193050840353744, "grad_norm": 0.27102926902278984, "kl": 0.31005859375, "learning_rate": 4.977406367733049e-07, "loss": 0.0003, "reward": 1.7321429327130318, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.732142886146903, "rewards/format_reward_func": 1.0, "step": 9658 }, { "completion_length": 236.2634048461914, "epoch": 1.619640387275242, "grad_norm": 0.16385341685718283, "kl": 1.131195068359375, "learning_rate": 4.977390273911689e-07, "loss": 0.0011, "reward": 1.7357143759727478, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 9660 }, { "completion_length": 234.70983123779297, "epoch": 1.6199756905151097, "grad_norm": 0.28091750128202797, "kl": 1.291107177734375, "learning_rate": 4.97737417438646e-07, "loss": 0.0013, "reward": 1.6535715013742447, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.6625000461935997, "rewards/format_reward_func": 0.9910714328289032, "step": 9662 }, { "completion_length": 220.4017972946167, "epoch": 1.6203109937549771, "grad_norm": 0.33978137394630953, "kl": 0.361083984375, "learning_rate": 4.977358069157395e-07, "loss": 0.0004, "reward": 1.7714286372065544, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 9664 }, { "completion_length": 229.77233219146729, "epoch": 1.6206462969948447, "grad_norm": 0.6532627671050593, "kl": 0.758575439453125, "learning_rate": 4.977341958224535e-07, "loss": 0.0008, "reward": 1.725000075995922, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7339286003261805, "rewards/format_reward_func": 0.9910714328289032, "step": 9666 }, { "completion_length": 228.43751049041748, "epoch": 1.6209816002347122, "grad_norm": 0.18385989770126193, "kl": 0.35162353515625, "learning_rate": 4.977325841587914e-07, "loss": 0.0004, "reward": 1.7808036357164383, "reward_std": 0.037249373737722635, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 0.9986607171595097, "step": 9668 }, { "completion_length": 237.67411518096924, "epoch": 1.6213169034745798, "grad_norm": 0.24482785614371022, "kl": 0.98797607421875, "learning_rate": 4.97730971924757e-07, "loss": 0.001, "reward": 1.7500000447034836, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7589286230504513, "rewards/format_reward_func": 0.9910714328289032, "step": 9670 }, { "completion_length": 224.8750114440918, "epoch": 1.6216522067144474, "grad_norm": 0.4871330431732284, "kl": 0.494415283203125, "learning_rate": 4.977293591203542e-07, "loss": 0.0005, "reward": 1.80892863124609, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.813392885029316, "rewards/format_reward_func": 0.9955357164144516, "step": 9672 }, { "completion_length": 222.71875858306885, "epoch": 1.621987509954315, "grad_norm": 0.14262834413088796, "kl": 0.99700927734375, "learning_rate": 4.977277457455865e-07, "loss": 0.001, "reward": 1.725000075995922, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7250000424683094, "rewards/format_reward_func": 1.0, "step": 9674 }, { "completion_length": 223.46876049041748, "epoch": 1.6223228131941825, "grad_norm": 0.34430265186672526, "kl": 0.10821533203125, "learning_rate": 4.977261318004576e-07, "loss": 0.0001, "reward": 1.7214286476373672, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214286103844643, "rewards/format_reward_func": 1.0, "step": 9676 }, { "completion_length": 232.05804634094238, "epoch": 1.62265811643405, "grad_norm": 0.20829203027319398, "kl": 0.208831787109375, "learning_rate": 4.977245172849714e-07, "loss": 0.0002, "reward": 1.764285795390606, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 9678 }, { "completion_length": 227.8616180419922, "epoch": 1.6229934196739175, "grad_norm": 0.2671142585139483, "kl": 0.1011962890625, "learning_rate": 4.977229021991315e-07, "loss": 0.0001, "reward": 1.7625000551342964, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643051922321, "rewards/format_reward_func": 0.9955357164144516, "step": 9680 }, { "completion_length": 230.9955472946167, "epoch": 1.6233287229137852, "grad_norm": 0.2495018380150175, "kl": 0.2869873046875, "learning_rate": 4.977212865429416e-07, "loss": 0.0003, "reward": 1.778571493923664, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 9682 }, { "completion_length": 219.17411613464355, "epoch": 1.6236640261536528, "grad_norm": 0.11882449677412509, "kl": 0.189483642578125, "learning_rate": 4.977196703164055e-07, "loss": 0.0002, "reward": 1.7446429282426834, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071633994579, "rewards/format_reward_func": 0.9955357164144516, "step": 9684 }, { "completion_length": 220.46429443359375, "epoch": 1.6239993293935204, "grad_norm": 0.1870383835898375, "kl": 0.11187744140625, "learning_rate": 4.977180535195268e-07, "loss": 0.0001, "reward": 1.7392857745289803, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 9686 }, { "completion_length": 225.55358219146729, "epoch": 1.6243346326333878, "grad_norm": 0.19740042234481134, "kl": 0.11981201171875, "learning_rate": 4.977164361523093e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 9688 }, { "completion_length": 224.74554538726807, "epoch": 1.6246699358732553, "grad_norm": 0.2529255736949469, "kl": 0.167327880859375, "learning_rate": 4.977148182147567e-07, "loss": 0.0002, "reward": 1.789285771548748, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857268452644, "rewards/format_reward_func": 1.0, "step": 9690 }, { "completion_length": 231.6696538925171, "epoch": 1.625005239113123, "grad_norm": 0.1329825256542032, "kl": 0.29461669921875, "learning_rate": 4.977131997068729e-07, "loss": 0.0003, "reward": 1.7714286297559738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 9692 }, { "completion_length": 218.3125114440918, "epoch": 1.6253405423529905, "grad_norm": 0.0972662980687718, "kl": 0.149688720703125, "learning_rate": 4.977115806286613e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 9694 }, { "completion_length": 217.821439743042, "epoch": 1.6256758455928582, "grad_norm": 0.3084129310617149, "kl": 0.125244140625, "learning_rate": 4.977099609801259e-07, "loss": 0.0001, "reward": 1.776785783469677, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.781250037252903, "rewards/format_reward_func": 0.9955357164144516, "step": 9696 }, { "completion_length": 221.05358123779297, "epoch": 1.6260111488327256, "grad_norm": 0.2154034072950012, "kl": 0.092742919921875, "learning_rate": 4.977083407612702e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 9698 }, { "completion_length": 222.59822368621826, "epoch": 1.626346452072593, "grad_norm": 0.1480951218735697, "kl": 0.094696044921875, "learning_rate": 4.977067199720981e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 9700 }, { "completion_length": 225.29465293884277, "epoch": 1.6266817553124606, "grad_norm": 0.25106638423487115, "kl": 0.103851318359375, "learning_rate": 4.977050986126134e-07, "loss": 0.0001, "reward": 1.764285795390606, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 9702 }, { "completion_length": 220.85268783569336, "epoch": 1.6270170585523283, "grad_norm": 0.2751909383121962, "kl": 0.1004638671875, "learning_rate": 4.977034766828195e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 9704 }, { "completion_length": 222.95983028411865, "epoch": 1.627352361792196, "grad_norm": 0.34442896985833393, "kl": 0.0963592529296875, "learning_rate": 4.977018541827206e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 9706 }, { "completion_length": 232.1071538925171, "epoch": 1.6276876650320635, "grad_norm": 0.1972980182956902, "kl": 0.1473541259765625, "learning_rate": 4.977002311123201e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 9708 }, { "completion_length": 232.39286613464355, "epoch": 1.628022968271931, "grad_norm": 0.18802488475716975, "kl": 0.1038055419921875, "learning_rate": 4.976986074716217e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 9710 }, { "completion_length": 227.821439743042, "epoch": 1.6283582715117983, "grad_norm": 0.14353477227332534, "kl": 0.1986083984375, "learning_rate": 4.976969832606295e-07, "loss": 0.0002, "reward": 1.776785783469677, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500186264515, "rewards/format_reward_func": 0.9955357164144516, "step": 9712 }, { "completion_length": 223.3616180419922, "epoch": 1.628693574751666, "grad_norm": 0.002946186467856951, "kl": 0.0805816650390625, "learning_rate": 4.976953584793469e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 9714 }, { "completion_length": 221.93750858306885, "epoch": 1.6290288779915336, "grad_norm": 0.2969509883521366, "kl": 0.1015625, "learning_rate": 4.976937331277777e-07, "loss": 0.0001, "reward": 1.8107143267989159, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8107143007218838, "rewards/format_reward_func": 1.0, "step": 9716 }, { "completion_length": 228.8884038925171, "epoch": 1.6293641812314013, "grad_norm": 0.18499841231582298, "kl": 0.1175689697265625, "learning_rate": 4.976921072059256e-07, "loss": 0.0001, "reward": 1.8000000417232513, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.800000037997961, "rewards/format_reward_func": 1.0, "step": 9718 }, { "completion_length": 235.00001049041748, "epoch": 1.6296994844712687, "grad_norm": 0.14542382371891566, "kl": 0.104278564453125, "learning_rate": 4.976904807137947e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 9720 }, { "completion_length": 235.32143878936768, "epoch": 1.6300347877111363, "grad_norm": 0.23501990256504454, "kl": 0.1071014404296875, "learning_rate": 4.976888536513883e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 9722 }, { "completion_length": 230.77232933044434, "epoch": 1.6303700909510037, "grad_norm": 0.22814759709441632, "kl": 0.094635009765625, "learning_rate": 4.976872260187104e-07, "loss": 0.0001, "reward": 1.7428572326898575, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 9724 }, { "completion_length": 228.00447463989258, "epoch": 1.6307053941908713, "grad_norm": 0.31272582817111516, "kl": 0.10003662109375, "learning_rate": 4.976855978157646e-07, "loss": 0.0001, "reward": 1.741964340209961, "reward_std": 0.09217641782015562, "rewards/equation_reward_func": 0.7482143230736256, "rewards/format_reward_func": 0.9937500059604645, "step": 9726 }, { "completion_length": 225.5000114440918, "epoch": 1.631040697430739, "grad_norm": 0.23990043104025616, "kl": 0.333709716796875, "learning_rate": 4.976839690425547e-07, "loss": 0.0003, "reward": 1.725000075995922, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000387430191, "rewards/format_reward_func": 1.0, "step": 9728 }, { "completion_length": 226.41072368621826, "epoch": 1.6313760006706066, "grad_norm": 0.20449317919171378, "kl": 0.1821441650390625, "learning_rate": 4.976823396990845e-07, "loss": 0.0002, "reward": 1.7839286178350449, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7883928790688515, "rewards/format_reward_func": 0.9955357164144516, "step": 9730 }, { "completion_length": 224.50447273254395, "epoch": 1.631711303910474, "grad_norm": 0.13556935221852442, "kl": 0.160125732421875, "learning_rate": 4.976807097853577e-07, "loss": 0.0002, "reward": 1.7892857789993286, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 9732 }, { "completion_length": 229.99554634094238, "epoch": 1.6320466071503414, "grad_norm": 0.24666051030834293, "kl": 0.097808837890625, "learning_rate": 4.976790793013781e-07, "loss": 0.0001, "reward": 1.7339286357164383, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.738392885774374, "rewards/format_reward_func": 0.9955357164144516, "step": 9734 }, { "completion_length": 221.28572463989258, "epoch": 1.632381910390209, "grad_norm": 0.2695799750218758, "kl": 0.0812835693359375, "learning_rate": 4.976774482471494e-07, "loss": 0.0001, "reward": 1.7696429342031479, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 9736 }, { "completion_length": 226.5357255935669, "epoch": 1.6327172136300767, "grad_norm": 0.301846569929245, "kl": 0.2186279296875, "learning_rate": 4.976758166226755e-07, "loss": 0.0002, "reward": 1.741071492433548, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7455357499420643, "rewards/format_reward_func": 0.9955357164144516, "step": 9738 }, { "completion_length": 222.57590198516846, "epoch": 1.6330525168699443, "grad_norm": 0.17720910829776573, "kl": 0.168731689453125, "learning_rate": 4.9767418442796e-07, "loss": 0.0002, "reward": 1.7464286386966705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 9740 }, { "completion_length": 227.6696548461914, "epoch": 1.6333878201098118, "grad_norm": 0.23504752245686839, "kl": 0.344696044921875, "learning_rate": 4.976725516630065e-07, "loss": 0.0003, "reward": 1.8250000551342964, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8250000216066837, "rewards/format_reward_func": 1.0, "step": 9742 }, { "completion_length": 219.54465293884277, "epoch": 1.6337231233496794, "grad_norm": 0.26755522942864296, "kl": 0.0870819091796875, "learning_rate": 4.976709183278192e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 9744 }, { "completion_length": 223.22768878936768, "epoch": 1.6340584265895468, "grad_norm": 0.3647382535741721, "kl": 0.13238525390625, "learning_rate": 4.976692844224014e-07, "loss": 0.0001, "reward": 1.7178572416305542, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7178571820259094, "rewards/format_reward_func": 1.0, "step": 9746 }, { "completion_length": 222.90179538726807, "epoch": 1.6343937298294144, "grad_norm": 0.23603635934814576, "kl": 0.084197998046875, "learning_rate": 4.976676499467573e-07, "loss": 0.0001, "reward": 1.7125000730156898, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7169642969965935, "rewards/format_reward_func": 0.9955357164144516, "step": 9748 }, { "completion_length": 223.4776906967163, "epoch": 1.634729033069282, "grad_norm": 0.17505501443511048, "kl": 0.0858306884765625, "learning_rate": 4.976660149008903e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 9750 }, { "completion_length": 226.9910831451416, "epoch": 1.6350643363091497, "grad_norm": 0.16053011603624132, "kl": 0.1113739013671875, "learning_rate": 4.976643792848043e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 9752 }, { "completion_length": 229.87054634094238, "epoch": 1.6353996395490171, "grad_norm": 0.20286619977289153, "kl": 0.1265869140625, "learning_rate": 4.976627430985031e-07, "loss": 0.0001, "reward": 1.7232143357396126, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7276786062866449, "rewards/format_reward_func": 0.9955357164144516, "step": 9754 }, { "completion_length": 229.2366180419922, "epoch": 1.6357349427888845, "grad_norm": 0.2432832648509416, "kl": 0.539398193359375, "learning_rate": 4.976611063419906e-07, "loss": 0.0005, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 9756 }, { "completion_length": 225.43304538726807, "epoch": 1.6360702460287522, "grad_norm": 0.23562147245432455, "kl": 0.093414306640625, "learning_rate": 4.976594690152702e-07, "loss": 0.0001, "reward": 1.742857187986374, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7517857514321804, "rewards/format_reward_func": 0.9910714328289032, "step": 9758 }, { "completion_length": 226.06697368621826, "epoch": 1.6364055492686198, "grad_norm": 0.2776689340708705, "kl": 0.14593505859375, "learning_rate": 4.976578311183459e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 9760 }, { "completion_length": 229.14732933044434, "epoch": 1.6367408525084874, "grad_norm": 0.2487283953788348, "kl": 0.4114837646484375, "learning_rate": 4.976561926512215e-07, "loss": 0.0004, "reward": 1.7625000476837158, "reward_std": 0.09343910776078701, "rewards/equation_reward_func": 0.7669643219560385, "rewards/format_reward_func": 0.9955357164144516, "step": 9762 }, { "completion_length": 231.17411708831787, "epoch": 1.637076155748355, "grad_norm": 0.2730592489396775, "kl": 0.090362548828125, "learning_rate": 4.976545536139007e-07, "loss": 0.0001, "reward": 1.7250000685453415, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7250000387430191, "rewards/format_reward_func": 1.0, "step": 9764 }, { "completion_length": 224.81250858306885, "epoch": 1.6374114589882225, "grad_norm": 0.2095179396768771, "kl": 0.0867919921875, "learning_rate": 4.976529140063874e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.785714328289032, "rewards/format_reward_func": 1.0, "step": 9766 }, { "completion_length": 239.69197463989258, "epoch": 1.63774676222809, "grad_norm": 0.4736853723651829, "kl": 0.27679443359375, "learning_rate": 4.976512738286851e-07, "loss": 0.0003, "reward": 1.7535715028643608, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7625000402331352, "rewards/format_reward_func": 0.9910714328289032, "step": 9768 }, { "completion_length": 231.71429634094238, "epoch": 1.6380820654679575, "grad_norm": 0.12591280961738255, "kl": 0.321502685546875, "learning_rate": 4.976496330807978e-07, "loss": 0.0003, "reward": 1.8214286118745804, "reward_std": 0.03030457627028227, "rewards/equation_reward_func": 0.8303571566939354, "rewards/format_reward_func": 0.9910714328289032, "step": 9770 }, { "completion_length": 231.53126049041748, "epoch": 1.6384173687078252, "grad_norm": 0.23487225655647087, "kl": 0.55450439453125, "learning_rate": 4.976479917627292e-07, "loss": 0.0006, "reward": 1.7683036401867867, "reward_std": 0.05492704268544912, "rewards/equation_reward_func": 0.7741071730852127, "rewards/format_reward_func": 0.9941964335739613, "step": 9772 }, { "completion_length": 226.21429634094238, "epoch": 1.6387526719476928, "grad_norm": 0.26357185990527765, "kl": 0.21807861328125, "learning_rate": 4.976463498744832e-07, "loss": 0.0002, "reward": 1.7232143729925156, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7276786081492901, "rewards/format_reward_func": 0.9955357164144516, "step": 9774 }, { "completion_length": 231.37501335144043, "epoch": 1.6390879751875602, "grad_norm": 0.2148319851533725, "kl": 0.1714019775390625, "learning_rate": 4.976447074160634e-07, "loss": 0.0002, "reward": 1.7946429327130318, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7991071604192257, "rewards/format_reward_func": 0.9955357164144516, "step": 9776 }, { "completion_length": 234.99108123779297, "epoch": 1.6394232784274279, "grad_norm": 0.0844546691876832, "kl": 0.27728271484375, "learning_rate": 4.976430643874737e-07, "loss": 0.0003, "reward": 1.7178572192788124, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7178571745753288, "rewards/format_reward_func": 1.0, "step": 9778 }, { "completion_length": 239.3303689956665, "epoch": 1.6397585816672953, "grad_norm": 0.35980099021948647, "kl": 0.845733642578125, "learning_rate": 4.976414207887178e-07, "loss": 0.0008, "reward": 1.7589286416769028, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7633928991854191, "rewards/format_reward_func": 0.9955357164144516, "step": 9780 }, { "completion_length": 231.68304634094238, "epoch": 1.640093884907163, "grad_norm": 0.16832908265496788, "kl": 0.0909423828125, "learning_rate": 4.976397766197996e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714440047741, "rewards/format_reward_func": 1.0, "step": 9782 }, { "completion_length": 231.8839406967163, "epoch": 1.6404291881470305, "grad_norm": 0.2077517323854166, "kl": 0.097900390625, "learning_rate": 4.976381318807228e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 1.0, "step": 9784 }, { "completion_length": 235.29019165039062, "epoch": 1.6407644913868982, "grad_norm": 0.1573480194081336, "kl": 0.0998077392578125, "learning_rate": 4.976364865714911e-07, "loss": 0.0001, "reward": 1.8017857670783997, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8062500394880772, "rewards/format_reward_func": 0.9955357164144516, "step": 9786 }, { "completion_length": 247.6250114440918, "epoch": 1.6410997946267656, "grad_norm": 0.24239820048925825, "kl": 0.5035400390625, "learning_rate": 4.976348406921085e-07, "loss": 0.0005, "reward": 1.7767857685685158, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7812500298023224, "rewards/format_reward_func": 0.9955357164144516, "step": 9788 }, { "completion_length": 246.48215579986572, "epoch": 1.641435097866633, "grad_norm": 0.2469812162663503, "kl": 0.15966796875, "learning_rate": 4.976331942425786e-07, "loss": 0.0002, "reward": 1.7553572207689285, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 9790 }, { "completion_length": 236.31251049041748, "epoch": 1.6417704011065006, "grad_norm": 0.18019076628164507, "kl": 0.4852447509765625, "learning_rate": 4.976315472229054e-07, "loss": 0.0005, "reward": 1.7321429327130318, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428880095482, "rewards/format_reward_func": 1.0, "step": 9792 }, { "completion_length": 232.93304634094238, "epoch": 1.6421057043463683, "grad_norm": 0.17304451156624187, "kl": 0.187408447265625, "learning_rate": 4.976298996330925e-07, "loss": 0.0002, "reward": 1.7767858058214188, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7812500335276127, "rewards/format_reward_func": 0.9955357164144516, "step": 9794 }, { "completion_length": 238.790189743042, "epoch": 1.642441007586236, "grad_norm": 0.7010478912700674, "kl": 1.2734222412109375, "learning_rate": 4.976282514731437e-07, "loss": 0.0013, "reward": 1.6839286386966705, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.6883929129689932, "rewards/format_reward_func": 0.9955357164144516, "step": 9796 }, { "completion_length": 254.2901906967163, "epoch": 1.6427763108261033, "grad_norm": 0.16518823323916085, "kl": 0.342803955078125, "learning_rate": 4.976266027430629e-07, "loss": 0.0003, "reward": 1.7464286386966705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.746428607031703, "rewards/format_reward_func": 1.0, "step": 9798 }, { "completion_length": 248.65625953674316, "epoch": 1.643111614065971, "grad_norm": 0.2258722497635101, "kl": 0.151885986328125, "learning_rate": 4.976249534428539e-07, "loss": 0.0002, "reward": 1.7642857804894447, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7732143141329288, "rewards/format_reward_func": 0.9910714328289032, "step": 9800 }, { "completion_length": 241.86161994934082, "epoch": 1.6434469173058384, "grad_norm": 0.21150592179344968, "kl": 0.124969482421875, "learning_rate": 4.976233035725203e-07, "loss": 0.0001, "reward": 1.7767857983708382, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500186264515, "rewards/format_reward_func": 0.9955357164144516, "step": 9802 }, { "completion_length": 239.51786613464355, "epoch": 1.643782220545706, "grad_norm": 0.23342208363613637, "kl": 0.11627197265625, "learning_rate": 4.976216531320662e-07, "loss": 0.0001, "reward": 1.7035715207457542, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.712500024586916, "rewards/format_reward_func": 0.9910714328289032, "step": 9804 }, { "completion_length": 243.0803689956665, "epoch": 1.6441175237855736, "grad_norm": 0.14752362469685548, "kl": 0.207550048828125, "learning_rate": 4.976200021214952e-07, "loss": 0.0002, "reward": 1.7678572088479996, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7767857424914837, "rewards/format_reward_func": 0.9910714328289032, "step": 9806 }, { "completion_length": 238.94197368621826, "epoch": 1.6444528270254413, "grad_norm": 0.23430527663124873, "kl": 0.1428680419921875, "learning_rate": 4.97618350540811e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643051922321, "rewards/format_reward_func": 0.9955357164144516, "step": 9808 }, { "completion_length": 239.4062614440918, "epoch": 1.6447881302653087, "grad_norm": 2.2863623227606498, "kl": 0.110076904296875, "learning_rate": 4.976166983900177e-07, "loss": 0.0001, "reward": 1.7339286655187607, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7383928783237934, "rewards/format_reward_func": 0.9955357164144516, "step": 9810 }, { "completion_length": 236.11608123779297, "epoch": 1.645123433505176, "grad_norm": 0.1577864150084241, "kl": 0.1080169677734375, "learning_rate": 4.976150456691189e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.035355339758098125, "rewards/equation_reward_func": 0.7696428932249546, "rewards/format_reward_func": 0.9910714328289032, "step": 9812 }, { "completion_length": 229.29465293884277, "epoch": 1.6454587367450437, "grad_norm": 0.23221787032821553, "kl": 0.15777587890625, "learning_rate": 4.976133923781186e-07, "loss": 0.0002, "reward": 1.7785714864730835, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7875000163912773, "rewards/format_reward_func": 0.9910714328289032, "step": 9814 }, { "completion_length": 246.70090579986572, "epoch": 1.6457940399849114, "grad_norm": 0.3261517870491476, "kl": 0.225372314453125, "learning_rate": 4.976117385170204e-07, "loss": 0.0002, "reward": 1.7928571924567223, "reward_std": 0.06060915347188711, "rewards/equation_reward_func": 0.8017857521772385, "rewards/format_reward_func": 0.9910714328289032, "step": 9816 }, { "completion_length": 224.31250858306885, "epoch": 1.646129343224779, "grad_norm": 0.4089467330222451, "kl": 0.14581298828125, "learning_rate": 4.97610084085828e-07, "loss": 0.0001, "reward": 1.8500000834465027, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8500000052154064, "rewards/format_reward_func": 1.0, "step": 9818 }, { "completion_length": 231.48661994934082, "epoch": 1.6464646464646466, "grad_norm": 0.24769592529913995, "kl": 0.213653564453125, "learning_rate": 4.976084290845455e-07, "loss": 0.0002, "reward": 1.7982143461704254, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.8116071671247482, "rewards/format_reward_func": 0.9866071492433548, "step": 9820 }, { "completion_length": 229.10715198516846, "epoch": 1.646799949704514, "grad_norm": 0.23369410087790052, "kl": 0.16387939453125, "learning_rate": 4.976067735131766e-07, "loss": 0.0002, "reward": 1.7517857626080513, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7562500312924385, "rewards/format_reward_func": 0.9955357164144516, "step": 9822 }, { "completion_length": 237.2142972946167, "epoch": 1.6471352529443815, "grad_norm": 0.16716198794533854, "kl": 0.13714599609375, "learning_rate": 4.976051173717251e-07, "loss": 0.0001, "reward": 1.7696429193019867, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.9955357164144516, "step": 9824 }, { "completion_length": 235.5580472946167, "epoch": 1.647470556184249, "grad_norm": 0.26337107233516976, "kl": 0.1463623046875, "learning_rate": 4.976034606601948e-07, "loss": 0.0001, "reward": 1.8160714581608772, "reward_std": 0.07828682195395231, "rewards/equation_reward_func": 0.8294643089175224, "rewards/format_reward_func": 0.9866071492433548, "step": 9826 }, { "completion_length": 233.915189743042, "epoch": 1.6478058594241167, "grad_norm": 0.5915121442371221, "kl": 0.158599853515625, "learning_rate": 4.976018033785895e-07, "loss": 0.0002, "reward": 1.7642857879400253, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 9828 }, { "completion_length": 244.946439743042, "epoch": 1.6481411626639844, "grad_norm": 0.22913292433146903, "kl": 0.179351806640625, "learning_rate": 4.976001455269129e-07, "loss": 0.0002, "reward": 1.7357143685221672, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7446429021656513, "rewards/format_reward_func": 0.9910714328289032, "step": 9830 }, { "completion_length": 234.47322368621826, "epoch": 1.6484764659038518, "grad_norm": 0.20752086023371275, "kl": 0.12939453125, "learning_rate": 4.975984871051692e-07, "loss": 0.0001, "reward": 1.7803571969270706, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7848214469850063, "rewards/format_reward_func": 0.9955357164144516, "step": 9832 }, { "completion_length": 224.13393688201904, "epoch": 1.6488117691437192, "grad_norm": 0.19172764728090289, "kl": 0.1249542236328125, "learning_rate": 4.97596828113362e-07, "loss": 0.0001, "reward": 1.764285758137703, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7821428868919611, "rewards/format_reward_func": 0.9821428656578064, "step": 9834 }, { "completion_length": 231.31697463989258, "epoch": 1.6491470723835868, "grad_norm": 0.2749263834024471, "kl": 0.183258056640625, "learning_rate": 4.97595168551495e-07, "loss": 0.0002, "reward": 1.7928572073578835, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.8017857410013676, "rewards/format_reward_func": 0.9910714328289032, "step": 9836 }, { "completion_length": 238.1696548461914, "epoch": 1.6494823756234545, "grad_norm": 0.1818458948457541, "kl": 0.223663330078125, "learning_rate": 4.975935084195721e-07, "loss": 0.0002, "reward": 1.757142923772335, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 9838 }, { "completion_length": 240.446439743042, "epoch": 1.649817678863322, "grad_norm": 0.2215274701890657, "kl": 0.308563232421875, "learning_rate": 4.975918477175972e-07, "loss": 0.0003, "reward": 1.7482143342494965, "reward_std": 0.07828682195395231, "rewards/equation_reward_func": 0.7705357447266579, "rewards/format_reward_func": 0.977678582072258, "step": 9840 }, { "completion_length": 229.79911613464355, "epoch": 1.6501529821031897, "grad_norm": 0.162230038132875, "kl": 0.106109619140625, "learning_rate": 4.975901864455739e-07, "loss": 0.0001, "reward": 1.7946429178118706, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7991071790456772, "rewards/format_reward_func": 0.9955357164144516, "step": 9842 }, { "completion_length": 248.53126430511475, "epoch": 1.6504882853430571, "grad_norm": 0.32201021323163814, "kl": 0.21990966796875, "learning_rate": 4.975885246035064e-07, "loss": 0.0002, "reward": 1.7267857640981674, "reward_std": 0.10354063473641872, "rewards/equation_reward_func": 0.7491071671247482, "rewards/format_reward_func": 0.977678582072258, "step": 9844 }, { "completion_length": 246.05804824829102, "epoch": 1.6508235885829246, "grad_norm": 0.3693541503025461, "kl": 0.27056884765625, "learning_rate": 4.975868621913983e-07, "loss": 0.0003, "reward": 1.7410714775323868, "reward_std": 0.08333758544176817, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.977678582072258, "step": 9846 }, { "completion_length": 246.50447845458984, "epoch": 1.6511588918227922, "grad_norm": 0.7884187970758376, "kl": 0.19110107421875, "learning_rate": 4.975851992092533e-07, "loss": 0.0002, "reward": 1.6910715103149414, "reward_std": 0.08838834706693888, "rewards/equation_reward_func": 0.7223214618861675, "rewards/format_reward_func": 0.9687500074505806, "step": 9848 }, { "completion_length": 238.7410831451416, "epoch": 1.6514941950626598, "grad_norm": 0.256588565457127, "kl": 0.242462158203125, "learning_rate": 4.975835356570755e-07, "loss": 0.0002, "reward": 1.78035718947649, "reward_std": 0.07828682195395231, "rewards/equation_reward_func": 0.8026785962283611, "rewards/format_reward_func": 0.977678582072258, "step": 9850 }, { "completion_length": 237.477689743042, "epoch": 1.6518294983025275, "grad_norm": 0.22375889051073486, "kl": 0.1295928955078125, "learning_rate": 4.975818715348686e-07, "loss": 0.0001, "reward": 1.71785718947649, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7357143145054579, "rewards/format_reward_func": 0.9821428656578064, "step": 9852 }, { "completion_length": 221.727689743042, "epoch": 1.6521648015423949, "grad_norm": 0.2291120045991947, "kl": 0.1092376708984375, "learning_rate": 4.975802068426364e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643312692642, "rewards/format_reward_func": 0.9955357164144516, "step": 9854 }, { "completion_length": 223.55804443359375, "epoch": 1.6525001047822625, "grad_norm": 0.26357104191843117, "kl": 0.112152099609375, "learning_rate": 4.975785415803828e-07, "loss": 0.0001, "reward": 1.7625000551342964, "reward_std": 0.0833375845104456, "rewards/equation_reward_func": 0.7758928947150707, "rewards/format_reward_func": 0.9866071492433548, "step": 9856 }, { "completion_length": 221.83483219146729, "epoch": 1.65283540802213, "grad_norm": 0.5731160641229195, "kl": 0.19775390625, "learning_rate": 4.975768757481115e-07, "loss": 0.0002, "reward": 1.7892857491970062, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.798214316368103, "rewards/format_reward_func": 0.9910714328289032, "step": 9858 }, { "completion_length": 229.1562623977661, "epoch": 1.6531707112619975, "grad_norm": 0.2752841675439393, "kl": 0.270050048828125, "learning_rate": 4.975752093458266e-07, "loss": 0.0003, "reward": 1.7553572058677673, "reward_std": 0.10354063659906387, "rewards/equation_reward_func": 0.7687500193715096, "rewards/format_reward_func": 0.9866071492433548, "step": 9860 }, { "completion_length": 213.55358123779297, "epoch": 1.6535060145018652, "grad_norm": 0.3404765282706294, "kl": 0.1291351318359375, "learning_rate": 4.975735423735316e-07, "loss": 0.0001, "reward": 1.801785759627819, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8062500320374966, "rewards/format_reward_func": 0.9955357164144516, "step": 9862 }, { "completion_length": 221.33483028411865, "epoch": 1.6538413177417328, "grad_norm": 0.19012431000183802, "kl": 0.151336669921875, "learning_rate": 4.975718748312306e-07, "loss": 0.0002, "reward": 1.7892857268452644, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7982143238186836, "rewards/format_reward_func": 0.9910714328289032, "step": 9864 }, { "completion_length": 222.90179634094238, "epoch": 1.6541766209816002, "grad_norm": 0.004469077198342189, "kl": 0.235565185546875, "learning_rate": 4.975702067189274e-07, "loss": 0.0002, "reward": 1.7500000596046448, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 9866 }, { "completion_length": 221.87054252624512, "epoch": 1.6545119242214676, "grad_norm": 0.5435280581221478, "kl": 0.3485107421875, "learning_rate": 4.975685380366257e-07, "loss": 0.0003, "reward": 1.739285796880722, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 9868 }, { "completion_length": 224.46429538726807, "epoch": 1.6548472274613353, "grad_norm": 0.20865656624230186, "kl": 0.588104248046875, "learning_rate": 4.975668687843295e-07, "loss": 0.0006, "reward": 1.7625000551342964, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 9870 }, { "completion_length": 219.540189743042, "epoch": 1.655182530701203, "grad_norm": 0.308684592760428, "kl": 0.479217529296875, "learning_rate": 4.975651989620425e-07, "loss": 0.0005, "reward": 1.7250000461935997, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000350177288, "rewards/format_reward_func": 1.0, "step": 9872 }, { "completion_length": 219.31697463989258, "epoch": 1.6555178339410705, "grad_norm": 0.21206860929059224, "kl": 0.5569000244140625, "learning_rate": 4.975635285697687e-07, "loss": 0.0006, "reward": 1.853571467101574, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8535714522004128, "rewards/format_reward_func": 1.0, "step": 9874 }, { "completion_length": 227.2321538925171, "epoch": 1.655853137180938, "grad_norm": 0.24630201716195602, "kl": 0.1265411376953125, "learning_rate": 4.975618576075119e-07, "loss": 0.0001, "reward": 1.7267858013510704, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7312500216066837, "rewards/format_reward_func": 0.9955357164144516, "step": 9876 }, { "completion_length": 219.62947368621826, "epoch": 1.6561884404208056, "grad_norm": 0.23565190769001568, "kl": 0.22161865234375, "learning_rate": 4.975601860752758e-07, "loss": 0.0002, "reward": 1.778571493923664, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.778571454808116, "rewards/format_reward_func": 1.0, "step": 9878 }, { "completion_length": 220.07590198516846, "epoch": 1.656523743660673, "grad_norm": 0.20121385950706266, "kl": 0.21588134765625, "learning_rate": 4.975585139730644e-07, "loss": 0.0002, "reward": 1.7892857640981674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 9880 }, { "completion_length": 229.6696538925171, "epoch": 1.6568590469005406, "grad_norm": 0.30524107494236097, "kl": 0.0986175537109375, "learning_rate": 4.975568413008816e-07, "loss": 0.0001, "reward": 1.7892857566475868, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 9882 }, { "completion_length": 224.22322273254395, "epoch": 1.6571943501404083, "grad_norm": 0.3024153219313286, "kl": 0.0926361083984375, "learning_rate": 4.97555168058731e-07, "loss": 0.0001, "reward": 1.7375000640749931, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7419643364846706, "rewards/format_reward_func": 0.9955357164144516, "step": 9884 }, { "completion_length": 218.5178689956665, "epoch": 1.657529653380276, "grad_norm": 0.2697534224150527, "kl": 0.094818115234375, "learning_rate": 4.975534942466168e-07, "loss": 0.0001, "reward": 1.7892857640981674, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 9886 }, { "completion_length": 223.04911613464355, "epoch": 1.6578649566201433, "grad_norm": 0.1835474696257643, "kl": 0.1168365478515625, "learning_rate": 4.975518198645425e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 9888 }, { "completion_length": 222.79018878936768, "epoch": 1.6582002598600107, "grad_norm": 0.1563539322514873, "kl": 0.186553955078125, "learning_rate": 4.975501449125122e-07, "loss": 0.0002, "reward": 1.8107143193483353, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8107143193483353, "rewards/format_reward_func": 1.0, "step": 9890 }, { "completion_length": 224.33483219146729, "epoch": 1.6585355630998784, "grad_norm": 0.29091891567336037, "kl": 0.1243438720703125, "learning_rate": 4.975484693905298e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 9892 }, { "completion_length": 225.27233123779297, "epoch": 1.658870866339746, "grad_norm": 0.37451253496480214, "kl": 0.140167236328125, "learning_rate": 4.975467932985989e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7678571827709675, "rewards/format_reward_func": 1.0, "step": 9894 }, { "completion_length": 237.70536708831787, "epoch": 1.6592061695796136, "grad_norm": 0.268947938153285, "kl": 0.9615478515625, "learning_rate": 4.975451166367235e-07, "loss": 0.001, "reward": 1.8000000566244125, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 9896 }, { "completion_length": 233.5982255935669, "epoch": 1.6595414728194813, "grad_norm": 0.2700866753770015, "kl": 0.112548828125, "learning_rate": 4.975434394049075e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 9898 }, { "completion_length": 233.76340198516846, "epoch": 1.6598767760593487, "grad_norm": 0.2781299469560648, "kl": 0.2715911865234375, "learning_rate": 4.975417616031547e-07, "loss": 0.0003, "reward": 1.7678572088479996, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 9900 }, { "completion_length": 245.7500114440918, "epoch": 1.660212079299216, "grad_norm": 0.2604433695384531, "kl": 0.42327880859375, "learning_rate": 4.97540083231469e-07, "loss": 0.0004, "reward": 1.8285714760422707, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8285714574158192, "rewards/format_reward_func": 1.0, "step": 9902 }, { "completion_length": 246.0937623977661, "epoch": 1.6605473825390837, "grad_norm": 0.523587038459449, "kl": 0.1160888671875, "learning_rate": 4.975384042898542e-07, "loss": 0.0001, "reward": 1.7660714983940125, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7705357484519482, "rewards/format_reward_func": 0.9955357164144516, "step": 9904 }, { "completion_length": 240.57590579986572, "epoch": 1.6608826857789514, "grad_norm": 0.2091887597400727, "kl": 0.1318206787109375, "learning_rate": 4.975367247783144e-07, "loss": 0.0001, "reward": 1.81428574770689, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857328057289, "rewards/format_reward_func": 1.0, "step": 9906 }, { "completion_length": 251.88840579986572, "epoch": 1.661217989018819, "grad_norm": 0.38583211577731064, "kl": 0.101470947265625, "learning_rate": 4.97535044696853e-07, "loss": 0.0001, "reward": 1.7196429446339607, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7241071797907352, "rewards/format_reward_func": 0.9955357164144516, "step": 9908 }, { "completion_length": 251.37947750091553, "epoch": 1.6615532922586864, "grad_norm": 0.3002960218342867, "kl": 0.4559783935546875, "learning_rate": 4.975333640454743e-07, "loss": 0.0005, "reward": 1.7357143610715866, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7357143238186836, "rewards/format_reward_func": 1.0, "step": 9910 }, { "completion_length": 240.1785831451416, "epoch": 1.661888595498554, "grad_norm": 0.23452481171864847, "kl": 0.275421142578125, "learning_rate": 4.975316828241821e-07, "loss": 0.0003, "reward": 1.8107143267989159, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107143193483353, "rewards/format_reward_func": 1.0, "step": 9912 }, { "completion_length": 238.3839406967163, "epoch": 1.6622238987384215, "grad_norm": 0.2104834790867371, "kl": 0.110626220703125, "learning_rate": 4.9753000103298e-07, "loss": 0.0001, "reward": 1.7625000551342964, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643275439739, "rewards/format_reward_func": 0.9955357164144516, "step": 9914 }, { "completion_length": 252.7946548461914, "epoch": 1.662559201978289, "grad_norm": 0.2198794519534018, "kl": 0.5018157958984375, "learning_rate": 4.975283186718722e-07, "loss": 0.0005, "reward": 1.7125000730156898, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7169643174856901, "rewards/format_reward_func": 0.9955357164144516, "step": 9916 }, { "completion_length": 253.2544755935669, "epoch": 1.6628945052181567, "grad_norm": 0.6512800667382872, "kl": 1.954864501953125, "learning_rate": 4.975266357408623e-07, "loss": 0.002, "reward": 1.7464286163449287, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7553571872413158, "rewards/format_reward_func": 0.9910714328289032, "step": 9918 }, { "completion_length": 264.151798248291, "epoch": 1.6632298084580244, "grad_norm": 0.19630697068622924, "kl": 0.124298095703125, "learning_rate": 4.975249522399544e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714786499739, "rewards/format_reward_func": 1.0, "step": 9920 }, { "completion_length": 243.2455472946167, "epoch": 1.6635651116978918, "grad_norm": 0.3169098579351197, "kl": 0.1141357421875, "learning_rate": 4.975232681691523e-07, "loss": 0.0001, "reward": 1.7446429207921028, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.9955357164144516, "step": 9922 }, { "completion_length": 247.95983219146729, "epoch": 1.6639004149377592, "grad_norm": 0.24182742790034367, "kl": 0.21661376953125, "learning_rate": 4.975215835284598e-07, "loss": 0.0002, "reward": 1.8000000640749931, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 9924 }, { "completion_length": 239.7009048461914, "epoch": 1.6642357181776268, "grad_norm": 0.007444658415270113, "kl": 0.1461181640625, "learning_rate": 4.975198983178808e-07, "loss": 0.0001, "reward": 1.7339286133646965, "reward_std": 0.0328299580141902, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 9926 }, { "completion_length": 245.08036708831787, "epoch": 1.6645710214174945, "grad_norm": 0.18782957922663335, "kl": 0.115631103515625, "learning_rate": 4.975182125374193e-07, "loss": 0.0001, "reward": 1.7928571924567223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571812808514, "rewards/format_reward_func": 1.0, "step": 9928 }, { "completion_length": 257.1517972946167, "epoch": 1.664906324657362, "grad_norm": 0.3034523369429072, "kl": 0.4335174560546875, "learning_rate": 4.975165261870791e-07, "loss": 0.0004, "reward": 1.778571479022503, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7785714417695999, "rewards/format_reward_func": 1.0, "step": 9930 }, { "completion_length": 257.20537281036377, "epoch": 1.6652416278972295, "grad_norm": 0.2824380825578585, "kl": 0.2369384765625, "learning_rate": 4.975148392668641e-07, "loss": 0.0002, "reward": 1.750000074505806, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.758928619325161, "rewards/format_reward_func": 0.9910714328289032, "step": 9932 }, { "completion_length": 245.7232265472412, "epoch": 1.6655769311370971, "grad_norm": 0.20963993568996167, "kl": 0.15716552734375, "learning_rate": 4.975131517767782e-07, "loss": 0.0002, "reward": 1.7785714864730835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 9934 }, { "completion_length": 254.8437623977661, "epoch": 1.6659122343769646, "grad_norm": 0.1708738271349833, "kl": 0.327911376953125, "learning_rate": 4.975114637168252e-07, "loss": 0.0003, "reward": 1.7428572103381157, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 9936 }, { "completion_length": 251.62054634094238, "epoch": 1.6662475376168322, "grad_norm": 0.15899310756923962, "kl": 0.11993408203125, "learning_rate": 4.97509775087009e-07, "loss": 0.0001, "reward": 1.742857187986374, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 0.9821428656578064, "step": 9938 }, { "completion_length": 259.2500114440918, "epoch": 1.6665828408566998, "grad_norm": 0.25007766641668766, "kl": 0.113739013671875, "learning_rate": 4.975080858873336e-07, "loss": 0.0001, "reward": 1.744642935693264, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071671247482, "rewards/format_reward_func": 0.9955357164144516, "step": 9940 }, { "completion_length": 253.57144260406494, "epoch": 1.6669181440965675, "grad_norm": 0.29124021885748613, "kl": 0.125030517578125, "learning_rate": 4.975063961178027e-07, "loss": 0.0001, "reward": 1.7982143685221672, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8026785887777805, "rewards/format_reward_func": 0.9955357164144516, "step": 9942 }, { "completion_length": 250.4375114440918, "epoch": 1.6672534473364349, "grad_norm": 0.36358150917938814, "kl": 0.12762451171875, "learning_rate": 4.975047057784204e-07, "loss": 0.0001, "reward": 1.7303572222590446, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7348214611411095, "rewards/format_reward_func": 0.9955357164144516, "step": 9944 }, { "completion_length": 264.16965675354004, "epoch": 1.6675887505763023, "grad_norm": 0.27750091888726475, "kl": 0.11468505859375, "learning_rate": 4.975030148691905e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7410714589059353, "rewards/format_reward_func": 0.9910714328289032, "step": 9946 }, { "completion_length": 264.3125104904175, "epoch": 1.66792405381617, "grad_norm": 0.26333116652019245, "kl": 0.13232421875, "learning_rate": 4.975013233901169e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 9948 }, { "completion_length": 251.47769260406494, "epoch": 1.6682593570560376, "grad_norm": 0.21635846864664374, "kl": 0.112518310546875, "learning_rate": 4.974996313412034e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.810714315623045, "rewards/format_reward_func": 1.0, "step": 9950 }, { "completion_length": 252.16072368621826, "epoch": 1.6685946602959052, "grad_norm": 0.1487337396106638, "kl": 0.1123046875, "learning_rate": 4.974979387224541e-07, "loss": 0.0001, "reward": 1.7178571969270706, "reward_std": 0.08586296625435352, "rewards/equation_reward_func": 0.7357143275439739, "rewards/format_reward_func": 0.9821428656578064, "step": 9952 }, { "completion_length": 262.29465675354004, "epoch": 1.6689299635357726, "grad_norm": 0.13238518388734732, "kl": 0.10601806640625, "learning_rate": 4.974962455338728e-07, "loss": 0.0001, "reward": 1.7946429029107094, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7991071715950966, "rewards/format_reward_func": 0.9955357164144516, "step": 9954 }, { "completion_length": 266.8928689956665, "epoch": 1.6692652667756402, "grad_norm": 0.1476093870816, "kl": 0.1008758544921875, "learning_rate": 4.974945517754633e-07, "loss": 0.0001, "reward": 1.7375000715255737, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7508928906172514, "rewards/format_reward_func": 0.9866071492433548, "step": 9956 }, { "completion_length": 252.73661994934082, "epoch": 1.6696005700155077, "grad_norm": 0.6375593922999553, "kl": 0.1016998291015625, "learning_rate": 4.974928574472296e-07, "loss": 0.0001, "reward": 1.7839286178350449, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7883928753435612, "rewards/format_reward_func": 0.9955357164144516, "step": 9958 }, { "completion_length": 262.15626335144043, "epoch": 1.6699358732553753, "grad_norm": 0.1283348204099573, "kl": 0.1146392822265625, "learning_rate": 4.974911625491755e-07, "loss": 0.0001, "reward": 1.7410714849829674, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7455357629805803, "rewards/format_reward_func": 0.9955357164144516, "step": 9960 }, { "completion_length": 264.4419775009155, "epoch": 1.670271176495243, "grad_norm": 0.15103147121960364, "kl": 0.107666015625, "learning_rate": 4.974894670813051e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 9962 }, { "completion_length": 255.56697273254395, "epoch": 1.6706064797351106, "grad_norm": 0.4432214758021507, "kl": 0.110931396484375, "learning_rate": 4.974877710436222e-07, "loss": 0.0001, "reward": 1.7142857909202576, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7321428917348385, "rewards/format_reward_func": 0.9821428656578064, "step": 9964 }, { "completion_length": 253.93751049041748, "epoch": 1.670941782974978, "grad_norm": 0.1251880983482693, "kl": 0.114776611328125, "learning_rate": 4.974860744361306e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571619093418, "rewards/format_reward_func": 1.0, "step": 9966 }, { "completion_length": 252.72769451141357, "epoch": 1.6712770862148454, "grad_norm": 0.21358964680499068, "kl": 0.11474609375, "learning_rate": 4.974843772588343e-07, "loss": 0.0001, "reward": 1.7910714894533157, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 9968 }, { "completion_length": 250.03572177886963, "epoch": 1.671612389454713, "grad_norm": 0.17397521385257672, "kl": 0.102630615234375, "learning_rate": 4.974826795117371e-07, "loss": 0.0001, "reward": 1.733928605914116, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7383928969502449, "rewards/format_reward_func": 0.9955357164144516, "step": 9970 }, { "completion_length": 239.4910831451416, "epoch": 1.6719476926945807, "grad_norm": 0.29886153992095205, "kl": 0.106719970703125, "learning_rate": 4.974809811948432e-07, "loss": 0.0001, "reward": 1.7107143700122833, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7107143215835094, "rewards/format_reward_func": 1.0, "step": 9972 }, { "completion_length": 244.91072463989258, "epoch": 1.6722829959344483, "grad_norm": 0.2621951499898351, "kl": 0.11273193359375, "learning_rate": 4.974792823081563e-07, "loss": 0.0001, "reward": 1.785714365541935, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 9974 }, { "completion_length": 235.9375123977661, "epoch": 1.672618299174316, "grad_norm": 0.1754438312521652, "kl": 0.0966033935546875, "learning_rate": 4.974775828516803e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 9976 }, { "completion_length": 252.69644355773926, "epoch": 1.6729536024141833, "grad_norm": 0.19395800186925502, "kl": 0.168914794921875, "learning_rate": 4.974758828254192e-07, "loss": 0.0002, "reward": 1.8214286118745804, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8214285895228386, "rewards/format_reward_func": 1.0, "step": 9978 }, { "completion_length": 246.8973331451416, "epoch": 1.6732889056540508, "grad_norm": 0.2813224848583, "kl": 0.138519287109375, "learning_rate": 4.974741822293768e-07, "loss": 0.0001, "reward": 1.7750000804662704, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7839285880327225, "rewards/format_reward_func": 0.9910714328289032, "step": 9980 }, { "completion_length": 253.12054824829102, "epoch": 1.6736242088939184, "grad_norm": 0.14022046326230286, "kl": 0.106109619140625, "learning_rate": 4.97472481063557e-07, "loss": 0.0001, "reward": 1.746428668498993, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 9982 }, { "completion_length": 247.61608409881592, "epoch": 1.673959512133786, "grad_norm": 0.1908507372333823, "kl": 0.1064453125, "learning_rate": 4.974707793279638e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 9984 }, { "completion_length": 258.3660831451416, "epoch": 1.6742948153736537, "grad_norm": 0.19304693534597053, "kl": 0.1163177490234375, "learning_rate": 4.974690770226012e-07, "loss": 0.0001, "reward": 1.6821429282426834, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6821428928524256, "rewards/format_reward_func": 1.0, "step": 9986 }, { "completion_length": 246.45090579986572, "epoch": 1.674630118613521, "grad_norm": 0.1695020044930438, "kl": 0.12274169921875, "learning_rate": 4.974673741474731e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7526785992085934, "rewards/format_reward_func": 0.9955357164144516, "step": 9988 }, { "completion_length": 241.7544765472412, "epoch": 1.6749654218533887, "grad_norm": 0.00708512969713439, "kl": 0.116058349609375, "learning_rate": 4.974656707025832e-07, "loss": 0.0001, "reward": 1.821428619325161, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.8214285932481289, "rewards/format_reward_func": 1.0, "step": 9990 }, { "completion_length": 252.5491189956665, "epoch": 1.6753007250932561, "grad_norm": 0.09447739873956, "kl": 0.15313720703125, "learning_rate": 4.974639666879356e-07, "loss": 0.0002, "reward": 1.7267857789993286, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7312500439584255, "rewards/format_reward_func": 0.9955357164144516, "step": 9992 }, { "completion_length": 239.4419765472412, "epoch": 1.6756360283331238, "grad_norm": 0.3231878362488016, "kl": 0.1316986083984375, "learning_rate": 4.974622621035342e-07, "loss": 0.0001, "reward": 1.7446429282426834, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.9955357164144516, "step": 9994 }, { "completion_length": 241.5759038925171, "epoch": 1.6759713315729914, "grad_norm": 0.14058138538907927, "kl": 0.0970916748046875, "learning_rate": 4.97460556949383e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286163449287, "rewards/format_reward_func": 1.0, "step": 9996 }, { "completion_length": 253.74108505249023, "epoch": 1.676306634812859, "grad_norm": 0.22176562412261744, "kl": 0.138458251953125, "learning_rate": 4.974588512254858e-07, "loss": 0.0001, "reward": 1.8107143715023994, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 9998 }, { "completion_length": 248.26340675354004, "epoch": 1.6766419380527264, "grad_norm": 0.2757893418813273, "kl": 0.1244964599609375, "learning_rate": 4.974571449318465e-07, "loss": 0.0001, "reward": 1.7642858028411865, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 10000 }, { "completion_length": 257.0134029388428, "epoch": 1.6769772412925938, "grad_norm": 0.686820231753818, "kl": 0.2299652099609375, "learning_rate": 4.974554380684692e-07, "loss": 0.0002, "reward": 1.7446429431438446, "reward_std": 0.09848987217992544, "rewards/equation_reward_func": 0.7580357417464256, "rewards/format_reward_func": 0.9866071492433548, "step": 10002 }, { "completion_length": 254.4687623977661, "epoch": 1.6773125445324615, "grad_norm": 0.3286740146450279, "kl": 0.186676025390625, "learning_rate": 4.974537306353577e-07, "loss": 0.0002, "reward": 1.798214353621006, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8026785887777805, "rewards/format_reward_func": 0.9955357164144516, "step": 10004 }, { "completion_length": 251.5714406967163, "epoch": 1.6776478477723291, "grad_norm": 0.13520624126222636, "kl": 0.3016204833984375, "learning_rate": 4.97452022632516e-07, "loss": 0.0003, "reward": 1.7482143491506577, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526786103844643, "rewards/format_reward_func": 0.9955357164144516, "step": 10006 }, { "completion_length": 258.53125953674316, "epoch": 1.6779831510121967, "grad_norm": 0.15517144788447346, "kl": 0.12713623046875, "learning_rate": 4.97450314059948e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 10008 }, { "completion_length": 249.79465579986572, "epoch": 1.6783184542520642, "grad_norm": 0.25555380520299875, "kl": 0.12371826171875, "learning_rate": 4.974486049176575e-07, "loss": 0.0001, "reward": 1.7071429342031479, "reward_std": 0.06060915347188711, "rewards/equation_reward_func": 0.7160714641213417, "rewards/format_reward_func": 0.9910714328289032, "step": 10010 }, { "completion_length": 256.10268688201904, "epoch": 1.6786537574919318, "grad_norm": 0.11627685203612302, "kl": 0.11810302734375, "learning_rate": 4.974468952056487e-07, "loss": 0.0001, "reward": 1.7196429446339607, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7241071686148643, "rewards/format_reward_func": 0.9955357164144516, "step": 10012 }, { "completion_length": 253.6428680419922, "epoch": 1.6789890607317992, "grad_norm": 0.20586557717180662, "kl": 0.1288604736328125, "learning_rate": 4.974451849239253e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7589285895228386, "rewards/format_reward_func": 0.9910714328289032, "step": 10014 }, { "completion_length": 250.83483123779297, "epoch": 1.6793243639716668, "grad_norm": 0.1705164777603785, "kl": 0.10638427734375, "learning_rate": 4.974434740724915e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7562500331550837, "rewards/format_reward_func": 0.9955357164144516, "step": 10016 }, { "completion_length": 258.96875953674316, "epoch": 1.6796596672115345, "grad_norm": 0.12622978400779727, "kl": 0.180267333984375, "learning_rate": 4.974417626513509e-07, "loss": 0.0002, "reward": 1.8214286118745804, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8214286006987095, "rewards/format_reward_func": 1.0, "step": 10018 }, { "completion_length": 262.65626525878906, "epoch": 1.6799949704514021, "grad_norm": 0.260643559753684, "kl": 0.163970947265625, "learning_rate": 4.974400506605077e-07, "loss": 0.0002, "reward": 1.7375000938773155, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 10020 }, { "completion_length": 264.2321538925171, "epoch": 1.6803302736912695, "grad_norm": 0.22700129756442697, "kl": 0.22357177734375, "learning_rate": 4.974383380999657e-07, "loss": 0.0002, "reward": 1.7482143640518188, "reward_std": 0.042931484058499336, "rewards/equation_reward_func": 0.7616071701049805, "rewards/format_reward_func": 0.9866071492433548, "step": 10022 }, { "completion_length": 260.0178699493408, "epoch": 1.680665576931137, "grad_norm": 0.6076334386049436, "kl": 0.208984375, "learning_rate": 4.97436624969729e-07, "loss": 0.0002, "reward": 1.7589286342263222, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7723214589059353, "rewards/format_reward_func": 0.9866071492433548, "step": 10024 }, { "completion_length": 272.1696557998657, "epoch": 1.6810008801710046, "grad_norm": 0.1626425044654749, "kl": 0.129058837890625, "learning_rate": 4.974349112698014e-07, "loss": 0.0001, "reward": 1.7910714745521545, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.8044643066823483, "rewards/format_reward_func": 0.9866071492433548, "step": 10026 }, { "completion_length": 258.67412090301514, "epoch": 1.6813361834108722, "grad_norm": 0.2425983207701506, "kl": 0.142913818359375, "learning_rate": 4.974331970001869e-07, "loss": 0.0001, "reward": 1.791071504354477, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7955357395112514, "rewards/format_reward_func": 0.9955357164144516, "step": 10028 }, { "completion_length": 257.92412090301514, "epoch": 1.6816714866507398, "grad_norm": 0.21702457725067278, "kl": 0.1026611328125, "learning_rate": 4.974314821608894e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7660714462399483, "rewards/format_reward_func": 0.9910714328289032, "step": 10030 }, { "completion_length": 248.46429634094238, "epoch": 1.6820067898906075, "grad_norm": 0.30085577116212653, "kl": 0.1477813720703125, "learning_rate": 4.974297667519129e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7625000290572643, "rewards/format_reward_func": 0.9910714328289032, "step": 10032 }, { "completion_length": 257.2589416503906, "epoch": 1.682342093130475, "grad_norm": 0.06563080695637653, "kl": 0.148345947265625, "learning_rate": 4.974280507732613e-07, "loss": 0.0001, "reward": 1.787500061094761, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7919643148779869, "rewards/format_reward_func": 0.9955357164144516, "step": 10034 }, { "completion_length": 261.8884048461914, "epoch": 1.6826773963703423, "grad_norm": 0.3108712427672666, "kl": 0.203338623046875, "learning_rate": 4.974263342249387e-07, "loss": 0.0002, "reward": 1.7535714879631996, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7625000160187483, "rewards/format_reward_func": 0.9910714328289032, "step": 10036 }, { "completion_length": 266.7187623977661, "epoch": 1.68301269961021, "grad_norm": 0.39533845320926486, "kl": 0.125885009765625, "learning_rate": 4.974246171069489e-07, "loss": 0.0001, "reward": 1.7000000700354576, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7089286036789417, "rewards/format_reward_func": 0.9910714328289032, "step": 10038 }, { "completion_length": 246.85715198516846, "epoch": 1.6833480028500776, "grad_norm": 0.24514446508009619, "kl": 0.11993408203125, "learning_rate": 4.974228994192959e-07, "loss": 0.0001, "reward": 1.7732143327593803, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 10040 }, { "completion_length": 266.3794765472412, "epoch": 1.6836833060899452, "grad_norm": 0.21864629994207177, "kl": 0.13507080078125, "learning_rate": 4.974211811619836e-07, "loss": 0.0001, "reward": 1.814285770058632, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857365310192, "rewards/format_reward_func": 1.0, "step": 10042 }, { "completion_length": 262.80358123779297, "epoch": 1.6840186093298126, "grad_norm": 0.17710018125664703, "kl": 0.136474609375, "learning_rate": 4.97419462335016e-07, "loss": 0.0001, "reward": 1.7357143461704254, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9910714328289032, "step": 10044 }, { "completion_length": 249.85715293884277, "epoch": 1.6843539125696803, "grad_norm": 0.32386004346591574, "kl": 0.110443115234375, "learning_rate": 4.974177429383971e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 10046 }, { "completion_length": 260.33483505249023, "epoch": 1.6846892158095477, "grad_norm": 0.2952786587131419, "kl": 0.142913818359375, "learning_rate": 4.974160229721308e-07, "loss": 0.0001, "reward": 1.7303572073578835, "reward_std": 0.09848987124860287, "rewards/equation_reward_func": 0.7437500320374966, "rewards/format_reward_func": 0.9866071492433548, "step": 10048 }, { "completion_length": 264.16519260406494, "epoch": 1.6850245190494153, "grad_norm": 0.4340353985467339, "kl": 0.12249755859375, "learning_rate": 4.974143024362211e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 10050 }, { "completion_length": 245.21875953674316, "epoch": 1.685359822289283, "grad_norm": 0.41833085967800654, "kl": 0.1136474609375, "learning_rate": 4.974125813306719e-07, "loss": 0.0001, "reward": 1.6964286416769028, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7142857499420643, "rewards/format_reward_func": 0.9821428656578064, "step": 10052 }, { "completion_length": 252.24554634094238, "epoch": 1.6856951255291506, "grad_norm": 0.3466336130381244, "kl": 0.116119384765625, "learning_rate": 4.974108596554872e-07, "loss": 0.0001, "reward": 1.8142857775092125, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857514321804, "rewards/format_reward_func": 1.0, "step": 10054 }, { "completion_length": 252.09822368621826, "epoch": 1.686030428769018, "grad_norm": 0.2296487564634485, "kl": 0.110504150390625, "learning_rate": 4.97409137410671e-07, "loss": 0.0001, "reward": 1.7446429207921028, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071727126837, "rewards/format_reward_func": 0.9955357164144516, "step": 10056 }, { "completion_length": 244.94643783569336, "epoch": 1.6863657320088854, "grad_norm": 0.15632409057849708, "kl": 0.120025634765625, "learning_rate": 4.974074145962272e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 10058 }, { "completion_length": 243.4330472946167, "epoch": 1.686701035248753, "grad_norm": 0.2494711951314461, "kl": 0.131256103515625, "learning_rate": 4.974056912121599e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 10060 }, { "completion_length": 251.37054634094238, "epoch": 1.6870363384886207, "grad_norm": 0.23723346999630718, "kl": 0.111907958984375, "learning_rate": 4.974039672584729e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 10062 }, { "completion_length": 263.2991189956665, "epoch": 1.6873716417284883, "grad_norm": 0.24109290815217851, "kl": 0.156463623046875, "learning_rate": 4.974022427351703e-07, "loss": 0.0002, "reward": 1.7267857864499092, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7312500402331352, "rewards/format_reward_func": 0.9955357164144516, "step": 10064 }, { "completion_length": 248.97322463989258, "epoch": 1.6877069449683557, "grad_norm": 0.24947769782804471, "kl": 0.12640380859375, "learning_rate": 4.974005176422559e-07, "loss": 0.0001, "reward": 1.7607143446803093, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 10066 }, { "completion_length": 245.2946538925171, "epoch": 1.6880422482082234, "grad_norm": 0.40610948081296316, "kl": 0.125946044921875, "learning_rate": 4.973987919797337e-07, "loss": 0.0001, "reward": 1.8107143640518188, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.8196428716182709, "rewards/format_reward_func": 0.9910714328289032, "step": 10068 }, { "completion_length": 257.37947273254395, "epoch": 1.6883775514480908, "grad_norm": 0.12776086661506092, "kl": 0.151214599609375, "learning_rate": 4.973970657476079e-07, "loss": 0.0002, "reward": 1.7875000685453415, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 10070 }, { "completion_length": 231.50893878936768, "epoch": 1.6887128546879584, "grad_norm": 0.1613835712573395, "kl": 0.0987548828125, "learning_rate": 4.973953389458824e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.839285746216774, "rewards/format_reward_func": 1.0, "step": 10072 }, { "completion_length": 250.8750114440918, "epoch": 1.689048157927826, "grad_norm": 0.21526186107713036, "kl": 0.144287109375, "learning_rate": 4.97393611574561e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 10074 }, { "completion_length": 242.64733505249023, "epoch": 1.6893834611676937, "grad_norm": 0.14668393885929962, "kl": 0.16693115234375, "learning_rate": 4.973918836336478e-07, "loss": 0.0002, "reward": 1.7892857566475868, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857305705547, "rewards/format_reward_func": 1.0, "step": 10076 }, { "completion_length": 251.165189743042, "epoch": 1.689718764407561, "grad_norm": 0.6243773833866528, "kl": 0.200439453125, "learning_rate": 4.973901551231467e-07, "loss": 0.0002, "reward": 1.7625000774860382, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 10078 }, { "completion_length": 243.98215293884277, "epoch": 1.6900540676474285, "grad_norm": 0.158549076583998, "kl": 0.1374359130859375, "learning_rate": 4.973884260430617e-07, "loss": 0.0001, "reward": 1.7625000551342964, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9955357164144516, "step": 10080 }, { "completion_length": 250.665189743042, "epoch": 1.6903893708872961, "grad_norm": 0.17691845376375445, "kl": 0.2376861572265625, "learning_rate": 4.97386696393397e-07, "loss": 0.0002, "reward": 1.8000000715255737, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000156462193, "rewards/format_reward_func": 1.0, "step": 10082 }, { "completion_length": 248.63840293884277, "epoch": 1.6907246741271638, "grad_norm": 0.20282061468046836, "kl": 0.1807861328125, "learning_rate": 4.973849661741563e-07, "loss": 0.0002, "reward": 1.7732143700122833, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 10084 }, { "completion_length": 244.196439743042, "epoch": 1.6910599773670314, "grad_norm": 0.28045689010751607, "kl": 0.71612548828125, "learning_rate": 4.973832353853436e-07, "loss": 0.0007, "reward": 1.7857143431901932, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143003493547, "rewards/format_reward_func": 1.0, "step": 10086 }, { "completion_length": 242.47322750091553, "epoch": 1.6913952806068988, "grad_norm": 0.3193025602899345, "kl": 0.498504638671875, "learning_rate": 4.973815040269631e-07, "loss": 0.0005, "reward": 1.7500000670552254, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 10088 }, { "completion_length": 246.7142972946167, "epoch": 1.6917305838467664, "grad_norm": 0.23974640616671034, "kl": 0.428253173828125, "learning_rate": 4.973797720990186e-07, "loss": 0.0004, "reward": 1.7857143506407738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 10090 }, { "completion_length": 264.4419775009155, "epoch": 1.6920658870866339, "grad_norm": 0.4536671668732646, "kl": 0.81256103515625, "learning_rate": 4.973780396015142e-07, "loss": 0.0008, "reward": 1.7321429178118706, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7321429029107094, "rewards/format_reward_func": 1.0, "step": 10092 }, { "completion_length": 245.4776906967163, "epoch": 1.6924011903265015, "grad_norm": 0.17562989852246272, "kl": 0.906951904296875, "learning_rate": 4.973763065344538e-07, "loss": 0.0009, "reward": 1.7232143729925156, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276786081492901, "rewards/format_reward_func": 0.9955357164144516, "step": 10094 }, { "completion_length": 254.79018688201904, "epoch": 1.6927364935663691, "grad_norm": 0.07665879961083437, "kl": 0.70660400390625, "learning_rate": 4.973745728978413e-07, "loss": 0.0007, "reward": 1.7892857566475868, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 10096 }, { "completion_length": 245.40626049041748, "epoch": 1.6930717968062368, "grad_norm": 0.2503366730466342, "kl": 0.56756591796875, "learning_rate": 4.97372838691681e-07, "loss": 0.0006, "reward": 1.7589286491274834, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928805589676, "rewards/format_reward_func": 0.9955357164144516, "step": 10098 }, { "completion_length": 253.65626049041748, "epoch": 1.6934071000461042, "grad_norm": 0.6320229532136868, "kl": 1.1220703125, "learning_rate": 4.973711039159765e-07, "loss": 0.0011, "reward": 1.7776786237955093, "reward_std": 0.05177031829953194, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 0.9848214387893677, "step": 10100 }, { "completion_length": 245.08483219146729, "epoch": 1.6937424032859716, "grad_norm": 0.1919645569093751, "kl": 0.560791015625, "learning_rate": 4.973693685707322e-07, "loss": 0.0006, "reward": 1.7500000521540642, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 10102 }, { "completion_length": 240.29911518096924, "epoch": 1.6940777065258392, "grad_norm": 0.1526633101158587, "kl": 0.2593841552734375, "learning_rate": 4.973676326559518e-07, "loss": 0.0003, "reward": 1.7964286282658577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285872876644, "rewards/format_reward_func": 1.0, "step": 10104 }, { "completion_length": 243.66518878936768, "epoch": 1.6944130097657069, "grad_norm": 0.19763174283560406, "kl": 0.2373809814453125, "learning_rate": 4.973658961716394e-07, "loss": 0.0002, "reward": 1.7839286476373672, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928753435612, "rewards/format_reward_func": 0.9955357164144516, "step": 10106 }, { "completion_length": 242.6651906967163, "epoch": 1.6947483130055745, "grad_norm": 0.18538824341398405, "kl": 0.131744384765625, "learning_rate": 4.973641591177991e-07, "loss": 0.0001, "reward": 1.8178572058677673, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 1.0, "step": 10108 }, { "completion_length": 242.34822273254395, "epoch": 1.6950836162454421, "grad_norm": 0.21636593330112164, "kl": 0.161346435546875, "learning_rate": 4.973624214944347e-07, "loss": 0.0002, "reward": 1.7607143595814705, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143167406321, "rewards/format_reward_func": 1.0, "step": 10110 }, { "completion_length": 243.72322273254395, "epoch": 1.6954189194853095, "grad_norm": 0.4051968109433958, "kl": 0.2559814453125, "learning_rate": 4.973606833015503e-07, "loss": 0.0003, "reward": 1.7660714983940125, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7705357391387224, "rewards/format_reward_func": 0.9955357164144516, "step": 10112 }, { "completion_length": 250.1696548461914, "epoch": 1.695754222725177, "grad_norm": 0.1942327326972562, "kl": 0.23480224609375, "learning_rate": 4.973589445391497e-07, "loss": 0.0002, "reward": 1.778571493923664, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 10114 }, { "completion_length": 231.977689743042, "epoch": 1.6960895259650446, "grad_norm": 0.2900110471204609, "kl": 0.150970458984375, "learning_rate": 4.973572052072374e-07, "loss": 0.0002, "reward": 1.7642857655882835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857525497675, "rewards/format_reward_func": 1.0, "step": 10116 }, { "completion_length": 244.3303680419922, "epoch": 1.6964248292049122, "grad_norm": 0.23325186627488984, "kl": 0.206512451171875, "learning_rate": 4.973554653058169e-07, "loss": 0.0002, "reward": 1.7250000908970833, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000424683094, "rewards/format_reward_func": 1.0, "step": 10118 }, { "completion_length": 239.58483219146729, "epoch": 1.6967601324447799, "grad_norm": 0.1860929377205112, "kl": 0.1572265625, "learning_rate": 4.973537248348925e-07, "loss": 0.0002, "reward": 1.753571480512619, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714656114578, "rewards/format_reward_func": 1.0, "step": 10120 }, { "completion_length": 240.98661613464355, "epoch": 1.6970954356846473, "grad_norm": 0.8941528688423361, "kl": 0.22747802734375, "learning_rate": 4.973519837944681e-07, "loss": 0.0002, "reward": 1.7821429073810577, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821429036557674, "rewards/format_reward_func": 1.0, "step": 10122 }, { "completion_length": 253.20983123779297, "epoch": 1.697430738924515, "grad_norm": 0.46380905705084785, "kl": 0.25750732421875, "learning_rate": 4.973502421845476e-07, "loss": 0.0003, "reward": 1.710714377462864, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7196428775787354, "rewards/format_reward_func": 0.9910714328289032, "step": 10124 }, { "completion_length": 249.33037185668945, "epoch": 1.6977660421643823, "grad_norm": 0.15391532324689755, "kl": 0.18988037109375, "learning_rate": 4.973485000051354e-07, "loss": 0.0002, "reward": 1.769642911851406, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 10126 }, { "completion_length": 252.89287090301514, "epoch": 1.69810134540425, "grad_norm": 0.49648863518351344, "kl": 0.2744140625, "learning_rate": 4.973467572562351e-07, "loss": 0.0003, "reward": 1.7339286282658577, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7383929006755352, "rewards/format_reward_func": 0.9955357164144516, "step": 10128 }, { "completion_length": 260.56251335144043, "epoch": 1.6984366486441176, "grad_norm": 0.23758751631952763, "kl": 0.212677001953125, "learning_rate": 4.973450139378508e-07, "loss": 0.0002, "reward": 1.7392857819795609, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7482143230736256, "rewards/format_reward_func": 0.9910714328289032, "step": 10130 }, { "completion_length": 246.54465675354004, "epoch": 1.6987719518839852, "grad_norm": 0.008302879011098514, "kl": 0.142120361328125, "learning_rate": 4.973432700499866e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 10132 }, { "completion_length": 239.49107837677002, "epoch": 1.6991072551238526, "grad_norm": 0.1169948662184085, "kl": 0.385284423828125, "learning_rate": 4.973415255926466e-07, "loss": 0.0004, "reward": 1.7857143357396126, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143115252256, "rewards/format_reward_func": 1.0, "step": 10134 }, { "completion_length": 250.99108505249023, "epoch": 1.69944255836372, "grad_norm": 0.43755620630028064, "kl": 0.3654937744140625, "learning_rate": 4.973397805658345e-07, "loss": 0.0004, "reward": 1.7625000849366188, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669642977416515, "rewards/format_reward_func": 0.9955357164144516, "step": 10136 }, { "completion_length": 241.7544755935669, "epoch": 1.6997778616035877, "grad_norm": 0.1611913201639629, "kl": 0.258697509765625, "learning_rate": 4.973380349695547e-07, "loss": 0.0003, "reward": 1.7321429401636124, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428824216127, "rewards/format_reward_func": 1.0, "step": 10138 }, { "completion_length": 248.00447845458984, "epoch": 1.7001131648434553, "grad_norm": 0.3664824885281984, "kl": 0.603790283203125, "learning_rate": 4.973362888038109e-07, "loss": 0.0006, "reward": 1.7535714879631996, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714488476515, "rewards/format_reward_func": 1.0, "step": 10140 }, { "completion_length": 251.26787090301514, "epoch": 1.700448468083323, "grad_norm": 0.40636458209251985, "kl": 0.312957763671875, "learning_rate": 4.973345420686073e-07, "loss": 0.0003, "reward": 1.6928572207689285, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.6928571797907352, "rewards/format_reward_func": 1.0, "step": 10142 }, { "completion_length": 248.34376049041748, "epoch": 1.7007837713231904, "grad_norm": 0.33955868228708397, "kl": 1.279510498046875, "learning_rate": 4.973327947639478e-07, "loss": 0.0013, "reward": 1.7035714983940125, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7035714648663998, "rewards/format_reward_func": 1.0, "step": 10144 }, { "completion_length": 250.28572845458984, "epoch": 1.701119074563058, "grad_norm": 0.16777483507884186, "kl": 0.157379150390625, "learning_rate": 4.973310468898366e-07, "loss": 0.0002, "reward": 1.7642857804894447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 10146 }, { "completion_length": 253.16965675354004, "epoch": 1.7014543778029254, "grad_norm": 0.19270567991823492, "kl": 0.84698486328125, "learning_rate": 4.973292984462777e-07, "loss": 0.0008, "reward": 1.7446429133415222, "reward_std": 0.06060915347188711, "rewards/equation_reward_func": 0.7571428958326578, "rewards/format_reward_func": 0.987500011920929, "step": 10148 }, { "completion_length": 246.55804443359375, "epoch": 1.701789681042793, "grad_norm": 0.24648732219059558, "kl": 0.365814208984375, "learning_rate": 4.973275494332749e-07, "loss": 0.0004, "reward": 1.7750000730156898, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 10150 }, { "completion_length": 262.4017972946167, "epoch": 1.7021249842826607, "grad_norm": 0.36717976358266147, "kl": 0.6148681640625, "learning_rate": 4.973257998508325e-07, "loss": 0.0006, "reward": 1.8000000566244125, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000156462193, "rewards/format_reward_func": 1.0, "step": 10152 }, { "completion_length": 241.30804634094238, "epoch": 1.7024602875225283, "grad_norm": 0.13942221880283598, "kl": 0.142333984375, "learning_rate": 4.973240496989543e-07, "loss": 0.0001, "reward": 1.7803571820259094, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7848214656114578, "rewards/format_reward_func": 0.9955357164144516, "step": 10154 }, { "completion_length": 239.52232933044434, "epoch": 1.7027955907623957, "grad_norm": 0.505066602090983, "kl": 0.241119384765625, "learning_rate": 4.973222989776446e-07, "loss": 0.0002, "reward": 1.7678572162985802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571864962578, "rewards/format_reward_func": 1.0, "step": 10156 }, { "completion_length": 239.7009038925171, "epoch": 1.7031308940022631, "grad_norm": 0.2516769436746728, "kl": 0.176483154296875, "learning_rate": 4.973205476869072e-07, "loss": 0.0002, "reward": 1.7625000551342964, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 10158 }, { "completion_length": 249.70983028411865, "epoch": 1.7034661972421308, "grad_norm": 0.1202381463499803, "kl": 0.1517333984375, "learning_rate": 4.973187958267461e-07, "loss": 0.0002, "reward": 1.76071435213089, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143335044384, "rewards/format_reward_func": 1.0, "step": 10160 }, { "completion_length": 237.6026906967163, "epoch": 1.7038015004819984, "grad_norm": 0.15176271405999694, "kl": 0.1248779296875, "learning_rate": 4.973170433971655e-07, "loss": 0.0001, "reward": 1.789285771548748, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 10162 }, { "completion_length": 246.08483219146729, "epoch": 1.704136803721866, "grad_norm": 0.3686108597554467, "kl": 0.121490478515625, "learning_rate": 4.973152903981693e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 10164 }, { "completion_length": 245.67858219146729, "epoch": 1.7044721069617337, "grad_norm": 0.22525795074098146, "kl": 0.11285400390625, "learning_rate": 4.973135368297617e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.8035714477300644, "rewards/format_reward_func": 1.0, "step": 10166 }, { "completion_length": 238.79911613464355, "epoch": 1.704807410201601, "grad_norm": 0.1792667048858632, "kl": 0.13275146484375, "learning_rate": 4.973117826919467e-07, "loss": 0.0001, "reward": 1.8142857551574707, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 10168 }, { "completion_length": 237.5134038925171, "epoch": 1.7051427134414685, "grad_norm": 0.002530607373799404, "kl": 0.0972442626953125, "learning_rate": 4.973100279847281e-07, "loss": 0.0001, "reward": 1.8071428909897804, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.807142898440361, "rewards/format_reward_func": 1.0, "step": 10170 }, { "completion_length": 246.03126335144043, "epoch": 1.7054780166813361, "grad_norm": 0.3430750903754556, "kl": 0.111419677734375, "learning_rate": 4.973082727081103e-07, "loss": 0.0001, "reward": 1.7053572237491608, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7098214738070965, "rewards/format_reward_func": 0.9955357164144516, "step": 10172 }, { "completion_length": 245.3437623977661, "epoch": 1.7058133199212038, "grad_norm": 0.30599076791001545, "kl": 0.1102294921875, "learning_rate": 4.97306516862097e-07, "loss": 0.0001, "reward": 1.832142911851406, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8321428783237934, "rewards/format_reward_func": 1.0, "step": 10174 }, { "completion_length": 243.66518878936768, "epoch": 1.7061486231610714, "grad_norm": 0.3329256720540618, "kl": 0.143310546875, "learning_rate": 4.973047604466925e-07, "loss": 0.0001, "reward": 1.7482143640518188, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526785954833031, "rewards/format_reward_func": 0.9955357164144516, "step": 10176 }, { "completion_length": 233.24554443359375, "epoch": 1.7064839264009388, "grad_norm": 0.2505718550948968, "kl": 0.121307373046875, "learning_rate": 4.973030034619007e-07, "loss": 0.0001, "reward": 1.7214286550879478, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214286122471094, "rewards/format_reward_func": 1.0, "step": 10178 }, { "completion_length": 240.8259048461914, "epoch": 1.7068192296408065, "grad_norm": 0.3175999733034826, "kl": 0.1077423095703125, "learning_rate": 4.973012459077257e-07, "loss": 0.0001, "reward": 1.70000009983778, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7000000346451998, "rewards/format_reward_func": 1.0, "step": 10180 }, { "completion_length": 238.7589406967163, "epoch": 1.7071545328806739, "grad_norm": 0.8609321651175816, "kl": 0.1385498046875, "learning_rate": 4.972994877841715e-07, "loss": 0.0001, "reward": 1.7767857611179352, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500409781933, "rewards/format_reward_func": 0.9955357164144516, "step": 10182 }, { "completion_length": 242.0625114440918, "epoch": 1.7074898361205415, "grad_norm": 0.3140538824392083, "kl": 0.120025634765625, "learning_rate": 4.972977290912423e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 10184 }, { "completion_length": 240.6696538925171, "epoch": 1.7078251393604091, "grad_norm": 0.16024782471908663, "kl": 0.1128692626953125, "learning_rate": 4.97295969828942e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714440047741, "rewards/format_reward_func": 1.0, "step": 10186 }, { "completion_length": 237.9241189956665, "epoch": 1.7081604426002768, "grad_norm": 0.20369444105406395, "kl": 0.1204833984375, "learning_rate": 4.972942099972746e-07, "loss": 0.0001, "reward": 1.825000062584877, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.825000025331974, "rewards/format_reward_func": 1.0, "step": 10188 }, { "completion_length": 237.3214406967163, "epoch": 1.7084957458401442, "grad_norm": 0.46196819682538787, "kl": 0.1105804443359375, "learning_rate": 4.972924495962443e-07, "loss": 0.0001, "reward": 1.8250000476837158, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8250000216066837, "rewards/format_reward_func": 1.0, "step": 10190 }, { "completion_length": 247.79465293884277, "epoch": 1.7088310490800116, "grad_norm": 0.17530326999013046, "kl": 0.09844970703125, "learning_rate": 4.97290688625855e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7446428835391998, "rewards/format_reward_func": 0.9910714328289032, "step": 10192 }, { "completion_length": 239.915189743042, "epoch": 1.7091663523198792, "grad_norm": 0.17669363274091412, "kl": 0.10247802734375, "learning_rate": 4.97288927086111e-07, "loss": 0.0001, "reward": 1.8000000268220901, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 10194 }, { "completion_length": 251.86608219146729, "epoch": 1.7095016555597469, "grad_norm": 0.1184864862329785, "kl": 0.111236572265625, "learning_rate": 4.972871649770162e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 10196 }, { "completion_length": 249.87947845458984, "epoch": 1.7098369587996145, "grad_norm": 0.3240525754200581, "kl": 0.1515350341796875, "learning_rate": 4.972854022985746e-07, "loss": 0.0002, "reward": 1.753571517765522, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 10198 }, { "completion_length": 258.5848331451416, "epoch": 1.710172262039482, "grad_norm": 0.13925645977761708, "kl": 0.135101318359375, "learning_rate": 4.972836390507902e-07, "loss": 0.0001, "reward": 1.7285715267062187, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7375000268220901, "rewards/format_reward_func": 0.9910714328289032, "step": 10200 }, { "completion_length": 255.58483219146729, "epoch": 1.7105075652793496, "grad_norm": 0.3032562982546393, "kl": 0.102783203125, "learning_rate": 4.972818752336674e-07, "loss": 0.0001, "reward": 1.7928571924567223, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 10202 }, { "completion_length": 257.7321557998657, "epoch": 1.710842868519217, "grad_norm": 0.3941826817499621, "kl": 0.14019775390625, "learning_rate": 4.972801108472099e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7410714626312256, "rewards/format_reward_func": 0.9910714328289032, "step": 10204 }, { "completion_length": 265.51340675354004, "epoch": 1.7111781717590846, "grad_norm": 0.25116486602831917, "kl": 0.133148193359375, "learning_rate": 4.97278345891422e-07, "loss": 0.0001, "reward": 1.798214316368103, "reward_std": 0.06313453149050474, "rewards/equation_reward_func": 0.8026785962283611, "rewards/format_reward_func": 0.9955357164144516, "step": 10206 }, { "completion_length": 265.3839416503906, "epoch": 1.7115134749989522, "grad_norm": 0.14768036461804573, "kl": 0.12890625, "learning_rate": 4.972765803663076e-07, "loss": 0.0001, "reward": 1.7267857939004898, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7312500290572643, "rewards/format_reward_func": 0.9955357164144516, "step": 10208 }, { "completion_length": 254.28572845458984, "epoch": 1.7118487782388199, "grad_norm": 0.18517748549085836, "kl": 0.101348876953125, "learning_rate": 4.972748142718708e-07, "loss": 0.0001, "reward": 1.789285771548748, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857305705547, "rewards/format_reward_func": 1.0, "step": 10210 }, { "completion_length": 248.27233219146729, "epoch": 1.7121840814786873, "grad_norm": 0.16112355740952278, "kl": 0.117645263671875, "learning_rate": 4.972730476081157e-07, "loss": 0.0001, "reward": 1.7571429461240768, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 10212 }, { "completion_length": 249.39733600616455, "epoch": 1.7125193847185547, "grad_norm": 0.27793530860374993, "kl": 0.2154541015625, "learning_rate": 4.972712803750464e-07, "loss": 0.0002, "reward": 1.8196429163217545, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8241071775555611, "rewards/format_reward_func": 0.9955357164144516, "step": 10214 }, { "completion_length": 255.77233409881592, "epoch": 1.7128546879584223, "grad_norm": 0.17317650979675972, "kl": 0.1054840087890625, "learning_rate": 4.972695125726669e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 10216 }, { "completion_length": 253.52233123779297, "epoch": 1.71318999119829, "grad_norm": 0.22910347888793958, "kl": 0.133819580078125, "learning_rate": 4.972677442009813e-07, "loss": 0.0001, "reward": 1.7785715088248253, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 10218 }, { "completion_length": 256.03126335144043, "epoch": 1.7135252944381576, "grad_norm": 0.1251190478169731, "kl": 0.1388702392578125, "learning_rate": 4.972659752599937e-07, "loss": 0.0001, "reward": 1.8482143357396126, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.852678582072258, "rewards/format_reward_func": 0.9955357164144516, "step": 10220 }, { "completion_length": 263.6428699493408, "epoch": 1.713860597678025, "grad_norm": 0.1527141001062361, "kl": 0.1422882080078125, "learning_rate": 4.972642057497082e-07, "loss": 0.0001, "reward": 1.7696429193019867, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7741071656346321, "rewards/format_reward_func": 0.9955357164144516, "step": 10222 }, { "completion_length": 267.70983123779297, "epoch": 1.7141959009178926, "grad_norm": 0.2617393189987282, "kl": 0.106292724609375, "learning_rate": 4.972624356701287e-07, "loss": 0.0001, "reward": 1.733928643167019, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 10224 }, { "completion_length": 259.7410831451416, "epoch": 1.71453120415776, "grad_norm": 0.2660914466620459, "kl": 0.13519287109375, "learning_rate": 4.972606650212595e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500149011612, "rewards/format_reward_func": 0.9955357164144516, "step": 10226 }, { "completion_length": 262.5357255935669, "epoch": 1.7148665073976277, "grad_norm": 0.09373059801058264, "kl": 0.147003173828125, "learning_rate": 4.972588938031045e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 10228 }, { "completion_length": 269.6651906967163, "epoch": 1.7152018106374953, "grad_norm": 0.09178558498879472, "kl": 0.278045654296875, "learning_rate": 4.972571220156679e-07, "loss": 0.0003, "reward": 1.79464291036129, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7991071529686451, "rewards/format_reward_func": 0.9955357164144516, "step": 10230 }, { "completion_length": 273.5357255935669, "epoch": 1.715537113877363, "grad_norm": 0.20699127352252952, "kl": 0.10955810546875, "learning_rate": 4.972553496589536e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000242143869, "rewards/format_reward_func": 1.0, "step": 10232 }, { "completion_length": 255.79911994934082, "epoch": 1.7158724171172304, "grad_norm": 0.16328593761963217, "kl": 0.16827392578125, "learning_rate": 4.97253576732966e-07, "loss": 0.0002, "reward": 1.7714286521077156, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 1.0, "step": 10234 }, { "completion_length": 251.2812623977661, "epoch": 1.7162077203570978, "grad_norm": 0.15051227767423914, "kl": 0.2393951416015625, "learning_rate": 4.972518032377088e-07, "loss": 0.0002, "reward": 1.7500000819563866, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 10236 }, { "completion_length": 266.82143783569336, "epoch": 1.7165430235969654, "grad_norm": 0.5221781685923255, "kl": 0.2209320068359375, "learning_rate": 4.972500291731865e-07, "loss": 0.0002, "reward": 1.7973214834928513, "reward_std": 0.07449874933809042, "rewards/equation_reward_func": 0.8080357369035482, "rewards/format_reward_func": 0.9892857223749161, "step": 10238 }, { "completion_length": 256.92858505249023, "epoch": 1.716878326836833, "grad_norm": 0.1457588564956292, "kl": 0.137115478515625, "learning_rate": 4.972482545394028e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 10240 }, { "completion_length": 261.8526916503906, "epoch": 1.7172136300767007, "grad_norm": 0.1387693804521752, "kl": 0.1348876953125, "learning_rate": 4.972464793363619e-07, "loss": 0.0001, "reward": 1.760714314877987, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.760714340955019, "rewards/format_reward_func": 1.0, "step": 10242 }, { "completion_length": 267.48662281036377, "epoch": 1.7175489333165683, "grad_norm": 0.45388286887329304, "kl": 0.276153564453125, "learning_rate": 4.972447035640681e-07, "loss": 0.0003, "reward": 1.7482143864035606, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 10244 }, { "completion_length": 256.8303689956665, "epoch": 1.7178842365564357, "grad_norm": 0.26309543690645204, "kl": 0.109283447265625, "learning_rate": 4.972429272225252e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 10246 }, { "completion_length": 256.8884048461914, "epoch": 1.7182195397963032, "grad_norm": 0.5253921470745806, "kl": 0.2630615234375, "learning_rate": 4.972411503117374e-07, "loss": 0.0003, "reward": 1.7035714983940125, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7125000301748514, "rewards/format_reward_func": 0.9910714328289032, "step": 10248 }, { "completion_length": 261.7857255935669, "epoch": 1.7185548430361708, "grad_norm": 0.1953770623697014, "kl": 0.12847900390625, "learning_rate": 4.972393728317089e-07, "loss": 0.0001, "reward": 1.7410714998841286, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357518047094, "rewards/format_reward_func": 0.9955357164144516, "step": 10250 }, { "completion_length": 259.8348321914673, "epoch": 1.7188901462760384, "grad_norm": 0.1585139958072636, "kl": 0.1113128662109375, "learning_rate": 4.972375947824437e-07, "loss": 0.0001, "reward": 1.7642857506871223, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857600003481, "rewards/format_reward_func": 1.0, "step": 10252 }, { "completion_length": 251.85269260406494, "epoch": 1.719225449515906, "grad_norm": 0.21025823635972254, "kl": 0.142669677734375, "learning_rate": 4.972358161639458e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 1.0, "step": 10254 }, { "completion_length": 246.71429538726807, "epoch": 1.7195607527557735, "grad_norm": 0.21935861333026402, "kl": 0.0960845947265625, "learning_rate": 4.972340369762193e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 10256 }, { "completion_length": 249.4509038925171, "epoch": 1.719896055995641, "grad_norm": 0.19696143606941816, "kl": 0.1159515380859375, "learning_rate": 4.972322572192686e-07, "loss": 0.0001, "reward": 1.785714328289032, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 10258 }, { "completion_length": 253.4955472946167, "epoch": 1.7202313592355085, "grad_norm": 0.2697506218673313, "kl": 0.1621856689453125, "learning_rate": 4.972304768930973e-07, "loss": 0.0002, "reward": 1.7750000581145287, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 10260 }, { "completion_length": 248.9821548461914, "epoch": 1.7205666624753762, "grad_norm": 0.15170022565279304, "kl": 0.102203369140625, "learning_rate": 4.9722869599771e-07, "loss": 0.0001, "reward": 1.8250000402331352, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8250000290572643, "rewards/format_reward_func": 1.0, "step": 10262 }, { "completion_length": 241.9017972946167, "epoch": 1.7209019657152438, "grad_norm": 0.19211303932289808, "kl": 0.105621337890625, "learning_rate": 4.972269145331106e-07, "loss": 0.0001, "reward": 1.7982143387198448, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8026785925030708, "rewards/format_reward_func": 0.9955357164144516, "step": 10264 }, { "completion_length": 254.39733505249023, "epoch": 1.7212372689551114, "grad_norm": 0.013160530116678611, "kl": 0.108795166015625, "learning_rate": 4.972251324993031e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7991071790456772, "rewards/format_reward_func": 0.9955357164144516, "step": 10266 }, { "completion_length": 242.98661708831787, "epoch": 1.7215725721949788, "grad_norm": 0.15062515796873527, "kl": 0.085784912109375, "learning_rate": 4.972233498962917e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 10268 }, { "completion_length": 252.28572463989258, "epoch": 1.7219078754348462, "grad_norm": 0.3019032739127625, "kl": 0.0868988037109375, "learning_rate": 4.972215667240805e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 1.0, "step": 10270 }, { "completion_length": 248.76340675354004, "epoch": 1.7222431786747139, "grad_norm": 0.17839336544905793, "kl": 0.0845184326171875, "learning_rate": 4.972197829826735e-07, "loss": 0.0001, "reward": 1.8053571581840515, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8098214566707611, "rewards/format_reward_func": 0.9955357164144516, "step": 10272 }, { "completion_length": 250.40179824829102, "epoch": 1.7225784819145815, "grad_norm": 0.4455969725900186, "kl": 0.11224365234375, "learning_rate": 4.97217998672075e-07, "loss": 0.0001, "reward": 1.758928619325161, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928935974836, "rewards/format_reward_func": 0.9955357164144516, "step": 10274 }, { "completion_length": 251.383939743042, "epoch": 1.7229137851544492, "grad_norm": 0.16785836779638325, "kl": 0.0938873291015625, "learning_rate": 4.972162137922888e-07, "loss": 0.0001, "reward": 1.7946429029107094, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7991071660071611, "rewards/format_reward_func": 0.9955357164144516, "step": 10276 }, { "completion_length": 243.18304538726807, "epoch": 1.7232490883943166, "grad_norm": 0.23542771845399904, "kl": 0.0963134765625, "learning_rate": 4.972144283433194e-07, "loss": 0.0001, "reward": 1.7785715162754059, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 10278 }, { "completion_length": 248.352689743042, "epoch": 1.7235843916341842, "grad_norm": 0.23453639466499993, "kl": 0.09661865234375, "learning_rate": 4.972126423251708e-07, "loss": 0.0001, "reward": 1.7825893387198448, "reward_std": 0.06502856989391148, "rewards/equation_reward_func": 0.7883929014205933, "rewards/format_reward_func": 0.9941964335739613, "step": 10280 }, { "completion_length": 249.73215675354004, "epoch": 1.7239196948740516, "grad_norm": 0.1695213510329949, "kl": 0.114288330078125, "learning_rate": 4.972108557378469e-07, "loss": 0.0001, "reward": 1.7901786267757416, "reward_std": 0.044194172602146864, "rewards/equation_reward_func": 0.7919643223285675, "rewards/format_reward_func": 0.9982142895460129, "step": 10282 }, { "completion_length": 247.0312623977661, "epoch": 1.7242549981139192, "grad_norm": 0.20108784450563666, "kl": 0.107269287109375, "learning_rate": 4.972090685813519e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 10284 }, { "completion_length": 248.40626525878906, "epoch": 1.7245903013537869, "grad_norm": 0.17376718166315555, "kl": 0.098052978515625, "learning_rate": 4.972072808556901e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 10286 }, { "completion_length": 253.43304920196533, "epoch": 1.7249256045936545, "grad_norm": 0.20889097778803672, "kl": 0.1026458740234375, "learning_rate": 4.972054925608654e-07, "loss": 0.0001, "reward": 1.7678571864962578, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 10288 }, { "completion_length": 251.602689743042, "epoch": 1.725260907833522, "grad_norm": 0.1601895959215239, "kl": 0.0942535400390625, "learning_rate": 4.972037036968821e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 10290 }, { "completion_length": 252.18304634094238, "epoch": 1.7255962110733893, "grad_norm": 0.18190231757421382, "kl": 0.1245269775390625, "learning_rate": 4.972019142637442e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 10292 }, { "completion_length": 249.75894260406494, "epoch": 1.725931514313257, "grad_norm": 0.17078338524343523, "kl": 0.0865631103515625, "learning_rate": 4.972001242614558e-07, "loss": 0.0001, "reward": 1.773214340209961, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7776785884052515, "rewards/format_reward_func": 0.9955357164144516, "step": 10294 }, { "completion_length": 244.34375953674316, "epoch": 1.7262668175531246, "grad_norm": 0.2378874275442031, "kl": 0.1673583984375, "learning_rate": 4.97198333690021e-07, "loss": 0.0002, "reward": 1.769642896950245, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7741071879863739, "rewards/format_reward_func": 0.9955357164144516, "step": 10296 }, { "completion_length": 249.7098331451416, "epoch": 1.7266021207929922, "grad_norm": 0.21160423128606037, "kl": 0.120391845703125, "learning_rate": 4.971965425494439e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 10298 }, { "completion_length": 253.62947177886963, "epoch": 1.7269374240328599, "grad_norm": 0.20583500077418201, "kl": 0.18536376953125, "learning_rate": 4.97194750839729e-07, "loss": 0.0002, "reward": 1.7464286461472511, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 10300 }, { "completion_length": 235.7901906967163, "epoch": 1.7272727272727273, "grad_norm": 0.3967885953207802, "kl": 0.1246337890625, "learning_rate": 4.971929585608799e-07, "loss": 0.0001, "reward": 1.7928571924567223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571812808514, "rewards/format_reward_func": 1.0, "step": 10302 }, { "completion_length": 240.97768688201904, "epoch": 1.7276080305125947, "grad_norm": 0.46283659163232105, "kl": 0.155731201171875, "learning_rate": 4.97191165712901e-07, "loss": 0.0002, "reward": 1.742857240140438, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 10304 }, { "completion_length": 241.07590675354004, "epoch": 1.7279433337524623, "grad_norm": 0.2614746911479949, "kl": 0.0865325927734375, "learning_rate": 4.971893722957964e-07, "loss": 0.0001, "reward": 1.8017857521772385, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.8062500171363354, "rewards/format_reward_func": 0.9955357164144516, "step": 10306 }, { "completion_length": 230.19197463989258, "epoch": 1.72827863699233, "grad_norm": 0.18608370494522042, "kl": 0.1041259765625, "learning_rate": 4.971875783095702e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 10308 }, { "completion_length": 240.24108505249023, "epoch": 1.7286139402321976, "grad_norm": 0.26414739512639407, "kl": 0.112152099609375, "learning_rate": 4.971857837542266e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 10310 }, { "completion_length": 241.75893878936768, "epoch": 1.728949243472065, "grad_norm": 0.2030729173493549, "kl": 0.12939453125, "learning_rate": 4.971839886297697e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.778571454808116, "rewards/format_reward_func": 1.0, "step": 10312 }, { "completion_length": 229.4955472946167, "epoch": 1.7292845467119327, "grad_norm": 0.2002807067146816, "kl": 0.106903076171875, "learning_rate": 4.971821929362035e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 10314 }, { "completion_length": 241.39733409881592, "epoch": 1.7296198499518, "grad_norm": 0.22584604105987863, "kl": 0.0989837646484375, "learning_rate": 4.971803966735322e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 10316 }, { "completion_length": 245.0357265472412, "epoch": 1.7299551531916677, "grad_norm": 0.16363201642424538, "kl": 0.09918212890625, "learning_rate": 4.971785998417601e-07, "loss": 0.0001, "reward": 1.7732143327593803, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 10318 }, { "completion_length": 241.36608219146729, "epoch": 1.7302904564315353, "grad_norm": 0.11868823441642207, "kl": 0.119659423828125, "learning_rate": 4.971768024408912e-07, "loss": 0.0001, "reward": 1.7589286342263222, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 10320 }, { "completion_length": 246.12947368621826, "epoch": 1.730625759671403, "grad_norm": 0.4974318341941413, "kl": 0.11785888671875, "learning_rate": 4.971750044709296e-07, "loss": 0.0001, "reward": 1.7357143312692642, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 10322 }, { "completion_length": 243.7142972946167, "epoch": 1.7309610629112704, "grad_norm": 0.22373018764174843, "kl": 0.10406494140625, "learning_rate": 4.971732059318796e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 10324 }, { "completion_length": 244.84822750091553, "epoch": 1.7312963661511378, "grad_norm": 0.3080666523543961, "kl": 0.157073974609375, "learning_rate": 4.971714068237452e-07, "loss": 0.0002, "reward": 1.7892857566475868, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 10326 }, { "completion_length": 247.28572750091553, "epoch": 1.7316316693910054, "grad_norm": 0.194548889979958, "kl": 0.18048095703125, "learning_rate": 4.971696071465305e-07, "loss": 0.0002, "reward": 1.7517857775092125, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 10328 }, { "completion_length": 255.8348331451416, "epoch": 1.731966972630873, "grad_norm": 0.10122060964439665, "kl": 0.1008148193359375, "learning_rate": 4.971678069002398e-07, "loss": 0.0001, "reward": 1.8250000476837158, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.825000025331974, "rewards/format_reward_func": 1.0, "step": 10330 }, { "completion_length": 258.7410821914673, "epoch": 1.7323022758707407, "grad_norm": 0.6716880283698804, "kl": 0.2233734130859375, "learning_rate": 4.971660060848772e-07, "loss": 0.0002, "reward": 1.8125000521540642, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8169643171131611, "rewards/format_reward_func": 0.9955357164144516, "step": 10332 }, { "completion_length": 260.14733505249023, "epoch": 1.7326375791106081, "grad_norm": 0.10600836730768504, "kl": 0.189788818359375, "learning_rate": 4.971642047004466e-07, "loss": 0.0002, "reward": 1.7250000685453415, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7250000294297934, "rewards/format_reward_func": 1.0, "step": 10334 }, { "completion_length": 253.477689743042, "epoch": 1.7329728823504758, "grad_norm": 0.13879610996945366, "kl": 0.160552978515625, "learning_rate": 4.971624027469526e-07, "loss": 0.0002, "reward": 1.7696429193019867, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7741071730852127, "rewards/format_reward_func": 0.9955357164144516, "step": 10336 }, { "completion_length": 257.62055015563965, "epoch": 1.7333081855903432, "grad_norm": 0.21084404479090876, "kl": 0.20538330078125, "learning_rate": 4.97160600224399e-07, "loss": 0.0002, "reward": 1.7928571999073029, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 10338 }, { "completion_length": 250.45090675354004, "epoch": 1.7336434888302108, "grad_norm": 0.26688576816393683, "kl": 0.146881103515625, "learning_rate": 4.971587971327901e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.782142873853445, "rewards/format_reward_func": 1.0, "step": 10340 }, { "completion_length": 256.0759038925171, "epoch": 1.7339787920700784, "grad_norm": 0.113620714880813, "kl": 0.16156005859375, "learning_rate": 4.9715699347213e-07, "loss": 0.0002, "reward": 1.7857143431901932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 10342 }, { "completion_length": 257.41072273254395, "epoch": 1.734314095309946, "grad_norm": 0.2530098783662599, "kl": 0.1668701171875, "learning_rate": 4.971551892424228e-07, "loss": 0.0002, "reward": 1.7357143387198448, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 1.0, "step": 10344 }, { "completion_length": 258.1651916503906, "epoch": 1.7346493985498135, "grad_norm": 0.13072198172795174, "kl": 0.095489501953125, "learning_rate": 4.971533844436728e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 10346 }, { "completion_length": 265.5535840988159, "epoch": 1.734984701789681, "grad_norm": 0.13941893460031515, "kl": 0.114532470703125, "learning_rate": 4.97151579075884e-07, "loss": 0.0001, "reward": 1.7267858013510704, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7312500383704901, "rewards/format_reward_func": 0.9955357164144516, "step": 10348 }, { "completion_length": 268.7009029388428, "epoch": 1.7353200050295485, "grad_norm": 0.28658059038787587, "kl": 0.103271484375, "learning_rate": 4.971497731390607e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 10350 }, { "completion_length": 259.558048248291, "epoch": 1.7356553082694162, "grad_norm": 0.1752486314212871, "kl": 0.12921142578125, "learning_rate": 4.971479666332069e-07, "loss": 0.0001, "reward": 1.7732143476605415, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.777678593993187, "rewards/format_reward_func": 0.9955357164144516, "step": 10352 }, { "completion_length": 275.13393783569336, "epoch": 1.7359906115092838, "grad_norm": 0.1740133493170833, "kl": 0.183502197265625, "learning_rate": 4.971461595583269e-07, "loss": 0.0002, "reward": 1.7375000566244125, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7419643104076385, "rewards/format_reward_func": 0.9955357164144516, "step": 10354 }, { "completion_length": 276.25447845458984, "epoch": 1.7363259147491512, "grad_norm": 0.4997503302559109, "kl": 0.151702880859375, "learning_rate": 4.971443519144248e-07, "loss": 0.0002, "reward": 1.758928619325161, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.763392886146903, "rewards/format_reward_func": 0.9955357164144516, "step": 10356 }, { "completion_length": 268.64287090301514, "epoch": 1.7366612179890188, "grad_norm": 0.20603458803696528, "kl": 0.166961669921875, "learning_rate": 4.971425437015048e-07, "loss": 0.0002, "reward": 1.8267857804894447, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8312500268220901, "rewards/format_reward_func": 0.9955357164144516, "step": 10358 }, { "completion_length": 269.7678680419922, "epoch": 1.7369965212288863, "grad_norm": 0.1892884256617599, "kl": 0.1112518310546875, "learning_rate": 4.97140734919571e-07, "loss": 0.0001, "reward": 1.7821429371833801, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7910714484751225, "rewards/format_reward_func": 0.9910714328289032, "step": 10360 }, { "completion_length": 267.5357265472412, "epoch": 1.737331824468754, "grad_norm": 0.21339591862795737, "kl": 0.096923828125, "learning_rate": 4.971389255686275e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.08586296625435352, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 0.9821428656578064, "step": 10362 }, { "completion_length": 266.5714406967163, "epoch": 1.7376671277086215, "grad_norm": 0.15824648300184477, "kl": 0.1436004638671875, "learning_rate": 4.971371156486786e-07, "loss": 0.0001, "reward": 1.8250000402331352, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8250000365078449, "rewards/format_reward_func": 1.0, "step": 10364 }, { "completion_length": 266.89733600616455, "epoch": 1.7380024309484892, "grad_norm": 0.3613629986828252, "kl": 0.1967926025390625, "learning_rate": 4.971353051597285e-07, "loss": 0.0002, "reward": 1.791071504354477, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7955357320606709, "rewards/format_reward_func": 0.9955357164144516, "step": 10366 }, { "completion_length": 260.7232265472412, "epoch": 1.7383377341883566, "grad_norm": 0.002761877959319243, "kl": 0.1133575439453125, "learning_rate": 4.971334941017813e-07, "loss": 0.0001, "reward": 1.7589285969734192, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7633929047733545, "rewards/format_reward_func": 0.9955357164144516, "step": 10368 }, { "completion_length": 267.3794765472412, "epoch": 1.738673037428224, "grad_norm": 0.12438441265646877, "kl": 0.15130615234375, "learning_rate": 4.971316824748412e-07, "loss": 0.0002, "reward": 1.7464286535978317, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 10370 }, { "completion_length": 271.1116189956665, "epoch": 1.7390083406680916, "grad_norm": 0.25002979116110813, "kl": 0.125457763671875, "learning_rate": 4.971298702789123e-07, "loss": 0.0001, "reward": 1.7839286401867867, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7883928790688515, "rewards/format_reward_func": 0.9955357164144516, "step": 10372 }, { "completion_length": 275.9107246398926, "epoch": 1.7393436439079593, "grad_norm": 0.20122847648749992, "kl": 0.1316986083984375, "learning_rate": 4.971280575139988e-07, "loss": 0.0001, "reward": 1.7553572058677673, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.759821455925703, "rewards/format_reward_func": 0.9955357164144516, "step": 10374 }, { "completion_length": 271.5223340988159, "epoch": 1.739678947147827, "grad_norm": 0.14811595712044243, "kl": 0.1340484619140625, "learning_rate": 4.971262441801048e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.752678606659174, "rewards/format_reward_func": 0.9955357164144516, "step": 10376 }, { "completion_length": 284.4330425262451, "epoch": 1.7400142503876945, "grad_norm": 0.22086490716191035, "kl": 0.118499755859375, "learning_rate": 4.971244302772346e-07, "loss": 0.0001, "reward": 1.787500061094761, "reward_std": 0.07828682195395231, "rewards/equation_reward_func": 0.8008928783237934, "rewards/format_reward_func": 0.9866071492433548, "step": 10378 }, { "completion_length": 267.0089416503906, "epoch": 1.740349553627562, "grad_norm": 0.057972474967720546, "kl": 0.1278839111328125, "learning_rate": 4.971226158053923e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7991071753203869, "rewards/format_reward_func": 0.9955357164144516, "step": 10380 }, { "completion_length": 282.9732275009155, "epoch": 1.7406848568674294, "grad_norm": 0.295008429979625, "kl": 0.219482421875, "learning_rate": 4.971208007645823e-07, "loss": 0.0002, "reward": 1.728571504354477, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7375000305473804, "rewards/format_reward_func": 0.9910714328289032, "step": 10382 }, { "completion_length": 270.80804920196533, "epoch": 1.741020160107297, "grad_norm": 0.22125537829038144, "kl": 0.129486083984375, "learning_rate": 4.971189851548084e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7732143215835094, "rewards/format_reward_func": 0.9910714328289032, "step": 10384 }, { "completion_length": 285.18304538726807, "epoch": 1.7413554633471646, "grad_norm": 0.5638466687686062, "kl": 0.160888671875, "learning_rate": 4.971171689760751e-07, "loss": 0.0002, "reward": 1.7625000402331352, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9955357164144516, "step": 10386 }, { "completion_length": 275.16965103149414, "epoch": 1.7416907665870323, "grad_norm": 0.2000542526716108, "kl": 0.141265869140625, "learning_rate": 4.971153522283864e-07, "loss": 0.0001, "reward": 1.7714286521077156, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7803571708500385, "rewards/format_reward_func": 0.9910714328289032, "step": 10388 }, { "completion_length": 273.29465675354004, "epoch": 1.7420260698268997, "grad_norm": 0.19184259176887247, "kl": 0.140655517578125, "learning_rate": 4.971135349117465e-07, "loss": 0.0001, "reward": 1.7428572326898575, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7517857402563095, "rewards/format_reward_func": 0.9910714328289032, "step": 10390 }, { "completion_length": 267.9821538925171, "epoch": 1.7423613730667673, "grad_norm": 0.11815547024679297, "kl": 0.15802001953125, "learning_rate": 4.971117170261596e-07, "loss": 0.0002, "reward": 1.7714286372065544, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 10392 }, { "completion_length": 272.5803699493408, "epoch": 1.7426966763066347, "grad_norm": 0.24361023479353522, "kl": 0.267608642578125, "learning_rate": 4.9710989857163e-07, "loss": 0.0003, "reward": 1.7142858058214188, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7142857424914837, "rewards/format_reward_func": 1.0, "step": 10394 }, { "completion_length": 258.5982275009155, "epoch": 1.7430319795465024, "grad_norm": 0.24585877829799133, "kl": 0.13165283203125, "learning_rate": 4.971080795481618e-07, "loss": 0.0001, "reward": 1.8285714909434319, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8285714462399483, "rewards/format_reward_func": 1.0, "step": 10396 }, { "completion_length": 266.8125123977661, "epoch": 1.74336728278637, "grad_norm": 0.2220689272461111, "kl": 0.150238037109375, "learning_rate": 4.971062599557591e-07, "loss": 0.0002, "reward": 1.76071435213089, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 10398 }, { "completion_length": 260.52679920196533, "epoch": 1.7437025860262376, "grad_norm": 0.19939037132635878, "kl": 0.109222412109375, "learning_rate": 4.971044397944261e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 10400 }, { "completion_length": 272.54911613464355, "epoch": 1.744037889266105, "grad_norm": 0.19888458583738894, "kl": 0.2293853759765625, "learning_rate": 4.971026190641672e-07, "loss": 0.0002, "reward": 1.7321429252624512, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428991854191, "rewards/format_reward_func": 1.0, "step": 10402 }, { "completion_length": 273.9732275009155, "epoch": 1.7443731925059724, "grad_norm": 0.27151938888768473, "kl": 0.178863525390625, "learning_rate": 4.971007977649864e-07, "loss": 0.0002, "reward": 1.785714365541935, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 10404 }, { "completion_length": 266.08930110931396, "epoch": 1.74470849574584, "grad_norm": 0.31683103034099785, "kl": 0.1129150390625, "learning_rate": 4.97098975896888e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 10406 }, { "completion_length": 267.5089406967163, "epoch": 1.7450437989857077, "grad_norm": 0.13888860982697956, "kl": 0.1087799072265625, "learning_rate": 4.970971534598761e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7767857424914837, "rewards/format_reward_func": 0.9910714328289032, "step": 10408 }, { "completion_length": 263.92411708831787, "epoch": 1.7453791022255754, "grad_norm": 0.27825671143278985, "kl": 0.143402099609375, "learning_rate": 4.970953304539549e-07, "loss": 0.0001, "reward": 1.7571429461240768, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428753435612, "rewards/format_reward_func": 1.0, "step": 10410 }, { "completion_length": 273.5178699493408, "epoch": 1.7457144054654428, "grad_norm": 0.08538777497485277, "kl": 0.151153564453125, "learning_rate": 4.970935068791286e-07, "loss": 0.0002, "reward": 1.7410714775323868, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7455357443541288, "rewards/format_reward_func": 0.9955357164144516, "step": 10412 }, { "completion_length": 270.0982275009155, "epoch": 1.7460497087053104, "grad_norm": 0.5390369677057072, "kl": 0.193695068359375, "learning_rate": 4.970916827354016e-07, "loss": 0.0002, "reward": 1.7732143476605415, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7866071835160255, "rewards/format_reward_func": 0.9866071492433548, "step": 10414 }, { "completion_length": 261.7366199493408, "epoch": 1.7463850119451778, "grad_norm": 0.18971974736151406, "kl": 0.1143035888671875, "learning_rate": 4.970898580227778e-07, "loss": 0.0001, "reward": 1.8285714611411095, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8285714648663998, "rewards/format_reward_func": 1.0, "step": 10416 }, { "completion_length": 285.3214406967163, "epoch": 1.7467203151850454, "grad_norm": 0.3431656777052588, "kl": 0.220428466796875, "learning_rate": 4.970880327412616e-07, "loss": 0.0002, "reward": 1.7875000685453415, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7919643148779869, "rewards/format_reward_func": 0.9955357164144516, "step": 10418 }, { "completion_length": 276.5134029388428, "epoch": 1.747055618424913, "grad_norm": 0.14949047704968912, "kl": 0.2712860107421875, "learning_rate": 4.97086206890857e-07, "loss": 0.0003, "reward": 1.7553572058677673, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214484751225, "rewards/format_reward_func": 0.9955357164144516, "step": 10420 }, { "completion_length": 265.5669755935669, "epoch": 1.7473909216647807, "grad_norm": 0.20053087897408742, "kl": 0.1323699951171875, "learning_rate": 4.970843804715684e-07, "loss": 0.0001, "reward": 1.7910714969038963, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.795535746961832, "rewards/format_reward_func": 0.9955357164144516, "step": 10422 }, { "completion_length": 281.4196557998657, "epoch": 1.7477262249046481, "grad_norm": 0.34421321644086805, "kl": 0.195068359375, "learning_rate": 4.970825534834e-07, "loss": 0.0002, "reward": 1.7285715118050575, "reward_std": 0.12121830135583878, "rewards/equation_reward_func": 0.728571455925703, "rewards/format_reward_func": 1.0, "step": 10424 }, { "completion_length": 272.6919765472412, "epoch": 1.7480615281445155, "grad_norm": 0.3231745079674326, "kl": 0.154296875, "learning_rate": 4.970807259263559e-07, "loss": 0.0002, "reward": 1.7678572088479996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 10426 }, { "completion_length": 279.2589464187622, "epoch": 1.7483968313843832, "grad_norm": 0.17148280698992707, "kl": 0.165374755859375, "learning_rate": 4.970788978004404e-07, "loss": 0.0002, "reward": 1.7607143595814705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 10428 }, { "completion_length": 275.589298248291, "epoch": 1.7487321346242508, "grad_norm": 0.25112206633806494, "kl": 0.146392822265625, "learning_rate": 4.970770691056577e-07, "loss": 0.0001, "reward": 1.7053572237491608, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7098214644938707, "rewards/format_reward_func": 0.9955357164144516, "step": 10430 }, { "completion_length": 265.3437614440918, "epoch": 1.7490674378641184, "grad_norm": 0.17369490573789476, "kl": 0.12432861328125, "learning_rate": 4.97075239842012e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 10432 }, { "completion_length": 273.96429443359375, "epoch": 1.749402741103986, "grad_norm": 0.24760782391253822, "kl": 0.1802978515625, "learning_rate": 4.970734100095073e-07, "loss": 0.0002, "reward": 1.7446429207921028, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7580357417464256, "rewards/format_reward_func": 0.9866071492433548, "step": 10434 }, { "completion_length": 272.47322845458984, "epoch": 1.7497380443438535, "grad_norm": 0.20643982792458254, "kl": 0.1326141357421875, "learning_rate": 4.970715796081482e-07, "loss": 0.0001, "reward": 1.817857176065445, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 1.0, "step": 10436 }, { "completion_length": 276.3794746398926, "epoch": 1.750073347583721, "grad_norm": 0.10986143055110638, "kl": 0.2333984375, "learning_rate": 4.970697486379386e-07, "loss": 0.0002, "reward": 1.7446429207921028, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 10438 }, { "completion_length": 257.4509029388428, "epoch": 1.7504086508235885, "grad_norm": 0.19597224889941217, "kl": 0.110137939453125, "learning_rate": 4.970679170988829e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285850524902, "rewards/format_reward_func": 1.0, "step": 10440 }, { "completion_length": 261.3794746398926, "epoch": 1.7507439540634562, "grad_norm": 0.32525695998592424, "kl": 0.164581298828125, "learning_rate": 4.970660849909852e-07, "loss": 0.0002, "reward": 1.7642857879400253, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 10442 }, { "completion_length": 264.62500858306885, "epoch": 1.7510792573033238, "grad_norm": 0.2660366832229654, "kl": 0.09619140625, "learning_rate": 4.970642523142498e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 10444 }, { "completion_length": 271.4866180419922, "epoch": 1.7514145605431912, "grad_norm": 0.2229115237298831, "kl": 0.159698486328125, "learning_rate": 4.970624190686808e-07, "loss": 0.0002, "reward": 1.773214340209961, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7776786126196384, "rewards/format_reward_func": 0.9955357164144516, "step": 10446 }, { "completion_length": 255.22322750091553, "epoch": 1.7517498637830589, "grad_norm": 0.34719989862558487, "kl": 0.1978912353515625, "learning_rate": 4.970605852542826e-07, "loss": 0.0002, "reward": 1.8125000447034836, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8169643171131611, "rewards/format_reward_func": 0.9955357164144516, "step": 10448 }, { "completion_length": 264.40626335144043, "epoch": 1.7520851670229263, "grad_norm": 0.19374510192625774, "kl": 0.1017913818359375, "learning_rate": 4.970587508710593e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285995811224, "rewards/format_reward_func": 1.0, "step": 10450 }, { "completion_length": 267.5491199493408, "epoch": 1.752420470262794, "grad_norm": 0.18141824950577376, "kl": 0.22296142578125, "learning_rate": 4.970569159190152e-07, "loss": 0.0002, "reward": 1.742857202887535, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571842610836, "rewards/format_reward_func": 1.0, "step": 10452 }, { "completion_length": 257.7500123977661, "epoch": 1.7527557735026615, "grad_norm": 0.00973250312477692, "kl": 0.120849609375, "learning_rate": 4.970550803981544e-07, "loss": 0.0001, "reward": 1.7250000685453415, "reward_std": 0.005050762556493282, "rewards/equation_reward_func": 0.7250000536441803, "rewards/format_reward_func": 1.0, "step": 10454 }, { "completion_length": 264.75447845458984, "epoch": 1.7530910767425292, "grad_norm": 0.16646004320135122, "kl": 0.0950927734375, "learning_rate": 4.970532443084812e-07, "loss": 0.0001, "reward": 1.7553571984171867, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7589286118745804, "rewards/format_reward_func": 0.9964285716414452, "step": 10456 }, { "completion_length": 272.44197368621826, "epoch": 1.7534263799823966, "grad_norm": 0.2769682286353471, "kl": 0.12298583984375, "learning_rate": 4.970514076499999e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.776785746216774, "rewards/format_reward_func": 0.9910714328289032, "step": 10458 }, { "completion_length": 259.90626430511475, "epoch": 1.753761683222264, "grad_norm": 0.20317105516113643, "kl": 0.0985107421875, "learning_rate": 4.970495704227146e-07, "loss": 0.0001, "reward": 1.7928572222590446, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 10460 }, { "completion_length": 267.4732275009155, "epoch": 1.7540969864621316, "grad_norm": 0.28731729589505295, "kl": 0.119537353515625, "learning_rate": 4.970477326266297e-07, "loss": 0.0001, "reward": 1.7178572490811348, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571820259094, "rewards/format_reward_func": 1.0, "step": 10462 }, { "completion_length": 258.66072940826416, "epoch": 1.7544322897019993, "grad_norm": 0.37070153761318986, "kl": 0.100830078125, "learning_rate": 4.970458942617493e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 10464 }, { "completion_length": 271.5000123977661, "epoch": 1.754767592941867, "grad_norm": 0.20976854165769124, "kl": 0.156036376953125, "learning_rate": 4.970440553280776e-07, "loss": 0.0002, "reward": 1.7732143551111221, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776785865426064, "rewards/format_reward_func": 0.9955357164144516, "step": 10466 }, { "completion_length": 261.25893783569336, "epoch": 1.7551028961817343, "grad_norm": 0.3999970736225111, "kl": 0.1058502197265625, "learning_rate": 4.970422158256188e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.09343910776078701, "rewards/equation_reward_func": 0.7669643014669418, "rewards/format_reward_func": 0.9955357164144516, "step": 10468 }, { "completion_length": 270.8616189956665, "epoch": 1.755438199421602, "grad_norm": 0.2932392086982285, "kl": 0.102630615234375, "learning_rate": 4.970403757543773e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714626312256, "rewards/format_reward_func": 1.0, "step": 10470 }, { "completion_length": 258.6919755935669, "epoch": 1.7557735026614694, "grad_norm": 0.7825140351594014, "kl": 0.227874755859375, "learning_rate": 4.970385351143573e-07, "loss": 0.0002, "reward": 1.8178571835160255, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8178571574389935, "rewards/format_reward_func": 1.0, "step": 10472 }, { "completion_length": 265.7009048461914, "epoch": 1.756108805901337, "grad_norm": 0.23065535913563237, "kl": 0.1201171875, "learning_rate": 4.97036693905563e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 10474 }, { "completion_length": 255.95536708831787, "epoch": 1.7564441091412046, "grad_norm": 0.35248789588895424, "kl": 0.3164215087890625, "learning_rate": 4.970348521279986e-07, "loss": 0.0003, "reward": 1.8142857626080513, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.823214303702116, "rewards/format_reward_func": 0.9910714328289032, "step": 10476 }, { "completion_length": 261.8125123977661, "epoch": 1.7567794123810723, "grad_norm": 0.19963963290351514, "kl": 0.1441192626953125, "learning_rate": 4.970330097816683e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 0.9821428656578064, "step": 10478 }, { "completion_length": 270.3884057998657, "epoch": 1.7571147156209397, "grad_norm": 0.23056821333387442, "kl": 0.110260009765625, "learning_rate": 4.970311668665766e-07, "loss": 0.0001, "reward": 1.764285758137703, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 10480 }, { "completion_length": 266.4776916503906, "epoch": 1.757450018860807, "grad_norm": 0.1639649340695387, "kl": 0.137908935546875, "learning_rate": 4.970293233827274e-07, "loss": 0.0001, "reward": 1.6678572073578835, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.676785746589303, "rewards/format_reward_func": 0.9910714328289032, "step": 10482 }, { "completion_length": 257.2053699493408, "epoch": 1.7577853221006747, "grad_norm": 0.20303344762139539, "kl": 0.107940673828125, "learning_rate": 4.970274793301252e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.04545686487108469, "rewards/equation_reward_func": 0.7767857424914837, "rewards/format_reward_func": 0.9910714328289032, "step": 10484 }, { "completion_length": 274.4018020629883, "epoch": 1.7581206253405424, "grad_norm": 0.261607313743246, "kl": 0.113311767578125, "learning_rate": 4.970256347087741e-07, "loss": 0.0001, "reward": 1.789285771548748, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 10486 }, { "completion_length": 258.8750114440918, "epoch": 1.75845592858041, "grad_norm": 0.48522185192261696, "kl": 0.1284637451171875, "learning_rate": 4.970237895186784e-07, "loss": 0.0001, "reward": 1.8339286223053932, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8383928686380386, "rewards/format_reward_func": 0.9955357164144516, "step": 10488 }, { "completion_length": 273.1071529388428, "epoch": 1.7587912318202774, "grad_norm": 0.18348579583019425, "kl": 0.12353515625, "learning_rate": 4.970219437598423e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.751785721629858, "rewards/format_reward_func": 0.9910714328289032, "step": 10490 }, { "completion_length": 270.8884057998657, "epoch": 1.759126535060145, "grad_norm": 0.3575486729727749, "kl": 0.1207275390625, "learning_rate": 4.970200974322702e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7767857499420643, "rewards/format_reward_func": 0.9910714328289032, "step": 10492 }, { "completion_length": 269.8750114440918, "epoch": 1.7594618383000125, "grad_norm": 0.2593350581842222, "kl": 0.160858154296875, "learning_rate": 4.970182505359662e-07, "loss": 0.0002, "reward": 1.7571429088711739, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 10494 }, { "completion_length": 266.58929920196533, "epoch": 1.75979714153988, "grad_norm": 0.2131207192247197, "kl": 0.10052490234375, "learning_rate": 4.970164030709346e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.06313453428447247, "rewards/equation_reward_func": 0.7758928798139095, "rewards/format_reward_func": 0.9866071492433548, "step": 10496 }, { "completion_length": 262.8928680419922, "epoch": 1.7601324447797477, "grad_norm": 0.23990261996700582, "kl": 0.1071319580078125, "learning_rate": 4.970145550371797e-07, "loss": 0.0001, "reward": 1.7303572222590446, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7348214536905289, "rewards/format_reward_func": 0.9955357164144516, "step": 10498 }, { "completion_length": 265.95090770721436, "epoch": 1.7604677480196154, "grad_norm": 0.28258834064510424, "kl": 0.177490234375, "learning_rate": 4.970127064347056e-07, "loss": 0.0002, "reward": 1.7017857879400253, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7062500268220901, "rewards/format_reward_func": 0.9955357164144516, "step": 10500 }, { "completion_length": 255.3259038925171, "epoch": 1.7608030512594828, "grad_norm": 0.25810737442998816, "kl": 0.099578857421875, "learning_rate": 4.970108572635168e-07, "loss": 0.0001, "reward": 1.825000062584877, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8250000141561031, "rewards/format_reward_func": 1.0, "step": 10502 }, { "completion_length": 255.51787185668945, "epoch": 1.7611383544993502, "grad_norm": 0.309821970825553, "kl": 0.150054931640625, "learning_rate": 4.970090075236173e-07, "loss": 0.0002, "reward": 1.7303572073578835, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7437500394880772, "rewards/format_reward_func": 0.9866071492433548, "step": 10504 }, { "completion_length": 260.08929920196533, "epoch": 1.7614736577392178, "grad_norm": 0.24711570040427921, "kl": 0.1146240234375, "learning_rate": 4.970071572150116e-07, "loss": 0.0001, "reward": 1.7982143238186836, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.8026785999536514, "rewards/format_reward_func": 0.9955357164144516, "step": 10506 }, { "completion_length": 260.5982265472412, "epoch": 1.7618089609790855, "grad_norm": 0.18349956303515857, "kl": 0.11907958984375, "learning_rate": 4.970053063377037e-07, "loss": 0.0001, "reward": 1.7285714969038963, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 10508 }, { "completion_length": 245.54911994934082, "epoch": 1.762144264218953, "grad_norm": 0.07936488872801638, "kl": 0.09381103515625, "learning_rate": 4.97003454891698e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 10510 }, { "completion_length": 255.3884048461914, "epoch": 1.7624795674588207, "grad_norm": 0.09023603463700747, "kl": 0.0997314453125, "learning_rate": 4.970016028769989e-07, "loss": 0.0001, "reward": 1.7625000476837158, "reward_std": 0.022728432901203632, "rewards/equation_reward_func": 0.7669643312692642, "rewards/format_reward_func": 0.9955357164144516, "step": 10512 }, { "completion_length": 251.54465675354004, "epoch": 1.7628148706986881, "grad_norm": 0.14167783274190454, "kl": 0.120635986328125, "learning_rate": 4.969997502936105e-07, "loss": 0.0001, "reward": 1.7375000789761543, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 10514 }, { "completion_length": 238.62054634094238, "epoch": 1.7631501739385556, "grad_norm": 0.21653483508274973, "kl": 0.100006103515625, "learning_rate": 4.96997897141537e-07, "loss": 0.0001, "reward": 1.8071429282426834, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428947150707, "rewards/format_reward_func": 1.0, "step": 10516 }, { "completion_length": 244.4732265472412, "epoch": 1.7634854771784232, "grad_norm": 0.21315796766163686, "kl": 0.09234619140625, "learning_rate": 4.969960434207828e-07, "loss": 0.0001, "reward": 1.7383929342031479, "reward_std": 0.056821079924702644, "rewards/equation_reward_func": 0.7446428798139095, "rewards/format_reward_func": 0.9937500059604645, "step": 10518 }, { "completion_length": 247.1517972946167, "epoch": 1.7638207804182908, "grad_norm": 0.18540650627396144, "kl": 0.097076416015625, "learning_rate": 4.969941891313522e-07, "loss": 0.0001, "reward": 1.848214328289032, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8526785857975483, "rewards/format_reward_func": 0.9955357164144516, "step": 10520 }, { "completion_length": 245.6741180419922, "epoch": 1.7641560836581585, "grad_norm": 0.16238503100743001, "kl": 0.101806640625, "learning_rate": 4.969923342732493e-07, "loss": 0.0001, "reward": 1.7982143387198448, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.8026786036789417, "rewards/format_reward_func": 0.9955357164144516, "step": 10522 }, { "completion_length": 251.41072750091553, "epoch": 1.7644913868980259, "grad_norm": 0.435907843732035, "kl": 0.09576416015625, "learning_rate": 4.969904788464786e-07, "loss": 0.0001, "reward": 1.7392858266830444, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 10524 }, { "completion_length": 253.42411708831787, "epoch": 1.7648266901378935, "grad_norm": 0.18620437703841827, "kl": 0.0916748046875, "learning_rate": 4.969886228510442e-07, "loss": 0.0001, "reward": 1.7285714894533157, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7375000268220901, "rewards/format_reward_func": 0.9910714328289032, "step": 10526 }, { "completion_length": 251.94197845458984, "epoch": 1.765161993377761, "grad_norm": 0.2205986553207691, "kl": 0.0987396240234375, "learning_rate": 4.969867662869503e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 10528 }, { "completion_length": 246.54019165039062, "epoch": 1.7654972966176286, "grad_norm": 0.1830755227045685, "kl": 0.120941162109375, "learning_rate": 4.969849091542014e-07, "loss": 0.0001, "reward": 1.8000000640749931, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 10530 }, { "completion_length": 240.3125123977661, "epoch": 1.7658325998574962, "grad_norm": 0.15024188183011833, "kl": 0.1271514892578125, "learning_rate": 4.969830514528016e-07, "loss": 0.0001, "reward": 1.7607143372297287, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7696428839117289, "rewards/format_reward_func": 0.9910714328289032, "step": 10532 }, { "completion_length": 238.25447368621826, "epoch": 1.7661679030973638, "grad_norm": 0.149959763615365, "kl": 0.091094970703125, "learning_rate": 4.969811931827552e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 10534 }, { "completion_length": 246.3928689956665, "epoch": 1.7665032063372312, "grad_norm": 0.3019943513178325, "kl": 0.098297119140625, "learning_rate": 4.969793343440666e-07, "loss": 0.0001, "reward": 1.8035714998841286, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 10536 }, { "completion_length": 238.12500858306885, "epoch": 1.7668385095770986, "grad_norm": 0.3349463351982298, "kl": 0.09002685546875, "learning_rate": 4.969774749367401e-07, "loss": 0.0001, "reward": 1.7446429282426834, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 10538 }, { "completion_length": 232.12054538726807, "epoch": 1.7671738128169663, "grad_norm": 0.36886859954996376, "kl": 0.0915985107421875, "learning_rate": 4.969756149607796e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 10540 }, { "completion_length": 242.53572750091553, "epoch": 1.767509116056834, "grad_norm": 0.2734354801264316, "kl": 0.1054229736328125, "learning_rate": 4.969737544161899e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 10542 }, { "completion_length": 246.2500114440918, "epoch": 1.7678444192967016, "grad_norm": 0.004447381062977461, "kl": 0.0968017578125, "learning_rate": 4.96971893302975e-07, "loss": 0.0001, "reward": 1.6928572058677673, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.6928571686148643, "rewards/format_reward_func": 1.0, "step": 10544 }, { "completion_length": 237.5000123977661, "epoch": 1.768179722536569, "grad_norm": 0.1948318054506246, "kl": 0.115692138671875, "learning_rate": 4.969700316211392e-07, "loss": 0.0001, "reward": 1.814285784959793, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 10546 }, { "completion_length": 239.33929634094238, "epoch": 1.7685150257764366, "grad_norm": 0.3184589691220786, "kl": 0.099212646484375, "learning_rate": 4.969681693706868e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 10548 }, { "completion_length": 235.21429538726807, "epoch": 1.768850329016304, "grad_norm": 0.20532648968982828, "kl": 0.129791259765625, "learning_rate": 4.969663065516222e-07, "loss": 0.0001, "reward": 1.7803572341799736, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7839286029338837, "rewards/format_reward_func": 0.9964285716414452, "step": 10550 }, { "completion_length": 231.64286708831787, "epoch": 1.7691856322561716, "grad_norm": 0.24991697110021083, "kl": 0.096435546875, "learning_rate": 4.969644431639495e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 10552 }, { "completion_length": 234.38393878936768, "epoch": 1.7695209354960393, "grad_norm": 0.21260438942552592, "kl": 0.11773681640625, "learning_rate": 4.969625792076731e-07, "loss": 0.0001, "reward": 1.692857213318348, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.6928571686148643, "rewards/format_reward_func": 1.0, "step": 10554 }, { "completion_length": 238.30358123779297, "epoch": 1.769856238735907, "grad_norm": 0.1314157473470005, "kl": 0.099853515625, "learning_rate": 4.969607146827972e-07, "loss": 0.0001, "reward": 1.789285771548748, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 10556 }, { "completion_length": 234.6071548461914, "epoch": 1.7701915419757743, "grad_norm": 0.12255947847610604, "kl": 0.11346435546875, "learning_rate": 4.969588495893263e-07, "loss": 0.0001, "reward": 1.8017857670783997, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8053571842610836, "rewards/format_reward_func": 0.9964285716414452, "step": 10558 }, { "completion_length": 232.83483028411865, "epoch": 1.7705268452156417, "grad_norm": 0.3855074347280174, "kl": 0.141265869140625, "learning_rate": 4.969569839272645e-07, "loss": 0.0001, "reward": 1.7089286670088768, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7125000320374966, "rewards/format_reward_func": 0.9964285790920258, "step": 10560 }, { "completion_length": 233.3303680419922, "epoch": 1.7708621484555094, "grad_norm": 0.18710436318418036, "kl": 0.1106109619140625, "learning_rate": 4.969551176966162e-07, "loss": 0.0001, "reward": 1.7928572297096252, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 10562 }, { "completion_length": 237.5669765472412, "epoch": 1.771197451695377, "grad_norm": 0.2694915854598506, "kl": 0.1116485595703125, "learning_rate": 4.969532508973856e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428909897804, "rewards/format_reward_func": 1.0, "step": 10564 }, { "completion_length": 249.62054634094238, "epoch": 1.7715327549352446, "grad_norm": 0.18554608403376768, "kl": 0.2052001953125, "learning_rate": 4.969513835295771e-07, "loss": 0.0002, "reward": 1.767857201397419, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 10566 }, { "completion_length": 248.68751049041748, "epoch": 1.7718680581751123, "grad_norm": 0.14565025528136757, "kl": 0.138702392578125, "learning_rate": 4.96949515593195e-07, "loss": 0.0001, "reward": 1.732142947614193, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7321428805589676, "rewards/format_reward_func": 1.0, "step": 10568 }, { "completion_length": 260.7232275009155, "epoch": 1.7722033614149797, "grad_norm": 0.2558570624218233, "kl": 0.375213623046875, "learning_rate": 4.969476470882435e-07, "loss": 0.0004, "reward": 1.7821428999304771, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428868919611, "rewards/format_reward_func": 1.0, "step": 10570 }, { "completion_length": 255.16072463989258, "epoch": 1.772538664654847, "grad_norm": 0.2710318747661508, "kl": 0.5247802734375, "learning_rate": 4.969457780147268e-07, "loss": 0.0005, "reward": 1.717857226729393, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571671247482, "rewards/format_reward_func": 1.0, "step": 10572 }, { "completion_length": 239.89733219146729, "epoch": 1.7728739678947147, "grad_norm": 0.1223392200473689, "kl": 0.10552978515625, "learning_rate": 4.969439083726496e-07, "loss": 0.0001, "reward": 1.8357143327593803, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8357143178582191, "rewards/format_reward_func": 1.0, "step": 10574 }, { "completion_length": 257.7053689956665, "epoch": 1.7732092711345824, "grad_norm": 0.3783049950951984, "kl": 0.526641845703125, "learning_rate": 4.969420381620158e-07, "loss": 0.0005, "reward": 1.7339286506175995, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7473214641213417, "rewards/format_reward_func": 0.9866071492433548, "step": 10576 }, { "completion_length": 256.7142972946167, "epoch": 1.77354457437445, "grad_norm": 0.23260212557444576, "kl": 0.114898681640625, "learning_rate": 4.9694016738283e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 10578 }, { "completion_length": 254.59376335144043, "epoch": 1.7738798776143174, "grad_norm": 0.31166389144220735, "kl": 0.380523681640625, "learning_rate": 4.969382960350962e-07, "loss": 0.0004, "reward": 1.7821429073810577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 10580 }, { "completion_length": 270.8705472946167, "epoch": 1.774215180854185, "grad_norm": 0.16185259134789332, "kl": 0.2837677001953125, "learning_rate": 4.969364241188191e-07, "loss": 0.0003, "reward": 1.6928572282195091, "reward_std": 0.08081220462918282, "rewards/equation_reward_func": 0.7196428887546062, "rewards/format_reward_func": 0.9732142984867096, "step": 10582 }, { "completion_length": 262.33036708831787, "epoch": 1.7745504840940525, "grad_norm": 0.20630636229383023, "kl": 0.138214111328125, "learning_rate": 4.969345516340026e-07, "loss": 0.0001, "reward": 1.7732143551111221, "reward_std": 0.10354063473641872, "rewards/equation_reward_func": 0.8044643104076385, "rewards/format_reward_func": 0.9687500149011612, "step": 10584 }, { "completion_length": 266.24108505249023, "epoch": 1.77488578733392, "grad_norm": 0.22700971231892747, "kl": 0.325653076171875, "learning_rate": 4.969326785806513e-07, "loss": 0.0003, "reward": 1.7053571864962578, "reward_std": 0.123743686825037, "rewards/equation_reward_func": 0.736607177183032, "rewards/format_reward_func": 0.9687500149011612, "step": 10586 }, { "completion_length": 255.2634048461914, "epoch": 1.7752210905737877, "grad_norm": 0.31500744912161194, "kl": 0.1110076904296875, "learning_rate": 4.969308049587694e-07, "loss": 0.0001, "reward": 1.7071429044008255, "reward_std": 0.08081220369786024, "rewards/equation_reward_func": 0.7250000312924385, "rewards/format_reward_func": 0.9821428656578064, "step": 10588 }, { "completion_length": 269.6607275009155, "epoch": 1.7755563938136554, "grad_norm": 0.29098020918540685, "kl": 0.1125640869140625, "learning_rate": 4.969289307683612e-07, "loss": 0.0001, "reward": 1.698214367032051, "reward_std": 0.10354063659906387, "rewards/equation_reward_func": 0.7205357439815998, "rewards/format_reward_func": 0.977678582072258, "step": 10590 }, { "completion_length": 266.28125953674316, "epoch": 1.7758916970535228, "grad_norm": 0.3319292014716968, "kl": 0.608734130859375, "learning_rate": 4.969270560094311e-07, "loss": 0.0006, "reward": 1.6875000596046448, "reward_std": 0.1085914010182023, "rewards/equation_reward_func": 0.7187500447034836, "rewards/format_reward_func": 0.9687500149011612, "step": 10592 }, { "completion_length": 261.5401945114136, "epoch": 1.7762270002933902, "grad_norm": 0.26305753900705553, "kl": 0.69952392578125, "learning_rate": 4.969251806819834e-07, "loss": 0.0007, "reward": 1.7392857745289803, "reward_std": 0.10606601648032665, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9732142984867096, "step": 10594 }, { "completion_length": 256.352689743042, "epoch": 1.7765623035332578, "grad_norm": 0.11624276670860509, "kl": 0.237396240234375, "learning_rate": 4.969233047860223e-07, "loss": 0.0002, "reward": 1.73214291036129, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.9830357208848, "step": 10596 }, { "completion_length": 266.1116180419922, "epoch": 1.7768976067731255, "grad_norm": 0.24888262657285953, "kl": 0.37750244140625, "learning_rate": 4.969214283215523e-07, "loss": 0.0004, "reward": 1.700000062584877, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7089285850524902, "rewards/format_reward_func": 0.9910714328289032, "step": 10598 }, { "completion_length": 255.25894165039062, "epoch": 1.777232910012993, "grad_norm": 0.3873078676136931, "kl": 0.159027099609375, "learning_rate": 4.969195512885775e-07, "loss": 0.0002, "reward": 1.757142923772335, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428753435612, "rewards/format_reward_func": 1.0, "step": 10600 }, { "completion_length": 258.6294775009155, "epoch": 1.7775682132528605, "grad_norm": 0.16038748989821341, "kl": 0.148468017578125, "learning_rate": 4.969176736871024e-07, "loss": 0.0001, "reward": 1.7142858132719994, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857499420643, "rewards/format_reward_func": 1.0, "step": 10602 }, { "completion_length": 252.9553680419922, "epoch": 1.7779035164927282, "grad_norm": 0.2520951190407464, "kl": 0.157440185546875, "learning_rate": 4.969157955171313e-07, "loss": 0.0002, "reward": 1.7000000476837158, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.708928607404232, "rewards/format_reward_func": 0.9910714328289032, "step": 10604 }, { "completion_length": 247.06697463989258, "epoch": 1.7782388197325956, "grad_norm": 0.26364201619762373, "kl": 0.3067626953125, "learning_rate": 4.969139167786684e-07, "loss": 0.0003, "reward": 1.7928572222590446, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 10606 }, { "completion_length": 245.12947463989258, "epoch": 1.7785741229724632, "grad_norm": 0.1645764154368642, "kl": 0.153106689453125, "learning_rate": 4.969120374717182e-07, "loss": 0.0002, "reward": 1.7678572162985802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 10608 }, { "completion_length": 237.33929538726807, "epoch": 1.7789094262123308, "grad_norm": 0.16426994973474854, "kl": 0.1124267578125, "learning_rate": 4.969101575962849e-07, "loss": 0.0001, "reward": 1.8178572058677673, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8178571611642838, "rewards/format_reward_func": 1.0, "step": 10610 }, { "completion_length": 247.6964406967163, "epoch": 1.7792447294521985, "grad_norm": 0.182551150654352, "kl": 0.1142730712890625, "learning_rate": 4.969082771523728e-07, "loss": 0.0001, "reward": 1.7982143461704254, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8026785962283611, "rewards/format_reward_func": 0.9955357164144516, "step": 10612 }, { "completion_length": 247.01786518096924, "epoch": 1.7795800326920659, "grad_norm": 0.23539622917480219, "kl": 0.1351470947265625, "learning_rate": 4.969063961399865e-07, "loss": 0.0001, "reward": 1.7866071984171867, "reward_std": 0.049244935624301434, "rewards/equation_reward_func": 0.7883928902447224, "rewards/format_reward_func": 0.9982142895460129, "step": 10614 }, { "completion_length": 255.25893878936768, "epoch": 1.7799153359319333, "grad_norm": 0.20134669385634055, "kl": 0.119110107421875, "learning_rate": 4.9690451455913e-07, "loss": 0.0001, "reward": 1.7660714983940125, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.9955357164144516, "step": 10616 }, { "completion_length": 240.79911708831787, "epoch": 1.780250639171801, "grad_norm": 0.07603412774145471, "kl": 0.10906982421875, "learning_rate": 4.969026324098076e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 10618 }, { "completion_length": 249.82590293884277, "epoch": 1.7805859424116686, "grad_norm": 0.26972691648450575, "kl": 0.1138153076171875, "learning_rate": 4.96900749692024e-07, "loss": 0.0001, "reward": 1.7776786237955093, "reward_std": 0.07197336852550507, "rewards/equation_reward_func": 0.7839285917580128, "rewards/format_reward_func": 0.9937500059604645, "step": 10620 }, { "completion_length": 239.44197750091553, "epoch": 1.7809212456515362, "grad_norm": 0.2305440981797223, "kl": 0.130126953125, "learning_rate": 4.968988664057834e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 10622 }, { "completion_length": 243.85268878936768, "epoch": 1.7812565488914036, "grad_norm": 0.26768835156722426, "kl": 0.092803955078125, "learning_rate": 4.968969825510899e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 10624 }, { "completion_length": 247.4196548461914, "epoch": 1.7815918521312712, "grad_norm": 0.23568997635818786, "kl": 0.155609130859375, "learning_rate": 4.968950981279481e-07, "loss": 0.0002, "reward": 1.7553572058677673, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7598214708268642, "rewards/format_reward_func": 0.9955357164144516, "step": 10626 }, { "completion_length": 248.5937614440918, "epoch": 1.7819271553711387, "grad_norm": 0.2304402395430419, "kl": 0.101806640625, "learning_rate": 4.968932131363621e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7839285954833031, "rewards/format_reward_func": 0.9910714328289032, "step": 10628 }, { "completion_length": 252.67858505249023, "epoch": 1.7822624586110063, "grad_norm": 0.14773177459543393, "kl": 0.088470458984375, "learning_rate": 4.968913275763365e-07, "loss": 0.0001, "reward": 1.753571480512619, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.762500025331974, "rewards/format_reward_func": 0.9910714328289032, "step": 10630 }, { "completion_length": 241.102689743042, "epoch": 1.782597761850874, "grad_norm": 0.23951759989016289, "kl": 0.0987548828125, "learning_rate": 4.968894414478756e-07, "loss": 0.0001, "reward": 1.817857213318348, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8178571686148643, "rewards/format_reward_func": 1.0, "step": 10632 }, { "completion_length": 259.4732246398926, "epoch": 1.7829330650907416, "grad_norm": 0.26787557518060745, "kl": 0.1324462890625, "learning_rate": 4.968875547509836e-07, "loss": 0.0001, "reward": 1.7696429267525673, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7741071619093418, "rewards/format_reward_func": 0.9955357164144516, "step": 10634 }, { "completion_length": 251.00447750091553, "epoch": 1.783268368330609, "grad_norm": 0.5559621875801689, "kl": 0.1373291015625, "learning_rate": 4.968856674856648e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 10636 }, { "completion_length": 244.40626049041748, "epoch": 1.7836036715704764, "grad_norm": 0.2907232267990311, "kl": 0.132568359375, "learning_rate": 4.968837796519238e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7214285992085934, "rewards/format_reward_func": 1.0, "step": 10638 }, { "completion_length": 245.6741180419922, "epoch": 1.783938974810344, "grad_norm": 0.2367535675360497, "kl": 0.134552001953125, "learning_rate": 4.968818912497647e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7875000275671482, "rewards/format_reward_func": 0.9910714328289032, "step": 10640 }, { "completion_length": 249.16072463989258, "epoch": 1.7842742780502117, "grad_norm": 0.279259478311908, "kl": 0.3076171875, "learning_rate": 4.968800022791921e-07, "loss": 0.0003, "reward": 1.751785770058632, "reward_std": 0.0328299580141902, "rewards/equation_reward_func": 0.7651786021888256, "rewards/format_reward_func": 0.9866071492433548, "step": 10642 }, { "completion_length": 244.25447463989258, "epoch": 1.7846095812900793, "grad_norm": 0.30117573873766484, "kl": 0.3115234375, "learning_rate": 4.9687811274021e-07, "loss": 0.0003, "reward": 1.7196429371833801, "reward_std": 0.1035406356677413, "rewards/equation_reward_func": 0.7330357506871223, "rewards/format_reward_func": 0.9866071492433548, "step": 10644 }, { "completion_length": 247.9375123977661, "epoch": 1.784944884529947, "grad_norm": 0.31154505385018966, "kl": 0.31414794921875, "learning_rate": 4.968762226328231e-07, "loss": 0.0003, "reward": 1.7392857819795609, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7482143118977547, "rewards/format_reward_func": 0.9910714328289032, "step": 10646 }, { "completion_length": 252.08929634094238, "epoch": 1.7852801877698143, "grad_norm": 0.16945113473111542, "kl": 0.0981597900390625, "learning_rate": 4.968743319570354e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714399069548, "rewards/format_reward_func": 1.0, "step": 10648 }, { "completion_length": 240.6160831451416, "epoch": 1.7856154910096818, "grad_norm": 0.22287418212263907, "kl": 0.100494384765625, "learning_rate": 4.968724407128516e-07, "loss": 0.0001, "reward": 1.8160714879631996, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8196428790688515, "rewards/format_reward_func": 0.9964285716414452, "step": 10650 }, { "completion_length": 235.75893878936768, "epoch": 1.7859507942495494, "grad_norm": 0.1826597093835376, "kl": 0.0981597900390625, "learning_rate": 4.968705489002759e-07, "loss": 0.0001, "reward": 1.751785770058632, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 10652 }, { "completion_length": 238.5044755935669, "epoch": 1.786286097489417, "grad_norm": 0.1955615982514768, "kl": 0.101715087890625, "learning_rate": 4.968686565193127e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 10654 }, { "completion_length": 247.62054443359375, "epoch": 1.7866214007292847, "grad_norm": 0.20413985650724134, "kl": 0.120880126953125, "learning_rate": 4.968667635699662e-07, "loss": 0.0001, "reward": 1.8000000640749931, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.808928593993187, "rewards/format_reward_func": 0.9910714328289032, "step": 10656 }, { "completion_length": 247.2232255935669, "epoch": 1.786956703969152, "grad_norm": 0.2791594220333788, "kl": 0.272247314453125, "learning_rate": 4.968648700522411e-07, "loss": 0.0003, "reward": 1.6928571984171867, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.6928571853786707, "rewards/format_reward_func": 1.0, "step": 10658 }, { "completion_length": 234.5089406967163, "epoch": 1.7872920072090197, "grad_norm": 0.3520679227334401, "kl": 0.1893463134765625, "learning_rate": 4.968629759661414e-07, "loss": 0.0002, "reward": 1.7607143595814705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 10660 }, { "completion_length": 242.85715293884277, "epoch": 1.7876273104488871, "grad_norm": 0.22464576239652737, "kl": 0.2414398193359375, "learning_rate": 4.968610813116716e-07, "loss": 0.0002, "reward": 1.7678572162985802, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7767857387661934, "rewards/format_reward_func": 0.9910714328289032, "step": 10662 }, { "completion_length": 238.91072463989258, "epoch": 1.7879626136887548, "grad_norm": 0.2595509564574555, "kl": 0.10626220703125, "learning_rate": 4.96859186088836e-07, "loss": 0.0001, "reward": 1.735714353621006, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143238186836, "rewards/format_reward_func": 1.0, "step": 10664 }, { "completion_length": 231.3705472946167, "epoch": 1.7882979169286224, "grad_norm": 0.5999381979692516, "kl": 0.176300048828125, "learning_rate": 4.968572902976392e-07, "loss": 0.0002, "reward": 1.8178571835160255, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8178571611642838, "rewards/format_reward_func": 1.0, "step": 10666 }, { "completion_length": 243.08482933044434, "epoch": 1.78863322016849, "grad_norm": 0.45039498742021106, "kl": 0.152374267578125, "learning_rate": 4.968553939380852e-07, "loss": 0.0002, "reward": 1.7696429416537285, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7741071581840515, "rewards/format_reward_func": 0.9955357164144516, "step": 10668 }, { "completion_length": 238.62055015563965, "epoch": 1.7889685234083574, "grad_norm": 0.31604306983363795, "kl": 0.114898681640625, "learning_rate": 4.968534970101786e-07, "loss": 0.0001, "reward": 1.8392857685685158, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8392857275903225, "rewards/format_reward_func": 1.0, "step": 10670 }, { "completion_length": 239.0000123977661, "epoch": 1.7893038266482248, "grad_norm": 0.1465661738385793, "kl": 0.151458740234375, "learning_rate": 4.968515995139237e-07, "loss": 0.0002, "reward": 1.803571492433548, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714440047741, "rewards/format_reward_func": 1.0, "step": 10672 }, { "completion_length": 244.06697750091553, "epoch": 1.7896391298880925, "grad_norm": 0.29623229382166644, "kl": 0.254180908203125, "learning_rate": 4.968497014493251e-07, "loss": 0.0003, "reward": 1.7482143267989159, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7616071812808514, "rewards/format_reward_func": 0.9866071492433548, "step": 10674 }, { "completion_length": 245.59376335144043, "epoch": 1.7899744331279601, "grad_norm": 0.26622090051005864, "kl": 0.1448974609375, "learning_rate": 4.968478028163867e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7669643089175224, "rewards/format_reward_func": 0.9955357164144516, "step": 10676 }, { "completion_length": 246.6607255935669, "epoch": 1.7903097363678278, "grad_norm": 0.16048621493513235, "kl": 0.18072509765625, "learning_rate": 4.968459036151132e-07, "loss": 0.0002, "reward": 1.76071435213089, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143241912127, "rewards/format_reward_func": 1.0, "step": 10678 }, { "completion_length": 240.69197368621826, "epoch": 1.7906450396076952, "grad_norm": 0.1796921365565492, "kl": 0.12127685546875, "learning_rate": 4.96844003845509e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000469386578, "rewards/format_reward_func": 1.0, "step": 10680 }, { "completion_length": 227.19643688201904, "epoch": 1.7909803428475628, "grad_norm": 0.14189694182681398, "kl": 0.176177978515625, "learning_rate": 4.968421035075784e-07, "loss": 0.0002, "reward": 1.7946429252624512, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7991071790456772, "rewards/format_reward_func": 0.9955357164144516, "step": 10682 }, { "completion_length": 240.73661518096924, "epoch": 1.7913156460874302, "grad_norm": 0.2688246315021042, "kl": 0.352508544921875, "learning_rate": 4.968402026013256e-07, "loss": 0.0004, "reward": 1.7517858073115349, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 10684 }, { "completion_length": 234.1294755935669, "epoch": 1.7916509493272978, "grad_norm": 0.3868588654613052, "kl": 0.184417724609375, "learning_rate": 4.968383011267553e-07, "loss": 0.0002, "reward": 1.7785715088248253, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 10686 }, { "completion_length": 235.66518878936768, "epoch": 1.7919862525671655, "grad_norm": 0.23331563541205555, "kl": 0.129638671875, "learning_rate": 4.968363990838716e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 10688 }, { "completion_length": 239.1384048461914, "epoch": 1.7923215558070331, "grad_norm": 0.14848342113524915, "kl": 0.100128173828125, "learning_rate": 4.96834496472679e-07, "loss": 0.0001, "reward": 1.8107143640518188, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8107143044471741, "rewards/format_reward_func": 1.0, "step": 10690 }, { "completion_length": 235.27679347991943, "epoch": 1.7926568590469005, "grad_norm": 0.12301443116228022, "kl": 0.1092529296875, "learning_rate": 4.968325932931819e-07, "loss": 0.0001, "reward": 1.7821428924798965, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428831666708, "rewards/format_reward_func": 1.0, "step": 10692 }, { "completion_length": 245.8437623977661, "epoch": 1.792992162286768, "grad_norm": 0.24794920535729859, "kl": 0.235076904296875, "learning_rate": 4.968306895453846e-07, "loss": 0.0002, "reward": 1.7142857983708382, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7142857424914837, "rewards/format_reward_func": 1.0, "step": 10694 }, { "completion_length": 239.08036613464355, "epoch": 1.7933274655266356, "grad_norm": 0.7665548328413366, "kl": 0.161376953125, "learning_rate": 4.968287852292916e-07, "loss": 0.0002, "reward": 1.7857143506407738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 10696 }, { "completion_length": 240.67858600616455, "epoch": 1.7936627687665032, "grad_norm": 0.26873292722443687, "kl": 0.1585845947265625, "learning_rate": 4.968268803449072e-07, "loss": 0.0002, "reward": 1.7642857879400253, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 10698 }, { "completion_length": 239.7053689956665, "epoch": 1.7939980720063708, "grad_norm": 0.36024769526209427, "kl": 0.099151611328125, "learning_rate": 4.968249748922358e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 10700 }, { "completion_length": 235.33037090301514, "epoch": 1.7943333752462385, "grad_norm": 0.1891792212945995, "kl": 0.097320556640625, "learning_rate": 4.968230688712818e-07, "loss": 0.0001, "reward": 1.721428669989109, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.721428606659174, "rewards/format_reward_func": 1.0, "step": 10702 }, { "completion_length": 223.64733123779297, "epoch": 1.794668678486106, "grad_norm": 0.18664824171925037, "kl": 0.09637451171875, "learning_rate": 4.968211622820495e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 10704 }, { "completion_length": 230.7187614440918, "epoch": 1.7950039817259733, "grad_norm": 0.19383546116090142, "kl": 0.1127166748046875, "learning_rate": 4.968192551245435e-07, "loss": 0.0001, "reward": 1.7892857640981674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857529222965, "rewards/format_reward_func": 1.0, "step": 10706 }, { "completion_length": 231.93750953674316, "epoch": 1.795339284965841, "grad_norm": 0.2077794241490124, "kl": 0.093048095703125, "learning_rate": 4.968173473987681e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 10708 }, { "completion_length": 234.94643783569336, "epoch": 1.7956745882057086, "grad_norm": 0.21449086730659028, "kl": 0.096038818359375, "learning_rate": 4.968154391047274e-07, "loss": 0.0001, "reward": 1.7714286595582962, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 10710 }, { "completion_length": 238.0491180419922, "epoch": 1.7960098914455762, "grad_norm": 0.29946112981399303, "kl": 0.09326171875, "learning_rate": 4.968135302424262e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 10712 }, { "completion_length": 236.37054634094238, "epoch": 1.7963451946854436, "grad_norm": 0.11589060979527094, "kl": 0.091156005859375, "learning_rate": 4.968116208118688e-07, "loss": 0.0001, "reward": 1.8080357685685158, "reward_std": 0.059346460737288, "rewards/equation_reward_func": 0.8098214641213417, "rewards/format_reward_func": 0.9982142895460129, "step": 10714 }, { "completion_length": 245.41965198516846, "epoch": 1.796680497925311, "grad_norm": 0.5277318509362663, "kl": 0.11151123046875, "learning_rate": 4.968097108130595e-07, "loss": 0.0001, "reward": 1.7250000983476639, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000424683094, "rewards/format_reward_func": 1.0, "step": 10716 }, { "completion_length": 237.00447368621826, "epoch": 1.7970158011651787, "grad_norm": 0.4631048577308088, "kl": 0.155487060546875, "learning_rate": 4.968078002460027e-07, "loss": 0.0002, "reward": 1.8000000566244125, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 10718 }, { "completion_length": 232.86161613464355, "epoch": 1.7973511044050463, "grad_norm": 0.31052716637182964, "kl": 0.121002197265625, "learning_rate": 4.968058891107029e-07, "loss": 0.0001, "reward": 1.7794643267989159, "reward_std": 0.049244935624301434, "rewards/equation_reward_func": 0.7812500279396772, "rewards/format_reward_func": 0.9982142895460129, "step": 10720 }, { "completion_length": 240.9241180419922, "epoch": 1.797686407644914, "grad_norm": 0.49598330005641567, "kl": 0.12677001953125, "learning_rate": 4.968039774071644e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 10722 }, { "completion_length": 236.0134048461914, "epoch": 1.7980217108847816, "grad_norm": 0.40670055538330085, "kl": 0.1128997802734375, "learning_rate": 4.968020651353916e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 1.0, "step": 10724 }, { "completion_length": 230.78572463989258, "epoch": 1.798357014124649, "grad_norm": 0.2347570156213548, "kl": 0.13226318359375, "learning_rate": 4.968001522953889e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000342726707, "rewards/format_reward_func": 1.0, "step": 10726 }, { "completion_length": 238.46429634094238, "epoch": 1.7986923173645164, "grad_norm": 0.15634411925731584, "kl": 0.101654052734375, "learning_rate": 4.967982388871608e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 10728 }, { "completion_length": 240.1785831451416, "epoch": 1.799027620604384, "grad_norm": 0.1919579346758296, "kl": 0.098297119140625, "learning_rate": 4.967963249107117e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286129921675, "rewards/format_reward_func": 1.0, "step": 10730 }, { "completion_length": 236.01340293884277, "epoch": 1.7993629238442517, "grad_norm": 0.19750935368932737, "kl": 0.104766845703125, "learning_rate": 4.967944103660458e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 10732 }, { "completion_length": 242.94197463989258, "epoch": 1.7996982270841193, "grad_norm": 0.18550836957480152, "kl": 0.106964111328125, "learning_rate": 4.967924952531678e-07, "loss": 0.0001, "reward": 1.753571480512619, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714730620384, "rewards/format_reward_func": 1.0, "step": 10734 }, { "completion_length": 239.72769165039062, "epoch": 1.8000335303239867, "grad_norm": 0.14716396585075947, "kl": 0.11041259765625, "learning_rate": 4.96790579572082e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7535714562982321, "rewards/format_reward_func": 1.0, "step": 10736 }, { "completion_length": 234.71429634094238, "epoch": 1.8003688335638544, "grad_norm": 0.1392211187955793, "kl": 0.102142333984375, "learning_rate": 4.967886633227927e-07, "loss": 0.0001, "reward": 1.7250000685453415, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 1.0, "step": 10738 }, { "completion_length": 236.9375114440918, "epoch": 1.8007041368037218, "grad_norm": 0.2994866979627325, "kl": 0.105682373046875, "learning_rate": 4.967867465053044e-07, "loss": 0.0001, "reward": 1.7839286401867867, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 10740 }, { "completion_length": 234.30804634094238, "epoch": 1.8010394400435894, "grad_norm": 0.18439435986178013, "kl": 0.102508544921875, "learning_rate": 4.967848291196216e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 10742 }, { "completion_length": 244.1160831451416, "epoch": 1.801374743283457, "grad_norm": 0.19176630269728612, "kl": 0.114044189453125, "learning_rate": 4.967829111657485e-07, "loss": 0.0001, "reward": 1.7071429267525673, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7071428839117289, "rewards/format_reward_func": 1.0, "step": 10744 }, { "completion_length": 249.7500123977661, "epoch": 1.8017100465233247, "grad_norm": 0.3886127234059232, "kl": 0.1446533203125, "learning_rate": 4.967809926436897e-07, "loss": 0.0001, "reward": 1.7732143327593803, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776786126196384, "rewards/format_reward_func": 0.9955357164144516, "step": 10746 }, { "completion_length": 240.1875114440918, "epoch": 1.802045349763192, "grad_norm": 0.25582454254449444, "kl": 0.12982177734375, "learning_rate": 4.967790735534495e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 10748 }, { "completion_length": 243.196439743042, "epoch": 1.8023806530030595, "grad_norm": 0.1093102806927311, "kl": 0.12841796875, "learning_rate": 4.967771538950325e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143022119999, "rewards/format_reward_func": 1.0, "step": 10750 }, { "completion_length": 233.93304538726807, "epoch": 1.8027159562429271, "grad_norm": 0.2041405389472402, "kl": 0.097808837890625, "learning_rate": 4.967752336684428e-07, "loss": 0.0001, "reward": 1.8178572058677673, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8178571686148643, "rewards/format_reward_func": 1.0, "step": 10752 }, { "completion_length": 243.42857933044434, "epoch": 1.8030512594827948, "grad_norm": 0.31358890129485795, "kl": 0.1039886474609375, "learning_rate": 4.967733128736852e-07, "loss": 0.0001, "reward": 1.7696429044008255, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7830357551574707, "rewards/format_reward_func": 0.9866071492433548, "step": 10754 }, { "completion_length": 252.57144165039062, "epoch": 1.8033865627226624, "grad_norm": 0.5249165360425068, "kl": 0.2020263671875, "learning_rate": 4.967713915107639e-07, "loss": 0.0002, "reward": 1.7062500938773155, "reward_std": 0.06187184248119593, "rewards/equation_reward_func": 0.7125000283122063, "rewards/format_reward_func": 0.9937500059604645, "step": 10756 }, { "completion_length": 249.23662090301514, "epoch": 1.8037218659625298, "grad_norm": 0.0033794405458902822, "kl": 0.105987548828125, "learning_rate": 4.967694695796833e-07, "loss": 0.0001, "reward": 1.7357143387198448, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143145054579, "rewards/format_reward_func": 1.0, "step": 10758 }, { "completion_length": 249.14733219146729, "epoch": 1.8040571692023974, "grad_norm": 0.25247943380170706, "kl": 0.135498046875, "learning_rate": 4.967675470804479e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.752678606659174, "rewards/format_reward_func": 0.9955357164144516, "step": 10760 }, { "completion_length": 258.09822940826416, "epoch": 1.8043924724422649, "grad_norm": 0.32874928742714987, "kl": 0.146820068359375, "learning_rate": 4.967656240130621e-07, "loss": 0.0001, "reward": 1.7107143849134445, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7107143104076385, "rewards/format_reward_func": 1.0, "step": 10762 }, { "completion_length": 245.71876335144043, "epoch": 1.8047277756821325, "grad_norm": 0.2009743767668466, "kl": 0.3307342529296875, "learning_rate": 4.967637003775303e-07, "loss": 0.0003, "reward": 1.7857143506407738, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 10764 }, { "completion_length": 254.3928689956665, "epoch": 1.8050630789220001, "grad_norm": 0.24327630118252214, "kl": 0.181610107421875, "learning_rate": 4.967617761738571e-07, "loss": 0.0002, "reward": 1.7000000849366188, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7000000346451998, "rewards/format_reward_func": 1.0, "step": 10766 }, { "completion_length": 247.15626049041748, "epoch": 1.8053983821618678, "grad_norm": 0.4902753874075953, "kl": 0.193634033203125, "learning_rate": 4.967598514020467e-07, "loss": 0.0002, "reward": 1.7750000581145287, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 10768 }, { "completion_length": 251.9866180419922, "epoch": 1.8057336854017352, "grad_norm": 0.22101468467873372, "kl": 0.237457275390625, "learning_rate": 4.967579260621036e-07, "loss": 0.0002, "reward": 1.7589286416769028, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928954601288, "rewards/format_reward_func": 0.9955357164144516, "step": 10770 }, { "completion_length": 262.8035840988159, "epoch": 1.8060689886416026, "grad_norm": 0.38687489836976613, "kl": 0.288330078125, "learning_rate": 4.967560001540324e-07, "loss": 0.0003, "reward": 1.7178572118282318, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.71785718947649, "rewards/format_reward_func": 1.0, "step": 10772 }, { "completion_length": 258.84375858306885, "epoch": 1.8064042918814702, "grad_norm": 0.1725018185461603, "kl": 0.2459564208984375, "learning_rate": 4.967540736778373e-07, "loss": 0.0002, "reward": 1.7500000968575478, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 10774 }, { "completion_length": 245.78126525878906, "epoch": 1.8067395951213379, "grad_norm": 0.12436038886983099, "kl": 0.120635986328125, "learning_rate": 4.967521466335228e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7660714536905289, "rewards/format_reward_func": 0.9910714328289032, "step": 10776 }, { "completion_length": 259.9955472946167, "epoch": 1.8070748983612055, "grad_norm": 0.3770273477501551, "kl": 0.53118896484375, "learning_rate": 4.967502190210934e-07, "loss": 0.0005, "reward": 1.7482143715023994, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.752678606659174, "rewards/format_reward_func": 0.9955357164144516, "step": 10778 }, { "completion_length": 242.696439743042, "epoch": 1.8074102016010731, "grad_norm": 1.098628616791553, "kl": 0.73077392578125, "learning_rate": 4.967482908405536e-07, "loss": 0.0007, "reward": 1.7785715013742447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 10780 }, { "completion_length": 244.02233123779297, "epoch": 1.8077455048409405, "grad_norm": 0.3056318524063688, "kl": 0.1494140625, "learning_rate": 4.967463620919077e-07, "loss": 0.0001, "reward": 1.7035715132951736, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7035714611411095, "rewards/format_reward_func": 1.0, "step": 10782 }, { "completion_length": 251.45536708831787, "epoch": 1.808080808080808, "grad_norm": 0.2017605345416473, "kl": 0.2183837890625, "learning_rate": 4.967444327751601e-07, "loss": 0.0002, "reward": 1.74642863124609, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7553571723401546, "rewards/format_reward_func": 0.9910714328289032, "step": 10784 }, { "completion_length": 246.1384038925171, "epoch": 1.8084161113206756, "grad_norm": 0.17238070059866234, "kl": 0.24993896484375, "learning_rate": 4.967425028903153e-07, "loss": 0.0002, "reward": 1.7892857789993286, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857436090708, "rewards/format_reward_func": 1.0, "step": 10786 }, { "completion_length": 247.71429538726807, "epoch": 1.8087514145605432, "grad_norm": 0.21629355159149774, "kl": 0.41064453125, "learning_rate": 4.96740572437378e-07, "loss": 0.0004, "reward": 1.7392857894301414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 10788 }, { "completion_length": 242.66965579986572, "epoch": 1.8090867178004109, "grad_norm": 0.3175852033098951, "kl": 0.1038665771484375, "learning_rate": 4.967386414163522e-07, "loss": 0.0001, "reward": 1.8196429461240768, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8241071701049805, "rewards/format_reward_func": 0.9955357164144516, "step": 10790 }, { "completion_length": 255.8884038925171, "epoch": 1.8094220210402783, "grad_norm": 0.32156099273712735, "kl": 0.1937713623046875, "learning_rate": 4.967367098272427e-07, "loss": 0.0002, "reward": 1.7410715073347092, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.754464328289032, "rewards/format_reward_func": 0.9866071492433548, "step": 10792 }, { "completion_length": 255.883939743042, "epoch": 1.809757324280146, "grad_norm": 0.0992087667618495, "kl": 0.1259918212890625, "learning_rate": 4.967347776700538e-07, "loss": 0.0001, "reward": 1.6910714656114578, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7133928891271353, "rewards/format_reward_func": 0.977678582072258, "step": 10794 }, { "completion_length": 247.7053680419922, "epoch": 1.8100926275200133, "grad_norm": 0.18729247886434083, "kl": 0.1168670654296875, "learning_rate": 4.967328449447898e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.787500012665987, "rewards/format_reward_func": 0.9910714328289032, "step": 10796 }, { "completion_length": 260.98215675354004, "epoch": 1.810427930759881, "grad_norm": 0.21310321098668664, "kl": 0.1923980712890625, "learning_rate": 4.967309116514555e-07, "loss": 0.0002, "reward": 1.7964286133646965, "reward_std": 0.035355339758098125, "rewards/equation_reward_func": 0.8053571805357933, "rewards/format_reward_func": 0.9910714328289032, "step": 10798 }, { "completion_length": 256.29911708831787, "epoch": 1.8107632339997486, "grad_norm": 0.3038643309134824, "kl": 0.1059722900390625, "learning_rate": 4.967289777900551e-07, "loss": 0.0001, "reward": 1.6625000908970833, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.6758928969502449, "rewards/format_reward_func": 0.9866071492433548, "step": 10800 }, { "completion_length": 240.25001049041748, "epoch": 1.8110985372396162, "grad_norm": 0.283155134206445, "kl": 0.132720947265625, "learning_rate": 4.96727043360593e-07, "loss": 0.0001, "reward": 1.7196429520845413, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 10802 }, { "completion_length": 237.68750953674316, "epoch": 1.8114338404794836, "grad_norm": 0.08161599208536394, "kl": 0.1114654541015625, "learning_rate": 4.967251083630739e-07, "loss": 0.0001, "reward": 1.785714328289032, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 10804 }, { "completion_length": 246.34822273254395, "epoch": 1.811769143719351, "grad_norm": 0.23240967627608058, "kl": 0.122589111328125, "learning_rate": 4.967231727975021e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.766964316368103, "rewards/format_reward_func": 0.9955357164144516, "step": 10806 }, { "completion_length": 248.69197463989258, "epoch": 1.8121044469592187, "grad_norm": 0.1302638213649644, "kl": 0.102691650390625, "learning_rate": 4.967212366638821e-07, "loss": 0.0001, "reward": 1.7464286163449287, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7553571723401546, "rewards/format_reward_func": 0.9910714328289032, "step": 10808 }, { "completion_length": 249.09822463989258, "epoch": 1.8124397501990863, "grad_norm": 0.309884045882955, "kl": 0.0946807861328125, "learning_rate": 4.967192999622183e-07, "loss": 0.0001, "reward": 1.7375000789761543, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7419643215835094, "rewards/format_reward_func": 0.9955357164144516, "step": 10810 }, { "completion_length": 233.42858028411865, "epoch": 1.812775053438954, "grad_norm": 0.23595094134065409, "kl": 0.10211181640625, "learning_rate": 4.967173626925152e-07, "loss": 0.0001, "reward": 1.7589286118745804, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7633928954601288, "rewards/format_reward_func": 0.9955357164144516, "step": 10812 }, { "completion_length": 245.852689743042, "epoch": 1.8131103566788214, "grad_norm": 0.32348892322344996, "kl": 0.143402099609375, "learning_rate": 4.967154248547773e-07, "loss": 0.0001, "reward": 1.751785770058632, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500312924385, "rewards/format_reward_func": 0.9955357164144516, "step": 10814 }, { "completion_length": 227.3035831451416, "epoch": 1.813445659918689, "grad_norm": 0.07325437364817967, "kl": 0.0951690673828125, "learning_rate": 4.967134864490089e-07, "loss": 0.0001, "reward": 1.782142885029316, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7821428943425417, "rewards/format_reward_func": 1.0, "step": 10816 }, { "completion_length": 239.6428689956665, "epoch": 1.8137809631585564, "grad_norm": 0.25369394576949483, "kl": 0.090789794921875, "learning_rate": 4.967115474752146e-07, "loss": 0.0001, "reward": 1.821428619325161, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8214285932481289, "rewards/format_reward_func": 1.0, "step": 10818 }, { "completion_length": 241.89733219146729, "epoch": 1.814116266398424, "grad_norm": 0.18867217775157238, "kl": 0.130615234375, "learning_rate": 4.967096079333989e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 10820 }, { "completion_length": 235.8750123977661, "epoch": 1.8144515696382917, "grad_norm": 0.21150017673269703, "kl": 0.095611572265625, "learning_rate": 4.967076678235662e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 10822 }, { "completion_length": 251.977689743042, "epoch": 1.8147868728781593, "grad_norm": 0.13915586483642634, "kl": 0.124359130859375, "learning_rate": 4.96705727145721e-07, "loss": 0.0001, "reward": 1.783928632736206, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7875000275671482, "rewards/format_reward_func": 0.9964285716414452, "step": 10824 }, { "completion_length": 236.93304634094238, "epoch": 1.8151221761180267, "grad_norm": 0.32561266503298675, "kl": 0.105865478515625, "learning_rate": 4.967037858998677e-07, "loss": 0.0001, "reward": 1.8178571835160255, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 1.0, "step": 10826 }, { "completion_length": 252.33929634094238, "epoch": 1.8154574793578941, "grad_norm": 0.28224839859070017, "kl": 0.1246337890625, "learning_rate": 4.967018440860109e-07, "loss": 0.0001, "reward": 1.8142857775092125, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.8232143074274063, "rewards/format_reward_func": 0.9910714328289032, "step": 10828 }, { "completion_length": 253.1205472946167, "epoch": 1.8157927825977618, "grad_norm": 0.15807833293193896, "kl": 0.157958984375, "learning_rate": 4.966999017041549e-07, "loss": 0.0002, "reward": 1.7892857789993286, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857473343611, "rewards/format_reward_func": 1.0, "step": 10830 }, { "completion_length": 251.11608505249023, "epoch": 1.8161280858376294, "grad_norm": 0.20844670553261815, "kl": 0.195587158203125, "learning_rate": 4.966979587543043e-07, "loss": 0.0002, "reward": 1.735714353621006, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143256813288, "rewards/format_reward_func": 1.0, "step": 10832 }, { "completion_length": 250.04465198516846, "epoch": 1.816463389077497, "grad_norm": 0.22690609755391183, "kl": 0.124908447265625, "learning_rate": 4.966960152364635e-07, "loss": 0.0001, "reward": 1.7410714700818062, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7455357387661934, "rewards/format_reward_func": 0.9955357164144516, "step": 10834 }, { "completion_length": 255.36608409881592, "epoch": 1.8167986923173647, "grad_norm": 0.1446434148430334, "kl": 0.151519775390625, "learning_rate": 4.96694071150637e-07, "loss": 0.0002, "reward": 1.7446429207921028, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7580357454717159, "rewards/format_reward_func": 0.9866071492433548, "step": 10836 }, { "completion_length": 250.4866189956665, "epoch": 1.817133995557232, "grad_norm": 0.47846980714272636, "kl": 0.1998748779296875, "learning_rate": 4.966921264968293e-07, "loss": 0.0002, "reward": 1.6732143759727478, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.6776786055415869, "rewards/format_reward_func": 0.9955357164144516, "step": 10838 }, { "completion_length": 243.34375953674316, "epoch": 1.8174692987970995, "grad_norm": 0.13234759313529285, "kl": 0.109649658203125, "learning_rate": 4.966901812750448e-07, "loss": 0.0001, "reward": 1.8428571745753288, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8428571745753288, "rewards/format_reward_func": 1.0, "step": 10840 }, { "completion_length": 242.13394165039062, "epoch": 1.8178046020369671, "grad_norm": 0.13118584337901576, "kl": 0.1727294921875, "learning_rate": 4.966882354852882e-07, "loss": 0.0002, "reward": 1.7589286267757416, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928954601288, "rewards/format_reward_func": 0.9955357164144516, "step": 10842 }, { "completion_length": 254.74554634094238, "epoch": 1.8181399052768348, "grad_norm": 0.19629673610097528, "kl": 0.15948486328125, "learning_rate": 4.966862891275637e-07, "loss": 0.0002, "reward": 1.7750000357627869, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7839286141097546, "rewards/format_reward_func": 0.9910714328289032, "step": 10844 }, { "completion_length": 255.2276906967163, "epoch": 1.8184752085167024, "grad_norm": 0.21825418465981786, "kl": 0.15118408203125, "learning_rate": 4.966843422018758e-07, "loss": 0.0002, "reward": 1.771428644657135, "reward_std": 0.10101525392383337, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 0.9821428656578064, "step": 10846 }, { "completion_length": 246.93304920196533, "epoch": 1.8188105117565698, "grad_norm": 0.14504716918157162, "kl": 0.1495361328125, "learning_rate": 4.966823947082292e-07, "loss": 0.0001, "reward": 1.7732143625617027, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7776785977184772, "rewards/format_reward_func": 0.9955357164144516, "step": 10848 }, { "completion_length": 253.6205472946167, "epoch": 1.8191458149964372, "grad_norm": 0.20655397846657847, "kl": 0.129791259765625, "learning_rate": 4.966804466466282e-07, "loss": 0.0001, "reward": 1.735714353621006, "reward_std": 0.10101525392383337, "rewards/equation_reward_func": 0.7535714693367481, "rewards/format_reward_func": 0.9821428656578064, "step": 10850 }, { "completion_length": 251.24554347991943, "epoch": 1.8194811182363049, "grad_norm": 0.1738275679980491, "kl": 0.1940460205078125, "learning_rate": 4.966784980170774e-07, "loss": 0.0002, "reward": 1.7714286223053932, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7892857305705547, "rewards/format_reward_func": 0.9821428656578064, "step": 10852 }, { "completion_length": 247.00447750091553, "epoch": 1.8198164214761725, "grad_norm": 0.20985469649492897, "kl": 1.001861572265625, "learning_rate": 4.966765488195812e-07, "loss": 0.001, "reward": 1.753571480512619, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7803571671247482, "rewards/format_reward_func": 0.9732142984867096, "step": 10854 }, { "completion_length": 236.7812623977661, "epoch": 1.8201517247160401, "grad_norm": 0.11620140241283157, "kl": 0.25567626953125, "learning_rate": 4.966745990541442e-07, "loss": 0.0003, "reward": 1.7553572058677673, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7687500305473804, "rewards/format_reward_func": 0.9866071492433548, "step": 10856 }, { "completion_length": 253.74555015563965, "epoch": 1.8204870279559078, "grad_norm": 0.1833107759586828, "kl": 0.108062744140625, "learning_rate": 4.966726487207708e-07, "loss": 0.0001, "reward": 1.7071429267525673, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7250000350177288, "rewards/format_reward_func": 0.9821428656578064, "step": 10858 }, { "completion_length": 239.290189743042, "epoch": 1.8208223311957752, "grad_norm": 0.23300359377419902, "kl": 0.562469482421875, "learning_rate": 4.966706978194655e-07, "loss": 0.0006, "reward": 1.7089286297559738, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7133928816765547, "rewards/format_reward_func": 0.9955357164144516, "step": 10860 }, { "completion_length": 236.2634038925171, "epoch": 1.8211576344356426, "grad_norm": 0.18866839307399952, "kl": 0.4730224609375, "learning_rate": 4.966687463502327e-07, "loss": 0.0005, "reward": 1.7660714760422707, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7705357298254967, "rewards/format_reward_func": 0.9955357164144516, "step": 10862 }, { "completion_length": 229.13393783569336, "epoch": 1.8214929376755102, "grad_norm": 0.10210684142020995, "kl": 0.330596923828125, "learning_rate": 4.966667943130771e-07, "loss": 0.0003, "reward": 1.7607143595814705, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 10864 }, { "completion_length": 234.0446538925171, "epoch": 1.8218282409153779, "grad_norm": 0.23129147813982034, "kl": 0.1144256591796875, "learning_rate": 4.966648417080031e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 10866 }, { "completion_length": 235.9107255935669, "epoch": 1.8221635441552455, "grad_norm": 0.1554049848557028, "kl": 0.1997222900390625, "learning_rate": 4.96662888535015e-07, "loss": 0.0002, "reward": 1.7857143506407738, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 10868 }, { "completion_length": 235.03572463989258, "epoch": 1.822498847395113, "grad_norm": 0.40407649067600887, "kl": 0.53131103515625, "learning_rate": 4.966609347941176e-07, "loss": 0.0005, "reward": 1.8035714700818062, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 10870 }, { "completion_length": 233.65179824829102, "epoch": 1.8228341506349806, "grad_norm": 0.13115162629134425, "kl": 0.3810577392578125, "learning_rate": 4.966589804853153e-07, "loss": 0.0004, "reward": 1.7642857804894447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857581377029, "rewards/format_reward_func": 1.0, "step": 10872 }, { "completion_length": 224.61161708831787, "epoch": 1.823169453874848, "grad_norm": 0.22174956953423863, "kl": 0.0928497314453125, "learning_rate": 4.966570256086126e-07, "loss": 0.0001, "reward": 1.7928572222590446, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 10874 }, { "completion_length": 231.61608123779297, "epoch": 1.8235047571147156, "grad_norm": 0.2175888324277477, "kl": 0.1910400390625, "learning_rate": 4.966550701640139e-07, "loss": 0.0002, "reward": 1.7464286610484123, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 10876 }, { "completion_length": 224.2098331451416, "epoch": 1.8238400603545832, "grad_norm": 0.13761918610296675, "kl": 0.294647216796875, "learning_rate": 4.966531141515237e-07, "loss": 0.0003, "reward": 1.7464286163449287, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7464286144822836, "rewards/format_reward_func": 1.0, "step": 10878 }, { "completion_length": 219.59375858306885, "epoch": 1.8241753635944509, "grad_norm": 0.12025355892629688, "kl": 0.126495361328125, "learning_rate": 4.966511575711467e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 10880 }, { "completion_length": 228.77233219146729, "epoch": 1.8245106668343183, "grad_norm": 0.3027116876913923, "kl": 0.84716796875, "learning_rate": 4.966492004228872e-07, "loss": 0.0008, "reward": 1.7678572088479996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 10882 }, { "completion_length": 220.33929538726807, "epoch": 1.8248459700741857, "grad_norm": 0.15113626205446923, "kl": 0.207672119140625, "learning_rate": 4.966472427067499e-07, "loss": 0.0002, "reward": 1.8571428954601288, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8571428768336773, "rewards/format_reward_func": 1.0, "step": 10884 }, { "completion_length": 223.70536613464355, "epoch": 1.8251812733140533, "grad_norm": 0.30749767893684915, "kl": 0.09979248046875, "learning_rate": 4.966452844227391e-07, "loss": 0.0001, "reward": 1.796428643167019, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7964286059141159, "rewards/format_reward_func": 1.0, "step": 10886 }, { "completion_length": 225.04018878936768, "epoch": 1.825516576553921, "grad_norm": 0.2152092830261441, "kl": 0.13427734375, "learning_rate": 4.966433255708594e-07, "loss": 0.0001, "reward": 1.728571504354477, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7285714522004128, "rewards/format_reward_func": 1.0, "step": 10888 }, { "completion_length": 237.9241189956665, "epoch": 1.8258518797937886, "grad_norm": 0.11807239100575599, "kl": 0.154876708984375, "learning_rate": 4.966413661511154e-07, "loss": 0.0002, "reward": 1.7250000461935997, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.725000036880374, "rewards/format_reward_func": 1.0, "step": 10890 }, { "completion_length": 228.87501049041748, "epoch": 1.826187183033656, "grad_norm": 0.273143116307382, "kl": 0.3214111328125, "learning_rate": 4.966394061635115e-07, "loss": 0.0003, "reward": 1.7428572103381157, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 10892 }, { "completion_length": 223.47322368621826, "epoch": 1.8265224862735236, "grad_norm": 0.1521072199861038, "kl": 0.11309814453125, "learning_rate": 4.966374456080522e-07, "loss": 0.0001, "reward": 1.735714353621006, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 1.0, "step": 10894 }, { "completion_length": 237.30358123779297, "epoch": 1.826857789513391, "grad_norm": 0.6139851648931479, "kl": 0.197723388671875, "learning_rate": 4.966354844847421e-07, "loss": 0.0002, "reward": 1.778571479022503, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 10896 }, { "completion_length": 238.2946548461914, "epoch": 1.8271930927532587, "grad_norm": 0.29583958632372603, "kl": 0.115325927734375, "learning_rate": 4.966335227935856e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571604192257, "rewards/format_reward_func": 1.0, "step": 10898 }, { "completion_length": 225.09822368621826, "epoch": 1.8275283959931263, "grad_norm": 0.24015298347241795, "kl": 0.225799560546875, "learning_rate": 4.966315605345873e-07, "loss": 0.0002, "reward": 1.850000038743019, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8500000089406967, "rewards/format_reward_func": 1.0, "step": 10900 }, { "completion_length": 235.571439743042, "epoch": 1.827863699232994, "grad_norm": 0.2932754055945961, "kl": 0.11669921875, "learning_rate": 4.966295977077518e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.745535746216774, "rewards/format_reward_func": 0.9955357164144516, "step": 10902 }, { "completion_length": 238.17411613464355, "epoch": 1.8281990024728614, "grad_norm": 0.26385235317977035, "kl": 0.1941680908203125, "learning_rate": 4.966276343130835e-07, "loss": 0.0002, "reward": 1.7142858132719994, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857536673546, "rewards/format_reward_func": 1.0, "step": 10904 }, { "completion_length": 234.19643783569336, "epoch": 1.8285343057127288, "grad_norm": 0.3444873572004797, "kl": 0.1352691650390625, "learning_rate": 4.966256703505869e-07, "loss": 0.0001, "reward": 1.8178571835160255, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.817857176065445, "rewards/format_reward_func": 1.0, "step": 10906 }, { "completion_length": 247.98215198516846, "epoch": 1.8288696089525964, "grad_norm": 0.2911970609785864, "kl": 0.11505126953125, "learning_rate": 4.966237058202665e-07, "loss": 0.0001, "reward": 1.7178572192788124, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7267857529222965, "rewards/format_reward_func": 0.9910714328289032, "step": 10908 }, { "completion_length": 239.40179538726807, "epoch": 1.829204912192464, "grad_norm": 0.25664083609430993, "kl": 0.21173095703125, "learning_rate": 4.96621740722127e-07, "loss": 0.0002, "reward": 1.7535714730620384, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714562982321, "rewards/format_reward_func": 1.0, "step": 10910 }, { "completion_length": 233.74554634094238, "epoch": 1.8295402154323317, "grad_norm": 0.2616752508689885, "kl": 0.0946807861328125, "learning_rate": 4.966197750561728e-07, "loss": 0.0001, "reward": 1.7839286178350449, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 10912 }, { "completion_length": 234.1294755935669, "epoch": 1.8298755186721993, "grad_norm": 0.2995033876494516, "kl": 0.1689453125, "learning_rate": 4.966178088224084e-07, "loss": 0.0002, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 10914 }, { "completion_length": 242.03126525878906, "epoch": 1.8302108219120667, "grad_norm": 0.31543069402561186, "kl": 0.10711669921875, "learning_rate": 4.966158420208383e-07, "loss": 0.0001, "reward": 1.7232143506407738, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7276785969734192, "rewards/format_reward_func": 0.9955357164144516, "step": 10916 }, { "completion_length": 219.94197273254395, "epoch": 1.8305461251519342, "grad_norm": 0.21660990217758797, "kl": 0.1119232177734375, "learning_rate": 4.966138746514672e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 10918 }, { "completion_length": 227.89733123779297, "epoch": 1.8308814283918018, "grad_norm": 0.16069491284800438, "kl": 0.1295623779296875, "learning_rate": 4.966119067142995e-07, "loss": 0.0001, "reward": 1.7714286148548126, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7803571783006191, "rewards/format_reward_func": 0.9910714328289032, "step": 10920 }, { "completion_length": 238.59376049041748, "epoch": 1.8312167316316694, "grad_norm": 0.18957200729810142, "kl": 0.117218017578125, "learning_rate": 4.966099382093397e-07, "loss": 0.0001, "reward": 1.7482143640518188, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526785992085934, "rewards/format_reward_func": 0.9955357164144516, "step": 10922 }, { "completion_length": 232.71875858306885, "epoch": 1.831552034871537, "grad_norm": 0.12322576113924998, "kl": 0.1805267333984375, "learning_rate": 4.966079691365925e-07, "loss": 0.0002, "reward": 1.7803571969270706, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 10924 }, { "completion_length": 233.70983028411865, "epoch": 1.8318873381114045, "grad_norm": 0.2518008327942554, "kl": 0.1314697265625, "learning_rate": 4.966059994960622e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 10926 }, { "completion_length": 247.36608123779297, "epoch": 1.832222641351272, "grad_norm": 0.3195704223914685, "kl": 0.25115966796875, "learning_rate": 4.966040292877534e-07, "loss": 0.0003, "reward": 1.7607143446803093, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7696428894996643, "rewards/format_reward_func": 0.9910714328289032, "step": 10928 }, { "completion_length": 232.74554538726807, "epoch": 1.8325579445911395, "grad_norm": 0.10482360688190698, "kl": 0.15802001953125, "learning_rate": 4.966020585116709e-07, "loss": 0.0002, "reward": 1.767857201397419, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571734577417, "rewards/format_reward_func": 1.0, "step": 10930 }, { "completion_length": 230.0937623977661, "epoch": 1.8328932478310072, "grad_norm": 0.3141847682191237, "kl": 0.16552734375, "learning_rate": 4.966000871678189e-07, "loss": 0.0002, "reward": 1.757142923772335, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 10932 }, { "completion_length": 232.38393878936768, "epoch": 1.8332285510708748, "grad_norm": 0.16685600890208938, "kl": 0.35284423828125, "learning_rate": 4.965981152562021e-07, "loss": 0.0004, "reward": 1.7446429058909416, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7491071783006191, "rewards/format_reward_func": 0.9955357164144516, "step": 10934 }, { "completion_length": 240.52679634094238, "epoch": 1.8335638543107424, "grad_norm": 0.5557902127047117, "kl": 0.145477294921875, "learning_rate": 4.96596142776825e-07, "loss": 0.0001, "reward": 1.758928619325161, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.763392873108387, "rewards/format_reward_func": 0.9955357164144516, "step": 10936 }, { "completion_length": 227.29911708831787, "epoch": 1.8338991575506098, "grad_norm": 0.25553049632062863, "kl": 0.1214599609375, "learning_rate": 4.965941697296922e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 10938 }, { "completion_length": 244.99554538726807, "epoch": 1.8342344607904773, "grad_norm": 0.36957619283406395, "kl": 0.170867919921875, "learning_rate": 4.965921961148081e-07, "loss": 0.0002, "reward": 1.7339286506175995, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7464285884052515, "rewards/format_reward_func": 0.9875000044703484, "step": 10940 }, { "completion_length": 235.54465675354004, "epoch": 1.8345697640303449, "grad_norm": 0.5029314215918635, "kl": 0.261199951171875, "learning_rate": 4.965902219321773e-07, "loss": 0.0003, "reward": 1.8000000715255737, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 10942 }, { "completion_length": 234.7009038925171, "epoch": 1.8349050672702125, "grad_norm": 0.19529034614225257, "kl": 0.1197967529296875, "learning_rate": 4.965882471818045e-07, "loss": 0.0001, "reward": 1.7428571954369545, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7517857477068901, "rewards/format_reward_func": 0.9910714328289032, "step": 10944 }, { "completion_length": 234.1696538925171, "epoch": 1.8352403705100802, "grad_norm": 0.21069830505407974, "kl": 0.131500244140625, "learning_rate": 4.965862718636941e-07, "loss": 0.0001, "reward": 1.7125000581145287, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7258928883820772, "rewards/format_reward_func": 0.9866071492433548, "step": 10946 }, { "completion_length": 230.1964406967163, "epoch": 1.8355756737499476, "grad_norm": 0.33352966239595755, "kl": 0.177215576171875, "learning_rate": 4.965842959778505e-07, "loss": 0.0002, "reward": 1.7482143342494965, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526786029338837, "rewards/format_reward_func": 0.9955357164144516, "step": 10948 }, { "completion_length": 233.35715198516846, "epoch": 1.8359109769898152, "grad_norm": 0.26718184673680234, "kl": 0.15020751953125, "learning_rate": 4.965823195242786e-07, "loss": 0.0002, "reward": 1.7732143625617027, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.777678593993187, "rewards/format_reward_func": 0.9955357164144516, "step": 10950 }, { "completion_length": 228.6741189956665, "epoch": 1.8362462802296826, "grad_norm": 0.6468786012472365, "kl": 0.130218505859375, "learning_rate": 4.965803425029828e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500331550837, "rewards/format_reward_func": 0.9955357164144516, "step": 10952 }, { "completion_length": 217.52679538726807, "epoch": 1.8365815834695502, "grad_norm": 0.20986412344697003, "kl": 0.132232666015625, "learning_rate": 4.965783649139675e-07, "loss": 0.0001, "reward": 1.787500038743019, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7919643186032772, "rewards/format_reward_func": 0.9955357164144516, "step": 10954 }, { "completion_length": 227.6696538925171, "epoch": 1.8369168867094179, "grad_norm": 0.13116619544808694, "kl": 0.1138916015625, "learning_rate": 4.965763867572375e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 10956 }, { "completion_length": 218.23661708831787, "epoch": 1.8372521899492855, "grad_norm": 0.20983926729483543, "kl": 0.1107177734375, "learning_rate": 4.965744080327972e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571626543999, "rewards/format_reward_func": 1.0, "step": 10958 }, { "completion_length": 223.20090293884277, "epoch": 1.837587493189153, "grad_norm": 0.3545627872806163, "kl": 0.10614013671875, "learning_rate": 4.965724287406512e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 10960 }, { "completion_length": 224.86608123779297, "epoch": 1.8379227964290203, "grad_norm": 0.2105883357527289, "kl": 0.1058807373046875, "learning_rate": 4.96570448880804e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7589286044239998, "rewards/format_reward_func": 0.9910714328289032, "step": 10962 }, { "completion_length": 229.571439743042, "epoch": 1.838258099668888, "grad_norm": 0.22364455871043729, "kl": 0.12042236328125, "learning_rate": 4.965684684532603e-07, "loss": 0.0001, "reward": 1.7468750923871994, "reward_std": 0.037249374436214566, "rewards/equation_reward_func": 0.7517857421189547, "rewards/format_reward_func": 0.9950892888009548, "step": 10964 }, { "completion_length": 224.60715293884277, "epoch": 1.8385934029087556, "grad_norm": 0.26191034704334365, "kl": 0.14447021484375, "learning_rate": 4.965664874580244e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.755357164889574, "rewards/format_reward_func": 0.9910714328289032, "step": 10966 }, { "completion_length": 225.0803680419922, "epoch": 1.8389287061486232, "grad_norm": 0.3470244049399262, "kl": 0.118988037109375, "learning_rate": 4.965645058951011e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 10968 }, { "completion_length": 223.38840293884277, "epoch": 1.8392640093884907, "grad_norm": 0.3880943554691539, "kl": 0.1046142578125, "learning_rate": 4.965625237644949e-07, "loss": 0.0001, "reward": 1.7428572326898575, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 10970 }, { "completion_length": 221.9375114440918, "epoch": 1.8395993126283583, "grad_norm": 0.5691958115794947, "kl": 0.1212158203125, "learning_rate": 4.965605410662104e-07, "loss": 0.0001, "reward": 1.766071505844593, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7705357372760773, "rewards/format_reward_func": 0.9955357164144516, "step": 10972 }, { "completion_length": 222.00893878936768, "epoch": 1.8399346158682257, "grad_norm": 0.17615148433837938, "kl": 0.101470947265625, "learning_rate": 4.965585578002521e-07, "loss": 0.0001, "reward": 1.8000000342726707, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 10974 }, { "completion_length": 242.60268878936768, "epoch": 1.8402699191080933, "grad_norm": 0.18370374417663582, "kl": 0.130126953125, "learning_rate": 4.965565739666245e-07, "loss": 0.0001, "reward": 1.755357213318348, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7598214484751225, "rewards/format_reward_func": 0.9955357164144516, "step": 10976 }, { "completion_length": 233.42411613464355, "epoch": 1.840605222347961, "grad_norm": 0.3542768180228526, "kl": 0.112030029296875, "learning_rate": 4.965545895653324e-07, "loss": 0.0001, "reward": 1.7607143744826317, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7696428783237934, "rewards/format_reward_func": 0.9910714328289032, "step": 10978 }, { "completion_length": 234.0491180419922, "epoch": 1.8409405255878286, "grad_norm": 0.21181866329362126, "kl": 0.11077880859375, "learning_rate": 4.965526045963801e-07, "loss": 0.0001, "reward": 1.7535715326666832, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7625000197440386, "rewards/format_reward_func": 0.9910714328289032, "step": 10980 }, { "completion_length": 235.08483505249023, "epoch": 1.841275828827696, "grad_norm": 0.14119896069516588, "kl": 0.11920166015625, "learning_rate": 4.965506190597723e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 10982 }, { "completion_length": 228.2991180419922, "epoch": 1.8416111320675634, "grad_norm": 0.394193532340823, "kl": 0.11993408203125, "learning_rate": 4.965486329555136e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 10984 }, { "completion_length": 229.94643878936768, "epoch": 1.841946435307431, "grad_norm": 0.22609019385205742, "kl": 0.100433349609375, "learning_rate": 4.965466462836085e-07, "loss": 0.0001, "reward": 1.8000000715255737, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000156462193, "rewards/format_reward_func": 1.0, "step": 10986 }, { "completion_length": 237.7276906967163, "epoch": 1.8422817385472987, "grad_norm": 0.3759695986030544, "kl": 0.1258544921875, "learning_rate": 4.965446590440616e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 10988 }, { "completion_length": 217.1428680419922, "epoch": 1.8426170417871663, "grad_norm": 0.2734768653982204, "kl": 0.109344482421875, "learning_rate": 4.965426712368776e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 10990 }, { "completion_length": 228.47322368621826, "epoch": 1.842952345027034, "grad_norm": 0.30183089643210215, "kl": 0.117431640625, "learning_rate": 4.965406828620607e-07, "loss": 0.0001, "reward": 1.730357214808464, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7348214518278837, "rewards/format_reward_func": 0.9955357164144516, "step": 10992 }, { "completion_length": 227.852689743042, "epoch": 1.8432876482669014, "grad_norm": 0.16928201748186822, "kl": 0.099700927734375, "learning_rate": 4.96538693919616e-07, "loss": 0.0001, "reward": 1.7714286148548126, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 1.0, "step": 10994 }, { "completion_length": 224.49108123779297, "epoch": 1.8436229515067688, "grad_norm": 0.2128443559967921, "kl": 0.099517822265625, "learning_rate": 4.965367044095477e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 10996 }, { "completion_length": 232.45090293884277, "epoch": 1.8439582547466364, "grad_norm": 0.35638057676443574, "kl": 0.159698486328125, "learning_rate": 4.965347143318605e-07, "loss": 0.0002, "reward": 1.7821429297327995, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 10998 }, { "completion_length": 220.62054634094238, "epoch": 1.844293557986504, "grad_norm": 0.2383076241088169, "kl": 0.09564208984375, "learning_rate": 4.96532723686559e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464285958558321, "rewards/format_reward_func": 1.0, "step": 11000 }, { "completion_length": 238.58036994934082, "epoch": 1.8446288612263717, "grad_norm": 0.3791546490491425, "kl": 0.20074462890625, "learning_rate": 4.965307324736477e-07, "loss": 0.0002, "reward": 1.8285714909434319, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8285714481025934, "rewards/format_reward_func": 1.0, "step": 11002 }, { "completion_length": 223.78126049041748, "epoch": 1.8449641644662391, "grad_norm": 0.16352363771720657, "kl": 0.114044189453125, "learning_rate": 4.965287406931313e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7964286059141159, "rewards/format_reward_func": 1.0, "step": 11004 }, { "completion_length": 227.56251049041748, "epoch": 1.8452994677061068, "grad_norm": 0.3131439397412478, "kl": 0.1202392578125, "learning_rate": 4.965267483450144e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.08081220090389252, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 11006 }, { "completion_length": 234.16518878936768, "epoch": 1.8456347709459742, "grad_norm": 0.20428976348291153, "kl": 0.10565185546875, "learning_rate": 4.965247554293014e-07, "loss": 0.0001, "reward": 1.8607143238186836, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8607143051922321, "rewards/format_reward_func": 1.0, "step": 11008 }, { "completion_length": 222.62054538726807, "epoch": 1.8459700741858418, "grad_norm": 0.27048745939962315, "kl": 0.1072845458984375, "learning_rate": 4.965227619459971e-07, "loss": 0.0001, "reward": 1.7821429371833801, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 11010 }, { "completion_length": 221.56251049041748, "epoch": 1.8463053774257094, "grad_norm": 0.255154030721768, "kl": 0.090850830078125, "learning_rate": 4.96520767895106e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7964285835623741, "rewards/format_reward_func": 1.0, "step": 11012 }, { "completion_length": 215.44197368621826, "epoch": 1.846640680665577, "grad_norm": 0.2318606194787304, "kl": 0.101898193359375, "learning_rate": 4.965187732766326e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 11014 }, { "completion_length": 221.33929538726807, "epoch": 1.8469759839054445, "grad_norm": 0.14313873384667694, "kl": 0.1305389404296875, "learning_rate": 4.965167780905817e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714656114578, "rewards/format_reward_func": 1.0, "step": 11016 }, { "completion_length": 231.19197845458984, "epoch": 1.847311287145312, "grad_norm": 0.22620464844274196, "kl": 0.10772705078125, "learning_rate": 4.965147823369576e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 1.0, "step": 11018 }, { "completion_length": 225.977689743042, "epoch": 1.8476465903851795, "grad_norm": 0.19280302891875264, "kl": 0.0964508056640625, "learning_rate": 4.965127860157652e-07, "loss": 0.0001, "reward": 1.76517865806818, "reward_std": 0.059346460737288, "rewards/equation_reward_func": 0.766964316368103, "rewards/format_reward_func": 0.9982142895460129, "step": 11020 }, { "completion_length": 232.98215198516846, "epoch": 1.8479818936250472, "grad_norm": 0.2722219674141182, "kl": 0.118072509765625, "learning_rate": 4.96510789127009e-07, "loss": 0.0001, "reward": 1.6964286640286446, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.6964286100119352, "rewards/format_reward_func": 1.0, "step": 11022 }, { "completion_length": 225.9107255935669, "epoch": 1.8483171968649148, "grad_norm": 0.25389911395651005, "kl": 0.100128173828125, "learning_rate": 4.965087916706934e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 11024 }, { "completion_length": 221.84822368621826, "epoch": 1.8486525001047822, "grad_norm": 0.21637961265921582, "kl": 0.102935791015625, "learning_rate": 4.965067936468234e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 11026 }, { "completion_length": 230.5803680419922, "epoch": 1.8489878033446498, "grad_norm": 0.18228930297643892, "kl": 0.1324462890625, "learning_rate": 4.965047950554032e-07, "loss": 0.0001, "reward": 1.725000075995922, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000312924385, "rewards/format_reward_func": 1.0, "step": 11028 }, { "completion_length": 223.92858123779297, "epoch": 1.8493231065845173, "grad_norm": 0.23163988946281985, "kl": 0.0888214111328125, "learning_rate": 4.965027958964376e-07, "loss": 0.0001, "reward": 1.703571505844593, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7035714611411095, "rewards/format_reward_func": 1.0, "step": 11030 }, { "completion_length": 234.64733219146729, "epoch": 1.849658409824385, "grad_norm": 0.15882551249432497, "kl": 0.11004638671875, "learning_rate": 4.965007961699312e-07, "loss": 0.0001, "reward": 1.787500038743019, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7919643111526966, "rewards/format_reward_func": 0.9955357164144516, "step": 11032 }, { "completion_length": 234.52679634094238, "epoch": 1.8499937130642525, "grad_norm": 0.16316429098513455, "kl": 0.10003662109375, "learning_rate": 4.964987958758885e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 11034 }, { "completion_length": 235.0267972946167, "epoch": 1.8503290163041202, "grad_norm": 0.1258603120735231, "kl": 0.24822998046875, "learning_rate": 4.964967950143143e-07, "loss": 0.0002, "reward": 1.733928620815277, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7383928894996643, "rewards/format_reward_func": 0.9955357164144516, "step": 11036 }, { "completion_length": 227.22768688201904, "epoch": 1.8506643195439876, "grad_norm": 0.28442494129277424, "kl": 0.114990234375, "learning_rate": 4.964947935852129e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.07071067858487368, "rewards/equation_reward_func": 0.7589286081492901, "rewards/format_reward_func": 0.9910714328289032, "step": 11038 }, { "completion_length": 229.85268878936768, "epoch": 1.850999622783855, "grad_norm": 0.3542580625263232, "kl": 0.116607666015625, "learning_rate": 4.964927915885893e-07, "loss": 0.0001, "reward": 1.703571505844593, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7035714723169804, "rewards/format_reward_func": 1.0, "step": 11040 }, { "completion_length": 231.94643878936768, "epoch": 1.8513349260237226, "grad_norm": 0.2923367114733701, "kl": 0.1073455810546875, "learning_rate": 4.964907890244478e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7991071753203869, "rewards/format_reward_func": 0.9955357164144516, "step": 11042 }, { "completion_length": 225.80804252624512, "epoch": 1.8516702292635903, "grad_norm": 0.2327532429020024, "kl": 0.1014404296875, "learning_rate": 4.964887858927931e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7839286103844643, "rewards/format_reward_func": 0.9910714328289032, "step": 11044 }, { "completion_length": 228.22768783569336, "epoch": 1.852005532503458, "grad_norm": 0.277353104211083, "kl": 0.1070098876953125, "learning_rate": 4.964867821936298e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 11046 }, { "completion_length": 226.15179634094238, "epoch": 1.8523408357433255, "grad_norm": 0.23078546163027888, "kl": 0.116180419921875, "learning_rate": 4.964847779269625e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 11048 }, { "completion_length": 227.08036708831787, "epoch": 1.852676138983193, "grad_norm": 0.2231158849058702, "kl": 0.099700927734375, "learning_rate": 4.96482773092796e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857581377029, "rewards/format_reward_func": 1.0, "step": 11050 }, { "completion_length": 227.75893783569336, "epoch": 1.8530114422230604, "grad_norm": 0.140503338610388, "kl": 0.134063720703125, "learning_rate": 4.964807676911347e-07, "loss": 0.0001, "reward": 1.7000000849366188, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7000000327825546, "rewards/format_reward_func": 1.0, "step": 11052 }, { "completion_length": 226.5759038925171, "epoch": 1.853346745462928, "grad_norm": 0.2436791254328514, "kl": 0.125732421875, "learning_rate": 4.964787617219832e-07, "loss": 0.0001, "reward": 1.8214286044239998, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8214285932481289, "rewards/format_reward_func": 1.0, "step": 11054 }, { "completion_length": 224.81697368621826, "epoch": 1.8536820487027956, "grad_norm": 0.2266395282483944, "kl": 0.10595703125, "learning_rate": 4.964767551853463e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 11056 }, { "completion_length": 218.30804634094238, "epoch": 1.8540173519426633, "grad_norm": 0.12301408490279202, "kl": 0.106903076171875, "learning_rate": 4.964747480812285e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 11058 }, { "completion_length": 225.54911613464355, "epoch": 1.8543526551825307, "grad_norm": 0.2668418387078583, "kl": 0.118621826171875, "learning_rate": 4.964727404096344e-07, "loss": 0.0001, "reward": 1.6875000819563866, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.6919643320143223, "rewards/format_reward_func": 0.9955357164144516, "step": 11060 }, { "completion_length": 220.75893592834473, "epoch": 1.8546879584223983, "grad_norm": 0.0035520221745497823, "kl": 0.1208953857421875, "learning_rate": 4.964707321705687e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 11062 }, { "completion_length": 237.15179443359375, "epoch": 1.8550232616622657, "grad_norm": 0.36594927961505463, "kl": 0.11773681640625, "learning_rate": 4.96468723364036e-07, "loss": 0.0001, "reward": 1.796428643167019, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 11064 }, { "completion_length": 222.36608028411865, "epoch": 1.8553585649021334, "grad_norm": 0.30599537635174073, "kl": 0.106658935546875, "learning_rate": 4.964667139900409e-07, "loss": 0.0001, "reward": 1.785714365541935, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 11066 }, { "completion_length": 223.11161708831787, "epoch": 1.855693868142001, "grad_norm": 0.47049750051047357, "kl": 0.12823486328125, "learning_rate": 4.96464704048588e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 11068 }, { "completion_length": 215.1741180419922, "epoch": 1.8560291713818686, "grad_norm": 0.2115054002297252, "kl": 0.0959320068359375, "learning_rate": 4.964626935396821e-07, "loss": 0.0001, "reward": 1.7892857491970062, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857473343611, "rewards/format_reward_func": 1.0, "step": 11070 }, { "completion_length": 217.03572273254395, "epoch": 1.856364474621736, "grad_norm": 0.18145256750998845, "kl": 0.108856201171875, "learning_rate": 4.964606824633276e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 11072 }, { "completion_length": 221.37500953674316, "epoch": 1.8566997778616035, "grad_norm": 0.20762713551014986, "kl": 0.108001708984375, "learning_rate": 4.964586708195292e-07, "loss": 0.0001, "reward": 1.8285714760422707, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8285714536905289, "rewards/format_reward_func": 1.0, "step": 11074 }, { "completion_length": 228.83483219146729, "epoch": 1.857035081101471, "grad_norm": 0.18659784904839863, "kl": 0.14788818359375, "learning_rate": 4.964566586082916e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 11076 }, { "completion_length": 222.52233028411865, "epoch": 1.8573703843413387, "grad_norm": 0.19205268624150945, "kl": 0.10675048828125, "learning_rate": 4.964546458296194e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8214285969734192, "rewards/format_reward_func": 1.0, "step": 11078 }, { "completion_length": 221.46429634094238, "epoch": 1.8577056875812064, "grad_norm": 0.2383659915228919, "kl": 0.12945556640625, "learning_rate": 4.964526324835172e-07, "loss": 0.0001, "reward": 1.732142947614193, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7321428935974836, "rewards/format_reward_func": 1.0, "step": 11080 }, { "completion_length": 211.19643878936768, "epoch": 1.8580409908210738, "grad_norm": 0.11302266977949367, "kl": 0.1028900146484375, "learning_rate": 4.964506185699897e-07, "loss": 0.0001, "reward": 1.8178571835160255, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.817857176065445, "rewards/format_reward_func": 1.0, "step": 11082 }, { "completion_length": 223.49554538726807, "epoch": 1.8583762940609414, "grad_norm": 0.23475124330163089, "kl": 0.14801025390625, "learning_rate": 4.964486040890415e-07, "loss": 0.0001, "reward": 1.710714377462864, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7107143215835094, "rewards/format_reward_func": 1.0, "step": 11084 }, { "completion_length": 219.42411708831787, "epoch": 1.8587115973008088, "grad_norm": 0.32645925916745405, "kl": 0.200714111328125, "learning_rate": 4.964465890406773e-07, "loss": 0.0002, "reward": 1.8071429133415222, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428909897804, "rewards/format_reward_func": 1.0, "step": 11086 }, { "completion_length": 223.35268878936768, "epoch": 1.8590469005406765, "grad_norm": 0.2379439074253615, "kl": 0.1185760498046875, "learning_rate": 4.964445734249015e-07, "loss": 0.0001, "reward": 1.78035718947649, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7848214656114578, "rewards/format_reward_func": 0.9955357164144516, "step": 11088 }, { "completion_length": 218.3526906967163, "epoch": 1.859382203780544, "grad_norm": 0.490915657263521, "kl": 0.137847900390625, "learning_rate": 4.96442557241719e-07, "loss": 0.0001, "reward": 1.721428669989109, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.721428606659174, "rewards/format_reward_func": 1.0, "step": 11090 }, { "completion_length": 219.01786518096924, "epoch": 1.8597175070204117, "grad_norm": 0.10672365885385886, "kl": 0.186767578125, "learning_rate": 4.964405404911344e-07, "loss": 0.0002, "reward": 1.7196429297327995, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7241071797907352, "rewards/format_reward_func": 0.9955357164144516, "step": 11092 }, { "completion_length": 216.41072368621826, "epoch": 1.8600528102602791, "grad_norm": 0.3824340210957931, "kl": 0.21832275390625, "learning_rate": 4.964385231731523e-07, "loss": 0.0002, "reward": 1.7464286461472511, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 11094 }, { "completion_length": 223.16072463989258, "epoch": 1.8603881135001465, "grad_norm": 0.23764974494911736, "kl": 0.187652587890625, "learning_rate": 4.964365052877773e-07, "loss": 0.0002, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 11096 }, { "completion_length": 236.05358409881592, "epoch": 1.8607234167400142, "grad_norm": 0.2853067876926874, "kl": 0.215728759765625, "learning_rate": 4.96434486835014e-07, "loss": 0.0002, "reward": 1.7089286744594574, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.713392898440361, "rewards/format_reward_func": 0.9955357164144516, "step": 11098 }, { "completion_length": 223.25001049041748, "epoch": 1.8610587199798818, "grad_norm": 0.3091993117408902, "kl": 0.2955322265625, "learning_rate": 4.964324678148674e-07, "loss": 0.0003, "reward": 1.725000075995922, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 1.0, "step": 11100 }, { "completion_length": 215.76340293884277, "epoch": 1.8613940232197494, "grad_norm": 0.2468966823450781, "kl": 0.138336181640625, "learning_rate": 4.964304482273417e-07, "loss": 0.0001, "reward": 1.8142857626080513, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8142857290804386, "rewards/format_reward_func": 1.0, "step": 11102 }, { "completion_length": 222.14286613464355, "epoch": 1.8617293264596169, "grad_norm": 0.14444154096411288, "kl": 0.2125244140625, "learning_rate": 4.964284280724418e-07, "loss": 0.0002, "reward": 1.7392857745289803, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 1.0, "step": 11104 }, { "completion_length": 212.61608028411865, "epoch": 1.8620646296994845, "grad_norm": 0.2764355620324343, "kl": 0.18096923828125, "learning_rate": 4.964264073501723e-07, "loss": 0.0002, "reward": 1.73214291036129, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7321428880095482, "rewards/format_reward_func": 1.0, "step": 11106 }, { "completion_length": 226.53572368621826, "epoch": 1.862399932939352, "grad_norm": 0.13167498736808136, "kl": 0.2672882080078125, "learning_rate": 4.964243860605378e-07, "loss": 0.0003, "reward": 1.8000000715255737, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 11108 }, { "completion_length": 218.80804538726807, "epoch": 1.8627352361792195, "grad_norm": 0.45637270918773215, "kl": 0.367919921875, "learning_rate": 4.96422364203543e-07, "loss": 0.0004, "reward": 1.8285714760422707, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8285714499652386, "rewards/format_reward_func": 1.0, "step": 11110 }, { "completion_length": 221.94643783569336, "epoch": 1.8630705394190872, "grad_norm": 0.24640437904082058, "kl": 0.381561279296875, "learning_rate": 4.964203417791926e-07, "loss": 0.0004, "reward": 1.7607143595814705, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7687500230967999, "rewards/format_reward_func": 0.9919642955064774, "step": 11112 }, { "completion_length": 222.49554538726807, "epoch": 1.8634058426589548, "grad_norm": 0.08996637829383648, "kl": 0.1153564453125, "learning_rate": 4.964183187874912e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7321428954601288, "rewards/format_reward_func": 1.0, "step": 11114 }, { "completion_length": 210.2009038925171, "epoch": 1.8637411458988222, "grad_norm": 0.30682680867562007, "kl": 0.525390625, "learning_rate": 4.964162952284435e-07, "loss": 0.0005, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857529222965, "rewards/format_reward_func": 1.0, "step": 11116 }, { "completion_length": 219.20983123779297, "epoch": 1.8640764491386896, "grad_norm": 0.21939131000569564, "kl": 0.161834716796875, "learning_rate": 4.964142711020539e-07, "loss": 0.0002, "reward": 1.7696429342031479, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 11118 }, { "completion_length": 220.9196548461914, "epoch": 1.8644117523785573, "grad_norm": 0.24876917172333088, "kl": 0.163299560546875, "learning_rate": 4.964122464083275e-07, "loss": 0.0002, "reward": 1.7357143610715866, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 11120 }, { "completion_length": 216.77233123779297, "epoch": 1.864747055618425, "grad_norm": 0.23915510555329103, "kl": 0.4630126953125, "learning_rate": 4.964102211472687e-07, "loss": 0.0005, "reward": 1.725000075995922, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000387430191, "rewards/format_reward_func": 1.0, "step": 11122 }, { "completion_length": 221.33929538726807, "epoch": 1.8650823588582925, "grad_norm": 0.24977953166575798, "kl": 0.19873046875, "learning_rate": 4.964081953188822e-07, "loss": 0.0002, "reward": 1.8071429058909416, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428760886192, "rewards/format_reward_func": 1.0, "step": 11124 }, { "completion_length": 226.56250762939453, "epoch": 1.8654176620981602, "grad_norm": 0.4095663196737921, "kl": 0.487213134765625, "learning_rate": 4.964061689231727e-07, "loss": 0.0005, "reward": 1.7750000581145287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000171363354, "rewards/format_reward_func": 1.0, "step": 11126 }, { "completion_length": 228.93750858306885, "epoch": 1.8657529653380276, "grad_norm": 0.30583252427411733, "kl": 0.337738037109375, "learning_rate": 4.964041419601448e-07, "loss": 0.0003, "reward": 1.7250000685453415, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000238418579, "rewards/format_reward_func": 1.0, "step": 11128 }, { "completion_length": 224.008939743042, "epoch": 1.866088268577895, "grad_norm": 0.13443705049530696, "kl": 0.214447021484375, "learning_rate": 4.964021144298032e-07, "loss": 0.0002, "reward": 1.7375000640749931, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7419643141329288, "rewards/format_reward_func": 0.9955357164144516, "step": 11130 }, { "completion_length": 229.95536708831787, "epoch": 1.8664235718177626, "grad_norm": 0.6325187220459428, "kl": 1.204498291015625, "learning_rate": 4.964000863321526e-07, "loss": 0.0012, "reward": 1.7160715088248253, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7205357551574707, "rewards/format_reward_func": 0.9955357164144516, "step": 11132 }, { "completion_length": 231.96429634094238, "epoch": 1.8667588750576303, "grad_norm": 0.11808471962014658, "kl": 0.1728515625, "learning_rate": 4.963980576671977e-07, "loss": 0.0002, "reward": 1.7857143431901932, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 11134 }, { "completion_length": 233.20090293884277, "epoch": 1.867094178297498, "grad_norm": 0.17947428004964994, "kl": 0.18927001953125, "learning_rate": 4.96396028434943e-07, "loss": 0.0002, "reward": 1.7571429014205933, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 11136 }, { "completion_length": 234.05804920196533, "epoch": 1.8674294815373653, "grad_norm": 0.23106300919668743, "kl": 0.202972412109375, "learning_rate": 4.963939986353934e-07, "loss": 0.0002, "reward": 1.765178643167019, "reward_std": 0.07954951096326113, "rewards/equation_reward_func": 0.7669643089175224, "rewards/format_reward_func": 0.9982142895460129, "step": 11138 }, { "completion_length": 228.23661708831787, "epoch": 1.867764784777233, "grad_norm": 0.2550263269785493, "kl": 0.206146240234375, "learning_rate": 4.963919682685533e-07, "loss": 0.0002, "reward": 1.8214286416769028, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.821428582072258, "rewards/format_reward_func": 1.0, "step": 11140 }, { "completion_length": 228.85715198516846, "epoch": 1.8681000880171004, "grad_norm": 0.17487852501349144, "kl": 0.12628173828125, "learning_rate": 4.963899373344276e-07, "loss": 0.0001, "reward": 1.8607143312692642, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8607143051922321, "rewards/format_reward_func": 1.0, "step": 11142 }, { "completion_length": 234.2991180419922, "epoch": 1.868435391256968, "grad_norm": 0.1105321170059028, "kl": 0.12158203125, "learning_rate": 4.963879058330209e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571660071611, "rewards/format_reward_func": 1.0, "step": 11144 }, { "completion_length": 234.05804538726807, "epoch": 1.8687706944968356, "grad_norm": 0.14043168729058803, "kl": 0.199127197265625, "learning_rate": 4.963858737643379e-07, "loss": 0.0002, "reward": 1.7964286357164383, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7964286059141159, "rewards/format_reward_func": 1.0, "step": 11146 }, { "completion_length": 227.93750762939453, "epoch": 1.8691059977367033, "grad_norm": 0.23978752547022278, "kl": 0.2164306640625, "learning_rate": 4.963838411283834e-07, "loss": 0.0002, "reward": 1.7035715207457542, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7035714685916901, "rewards/format_reward_func": 1.0, "step": 11148 }, { "completion_length": 239.415189743042, "epoch": 1.8694413009765707, "grad_norm": 0.17226054792818743, "kl": 0.137786865234375, "learning_rate": 4.963818079251618e-07, "loss": 0.0001, "reward": 1.746428668498993, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.746428593993187, "rewards/format_reward_func": 1.0, "step": 11150 }, { "completion_length": 234.7366189956665, "epoch": 1.869776604216438, "grad_norm": 0.563115423768281, "kl": 0.161102294921875, "learning_rate": 4.963797741546779e-07, "loss": 0.0002, "reward": 1.7535715028643608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.753571443259716, "rewards/format_reward_func": 1.0, "step": 11152 }, { "completion_length": 231.68750762939453, "epoch": 1.8701119074563057, "grad_norm": 0.13526243051831918, "kl": 0.117584228515625, "learning_rate": 4.963777398169365e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571902215481, "rewards/format_reward_func": 1.0, "step": 11154 }, { "completion_length": 230.14286708831787, "epoch": 1.8704472106961734, "grad_norm": 0.39458122341108914, "kl": 0.1586151123046875, "learning_rate": 4.963757049119421e-07, "loss": 0.0002, "reward": 1.7982143387198448, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.802678607404232, "rewards/format_reward_func": 0.9955357164144516, "step": 11156 }, { "completion_length": 232.852689743042, "epoch": 1.870782513936041, "grad_norm": 0.2544117423378524, "kl": 0.13641357421875, "learning_rate": 4.963736694396996e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 11158 }, { "completion_length": 230.69643878936768, "epoch": 1.8711178171759084, "grad_norm": 0.2988204347526544, "kl": 0.113555908203125, "learning_rate": 4.963716334002135e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928954601288, "rewards/format_reward_func": 0.9955357164144516, "step": 11160 }, { "completion_length": 244.852689743042, "epoch": 1.871453120415776, "grad_norm": 0.2732977459117658, "kl": 0.110443115234375, "learning_rate": 4.963695967934886e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714525729418, "rewards/format_reward_func": 1.0, "step": 11162 }, { "completion_length": 231.44643783569336, "epoch": 1.8717884236556435, "grad_norm": 0.20912566072607522, "kl": 0.106414794921875, "learning_rate": 4.963675596195295e-07, "loss": 0.0001, "reward": 1.7089286521077156, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7133928909897804, "rewards/format_reward_func": 0.9955357164144516, "step": 11164 }, { "completion_length": 234.3437614440918, "epoch": 1.872123726895511, "grad_norm": 0.4023109968185387, "kl": 0.12664794921875, "learning_rate": 4.963655218783409e-07, "loss": 0.0001, "reward": 1.7339286282658577, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.738392885774374, "rewards/format_reward_func": 0.9955357164144516, "step": 11166 }, { "completion_length": 235.37054824829102, "epoch": 1.8724590301353787, "grad_norm": 0.2784812271855098, "kl": 0.12969970703125, "learning_rate": 4.963634835699275e-07, "loss": 0.0001, "reward": 1.7482143566012383, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526785992085934, "rewards/format_reward_func": 0.9955357164144516, "step": 11168 }, { "completion_length": 235.15179538726807, "epoch": 1.8727943333752464, "grad_norm": 0.17483775044451044, "kl": 0.113037109375, "learning_rate": 4.963614446942941e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 11170 }, { "completion_length": 233.7410831451416, "epoch": 1.8731296366151138, "grad_norm": 0.12216847250099579, "kl": 0.0994415283203125, "learning_rate": 4.963594052514453e-07, "loss": 0.0001, "reward": 1.8071429282426834, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 11172 }, { "completion_length": 230.18304634094238, "epoch": 1.8734649398549812, "grad_norm": 0.4168596205302882, "kl": 0.11090087890625, "learning_rate": 4.963573652413858e-07, "loss": 0.0001, "reward": 1.766071505844593, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9955357164144516, "step": 11174 }, { "completion_length": 229.29911613464355, "epoch": 1.8738002430948488, "grad_norm": 0.2791899452311946, "kl": 0.118316650390625, "learning_rate": 4.963553246641203e-07, "loss": 0.0001, "reward": 1.8357143625617027, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8357143104076385, "rewards/format_reward_func": 1.0, "step": 11176 }, { "completion_length": 231.80358028411865, "epoch": 1.8741355463347165, "grad_norm": 0.16582448671376948, "kl": 0.12030029296875, "learning_rate": 4.963532835196534e-07, "loss": 0.0001, "reward": 1.807142898440361, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.807142898440361, "rewards/format_reward_func": 1.0, "step": 11178 }, { "completion_length": 234.4509048461914, "epoch": 1.874470849574584, "grad_norm": 0.2595491378811056, "kl": 0.122222900390625, "learning_rate": 4.9635124180799e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857231199741, "rewards/format_reward_func": 1.0, "step": 11180 }, { "completion_length": 240.40179634094238, "epoch": 1.8748061528144517, "grad_norm": 0.14194016819276536, "kl": 0.11871337890625, "learning_rate": 4.963491995291347e-07, "loss": 0.0001, "reward": 1.7875000461935997, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.791964303702116, "rewards/format_reward_func": 0.9955357164144516, "step": 11182 }, { "completion_length": 232.55804538726807, "epoch": 1.8751414560543191, "grad_norm": 0.12892934407277282, "kl": 0.11798095703125, "learning_rate": 4.963471566830922e-07, "loss": 0.0001, "reward": 1.7553572207689285, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7598214745521545, "rewards/format_reward_func": 0.9955357164144516, "step": 11184 }, { "completion_length": 235.81251049041748, "epoch": 1.8754767592941866, "grad_norm": 0.25378944592714786, "kl": 0.11944580078125, "learning_rate": 4.963451132698672e-07, "loss": 0.0001, "reward": 1.8357143253087997, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8357142992317677, "rewards/format_reward_func": 1.0, "step": 11186 }, { "completion_length": 250.5669755935669, "epoch": 1.8758120625340542, "grad_norm": 0.615027955265174, "kl": 0.1882781982421875, "learning_rate": 4.963430692894644e-07, "loss": 0.0002, "reward": 1.7375000938773155, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7419643066823483, "rewards/format_reward_func": 0.9955357164144516, "step": 11188 }, { "completion_length": 244.79465293884277, "epoch": 1.8761473657739218, "grad_norm": 0.2411759958789673, "kl": 0.1248626708984375, "learning_rate": 4.963410247418886e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000417232513, "rewards/format_reward_func": 1.0, "step": 11190 }, { "completion_length": 244.06250953674316, "epoch": 1.8764826690137895, "grad_norm": 0.3607120042957956, "kl": 0.274566650390625, "learning_rate": 4.963389796271443e-07, "loss": 0.0003, "reward": 1.6946429535746574, "reward_std": 0.09848987031728029, "rewards/equation_reward_func": 0.6991071961820126, "rewards/format_reward_func": 0.9955357164144516, "step": 11192 }, { "completion_length": 241.02680015563965, "epoch": 1.8768179722536569, "grad_norm": 0.179436973418576, "kl": 0.150360107421875, "learning_rate": 4.963369339452363e-07, "loss": 0.0002, "reward": 1.773214340209961, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7776785902678967, "rewards/format_reward_func": 0.9955357164144516, "step": 11194 }, { "completion_length": 249.25447463989258, "epoch": 1.8771532754935245, "grad_norm": 0.22366962597111079, "kl": 0.215087890625, "learning_rate": 4.963348876961695e-07, "loss": 0.0002, "reward": 1.7660714909434319, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7705357447266579, "rewards/format_reward_func": 0.9955357164144516, "step": 11196 }, { "completion_length": 239.75893783569336, "epoch": 1.877488578733392, "grad_norm": 0.5282950834770903, "kl": 0.2510986328125, "learning_rate": 4.963328408799484e-07, "loss": 0.0003, "reward": 1.757142923772335, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 11198 }, { "completion_length": 244.62500858306885, "epoch": 1.8778238819732596, "grad_norm": 0.17760628742076728, "kl": 0.205841064453125, "learning_rate": 4.963307934965777e-07, "loss": 0.0002, "reward": 1.789285771548748, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7982143275439739, "rewards/format_reward_func": 0.9910714328289032, "step": 11200 }, { "completion_length": 248.48215579986572, "epoch": 1.8781591852131272, "grad_norm": 0.19333171248821832, "kl": 0.146392822265625, "learning_rate": 4.963287455460622e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 11202 }, { "completion_length": 246.0000123977661, "epoch": 1.8784944884529948, "grad_norm": 0.26767903257350883, "kl": 0.162445068359375, "learning_rate": 4.963266970284067e-07, "loss": 0.0002, "reward": 1.7267857789993286, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.731250025331974, "rewards/format_reward_func": 0.9955357164144516, "step": 11204 }, { "completion_length": 249.42858219146729, "epoch": 1.8788297916928622, "grad_norm": 0.15949540283927346, "kl": 0.176422119140625, "learning_rate": 4.963246479436157e-07, "loss": 0.0002, "reward": 1.8000000566244125, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000342726707, "rewards/format_reward_func": 1.0, "step": 11206 }, { "completion_length": 235.9687614440918, "epoch": 1.8791650949327297, "grad_norm": 0.22833940214050732, "kl": 0.18597412109375, "learning_rate": 4.96322598291694e-07, "loss": 0.0002, "reward": 1.7892857640981674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857305705547, "rewards/format_reward_func": 1.0, "step": 11208 }, { "completion_length": 252.52233695983887, "epoch": 1.8795003981725973, "grad_norm": 0.16628686527578662, "kl": 0.2346954345703125, "learning_rate": 4.963205480726465e-07, "loss": 0.0002, "reward": 1.7964285984635353, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 11210 }, { "completion_length": 240.1741180419922, "epoch": 1.879835701412465, "grad_norm": 0.3176503045231156, "kl": 0.28228759765625, "learning_rate": 4.963184972864776e-07, "loss": 0.0003, "reward": 1.7607143819332123, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 11212 }, { "completion_length": 243.00001525878906, "epoch": 1.8801710046523326, "grad_norm": 0.17669946946339002, "kl": 0.12799072265625, "learning_rate": 4.963164459331924e-07, "loss": 0.0001, "reward": 1.7875000312924385, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7919643241912127, "rewards/format_reward_func": 0.9955357164144516, "step": 11214 }, { "completion_length": 241.74108219146729, "epoch": 1.8805063078922, "grad_norm": 0.11810350754256321, "kl": 0.107452392578125, "learning_rate": 4.963143940127953e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 11216 }, { "completion_length": 239.05804443359375, "epoch": 1.8808416111320676, "grad_norm": 0.24622105000391542, "kl": 0.14434814453125, "learning_rate": 4.963123415252911e-07, "loss": 0.0001, "reward": 1.730357214808464, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7348214611411095, "rewards/format_reward_func": 0.9955357164144516, "step": 11218 }, { "completion_length": 247.7544765472412, "epoch": 1.881176914371935, "grad_norm": 0.20972882425954445, "kl": 0.20135498046875, "learning_rate": 4.963102884706845e-07, "loss": 0.0002, "reward": 1.7428572103381157, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 11220 }, { "completion_length": 232.7142972946167, "epoch": 1.8815122176118027, "grad_norm": 0.2750278904808387, "kl": 0.192230224609375, "learning_rate": 4.963082348489804e-07, "loss": 0.0002, "reward": 1.7642857655882835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 11222 }, { "completion_length": 242.1250123977661, "epoch": 1.8818475208516703, "grad_norm": 0.3315015096714337, "kl": 0.336578369140625, "learning_rate": 4.963061806601835e-07, "loss": 0.0003, "reward": 1.7642857730388641, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 11224 }, { "completion_length": 234.46429443359375, "epoch": 1.882182824091538, "grad_norm": 0.09096484363912223, "kl": 0.1370697021484375, "learning_rate": 4.963041259042984e-07, "loss": 0.0001, "reward": 1.7589286491274834, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7633928805589676, "rewards/format_reward_func": 0.9955357164144516, "step": 11226 }, { "completion_length": 230.73215198516846, "epoch": 1.8825181273314053, "grad_norm": 0.16681827582170852, "kl": 0.13641357421875, "learning_rate": 4.963020705813297e-07, "loss": 0.0001, "reward": 1.835714340209961, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8357142992317677, "rewards/format_reward_func": 1.0, "step": 11228 }, { "completion_length": 234.8303680419922, "epoch": 1.8828534305712727, "grad_norm": 0.37642315561234024, "kl": 0.120574951171875, "learning_rate": 4.963000146912825e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 1.0, "step": 11230 }, { "completion_length": 240.70983219146729, "epoch": 1.8831887338111404, "grad_norm": 0.23413257547900723, "kl": 0.123199462890625, "learning_rate": 4.962979582341613e-07, "loss": 0.0001, "reward": 1.8071429282426834, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428760886192, "rewards/format_reward_func": 1.0, "step": 11232 }, { "completion_length": 236.7098331451416, "epoch": 1.883524037051008, "grad_norm": 0.2838174612146134, "kl": 0.137664794921875, "learning_rate": 4.962959012099709e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 11234 }, { "completion_length": 237.9821548461914, "epoch": 1.8838593402908757, "grad_norm": 0.3923008916019813, "kl": 0.157470703125, "learning_rate": 4.96293843618716e-07, "loss": 0.0002, "reward": 1.7750000581145287, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 11236 }, { "completion_length": 238.98215293884277, "epoch": 1.884194643530743, "grad_norm": 0.2385679361089355, "kl": 0.1591796875, "learning_rate": 4.962917854604013e-07, "loss": 0.0002, "reward": 1.7642857655882835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857581377029, "rewards/format_reward_func": 1.0, "step": 11238 }, { "completion_length": 235.0759048461914, "epoch": 1.8845299467706107, "grad_norm": 0.14434117277451863, "kl": 0.12469482421875, "learning_rate": 4.962897267350316e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143256813288, "rewards/format_reward_func": 1.0, "step": 11240 }, { "completion_length": 223.95090293884277, "epoch": 1.884865250010478, "grad_norm": 0.03234523914231118, "kl": 0.12738037109375, "learning_rate": 4.962876674426116e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143267989159, "rewards/format_reward_func": 1.0, "step": 11242 }, { "completion_length": 244.4196538925171, "epoch": 1.8852005532503457, "grad_norm": 0.3593552515735288, "kl": 0.238555908203125, "learning_rate": 4.962856075831462e-07, "loss": 0.0002, "reward": 1.6785715222358704, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.6785714607685804, "rewards/format_reward_func": 1.0, "step": 11244 }, { "completion_length": 246.20537090301514, "epoch": 1.8855358564902134, "grad_norm": 0.37697301198383576, "kl": 0.193267822265625, "learning_rate": 4.962835471566399e-07, "loss": 0.0002, "reward": 1.7839286252856255, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928716182709, "rewards/format_reward_func": 0.9955357164144516, "step": 11246 }, { "completion_length": 248.55804824829102, "epoch": 1.885871159730081, "grad_norm": 0.23996368764434198, "kl": 0.1488037109375, "learning_rate": 4.962814861630977e-07, "loss": 0.0001, "reward": 1.7375000938773155, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7419643066823483, "rewards/format_reward_func": 0.9955357164144516, "step": 11248 }, { "completion_length": 253.0000114440918, "epoch": 1.8862064629699484, "grad_norm": 0.18681771723186258, "kl": 0.12933349609375, "learning_rate": 4.96279424602524e-07, "loss": 0.0001, "reward": 1.7517857924103737, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 11250 }, { "completion_length": 238.5446548461914, "epoch": 1.8865417662098158, "grad_norm": 0.10783026267363756, "kl": 0.18572998046875, "learning_rate": 4.962773624749239e-07, "loss": 0.0002, "reward": 1.796428620815277, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 11252 }, { "completion_length": 229.48215293884277, "epoch": 1.8868770694496835, "grad_norm": 0.19649646385163824, "kl": 0.105621337890625, "learning_rate": 4.96275299780302e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 11254 }, { "completion_length": 248.9866189956665, "epoch": 1.887212372689551, "grad_norm": 0.28251356223769303, "kl": 0.43621826171875, "learning_rate": 4.96273236518663e-07, "loss": 0.0004, "reward": 1.7767857685685158, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.781250037252903, "rewards/format_reward_func": 0.9955357164144516, "step": 11256 }, { "completion_length": 229.65179634094238, "epoch": 1.8875476759294187, "grad_norm": 0.09993092682238044, "kl": 0.1275787353515625, "learning_rate": 4.962711726900117e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 11258 }, { "completion_length": 235.91518878936768, "epoch": 1.8878829791692864, "grad_norm": 0.2383186917676842, "kl": 0.11273193359375, "learning_rate": 4.962691082943528e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7526785843074322, "rewards/format_reward_func": 0.9955357164144516, "step": 11260 }, { "completion_length": 240.62947368621826, "epoch": 1.8882182824091538, "grad_norm": 0.12872696701242822, "kl": 0.2894744873046875, "learning_rate": 4.962670433316912e-07, "loss": 0.0003, "reward": 1.7660714834928513, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 11262 }, { "completion_length": 237.12054634094238, "epoch": 1.8885535856490212, "grad_norm": 0.2637527234686633, "kl": 0.201141357421875, "learning_rate": 4.962649778020316e-07, "loss": 0.0002, "reward": 1.7535715103149414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714637488127, "rewards/format_reward_func": 1.0, "step": 11264 }, { "completion_length": 241.26340198516846, "epoch": 1.8888888888888888, "grad_norm": 0.19499182893264433, "kl": 0.1104736328125, "learning_rate": 4.962629117053786e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 11266 }, { "completion_length": 247.0312623977661, "epoch": 1.8892241921287565, "grad_norm": 0.34806357799507837, "kl": 0.128662109375, "learning_rate": 4.962608450417371e-07, "loss": 0.0001, "reward": 1.6964286416769028, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.6964286100119352, "rewards/format_reward_func": 1.0, "step": 11268 }, { "completion_length": 243.08037090301514, "epoch": 1.889559495368624, "grad_norm": 0.13210932993568505, "kl": 0.141876220703125, "learning_rate": 4.962587778111119e-07, "loss": 0.0001, "reward": 1.832142896950245, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8321428745985031, "rewards/format_reward_func": 1.0, "step": 11270 }, { "completion_length": 247.37947845458984, "epoch": 1.8898947986084915, "grad_norm": 0.37825347713712715, "kl": 0.171356201171875, "learning_rate": 4.962567100135075e-07, "loss": 0.0002, "reward": 1.7642857804894447, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7642857544124126, "rewards/format_reward_func": 1.0, "step": 11272 }, { "completion_length": 250.8884038925171, "epoch": 1.8902301018483592, "grad_norm": 0.25807449253969955, "kl": 0.112640380859375, "learning_rate": 4.962546416489289e-07, "loss": 0.0001, "reward": 1.7589286491274834, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7633928954601288, "rewards/format_reward_func": 0.9955357164144516, "step": 11274 }, { "completion_length": 252.50894165039062, "epoch": 1.8905654050882266, "grad_norm": 0.3161104987993201, "kl": 0.10699462890625, "learning_rate": 4.962525727173809e-07, "loss": 0.0001, "reward": 1.746428668498993, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 11276 }, { "completion_length": 250.54912090301514, "epoch": 1.8909007083280942, "grad_norm": 0.26533155739866404, "kl": 0.1735382080078125, "learning_rate": 4.962505032188682e-07, "loss": 0.0002, "reward": 1.725000075995922, "reward_std": 0.05555838719010353, "rewards/equation_reward_func": 0.7250000331550837, "rewards/format_reward_func": 1.0, "step": 11278 }, { "completion_length": 252.9866180419922, "epoch": 1.8912360115679618, "grad_norm": 0.269657416281771, "kl": 0.264495849609375, "learning_rate": 4.962484331533955e-07, "loss": 0.0003, "reward": 1.7000000849366188, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7000000365078449, "rewards/format_reward_func": 1.0, "step": 11280 }, { "completion_length": 255.80358600616455, "epoch": 1.8915713148078295, "grad_norm": 0.11395970195478738, "kl": 0.197479248046875, "learning_rate": 4.962463625209676e-07, "loss": 0.0002, "reward": 1.773214340209961, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.9955357164144516, "step": 11282 }, { "completion_length": 252.80804634094238, "epoch": 1.8919066180476969, "grad_norm": 0.20015288301172346, "kl": 0.2366180419921875, "learning_rate": 4.962442913215892e-07, "loss": 0.0002, "reward": 1.7607143446803093, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7696428839117289, "rewards/format_reward_func": 0.9910714328289032, "step": 11284 }, { "completion_length": 265.46429920196533, "epoch": 1.8922419212875643, "grad_norm": 0.3071617414978232, "kl": 0.1317138671875, "learning_rate": 4.962422195552652e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7767857313156128, "rewards/format_reward_func": 0.9910714328289032, "step": 11286 }, { "completion_length": 264.34822845458984, "epoch": 1.892577224527432, "grad_norm": 0.5844285027634494, "kl": 0.6663818359375, "learning_rate": 4.962401472220004e-07, "loss": 0.0007, "reward": 1.7375000640749931, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 11288 }, { "completion_length": 257.7276906967163, "epoch": 1.8929125277672996, "grad_norm": 0.18094547642963446, "kl": 0.4002685546875, "learning_rate": 4.962380743217994e-07, "loss": 0.0004, "reward": 1.7250000983476639, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 1.0, "step": 11290 }, { "completion_length": 256.72322845458984, "epoch": 1.8932478310071672, "grad_norm": 0.31669459769976144, "kl": 0.608367919921875, "learning_rate": 4.96236000854667e-07, "loss": 0.0006, "reward": 1.757142923772335, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428772062063, "rewards/format_reward_func": 1.0, "step": 11292 }, { "completion_length": 249.0491189956665, "epoch": 1.8935831342470346, "grad_norm": 0.23547687690667424, "kl": 0.344940185546875, "learning_rate": 4.962339268206081e-07, "loss": 0.0003, "reward": 1.7875000536441803, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 11294 }, { "completion_length": 250.0669755935669, "epoch": 1.8939184374869023, "grad_norm": 0.44210786853010475, "kl": 0.396240234375, "learning_rate": 4.962318522196274e-07, "loss": 0.0004, "reward": 1.7125000953674316, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7169643193483353, "rewards/format_reward_func": 0.9955357164144516, "step": 11296 }, { "completion_length": 264.73215198516846, "epoch": 1.8942537407267697, "grad_norm": 0.2360487677899644, "kl": 0.343963623046875, "learning_rate": 4.962297770517296e-07, "loss": 0.0003, "reward": 1.801785759627819, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.8151785954833031, "rewards/format_reward_func": 0.9866071492433548, "step": 11298 }, { "completion_length": 253.4285831451416, "epoch": 1.8945890439666373, "grad_norm": 0.19174211335333957, "kl": 0.182464599609375, "learning_rate": 4.962277013169197e-07, "loss": 0.0002, "reward": 1.7089286521077156, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7133928909897804, "rewards/format_reward_func": 0.9955357164144516, "step": 11300 }, { "completion_length": 262.34376335144043, "epoch": 1.894924347206505, "grad_norm": 0.4497420045476892, "kl": 0.34619140625, "learning_rate": 4.962256250152022e-07, "loss": 0.0003, "reward": 1.6500000730156898, "reward_std": 0.07071067858487368, "rewards/equation_reward_func": 0.6678571738302708, "rewards/format_reward_func": 0.9821428656578064, "step": 11302 }, { "completion_length": 266.2901906967163, "epoch": 1.8952596504463726, "grad_norm": 0.19696851735656268, "kl": 0.706573486328125, "learning_rate": 4.962235481465821e-07, "loss": 0.0007, "reward": 1.7504464909434319, "reward_std": 0.0700793326832354, "rewards/equation_reward_func": 0.7651786096394062, "rewards/format_reward_func": 0.9852678664028645, "step": 11304 }, { "completion_length": 257.4821538925171, "epoch": 1.89559495368624, "grad_norm": 0.2895527404607826, "kl": 0.30169677734375, "learning_rate": 4.962214707110641e-07, "loss": 0.0003, "reward": 1.7000000849366188, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.708928607404232, "rewards/format_reward_func": 0.9910714328289032, "step": 11306 }, { "completion_length": 245.58929824829102, "epoch": 1.8959302569261074, "grad_norm": 0.3727980145309879, "kl": 1.33642578125, "learning_rate": 4.96219392708653e-07, "loss": 0.0013, "reward": 1.7535715103149414, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7625000365078449, "rewards/format_reward_func": 0.9910714328289032, "step": 11308 }, { "completion_length": 251.3259048461914, "epoch": 1.896265560165975, "grad_norm": 0.08608763982420042, "kl": 1.362823486328125, "learning_rate": 4.962173141393535e-07, "loss": 0.0014, "reward": 1.801785759627819, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8062500320374966, "rewards/format_reward_func": 0.9955357164144516, "step": 11310 }, { "completion_length": 251.4330472946167, "epoch": 1.8966008634058427, "grad_norm": 0.2428648181228268, "kl": 0.157745361328125, "learning_rate": 4.962152350031704e-07, "loss": 0.0002, "reward": 1.7303572297096252, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7348214462399483, "rewards/format_reward_func": 0.9955357164144516, "step": 11312 }, { "completion_length": 260.3080472946167, "epoch": 1.8969361666457103, "grad_norm": 0.2209283834755941, "kl": 0.304290771484375, "learning_rate": 4.962131553001086e-07, "loss": 0.0003, "reward": 1.7446429133415222, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7491071820259094, "rewards/format_reward_func": 0.9955357164144516, "step": 11314 }, { "completion_length": 248.6696548461914, "epoch": 1.897271469885578, "grad_norm": 0.159877997058151, "kl": 0.5746612548828125, "learning_rate": 4.962110750301729e-07, "loss": 0.0006, "reward": 1.759375050663948, "reward_std": 0.017046324210241437, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 0.9986607171595097, "step": 11316 }, { "completion_length": 260.1384038925171, "epoch": 1.8976067731254453, "grad_norm": 0.17812440090711842, "kl": 0.610321044921875, "learning_rate": 4.96208994193368e-07, "loss": 0.0006, "reward": 1.7660714983940125, "reward_std": 0.07828682288527489, "rewards/equation_reward_func": 0.7794643212109804, "rewards/format_reward_func": 0.9866071492433548, "step": 11318 }, { "completion_length": 248.71875858306885, "epoch": 1.8979420763653128, "grad_norm": 0.22698536374050526, "kl": 0.59393310546875, "learning_rate": 4.962069127896987e-07, "loss": 0.0006, "reward": 1.841071456670761, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.8455357290804386, "rewards/format_reward_func": 0.9955357164144516, "step": 11320 }, { "completion_length": 253.8259038925171, "epoch": 1.8982773796051804, "grad_norm": 0.21589363012833795, "kl": 0.1286773681640625, "learning_rate": 4.962048308191698e-07, "loss": 0.0001, "reward": 1.7660714983940125, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7705357484519482, "rewards/format_reward_func": 0.9955357164144516, "step": 11322 }, { "completion_length": 256.321439743042, "epoch": 1.898612682845048, "grad_norm": 0.13809367925350216, "kl": 1.128448486328125, "learning_rate": 4.96202748281786e-07, "loss": 0.0011, "reward": 1.7625000551342964, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643051922321, "rewards/format_reward_func": 0.9955357164144516, "step": 11324 }, { "completion_length": 255.62054538726807, "epoch": 1.8989479860849157, "grad_norm": 0.1571339857944827, "kl": 0.396942138671875, "learning_rate": 4.962006651775522e-07, "loss": 0.0004, "reward": 1.7535714954137802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714730620384, "rewards/format_reward_func": 1.0, "step": 11326 }, { "completion_length": 263.977689743042, "epoch": 1.899283289324783, "grad_norm": 0.3801393073801637, "kl": 0.1540679931640625, "learning_rate": 4.961985815064732e-07, "loss": 0.0002, "reward": 1.7803572043776512, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7848214581608772, "rewards/format_reward_func": 0.9955357164144516, "step": 11328 }, { "completion_length": 259.214298248291, "epoch": 1.8996185925646507, "grad_norm": 0.007573744361931633, "kl": 0.76910400390625, "learning_rate": 4.961964972685539e-07, "loss": 0.0008, "reward": 1.78035718947649, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 11330 }, { "completion_length": 266.71875953674316, "epoch": 1.8999538958045181, "grad_norm": 0.18791643353272014, "kl": 0.676361083984375, "learning_rate": 4.961944124637989e-07, "loss": 0.0007, "reward": 1.723214365541935, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276786081492901, "rewards/format_reward_func": 0.9955357164144516, "step": 11332 }, { "completion_length": 257.7366199493408, "epoch": 1.9002891990443858, "grad_norm": 0.20574448512918195, "kl": 0.811431884765625, "learning_rate": 4.96192327092213e-07, "loss": 0.0008, "reward": 1.730357214808464, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7348214723169804, "rewards/format_reward_func": 0.9955357164144516, "step": 11334 }, { "completion_length": 266.5669765472412, "epoch": 1.9006245022842534, "grad_norm": 0.12088067703855519, "kl": 0.2629852294921875, "learning_rate": 4.961902411538013e-07, "loss": 0.0003, "reward": 1.739285759627819, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7482143305242062, "rewards/format_reward_func": 0.9910714328289032, "step": 11336 }, { "completion_length": 275.45537090301514, "epoch": 1.900959805524121, "grad_norm": 0.19524772069955354, "kl": 0.3109130859375, "learning_rate": 4.961881546485682e-07, "loss": 0.0003, "reward": 1.7625000849366188, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 11338 }, { "completion_length": 271.44644355773926, "epoch": 1.9012951087639884, "grad_norm": 0.1591140863477574, "kl": 0.1329803466796875, "learning_rate": 4.961860675765188e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.769642885774374, "rewards/format_reward_func": 0.9910714328289032, "step": 11340 }, { "completion_length": 278.3035821914673, "epoch": 1.9016304120038559, "grad_norm": 0.2581112163246211, "kl": 0.2618408203125, "learning_rate": 4.961839799376576e-07, "loss": 0.0003, "reward": 1.8392857685685158, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8392857275903225, "rewards/format_reward_func": 1.0, "step": 11342 }, { "completion_length": 263.99554920196533, "epoch": 1.9019657152437235, "grad_norm": 0.18510317852809857, "kl": 0.1702880859375, "learning_rate": 4.961818917319897e-07, "loss": 0.0002, "reward": 1.7267857939004898, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7312500365078449, "rewards/format_reward_func": 0.9955357164144516, "step": 11344 }, { "completion_length": 265.2634048461914, "epoch": 1.9023010184835911, "grad_norm": 0.1454532985720656, "kl": 0.371673583984375, "learning_rate": 4.961798029595199e-07, "loss": 0.0004, "reward": 1.76071435213089, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143130153418, "rewards/format_reward_func": 1.0, "step": 11346 }, { "completion_length": 271.0044765472412, "epoch": 1.9026363217234588, "grad_norm": 0.11359424624294441, "kl": 0.22344970703125, "learning_rate": 4.961777136202528e-07, "loss": 0.0002, "reward": 1.7410714998841286, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7455357424914837, "rewards/format_reward_func": 0.9955357164144516, "step": 11348 }, { "completion_length": 269.70536708831787, "epoch": 1.9029716249633262, "grad_norm": 0.14220671747259186, "kl": 0.181732177734375, "learning_rate": 4.961756237141934e-07, "loss": 0.0002, "reward": 1.728571504354477, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7285714633762836, "rewards/format_reward_func": 1.0, "step": 11350 }, { "completion_length": 269.3705530166626, "epoch": 1.9033069282031938, "grad_norm": 0.1606879487831985, "kl": 0.114715576171875, "learning_rate": 4.961735332413465e-07, "loss": 0.0001, "reward": 1.751785770058632, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500443309546, "rewards/format_reward_func": 0.9955357164144516, "step": 11352 }, { "completion_length": 269.37054538726807, "epoch": 1.9036422314430612, "grad_norm": 0.3928058113985592, "kl": 0.1796112060546875, "learning_rate": 4.961714422017167e-07, "loss": 0.0002, "reward": 1.7678571790456772, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 11354 }, { "completion_length": 267.2500123977661, "epoch": 1.9039775346829289, "grad_norm": 0.10755558405850703, "kl": 0.14447021484375, "learning_rate": 4.96169350595309e-07, "loss": 0.0001, "reward": 1.8464286103844643, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8464285954833031, "rewards/format_reward_func": 1.0, "step": 11356 }, { "completion_length": 264.2812623977661, "epoch": 1.9043128379227965, "grad_norm": 0.28424929901275836, "kl": 0.12335205078125, "learning_rate": 4.961672584221282e-07, "loss": 0.0001, "reward": 1.8000000640749931, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.8089285865426064, "rewards/format_reward_func": 0.9910714328289032, "step": 11358 }, { "completion_length": 264.68751525878906, "epoch": 1.9046481411626641, "grad_norm": 0.14374849264193768, "kl": 0.1146240234375, "learning_rate": 4.961651656821791e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 11360 }, { "completion_length": 273.4866180419922, "epoch": 1.9049834444025315, "grad_norm": 0.227732210229808, "kl": 0.21929931640625, "learning_rate": 4.961630723754666e-07, "loss": 0.0002, "reward": 1.7267857640981674, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7312500309199095, "rewards/format_reward_func": 0.9955357164144516, "step": 11362 }, { "completion_length": 270.5803689956665, "epoch": 1.905318747642399, "grad_norm": 0.17363189254935005, "kl": 0.14501953125, "learning_rate": 4.961609785019954e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 11364 }, { "completion_length": 271.4330472946167, "epoch": 1.9056540508822666, "grad_norm": 0.23595968121958819, "kl": 0.207855224609375, "learning_rate": 4.961588840617703e-07, "loss": 0.0002, "reward": 1.7142857983708382, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7232143171131611, "rewards/format_reward_func": 0.9910714328289032, "step": 11366 }, { "completion_length": 264.21876430511475, "epoch": 1.9059893541221342, "grad_norm": 0.14596728600368417, "kl": 0.11456298828125, "learning_rate": 4.961567890547962e-07, "loss": 0.0001, "reward": 1.8017857521772385, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.8062500208616257, "rewards/format_reward_func": 0.9955357164144516, "step": 11368 }, { "completion_length": 270.11162090301514, "epoch": 1.9063246573620019, "grad_norm": 0.23307965611049494, "kl": 0.19744873046875, "learning_rate": 4.96154693481078e-07, "loss": 0.0002, "reward": 1.7053572162985802, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.709821468219161, "rewards/format_reward_func": 0.9955357164144516, "step": 11370 }, { "completion_length": 265.9509029388428, "epoch": 1.9066599606018693, "grad_norm": 0.32637737542984435, "kl": 0.22430419921875, "learning_rate": 4.961525973406203e-07, "loss": 0.0002, "reward": 1.8000000640749931, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.8089285865426064, "rewards/format_reward_func": 0.9910714328289032, "step": 11372 }, { "completion_length": 261.34376335144043, "epoch": 1.906995263841737, "grad_norm": 0.2971004762919588, "kl": 0.1528472900390625, "learning_rate": 4.961505006334281e-07, "loss": 0.0002, "reward": 1.8000000566244125, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.8089285790920258, "rewards/format_reward_func": 0.9910714328289032, "step": 11374 }, { "completion_length": 266.3169775009155, "epoch": 1.9073305670816043, "grad_norm": 0.0754146216793177, "kl": 0.145721435546875, "learning_rate": 4.961484033595061e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7660714499652386, "rewards/format_reward_func": 0.9910714328289032, "step": 11376 }, { "completion_length": 264.87055110931396, "epoch": 1.907665870321472, "grad_norm": 0.14408567338438105, "kl": 0.123016357421875, "learning_rate": 4.961463055188593e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7500000428408384, "rewards/format_reward_func": 1.0, "step": 11378 }, { "completion_length": 260.95983505249023, "epoch": 1.9080011735613396, "grad_norm": 0.21066553363979487, "kl": 0.201416015625, "learning_rate": 4.961442071114925e-07, "loss": 0.0002, "reward": 1.796428605914116, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286040514708, "rewards/format_reward_func": 1.0, "step": 11380 }, { "completion_length": 262.84822845458984, "epoch": 1.9083364768012072, "grad_norm": 0.25315756086610974, "kl": 0.1490478515625, "learning_rate": 4.961421081374104e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7946428880095482, "rewards/format_reward_func": 0.9910714328289032, "step": 11382 }, { "completion_length": 263.6250114440918, "epoch": 1.9086717800410746, "grad_norm": 0.2453006777873615, "kl": 0.224761962890625, "learning_rate": 4.961400085966179e-07, "loss": 0.0002, "reward": 1.7285714894533157, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7375000324100256, "rewards/format_reward_func": 0.9910714328289032, "step": 11384 }, { "completion_length": 259.40179538726807, "epoch": 1.909007083280942, "grad_norm": 0.39350043991505274, "kl": 0.12457275390625, "learning_rate": 4.961379084891199e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7991071790456772, "rewards/format_reward_func": 0.9955357164144516, "step": 11386 }, { "completion_length": 270.33930015563965, "epoch": 1.9093423865208097, "grad_norm": 0.2779319904231321, "kl": 0.175750732421875, "learning_rate": 4.961358078149211e-07, "loss": 0.0002, "reward": 1.735714353621006, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7446428835391998, "rewards/format_reward_func": 0.9910714328289032, "step": 11388 }, { "completion_length": 257.08036708831787, "epoch": 1.9096776897606773, "grad_norm": 0.8968730072056478, "kl": 0.12542724609375, "learning_rate": 4.961337065740263e-07, "loss": 0.0001, "reward": 1.8107143267989159, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.8196428827941418, "rewards/format_reward_func": 0.9910714328289032, "step": 11390 }, { "completion_length": 258.45090770721436, "epoch": 1.910012993000545, "grad_norm": 0.273157945393228, "kl": 0.130096435546875, "learning_rate": 4.961316047664406e-07, "loss": 0.0001, "reward": 1.7589286342263222, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7723214477300644, "rewards/format_reward_func": 0.9866071492433548, "step": 11392 }, { "completion_length": 263.4241180419922, "epoch": 1.9103482962404126, "grad_norm": 0.13818252928656122, "kl": 0.154266357421875, "learning_rate": 4.961295023921687e-07, "loss": 0.0002, "reward": 1.7714286297559738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 11394 }, { "completion_length": 265.8214406967163, "epoch": 1.91068359948028, "grad_norm": 0.27011759053789397, "kl": 0.226043701171875, "learning_rate": 4.961273994512154e-07, "loss": 0.0002, "reward": 1.7500000670552254, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 11396 }, { "completion_length": 250.55358409881592, "epoch": 1.9110189027201474, "grad_norm": 0.6885092609424, "kl": 0.2010955810546875, "learning_rate": 4.961252959435856e-07, "loss": 0.0002, "reward": 1.7464286610484123, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7553571779280901, "rewards/format_reward_func": 0.9910714328289032, "step": 11398 }, { "completion_length": 268.08929443359375, "epoch": 1.911354205960015, "grad_norm": 0.2028304503555617, "kl": 0.15380859375, "learning_rate": 4.961231918692839e-07, "loss": 0.0002, "reward": 1.7964286282658577, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7964286096394062, "rewards/format_reward_func": 1.0, "step": 11400 }, { "completion_length": 268.0803680419922, "epoch": 1.9116895091998827, "grad_norm": 0.8293108782393289, "kl": 0.17572021484375, "learning_rate": 4.961210872283157e-07, "loss": 0.0002, "reward": 1.7678572237491608, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7767857313156128, "rewards/format_reward_func": 0.9910714328289032, "step": 11402 }, { "completion_length": 264.0044765472412, "epoch": 1.9120248124397503, "grad_norm": 0.3747041193333945, "kl": 0.146636962890625, "learning_rate": 4.961189820206852e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7803571745753288, "rewards/format_reward_func": 0.9910714328289032, "step": 11404 }, { "completion_length": 278.9464387893677, "epoch": 1.9123601156796177, "grad_norm": 0.13538982335741828, "kl": 0.15130615234375, "learning_rate": 4.961168762463978e-07, "loss": 0.0002, "reward": 1.774107202887535, "reward_std": 0.056821079924702644, "rewards/equation_reward_func": 0.7803571783006191, "rewards/format_reward_func": 0.9937500059604645, "step": 11406 }, { "completion_length": 266.1651906967163, "epoch": 1.9126954189194854, "grad_norm": 0.8431514704194816, "kl": 0.159820556640625, "learning_rate": 4.961147699054579e-07, "loss": 0.0002, "reward": 1.757142923772335, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7660714611411095, "rewards/format_reward_func": 0.9910714328289032, "step": 11408 }, { "completion_length": 277.15180110931396, "epoch": 1.9130307221593528, "grad_norm": 1.5579412370998411, "kl": 0.156585693359375, "learning_rate": 4.961126629978707e-07, "loss": 0.0002, "reward": 1.7482143342494965, "reward_std": 0.0732360603287816, "rewards/equation_reward_func": 0.7705357447266579, "rewards/format_reward_func": 0.9776785783469677, "step": 11410 }, { "completion_length": 285.42858505249023, "epoch": 1.9133660253992204, "grad_norm": 0.41450893660091553, "kl": 0.1715240478515625, "learning_rate": 4.961105555236408e-07, "loss": 0.0002, "reward": 1.7928572073578835, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.8017857484519482, "rewards/format_reward_func": 0.9910714328289032, "step": 11412 }, { "completion_length": 274.0803689956665, "epoch": 1.913701328639088, "grad_norm": 0.24263481536803763, "kl": 0.14422607421875, "learning_rate": 4.961084474827731e-07, "loss": 0.0001, "reward": 1.7446429133415222, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7580357491970062, "rewards/format_reward_func": 0.9866071492433548, "step": 11414 }, { "completion_length": 280.65179443359375, "epoch": 1.9140366318789557, "grad_norm": 1.6763596841012791, "kl": 2.72528076171875, "learning_rate": 4.961063388752726e-07, "loss": 0.0027, "reward": 1.6982143819332123, "reward_std": 0.11364216078072786, "rewards/equation_reward_func": 0.7383928745985031, "rewards/format_reward_func": 0.9598214477300644, "step": 11416 }, { "completion_length": 286.12054538726807, "epoch": 1.914371935118823, "grad_norm": 0.6866111926368569, "kl": 0.192596435546875, "learning_rate": 4.961042297011441e-07, "loss": 0.0002, "reward": 1.6571429297327995, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.6928571723401546, "rewards/format_reward_func": 0.9642857275903225, "step": 11418 }, { "completion_length": 268.3839387893677, "epoch": 1.9147072383586905, "grad_norm": 0.2820457558191292, "kl": 0.11151123046875, "learning_rate": 4.961021199603923e-07, "loss": 0.0001, "reward": 1.769642911851406, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7830357402563095, "rewards/format_reward_func": 0.9866071492433548, "step": 11420 }, { "completion_length": 270.7812623977661, "epoch": 1.9150425415985581, "grad_norm": 0.13665542251726445, "kl": 0.1313629150390625, "learning_rate": 4.961000096530222e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.04545686487108469, "rewards/equation_reward_func": 0.7839285954833031, "rewards/format_reward_func": 0.9910714328289032, "step": 11422 }, { "completion_length": 268.5669765472412, "epoch": 1.9153778448384258, "grad_norm": 0.39813810964567975, "kl": 0.13531494140625, "learning_rate": 4.960978987790386e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428809314966, "rewards/format_reward_func": 1.0, "step": 11424 }, { "completion_length": 251.665189743042, "epoch": 1.9157131480782934, "grad_norm": 0.10712173856582168, "kl": 0.0943603515625, "learning_rate": 4.960957873384465e-07, "loss": 0.0001, "reward": 1.8392857536673546, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8392857164144516, "rewards/format_reward_func": 1.0, "step": 11426 }, { "completion_length": 255.4062623977661, "epoch": 1.9160484513181608, "grad_norm": 0.14126711549786453, "kl": 0.1662445068359375, "learning_rate": 4.960936753312506e-07, "loss": 0.0002, "reward": 1.807142898440361, "reward_std": 0.04040610231459141, "rewards/equation_reward_func": 0.8160714581608772, "rewards/format_reward_func": 0.9910714328289032, "step": 11428 }, { "completion_length": 248.08037185668945, "epoch": 1.9163837545580285, "grad_norm": 0.3138137135372601, "kl": 0.113006591796875, "learning_rate": 4.960915627574558e-07, "loss": 0.0001, "reward": 1.7303572297096252, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.7348214536905289, "rewards/format_reward_func": 0.9955357164144516, "step": 11430 }, { "completion_length": 247.3169765472412, "epoch": 1.9167190577978959, "grad_norm": 0.04930311204874395, "kl": 0.1185302734375, "learning_rate": 4.96089449617067e-07, "loss": 0.0001, "reward": 1.785714328289032, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 11432 }, { "completion_length": 253.94197368621826, "epoch": 1.9170543610377635, "grad_norm": 0.34253288139320753, "kl": 0.177154541015625, "learning_rate": 4.96087335910089e-07, "loss": 0.0002, "reward": 1.760714329779148, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 11434 }, { "completion_length": 248.2812614440918, "epoch": 1.9173896642776311, "grad_norm": 0.15253380622826307, "kl": 0.120941162109375, "learning_rate": 4.960852216365268e-07, "loss": 0.0001, "reward": 1.7642857655882835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 11436 }, { "completion_length": 259.6741180419922, "epoch": 1.9177249675174988, "grad_norm": 0.16595620257856564, "kl": 0.131561279296875, "learning_rate": 4.960831067963851e-07, "loss": 0.0001, "reward": 1.7392857670783997, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.748214315623045, "rewards/format_reward_func": 0.9910714328289032, "step": 11438 }, { "completion_length": 242.8125114440918, "epoch": 1.9180602707573662, "grad_norm": 0.13210774905736541, "kl": 0.104461669921875, "learning_rate": 4.960809913896689e-07, "loss": 0.0001, "reward": 1.7571429014205933, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 11440 }, { "completion_length": 250.49554920196533, "epoch": 1.9183955739972336, "grad_norm": 0.004442574314263273, "kl": 0.148345947265625, "learning_rate": 4.960788754163829e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 1.0, "step": 11442 }, { "completion_length": 235.95090103149414, "epoch": 1.9187308772371012, "grad_norm": 0.13640183603938238, "kl": 0.125457763671875, "learning_rate": 4.960767588765322e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 11444 }, { "completion_length": 242.54911613464355, "epoch": 1.9190661804769689, "grad_norm": 0.3164099307381995, "kl": 0.1171875, "learning_rate": 4.960746417701215e-07, "loss": 0.0001, "reward": 1.8607143312692642, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8607143051922321, "rewards/format_reward_func": 1.0, "step": 11446 }, { "completion_length": 258.3526906967163, "epoch": 1.9194014837168365, "grad_norm": 0.24606226713899274, "kl": 0.09759521484375, "learning_rate": 4.960725240971558e-07, "loss": 0.0001, "reward": 1.7446429133415222, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7491071633994579, "rewards/format_reward_func": 0.9955357164144516, "step": 11448 }, { "completion_length": 249.64733219146729, "epoch": 1.9197367869567041, "grad_norm": 0.21903366414071526, "kl": 0.117919921875, "learning_rate": 4.960704058576399e-07, "loss": 0.0001, "reward": 1.789285771548748, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857566475868, "rewards/format_reward_func": 1.0, "step": 11450 }, { "completion_length": 250.1428680419922, "epoch": 1.9200720901965715, "grad_norm": 0.11739844578988058, "kl": 0.15765380859375, "learning_rate": 4.960682870515786e-07, "loss": 0.0002, "reward": 1.700000062584877, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7000000365078449, "rewards/format_reward_func": 1.0, "step": 11452 }, { "completion_length": 252.415189743042, "epoch": 1.920407393436439, "grad_norm": 0.23504962226583487, "kl": 0.130767822265625, "learning_rate": 4.96066167678977e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 11454 }, { "completion_length": 250.7500114440918, "epoch": 1.9207426966763066, "grad_norm": 0.17987810570866028, "kl": 0.1189422607421875, "learning_rate": 4.960640477398398e-07, "loss": 0.0001, "reward": 1.735714390873909, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 11456 }, { "completion_length": 251.9062614440918, "epoch": 1.9210779999161742, "grad_norm": 0.5340453361696048, "kl": 0.1201019287109375, "learning_rate": 4.96061927234172e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 11458 }, { "completion_length": 257.8660840988159, "epoch": 1.9214133031560419, "grad_norm": 0.06368641751747849, "kl": 0.11151123046875, "learning_rate": 4.960598061619782e-07, "loss": 0.0001, "reward": 1.826785758137703, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8401785865426064, "rewards/format_reward_func": 0.9866071492433548, "step": 11460 }, { "completion_length": 260.2812614440918, "epoch": 1.9217486063959093, "grad_norm": 0.20438380505560297, "kl": 0.2633209228515625, "learning_rate": 4.960576845232637e-07, "loss": 0.0003, "reward": 1.7714286297559738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 11462 }, { "completion_length": 272.7500123977661, "epoch": 1.922083909635777, "grad_norm": 0.4689894616913192, "kl": 0.259124755859375, "learning_rate": 4.960555623180331e-07, "loss": 0.0003, "reward": 1.712500087916851, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7169643212109804, "rewards/format_reward_func": 0.9955357164144516, "step": 11464 }, { "completion_length": 262.7098321914673, "epoch": 1.9224192128756443, "grad_norm": 0.198931744948235, "kl": 0.163665771484375, "learning_rate": 4.960534395462913e-07, "loss": 0.0002, "reward": 1.7660714909434319, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 11466 }, { "completion_length": 262.0714406967163, "epoch": 1.922754516115512, "grad_norm": 0.16364250738570876, "kl": 0.1119537353515625, "learning_rate": 4.960513162080434e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857540398836, "rewards/format_reward_func": 1.0, "step": 11468 }, { "completion_length": 254.85715007781982, "epoch": 1.9230898193553796, "grad_norm": 0.17039801150252207, "kl": 0.227020263671875, "learning_rate": 4.96049192303294e-07, "loss": 0.0002, "reward": 1.7178572341799736, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7178571671247482, "rewards/format_reward_func": 1.0, "step": 11470 }, { "completion_length": 248.5312614440918, "epoch": 1.9234251225952472, "grad_norm": 0.2090763004651719, "kl": 0.159637451171875, "learning_rate": 4.960470678320482e-07, "loss": 0.0002, "reward": 1.7642857655882835, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7732143104076385, "rewards/format_reward_func": 0.9910714328289032, "step": 11472 }, { "completion_length": 260.42412090301514, "epoch": 1.9237604258351146, "grad_norm": 0.23593082654275854, "kl": 0.112701416015625, "learning_rate": 4.960449427943108e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714752972126, "rewards/format_reward_func": 1.0, "step": 11474 }, { "completion_length": 275.6696548461914, "epoch": 1.924095729074982, "grad_norm": 0.20935209515541264, "kl": 0.1348876953125, "learning_rate": 4.960428171900868e-07, "loss": 0.0001, "reward": 1.7446429133415222, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.7580357491970062, "rewards/format_reward_func": 0.9866071492433548, "step": 11476 }, { "completion_length": 262.495548248291, "epoch": 1.9244310323148497, "grad_norm": 0.21949802358359427, "kl": 0.119415283203125, "learning_rate": 4.960406910193809e-07, "loss": 0.0001, "reward": 1.7875000461935997, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7919643148779869, "rewards/format_reward_func": 0.9955357164144516, "step": 11478 }, { "completion_length": 269.5134038925171, "epoch": 1.9247663355547173, "grad_norm": 0.15969970131972752, "kl": 0.154083251953125, "learning_rate": 4.960385642821982e-07, "loss": 0.0002, "reward": 1.8035714775323868, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 11480 }, { "completion_length": 271.7321557998657, "epoch": 1.925101638794585, "grad_norm": 0.22003888136350389, "kl": 0.22259521484375, "learning_rate": 4.960364369785433e-07, "loss": 0.0002, "reward": 1.760714329779148, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.769642885774374, "rewards/format_reward_func": 0.9910714328289032, "step": 11482 }, { "completion_length": 273.16965770721436, "epoch": 1.9254369420344524, "grad_norm": 0.4368958131528556, "kl": 0.192474365234375, "learning_rate": 4.960343091084215e-07, "loss": 0.0002, "reward": 1.7357143610715866, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7446428947150707, "rewards/format_reward_func": 0.9910714328289032, "step": 11484 }, { "completion_length": 275.43750953674316, "epoch": 1.92577224527432, "grad_norm": 0.1728484515807659, "kl": 0.2697906494140625, "learning_rate": 4.960321806718375e-07, "loss": 0.0003, "reward": 1.8178571835160255, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8178571723401546, "rewards/format_reward_func": 1.0, "step": 11486 }, { "completion_length": 276.66965675354004, "epoch": 1.9261075485141874, "grad_norm": 0.5261602344874771, "kl": 0.2050933837890625, "learning_rate": 4.960300516687961e-07, "loss": 0.0002, "reward": 1.7750000581145287, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7928571626543999, "rewards/format_reward_func": 0.9821428656578064, "step": 11488 }, { "completion_length": 269.5848340988159, "epoch": 1.926442851754055, "grad_norm": 0.19601843506053898, "kl": 0.2016143798828125, "learning_rate": 4.960279220993023e-07, "loss": 0.0002, "reward": 1.7053572311997414, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7098214700818062, "rewards/format_reward_func": 0.9955357164144516, "step": 11490 }, { "completion_length": 270.68751335144043, "epoch": 1.9267781549939227, "grad_norm": 0.17990204744519162, "kl": 0.17333984375, "learning_rate": 4.960257919633611e-07, "loss": 0.0002, "reward": 1.7857143133878708, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7946428991854191, "rewards/format_reward_func": 0.9910714328289032, "step": 11492 }, { "completion_length": 262.0982265472412, "epoch": 1.9271134582337903, "grad_norm": 0.27930705227496755, "kl": 0.2054595947265625, "learning_rate": 4.960236612609773e-07, "loss": 0.0002, "reward": 1.733928643167019, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7383928876370192, "rewards/format_reward_func": 0.9955357164144516, "step": 11494 }, { "completion_length": 263.8571557998657, "epoch": 1.9274487614736577, "grad_norm": 0.23885652033263852, "kl": 0.25677490234375, "learning_rate": 4.960215299921557e-07, "loss": 0.0003, "reward": 1.7821429148316383, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 11496 }, { "completion_length": 261.25447273254395, "epoch": 1.9277840647135251, "grad_norm": 0.11426829648277192, "kl": 0.13201904296875, "learning_rate": 4.960193981569014e-07, "loss": 0.0001, "reward": 1.773214340209961, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.7776786088943481, "rewards/format_reward_func": 0.9955357164144516, "step": 11498 }, { "completion_length": 257.1428689956665, "epoch": 1.9281193679533928, "grad_norm": 0.14128657839377187, "kl": 0.23486328125, "learning_rate": 4.960172657552192e-07, "loss": 0.0002, "reward": 1.7535715028643608, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 11500 }, { "completion_length": 269.8214387893677, "epoch": 1.9284546711932604, "grad_norm": 0.04353958717156185, "kl": 0.460693359375, "learning_rate": 4.960151327871141e-07, "loss": 0.0005, "reward": 1.7785714864730835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 11502 }, { "completion_length": 252.66072750091553, "epoch": 1.928789974433128, "grad_norm": 0.15968991693381757, "kl": 0.16265869140625, "learning_rate": 4.960129992525909e-07, "loss": 0.0002, "reward": 1.76607146859169, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7705357577651739, "rewards/format_reward_func": 0.9955357164144516, "step": 11504 }, { "completion_length": 263.5044775009155, "epoch": 1.9291252776729955, "grad_norm": 0.2532534123405873, "kl": 0.210540771484375, "learning_rate": 4.960108651516545e-07, "loss": 0.0002, "reward": 1.7571429535746574, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 11506 }, { "completion_length": 267.4821548461914, "epoch": 1.929460580912863, "grad_norm": 0.14032272575220447, "kl": 0.331695556640625, "learning_rate": 4.960087304843099e-07, "loss": 0.0003, "reward": 1.8178571835160255, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8178571611642838, "rewards/format_reward_func": 1.0, "step": 11508 }, { "completion_length": 254.3035831451416, "epoch": 1.9297958841527305, "grad_norm": 0.14579727584127364, "kl": 0.116943359375, "learning_rate": 4.96006595250562e-07, "loss": 0.0001, "reward": 1.8250000402331352, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.825000025331974, "rewards/format_reward_func": 1.0, "step": 11510 }, { "completion_length": 260.56251335144043, "epoch": 1.9301311873925981, "grad_norm": 0.2295536831446175, "kl": 0.098663330078125, "learning_rate": 4.960044594504158e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7991071678698063, "rewards/format_reward_func": 0.9955357164144516, "step": 11512 }, { "completion_length": 272.0000104904175, "epoch": 1.9304664906324658, "grad_norm": 0.1013315839791849, "kl": 0.131591796875, "learning_rate": 4.96002323083876e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 11514 }, { "completion_length": 267.3169775009155, "epoch": 1.9308017938723334, "grad_norm": 0.2789793491517081, "kl": 0.112640380859375, "learning_rate": 4.960001861509477e-07, "loss": 0.0001, "reward": 1.744642935693264, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.9955357164144516, "step": 11516 }, { "completion_length": 262.5625114440918, "epoch": 1.9311370971122008, "grad_norm": 0.23336605649958678, "kl": 0.1778564453125, "learning_rate": 4.959980486516358e-07, "loss": 0.0002, "reward": 1.725000061094761, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7250000387430191, "rewards/format_reward_func": 1.0, "step": 11518 }, { "completion_length": 275.1562662124634, "epoch": 1.9314724003520682, "grad_norm": 0.19566252414623606, "kl": 0.120880126953125, "learning_rate": 4.959959105859451e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714656114578, "rewards/format_reward_func": 1.0, "step": 11520 }, { "completion_length": 273.3035821914673, "epoch": 1.9318077035919359, "grad_norm": 0.26468819573733504, "kl": 0.131500244140625, "learning_rate": 4.959937719538806e-07, "loss": 0.0001, "reward": 1.728571504354477, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7375000342726707, "rewards/format_reward_func": 0.9910714328289032, "step": 11522 }, { "completion_length": 278.30804920196533, "epoch": 1.9321430068318035, "grad_norm": 0.16803667829067487, "kl": 0.1395416259765625, "learning_rate": 4.959916327554473e-07, "loss": 0.0001, "reward": 1.7107143476605415, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7107143253087997, "rewards/format_reward_func": 1.0, "step": 11524 }, { "completion_length": 282.00000953674316, "epoch": 1.9324783100716711, "grad_norm": 0.436746719978823, "kl": 0.212310791015625, "learning_rate": 4.959894929906499e-07, "loss": 0.0002, "reward": 1.7464286535978317, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 11526 }, { "completion_length": 274.15180110931396, "epoch": 1.9328136133115388, "grad_norm": 0.15089911493730374, "kl": 0.23822021484375, "learning_rate": 4.959873526594937e-07, "loss": 0.0002, "reward": 1.7196429148316383, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7241071872413158, "rewards/format_reward_func": 0.9955357164144516, "step": 11528 }, { "completion_length": 268.16519355773926, "epoch": 1.9331489165514062, "grad_norm": 0.24695749023732816, "kl": 0.290924072265625, "learning_rate": 4.959852117619834e-07, "loss": 0.0003, "reward": 1.7410714849829674, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7455357518047094, "rewards/format_reward_func": 0.9955357164144516, "step": 11530 }, { "completion_length": 267.61608505249023, "epoch": 1.9334842197912736, "grad_norm": 0.15763417332067145, "kl": 0.22369384765625, "learning_rate": 4.959830702981237e-07, "loss": 0.0002, "reward": 1.7392857894301414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7392857298254967, "rewards/format_reward_func": 1.0, "step": 11532 }, { "completion_length": 281.9687623977661, "epoch": 1.9338195230311412, "grad_norm": 0.22574997557198315, "kl": 0.16107177734375, "learning_rate": 4.9598092826792e-07, "loss": 0.0002, "reward": 1.737500086426735, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7419643141329288, "rewards/format_reward_func": 0.9955357164144516, "step": 11534 }, { "completion_length": 272.75447845458984, "epoch": 1.9341548262710089, "grad_norm": 0.20676245907015658, "kl": 0.257080078125, "learning_rate": 4.95978785671377e-07, "loss": 0.0003, "reward": 1.7642858028411865, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 11536 }, { "completion_length": 273.80358695983887, "epoch": 1.9344901295108765, "grad_norm": 0.11482451783793798, "kl": 0.14068603515625, "learning_rate": 4.959766425084995e-07, "loss": 0.0001, "reward": 1.776785746216774, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7901786006987095, "rewards/format_reward_func": 0.9866071492433548, "step": 11538 }, { "completion_length": 283.5178699493408, "epoch": 1.934825432750744, "grad_norm": 0.42962985392694314, "kl": 0.11175537109375, "learning_rate": 4.959744987792926e-07, "loss": 0.0001, "reward": 1.816071480512619, "reward_std": 0.07828682009130716, "rewards/equation_reward_func": 0.8205357491970062, "rewards/format_reward_func": 0.9955357164144516, "step": 11540 }, { "completion_length": 262.7500114440918, "epoch": 1.9351607359906116, "grad_norm": 0.26195593090194097, "kl": 0.10430908203125, "learning_rate": 4.959723544837611e-07, "loss": 0.0001, "reward": 1.8785714581608772, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8785714469850063, "rewards/format_reward_func": 1.0, "step": 11542 }, { "completion_length": 265.85269260406494, "epoch": 1.935496039230479, "grad_norm": 0.22136712208141024, "kl": 0.103271484375, "learning_rate": 4.959702096219103e-07, "loss": 0.0001, "reward": 1.8178571835160255, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.8267857432365417, "rewards/format_reward_func": 0.9910714328289032, "step": 11544 }, { "completion_length": 276.78572845458984, "epoch": 1.9358313424703466, "grad_norm": 0.3523477652786812, "kl": 0.125274658203125, "learning_rate": 4.959680641937447e-07, "loss": 0.0001, "reward": 1.7428572326898575, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 11546 }, { "completion_length": 276.51340675354004, "epoch": 1.9361666457102142, "grad_norm": 0.12306280651202639, "kl": 0.116790771484375, "learning_rate": 4.959659181992695e-07, "loss": 0.0001, "reward": 1.7892857566475868, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7982143126428127, "rewards/format_reward_func": 0.9910714328289032, "step": 11548 }, { "completion_length": 268.9509048461914, "epoch": 1.9365019489500819, "grad_norm": 0.1913487796137698, "kl": 0.123565673828125, "learning_rate": 4.959637716384895e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 11550 }, { "completion_length": 271.3392972946167, "epoch": 1.9368372521899493, "grad_norm": 0.14629526246249022, "kl": 0.1084747314453125, "learning_rate": 4.959616245114097e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 11552 }, { "completion_length": 279.4196548461914, "epoch": 1.9371725554298167, "grad_norm": 0.09694533133624259, "kl": 0.129730224609375, "learning_rate": 4.95959476818035e-07, "loss": 0.0001, "reward": 1.7660714760422707, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9955357164144516, "step": 11554 }, { "completion_length": 271.45983123779297, "epoch": 1.9375078586696843, "grad_norm": 0.2217887064929058, "kl": 0.11297607421875, "learning_rate": 4.959573285583706e-07, "loss": 0.0001, "reward": 1.8071429133415222, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428686380386, "rewards/format_reward_func": 1.0, "step": 11556 }, { "completion_length": 272.4732275009155, "epoch": 1.937843161909552, "grad_norm": 0.23558904806238534, "kl": 0.132354736328125, "learning_rate": 4.959551797324211e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964286059141159, "rewards/format_reward_func": 1.0, "step": 11558 }, { "completion_length": 266.68305110931396, "epoch": 1.9381784651494196, "grad_norm": 0.22151610735532148, "kl": 0.1597137451171875, "learning_rate": 4.959530303401915e-07, "loss": 0.0002, "reward": 1.8169643357396126, "reward_std": 0.05682107899338007, "rewards/equation_reward_func": 0.8232143074274063, "rewards/format_reward_func": 0.9937500059604645, "step": 11560 }, { "completion_length": 267.44644355773926, "epoch": 1.938513768389287, "grad_norm": 0.1748234891363921, "kl": 0.1086578369140625, "learning_rate": 4.959508803816868e-07, "loss": 0.0001, "reward": 1.7589286342263222, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 11562 }, { "completion_length": 284.33930015563965, "epoch": 1.9388490716291547, "grad_norm": 0.189758110151058, "kl": 0.1158447265625, "learning_rate": 4.959487298569121e-07, "loss": 0.0001, "reward": 1.7875000536441803, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.8008928745985031, "rewards/format_reward_func": 0.9866071492433548, "step": 11564 }, { "completion_length": 273.1964416503906, "epoch": 1.939184374869022, "grad_norm": 0.22127814925587067, "kl": 0.1219329833984375, "learning_rate": 4.959465787658723e-07, "loss": 0.0001, "reward": 1.7464286237955093, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7553571723401546, "rewards/format_reward_func": 0.9910714328289032, "step": 11566 }, { "completion_length": 270.5446538925171, "epoch": 1.9395196781088897, "grad_norm": 0.2793978109461439, "kl": 0.2030029296875, "learning_rate": 4.959444271085723e-07, "loss": 0.0002, "reward": 1.7214286029338837, "reward_std": 0.0656599160283804, "rewards/equation_reward_func": 0.7482143081724644, "rewards/format_reward_func": 0.9732142984867096, "step": 11568 }, { "completion_length": 273.8259086608887, "epoch": 1.9398549813487573, "grad_norm": 0.550426955461273, "kl": 0.2730712890625, "learning_rate": 4.959422748850168e-07, "loss": 0.0003, "reward": 1.7321429327130318, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7410714421421289, "rewards/format_reward_func": 0.9910714328289032, "step": 11570 }, { "completion_length": 277.6205472946167, "epoch": 1.940190284588625, "grad_norm": 0.19508560241066944, "kl": 0.1059417724609375, "learning_rate": 4.959401220952112e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7910714522004128, "rewards/format_reward_func": 0.9910714328289032, "step": 11572 }, { "completion_length": 277.46429443359375, "epoch": 1.9405255878284924, "grad_norm": 0.44544147580496724, "kl": 0.1553955078125, "learning_rate": 4.959379687391602e-07, "loss": 0.0002, "reward": 1.803571492433548, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 11574 }, { "completion_length": 272.8750123977661, "epoch": 1.9408608910683598, "grad_norm": 0.20959783360587844, "kl": 0.17010498046875, "learning_rate": 4.959358148168687e-07, "loss": 0.0002, "reward": 1.7678572088479996, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7767857536673546, "rewards/format_reward_func": 0.9910714328289032, "step": 11576 }, { "completion_length": 267.2009057998657, "epoch": 1.9411961943082274, "grad_norm": 0.23572829651922414, "kl": 0.221099853515625, "learning_rate": 4.95933660328342e-07, "loss": 0.0002, "reward": 1.8000000715255737, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.8089285977184772, "rewards/format_reward_func": 0.9910714328289032, "step": 11578 }, { "completion_length": 276.2991180419922, "epoch": 1.941531497548095, "grad_norm": 0.1705477545589577, "kl": 0.1273193359375, "learning_rate": 4.959315052735846e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500409781933, "rewards/format_reward_func": 0.9955357164144516, "step": 11580 }, { "completion_length": 261.55358600616455, "epoch": 1.9418668007879627, "grad_norm": 0.4312032446580683, "kl": 0.151031494140625, "learning_rate": 4.959293496526018e-07, "loss": 0.0002, "reward": 1.7875000536441803, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7919643111526966, "rewards/format_reward_func": 0.9955357164144516, "step": 11582 }, { "completion_length": 269.62054443359375, "epoch": 1.9422021040278303, "grad_norm": 0.17382506989411386, "kl": 0.1229248046875, "learning_rate": 4.959271934653985e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 11584 }, { "completion_length": 271.15179538726807, "epoch": 1.9425374072676977, "grad_norm": 0.22970344709163384, "kl": 0.3343658447265625, "learning_rate": 4.959250367119795e-07, "loss": 0.0003, "reward": 1.73214291036129, "reward_std": 0.10606601648032665, "rewards/equation_reward_func": 0.7500000204890966, "rewards/format_reward_func": 0.9821428656578064, "step": 11586 }, { "completion_length": 277.8303680419922, "epoch": 1.9428727105075652, "grad_norm": 0.24053185195149798, "kl": 0.14495849609375, "learning_rate": 4.9592287939235e-07, "loss": 0.0001, "reward": 1.7357143387198448, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7446428835391998, "rewards/format_reward_func": 0.9910714328289032, "step": 11588 }, { "completion_length": 265.2723340988159, "epoch": 1.9432080137474328, "grad_norm": 0.2155865915403086, "kl": 0.13531494140625, "learning_rate": 4.959207215065148e-07, "loss": 0.0001, "reward": 1.7232143729925156, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276785895228386, "rewards/format_reward_func": 0.9955357164144516, "step": 11590 }, { "completion_length": 263.0491199493408, "epoch": 1.9435433169873004, "grad_norm": 0.19655043391107452, "kl": 0.6219482421875, "learning_rate": 4.959185630544788e-07, "loss": 0.0006, "reward": 1.760714367032051, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 11592 }, { "completion_length": 264.5535831451416, "epoch": 1.943878620227168, "grad_norm": 0.32884497758459014, "kl": 0.231201171875, "learning_rate": 4.959164040362473e-07, "loss": 0.0002, "reward": 1.7375000789761543, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7419643327593803, "rewards/format_reward_func": 0.9955357164144516, "step": 11594 }, { "completion_length": 263.308048248291, "epoch": 1.9442139234670355, "grad_norm": 0.12803094395008807, "kl": 0.25775146484375, "learning_rate": 4.959142444518249e-07, "loss": 0.0003, "reward": 1.7196429371833801, "reward_std": 0.06313453428447247, "rewards/equation_reward_func": 0.7330357320606709, "rewards/format_reward_func": 0.9866071492433548, "step": 11596 }, { "completion_length": 263.7901906967163, "epoch": 1.944549226706903, "grad_norm": 0.2162849149079578, "kl": 0.13641357421875, "learning_rate": 4.959120843012168e-07, "loss": 0.0001, "reward": 1.8196428939700127, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8241071589291096, "rewards/format_reward_func": 0.9955357164144516, "step": 11598 }, { "completion_length": 253.55804538726807, "epoch": 1.9448845299467705, "grad_norm": 0.27344404414924084, "kl": 0.5733642578125, "learning_rate": 4.959099235844278e-07, "loss": 0.0006, "reward": 1.7982143461704254, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.8026785999536514, "rewards/format_reward_func": 0.9955357164144516, "step": 11600 }, { "completion_length": 255.9464406967163, "epoch": 1.9452198331866382, "grad_norm": 0.19967799496439764, "kl": 0.13580322265625, "learning_rate": 4.959077623014631e-07, "loss": 0.0001, "reward": 1.742857202887535, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 11602 }, { "completion_length": 255.94197750091553, "epoch": 1.9455551364265058, "grad_norm": 0.21747384010928997, "kl": 0.6826171875, "learning_rate": 4.959056004523275e-07, "loss": 0.0007, "reward": 1.7642857506871223, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7732143178582191, "rewards/format_reward_func": 0.9910714328289032, "step": 11604 }, { "completion_length": 252.40179920196533, "epoch": 1.9458904396663734, "grad_norm": 0.13116440361104098, "kl": 0.253143310546875, "learning_rate": 4.959034380370261e-07, "loss": 0.0003, "reward": 1.7142857983708382, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7142857424914837, "rewards/format_reward_func": 1.0, "step": 11606 }, { "completion_length": 258.94643783569336, "epoch": 1.9462257429062408, "grad_norm": 0.17006587057823047, "kl": 0.4481964111328125, "learning_rate": 4.959012750555638e-07, "loss": 0.0004, "reward": 1.7553572058677673, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7598214708268642, "rewards/format_reward_func": 0.9955357164144516, "step": 11608 }, { "completion_length": 251.8482255935669, "epoch": 1.9465610461461083, "grad_norm": 0.17390186294610058, "kl": 0.245513916015625, "learning_rate": 4.958991115079455e-07, "loss": 0.0002, "reward": 1.7785714864730835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 11610 }, { "completion_length": 252.4509048461914, "epoch": 1.9468963493859759, "grad_norm": 0.13355463954568156, "kl": 0.18939208984375, "learning_rate": 4.958969473941763e-07, "loss": 0.0002, "reward": 1.6750000789761543, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.6750000305473804, "rewards/format_reward_func": 1.0, "step": 11612 }, { "completion_length": 263.87055015563965, "epoch": 1.9472316526258435, "grad_norm": 0.30431565753795486, "kl": 0.47369384765625, "learning_rate": 4.958947827142612e-07, "loss": 0.0005, "reward": 1.7303571999073029, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7348214630037546, "rewards/format_reward_func": 0.9955357164144516, "step": 11614 }, { "completion_length": 260.1562614440918, "epoch": 1.9475669558657112, "grad_norm": 0.2853115135319929, "kl": 0.405059814453125, "learning_rate": 4.958926174682052e-07, "loss": 0.0004, "reward": 1.7303572222590446, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.743750024586916, "rewards/format_reward_func": 0.9866071492433548, "step": 11616 }, { "completion_length": 252.3392972946167, "epoch": 1.9479022591055786, "grad_norm": 0.15608169259880164, "kl": 0.25421142578125, "learning_rate": 4.958904516560132e-07, "loss": 0.0003, "reward": 1.771428607404232, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286148548126, "rewards/format_reward_func": 1.0, "step": 11618 }, { "completion_length": 247.1339406967163, "epoch": 1.9482375623454462, "grad_norm": 0.17523372472153195, "kl": 0.204986572265625, "learning_rate": 4.958882852776901e-07, "loss": 0.0002, "reward": 1.7196429297327995, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7241071816533804, "rewards/format_reward_func": 0.9955357164144516, "step": 11620 }, { "completion_length": 241.5134048461914, "epoch": 1.9485728655853136, "grad_norm": 0.2508027085559755, "kl": 0.18426513671875, "learning_rate": 4.958861183332411e-07, "loss": 0.0002, "reward": 1.7321429252624512, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428824216127, "rewards/format_reward_func": 1.0, "step": 11622 }, { "completion_length": 240.7187614440918, "epoch": 1.9489081688251813, "grad_norm": 0.19603250128713348, "kl": 0.172637939453125, "learning_rate": 4.95883950822671e-07, "loss": 0.0002, "reward": 1.8178571835160255, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8178571630269289, "rewards/format_reward_func": 1.0, "step": 11624 }, { "completion_length": 245.19197368621826, "epoch": 1.9492434720650489, "grad_norm": 0.32498217250259853, "kl": 0.6026763916015625, "learning_rate": 4.95881782745985e-07, "loss": 0.0006, "reward": 1.7982143610715866, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8026785850524902, "rewards/format_reward_func": 0.9955357164144516, "step": 11626 }, { "completion_length": 244.44643878936768, "epoch": 1.9495787753049165, "grad_norm": 0.08157777296903064, "kl": 0.193695068359375, "learning_rate": 4.958796141031878e-07, "loss": 0.0002, "reward": 1.775000087916851, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 11628 }, { "completion_length": 246.67858409881592, "epoch": 1.949914078544784, "grad_norm": 0.24126868290716924, "kl": 0.1125946044921875, "learning_rate": 4.958774448942847e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 11630 }, { "completion_length": 250.96429634094238, "epoch": 1.9502493817846513, "grad_norm": 0.058257893302917085, "kl": 0.1483154296875, "learning_rate": 4.958752751192805e-07, "loss": 0.0001, "reward": 1.7982143461704254, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8026785999536514, "rewards/format_reward_func": 0.9955357164144516, "step": 11632 }, { "completion_length": 262.72768783569336, "epoch": 1.950584685024519, "grad_norm": 0.19758306784761007, "kl": 3.353057861328125, "learning_rate": 4.958731047781803e-07, "loss": 0.0034, "reward": 1.7053572162985802, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.718750037252903, "rewards/format_reward_func": 0.9866071492433548, "step": 11634 }, { "completion_length": 250.64287185668945, "epoch": 1.9509199882643866, "grad_norm": 0.26712866657750123, "kl": 0.165252685546875, "learning_rate": 4.958709338709889e-07, "loss": 0.0002, "reward": 1.730357214808464, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7348214685916901, "rewards/format_reward_func": 0.9955357164144516, "step": 11636 }, { "completion_length": 261.1651916503906, "epoch": 1.9512552915042543, "grad_norm": 0.19153741964710466, "kl": 1.4565277099609375, "learning_rate": 4.958687623977117e-07, "loss": 0.0015, "reward": 1.767857201397419, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.776785746216774, "rewards/format_reward_func": 0.9910714328289032, "step": 11638 }, { "completion_length": 258.1116199493408, "epoch": 1.9515905947441217, "grad_norm": 0.1725781835403997, "kl": 0.469635009765625, "learning_rate": 4.958665903583533e-07, "loss": 0.0005, "reward": 1.8053571954369545, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8098214603960514, "rewards/format_reward_func": 0.9955357164144516, "step": 11640 }, { "completion_length": 266.96876335144043, "epoch": 1.9519258979839893, "grad_norm": 0.3686855380317879, "kl": 0.53802490234375, "learning_rate": 4.958644177529189e-07, "loss": 0.0005, "reward": 1.7535715028643608, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 0.9821428656578064, "step": 11642 }, { "completion_length": 244.67858123779297, "epoch": 1.9522612012238567, "grad_norm": 0.08202584755809787, "kl": 0.1743621826171875, "learning_rate": 4.958622445814133e-07, "loss": 0.0002, "reward": 1.8017857670783997, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.8062500320374966, "rewards/format_reward_func": 0.9955357164144516, "step": 11644 }, { "completion_length": 253.4553680419922, "epoch": 1.9525965044637243, "grad_norm": 0.28201809123002675, "kl": 0.7519073486328125, "learning_rate": 4.958600708438418e-07, "loss": 0.0008, "reward": 1.7910714894533157, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.8044643066823483, "rewards/format_reward_func": 0.9866071492433548, "step": 11646 }, { "completion_length": 258.1875104904175, "epoch": 1.952931807703592, "grad_norm": 0.45200193199739935, "kl": 1.96221923828125, "learning_rate": 4.958578965402093e-07, "loss": 0.002, "reward": 1.742857187986374, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 0.9821428619325161, "step": 11648 }, { "completion_length": 256.8973321914673, "epoch": 1.9532671109434596, "grad_norm": 0.22485174876966724, "kl": 1.655181884765625, "learning_rate": 4.958557216705207e-07, "loss": 0.0017, "reward": 1.7232143580913544, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7366071715950966, "rewards/format_reward_func": 0.9866071492433548, "step": 11650 }, { "completion_length": 252.80358409881592, "epoch": 1.953602414183327, "grad_norm": 0.3054215109505365, "kl": 0.42816162109375, "learning_rate": 4.95853546234781e-07, "loss": 0.0004, "reward": 1.7571429014205933, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7660714574158192, "rewards/format_reward_func": 0.9910714328289032, "step": 11652 }, { "completion_length": 279.90180110931396, "epoch": 1.9539377174231944, "grad_norm": 0.13204873486477667, "kl": 0.4475860595703125, "learning_rate": 4.958513702329953e-07, "loss": 0.0004, "reward": 1.7357143312692642, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7535714469850063, "rewards/format_reward_func": 0.9821428656578064, "step": 11654 }, { "completion_length": 260.99555015563965, "epoch": 1.954273020663062, "grad_norm": 0.2930258195465054, "kl": 0.1341094970703125, "learning_rate": 4.958491936651687e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.09091372694820166, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 0.9821428656578064, "step": 11656 }, { "completion_length": 250.63840293884277, "epoch": 1.9546083239029297, "grad_norm": 0.1968308325811053, "kl": 0.443206787109375, "learning_rate": 4.95847016531306e-07, "loss": 0.0004, "reward": 1.7857143506407738, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7946428842842579, "rewards/format_reward_func": 0.9910714328289032, "step": 11658 }, { "completion_length": 254.2053689956665, "epoch": 1.9549436271427973, "grad_norm": 0.21657157782427686, "kl": 0.255615234375, "learning_rate": 4.958448388314124e-07, "loss": 0.0003, "reward": 1.725000075995922, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7428571619093418, "rewards/format_reward_func": 0.9821428656578064, "step": 11660 }, { "completion_length": 259.3526906967163, "epoch": 1.955278930382665, "grad_norm": 0.1188084083595016, "kl": 0.2137908935546875, "learning_rate": 4.958426605654928e-07, "loss": 0.0002, "reward": 1.8000000342726707, "reward_std": 0.05050762835890055, "rewards/equation_reward_func": 0.808928607031703, "rewards/format_reward_func": 0.9910714328289032, "step": 11662 }, { "completion_length": 250.0803680419922, "epoch": 1.9556142336225324, "grad_norm": 0.45858462645071885, "kl": 0.361175537109375, "learning_rate": 4.958404817335522e-07, "loss": 0.0004, "reward": 1.7071429342031479, "reward_std": 0.0909137288108468, "rewards/equation_reward_func": 0.7339286170899868, "rewards/format_reward_func": 0.9732142984867096, "step": 11664 }, { "completion_length": 238.53126049041748, "epoch": 1.9559495368623998, "grad_norm": 0.16654047526987906, "kl": 0.10797119140625, "learning_rate": 4.958383023355957e-07, "loss": 0.0001, "reward": 1.7553572058677673, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.768750037997961, "rewards/format_reward_func": 0.9866071492433548, "step": 11666 }, { "completion_length": 247.0223331451416, "epoch": 1.9562848401022674, "grad_norm": 0.21473295137029416, "kl": 0.17388916015625, "learning_rate": 4.958361223716282e-07, "loss": 0.0002, "reward": 1.728571504354477, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7285714615136385, "rewards/format_reward_func": 1.0, "step": 11668 }, { "completion_length": 236.5491180419922, "epoch": 1.956620143342135, "grad_norm": 0.1944749028128767, "kl": 0.14154052734375, "learning_rate": 4.95833941841655e-07, "loss": 0.0001, "reward": 1.744642935693264, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.9955357164144516, "step": 11670 }, { "completion_length": 237.00893878936768, "epoch": 1.9569554465820027, "grad_norm": 0.07876787095959105, "kl": 0.208038330078125, "learning_rate": 4.958317607456807e-07, "loss": 0.0002, "reward": 1.7517857775092125, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 11672 }, { "completion_length": 245.40625953674316, "epoch": 1.9572907498218701, "grad_norm": 0.11458006446040274, "kl": 0.266632080078125, "learning_rate": 4.958295790837106e-07, "loss": 0.0003, "reward": 1.719642922282219, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7330357432365417, "rewards/format_reward_func": 0.9866071492433548, "step": 11674 }, { "completion_length": 238.54018783569336, "epoch": 1.9576260530617378, "grad_norm": 0.3717038714285689, "kl": 0.412078857421875, "learning_rate": 4.958273968557497e-07, "loss": 0.0004, "reward": 1.726785771548748, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.740178607404232, "rewards/format_reward_func": 0.9866071492433548, "step": 11676 }, { "completion_length": 231.80358028411865, "epoch": 1.9579613563016052, "grad_norm": 0.20329722813535972, "kl": 0.27691650390625, "learning_rate": 4.95825214061803e-07, "loss": 0.0003, "reward": 1.746428668498993, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 11678 }, { "completion_length": 233.9241189956665, "epoch": 1.9582966595414728, "grad_norm": 0.12961171035557645, "kl": 0.254791259765625, "learning_rate": 4.958230307018755e-07, "loss": 0.0003, "reward": 1.7196429371833801, "reward_std": 0.0530330091714859, "rewards/equation_reward_func": 0.7330357432365417, "rewards/format_reward_func": 0.9866071492433548, "step": 11680 }, { "completion_length": 231.82143878936768, "epoch": 1.9586319627813404, "grad_norm": 0.2286560781940869, "kl": 0.150177001953125, "learning_rate": 4.958208467759722e-07, "loss": 0.0002, "reward": 1.6982143595814705, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.702678607776761, "rewards/format_reward_func": 0.9955357164144516, "step": 11682 }, { "completion_length": 234.32590293884277, "epoch": 1.958967266021208, "grad_norm": 0.1759078841403251, "kl": 0.170379638671875, "learning_rate": 4.958186622840982e-07, "loss": 0.0002, "reward": 1.7500000670552254, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000204890966, "rewards/format_reward_func": 1.0, "step": 11684 }, { "completion_length": 228.33483505249023, "epoch": 1.9593025692610755, "grad_norm": 0.25031878839867766, "kl": 0.27630615234375, "learning_rate": 4.958164772262584e-07, "loss": 0.0003, "reward": 1.7571429312229156, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428846567869, "rewards/format_reward_func": 1.0, "step": 11686 }, { "completion_length": 221.42411613464355, "epoch": 1.959637872500943, "grad_norm": 0.2143225747696588, "kl": 0.216766357421875, "learning_rate": 4.95814291602458e-07, "loss": 0.0002, "reward": 1.832142911851406, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8321428894996643, "rewards/format_reward_func": 1.0, "step": 11688 }, { "completion_length": 223.12500953674316, "epoch": 1.9599731757408105, "grad_norm": 0.11656490838104483, "kl": 0.2996826171875, "learning_rate": 4.95812105412702e-07, "loss": 0.0003, "reward": 1.7857143431901932, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 11690 }, { "completion_length": 222.45090579986572, "epoch": 1.9603084789806782, "grad_norm": 0.20553337543261221, "kl": 0.117279052734375, "learning_rate": 4.958099186569953e-07, "loss": 0.0001, "reward": 1.7303571850061417, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.734821455553174, "rewards/format_reward_func": 0.9955357164144516, "step": 11692 }, { "completion_length": 223.79465293884277, "epoch": 1.9606437822205458, "grad_norm": 0.09753404057868746, "kl": 0.360626220703125, "learning_rate": 4.958077313353432e-07, "loss": 0.0004, "reward": 1.7500000447034836, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 11694 }, { "completion_length": 216.57590293884277, "epoch": 1.9609790854604132, "grad_norm": 0.159109331582923, "kl": 0.179901123046875, "learning_rate": 4.958055434477504e-07, "loss": 0.0002, "reward": 1.796428620815277, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 11696 }, { "completion_length": 231.446439743042, "epoch": 1.9613143887002809, "grad_norm": 0.2844154985682583, "kl": 0.23492431640625, "learning_rate": 4.958033549942222e-07, "loss": 0.0002, "reward": 1.825000062584877, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8250000141561031, "rewards/format_reward_func": 1.0, "step": 11698 }, { "completion_length": 219.102689743042, "epoch": 1.9616496919401483, "grad_norm": 0.28877537376523416, "kl": 0.134918212890625, "learning_rate": 4.958011659747635e-07, "loss": 0.0001, "reward": 1.7142857909202576, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.714285746216774, "rewards/format_reward_func": 1.0, "step": 11700 }, { "completion_length": 226.44643783569336, "epoch": 1.961984995180016, "grad_norm": 0.42817238326910484, "kl": 0.4429931640625, "learning_rate": 4.957989763893793e-07, "loss": 0.0004, "reward": 1.7750000581145287, "reward_std": 0.045456862077116966, "rewards/equation_reward_func": 0.7750000152736902, "rewards/format_reward_func": 1.0, "step": 11702 }, { "completion_length": 220.68304538726807, "epoch": 1.9623202984198835, "grad_norm": 0.09978868725868749, "kl": 0.122467041015625, "learning_rate": 4.957967862380749e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 11704 }, { "completion_length": 226.65625762939453, "epoch": 1.9626556016597512, "grad_norm": 0.11651728537059637, "kl": 0.1053619384765625, "learning_rate": 4.95794595520855e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 11706 }, { "completion_length": 224.50893783569336, "epoch": 1.9629909048996186, "grad_norm": 0.2761169766878691, "kl": 0.214324951171875, "learning_rate": 4.957924042377248e-07, "loss": 0.0002, "reward": 1.753571480512619, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 11708 }, { "completion_length": 225.88393592834473, "epoch": 1.963326208139486, "grad_norm": 0.19632426609063403, "kl": 0.237152099609375, "learning_rate": 4.957902123886895e-07, "loss": 0.0002, "reward": 1.7857143506407738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 11710 }, { "completion_length": 213.43304634094238, "epoch": 1.9636615113793536, "grad_norm": 0.18476990547780098, "kl": 0.12530517578125, "learning_rate": 4.957880199737539e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714622586966, "rewards/format_reward_func": 1.0, "step": 11712 }, { "completion_length": 220.67858123779297, "epoch": 1.9639968146192213, "grad_norm": 0.20954613469586167, "kl": 0.3189544677734375, "learning_rate": 4.957858269929232e-07, "loss": 0.0003, "reward": 1.8035714849829674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 11714 }, { "completion_length": 230.14733123779297, "epoch": 1.964332117859089, "grad_norm": 0.18032852907919253, "kl": 0.217559814453125, "learning_rate": 4.957836334462024e-07, "loss": 0.0002, "reward": 1.7500000596046448, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000391155481, "rewards/format_reward_func": 1.0, "step": 11716 }, { "completion_length": 247.68304920196533, "epoch": 1.9646674210989565, "grad_norm": 0.2596767575265767, "kl": 0.3243408203125, "learning_rate": 4.957814393335964e-07, "loss": 0.0003, "reward": 1.7017857879400253, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.706250037997961, "rewards/format_reward_func": 0.9955357164144516, "step": 11718 }, { "completion_length": 223.7321538925171, "epoch": 1.965002724338824, "grad_norm": 0.17295896277556655, "kl": 0.103607177734375, "learning_rate": 4.957792446551105e-07, "loss": 0.0001, "reward": 1.787500061094761, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7919643186032772, "rewards/format_reward_func": 0.9955357164144516, "step": 11720 }, { "completion_length": 244.13393878936768, "epoch": 1.9653380275786914, "grad_norm": 0.12359927873014928, "kl": 0.1367950439453125, "learning_rate": 4.957770494107496e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 11722 }, { "completion_length": 238.42858409881592, "epoch": 1.965673330818559, "grad_norm": 0.2447817104012549, "kl": 0.10650634765625, "learning_rate": 4.957748536005189e-07, "loss": 0.0001, "reward": 1.7732143476605415, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 11724 }, { "completion_length": 244.68304824829102, "epoch": 1.9660086340584266, "grad_norm": 0.25763781640983036, "kl": 0.107635498046875, "learning_rate": 4.957726572244233e-07, "loss": 0.0001, "reward": 1.7375000640749931, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643234461546, "rewards/format_reward_func": 0.9955357164144516, "step": 11726 }, { "completion_length": 256.0982275009155, "epoch": 1.9663439372982943, "grad_norm": 0.17076517137928504, "kl": 0.104156494140625, "learning_rate": 4.95770460282468e-07, "loss": 0.0001, "reward": 1.7803571969270706, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7848214656114578, "rewards/format_reward_func": 0.9955357164144516, "step": 11728 }, { "completion_length": 250.0982255935669, "epoch": 1.9666792405381617, "grad_norm": 0.28394623429905763, "kl": 0.131378173828125, "learning_rate": 4.957682627746578e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7589286044239998, "rewards/format_reward_func": 0.9910714328289032, "step": 11730 }, { "completion_length": 245.6473331451416, "epoch": 1.967014543778029, "grad_norm": 0.25643459126742696, "kl": 0.1122894287109375, "learning_rate": 4.957660647009981e-07, "loss": 0.0001, "reward": 1.7875000685453415, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 11732 }, { "completion_length": 244.6428680419922, "epoch": 1.9673498470178967, "grad_norm": 0.18632279817106337, "kl": 0.1124420166015625, "learning_rate": 4.957638660614938e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857544124126, "rewards/format_reward_func": 1.0, "step": 11734 }, { "completion_length": 256.9151916503906, "epoch": 1.9676851502577644, "grad_norm": 0.22765448085255816, "kl": 0.1229400634765625, "learning_rate": 4.957616668561498e-07, "loss": 0.0001, "reward": 1.7446429282426834, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7491071857511997, "rewards/format_reward_func": 0.9955357164144516, "step": 11736 }, { "completion_length": 246.9017972946167, "epoch": 1.968020453497632, "grad_norm": 0.6592077745590738, "kl": 0.13031005859375, "learning_rate": 4.957594670849715e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.8196428827941418, "rewards/format_reward_func": 0.9910714328289032, "step": 11738 }, { "completion_length": 255.5178689956665, "epoch": 1.9683557567374996, "grad_norm": 0.26862816593064337, "kl": 0.117919921875, "learning_rate": 4.957572667479637e-07, "loss": 0.0001, "reward": 1.7303572073578835, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7348214536905289, "rewards/format_reward_func": 0.9955357164144516, "step": 11740 }, { "completion_length": 246.37947368621826, "epoch": 1.968691059977367, "grad_norm": 0.2317197207042673, "kl": 0.115509033203125, "learning_rate": 4.957550658451315e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 11742 }, { "completion_length": 257.39286708831787, "epoch": 1.9690263632172345, "grad_norm": 0.1188256513109012, "kl": 0.1306610107421875, "learning_rate": 4.957528643764801e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 11744 }, { "completion_length": 254.32144260406494, "epoch": 1.969361666457102, "grad_norm": 0.13580848106302223, "kl": 0.150390625, "learning_rate": 4.957506623420145e-07, "loss": 0.0002, "reward": 1.7446429133415222, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7580357491970062, "rewards/format_reward_func": 0.9866071492433548, "step": 11746 }, { "completion_length": 249.18304824829102, "epoch": 1.9696969696969697, "grad_norm": 0.17056617952178113, "kl": 0.143218994140625, "learning_rate": 4.957484597417398e-07, "loss": 0.0001, "reward": 1.773214340209961, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7866071611642838, "rewards/format_reward_func": 0.9866071455180645, "step": 11748 }, { "completion_length": 257.696439743042, "epoch": 1.9700322729368374, "grad_norm": 0.05453721168213884, "kl": 0.112518310546875, "learning_rate": 4.957462565756609e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7812500298023224, "rewards/format_reward_func": 0.9955357164144516, "step": 11750 }, { "completion_length": 258.0759057998657, "epoch": 1.9703675761767048, "grad_norm": 0.1605862489015661, "kl": 0.10565185546875, "learning_rate": 4.95744052843783e-07, "loss": 0.0001, "reward": 1.798214353621006, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8026786111295223, "rewards/format_reward_func": 0.9955357164144516, "step": 11752 }, { "completion_length": 254.1607265472412, "epoch": 1.9707028794165724, "grad_norm": 0.19646260246464867, "kl": 0.1075439453125, "learning_rate": 4.957418485461112e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 11754 }, { "completion_length": 252.89733505249023, "epoch": 1.9710381826564398, "grad_norm": 0.3499983244450021, "kl": 0.2943115234375, "learning_rate": 4.957396436826506e-07, "loss": 0.0003, "reward": 1.8098214641213417, "reward_std": 0.056821079924702644, "rewards/equation_reward_func": 0.816071443259716, "rewards/format_reward_func": 0.9937500059604645, "step": 11756 }, { "completion_length": 257.3348340988159, "epoch": 1.9713734858963075, "grad_norm": 0.21213137164595536, "kl": 0.149383544921875, "learning_rate": 4.957374382534063e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526786085218191, "rewards/format_reward_func": 0.9955357164144516, "step": 11758 }, { "completion_length": 254.6785831451416, "epoch": 1.971708789136175, "grad_norm": 0.16187445714713122, "kl": 0.0967864990234375, "learning_rate": 4.957352322583831e-07, "loss": 0.0001, "reward": 1.8214286267757416, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8214285895228386, "rewards/format_reward_func": 1.0, "step": 11760 }, { "completion_length": 255.33929538726807, "epoch": 1.9720440923760427, "grad_norm": 0.17196609458920475, "kl": 0.127349853515625, "learning_rate": 4.957330256975865e-07, "loss": 0.0001, "reward": 1.7446429431438446, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.749107176437974, "rewards/format_reward_func": 0.9955357164144516, "step": 11762 }, { "completion_length": 248.13840579986572, "epoch": 1.9723793956159101, "grad_norm": 0.29006952980499107, "kl": 0.0870361328125, "learning_rate": 4.957308185710212e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.785714328289032, "rewards/format_reward_func": 1.0, "step": 11764 }, { "completion_length": 249.27679634094238, "epoch": 1.9727146988557775, "grad_norm": 0.10785375158854305, "kl": 0.157928466796875, "learning_rate": 4.957286108786925e-07, "loss": 0.0002, "reward": 1.8107143342494965, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107143193483353, "rewards/format_reward_func": 1.0, "step": 11766 }, { "completion_length": 247.71429920196533, "epoch": 1.9730500020956452, "grad_norm": 0.20807208197803714, "kl": 0.229705810546875, "learning_rate": 4.957264026206054e-07, "loss": 0.0002, "reward": 1.8000000789761543, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 11768 }, { "completion_length": 246.93304920196533, "epoch": 1.9733853053355128, "grad_norm": 0.17862243170744227, "kl": 0.180389404296875, "learning_rate": 4.957241937967651e-07, "loss": 0.0002, "reward": 1.8089286237955093, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8133928701281548, "rewards/format_reward_func": 0.9955357164144516, "step": 11770 }, { "completion_length": 252.46876049041748, "epoch": 1.9737206085753805, "grad_norm": 0.22827912919286747, "kl": 0.3739166259765625, "learning_rate": 4.957219844071765e-07, "loss": 0.0004, "reward": 1.7250000908970833, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000350177288, "rewards/format_reward_func": 1.0, "step": 11772 }, { "completion_length": 236.30358409881592, "epoch": 1.9740559118152479, "grad_norm": 0.18547557891243543, "kl": 0.1163330078125, "learning_rate": 4.957197744518449e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857529222965, "rewards/format_reward_func": 1.0, "step": 11774 }, { "completion_length": 251.08483409881592, "epoch": 1.9743912150551155, "grad_norm": 0.5494083041560429, "kl": 0.337066650390625, "learning_rate": 4.957175639307751e-07, "loss": 0.0003, "reward": 1.7678572237491608, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 11776 }, { "completion_length": 252.4866189956665, "epoch": 1.974726518294983, "grad_norm": 0.1988050122697657, "kl": 0.1005859375, "learning_rate": 4.957153528439725e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 11778 }, { "completion_length": 250.71429634094238, "epoch": 1.9750618215348505, "grad_norm": 0.17271351151488437, "kl": 0.090301513671875, "learning_rate": 4.95713141191442e-07, "loss": 0.0001, "reward": 1.7142857983708382, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7142857406288385, "rewards/format_reward_func": 1.0, "step": 11780 }, { "completion_length": 247.45983123779297, "epoch": 1.9753971247747182, "grad_norm": 0.1843118865578671, "kl": 0.389495849609375, "learning_rate": 4.957109289731888e-07, "loss": 0.0004, "reward": 1.7821429371833801, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 11782 }, { "completion_length": 236.5982255935669, "epoch": 1.9757324280145858, "grad_norm": 0.21773081688597665, "kl": 0.3225250244140625, "learning_rate": 4.957087161892179e-07, "loss": 0.0003, "reward": 1.7750000655651093, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 11784 }, { "completion_length": 241.3973331451416, "epoch": 1.9760677312544532, "grad_norm": 0.26868741544036107, "kl": 0.7315216064453125, "learning_rate": 4.957065028395344e-07, "loss": 0.0007, "reward": 1.8214285969734192, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8214285969734192, "rewards/format_reward_func": 1.0, "step": 11786 }, { "completion_length": 248.35715579986572, "epoch": 1.9764030344943206, "grad_norm": 0.20328298548363283, "kl": 0.1676788330078125, "learning_rate": 4.957042889241435e-07, "loss": 0.0002, "reward": 1.7750000655651093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 11788 }, { "completion_length": 243.7544755935669, "epoch": 1.9767383377341883, "grad_norm": 0.1907004951600171, "kl": 0.1032867431640625, "learning_rate": 4.957020744430502e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 11790 }, { "completion_length": 233.78572368621826, "epoch": 1.977073640974056, "grad_norm": 0.2596566984727189, "kl": 0.09918212890625, "learning_rate": 4.956998593962596e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 11792 }, { "completion_length": 236.5089406967163, "epoch": 1.9774089442139235, "grad_norm": 0.30107402027910873, "kl": 0.442413330078125, "learning_rate": 4.956976437837768e-07, "loss": 0.0004, "reward": 1.8000000789761543, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 11794 }, { "completion_length": 242.1071538925171, "epoch": 1.9777442474537912, "grad_norm": 0.22727364686394377, "kl": 0.29437255859375, "learning_rate": 4.95695427605607e-07, "loss": 0.0003, "reward": 1.803571492433548, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 11796 }, { "completion_length": 243.19643878936768, "epoch": 1.9780795506936586, "grad_norm": 0.25835029992767844, "kl": 0.1036529541015625, "learning_rate": 4.956932108617552e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 11798 }, { "completion_length": 243.7991180419922, "epoch": 1.978414853933526, "grad_norm": 0.20062119433370976, "kl": 0.1900177001953125, "learning_rate": 4.956909935522265e-07, "loss": 0.0002, "reward": 1.74642863124609, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464285902678967, "rewards/format_reward_func": 1.0, "step": 11800 }, { "completion_length": 237.56251049041748, "epoch": 1.9787501571733936, "grad_norm": 0.19331615368744728, "kl": 0.183807373046875, "learning_rate": 4.95688775677026e-07, "loss": 0.0002, "reward": 1.7892857939004898, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 11802 }, { "completion_length": 243.19643878936768, "epoch": 1.9790854604132613, "grad_norm": 0.26249176823337705, "kl": 0.14044189453125, "learning_rate": 4.956865572361589e-07, "loss": 0.0001, "reward": 1.7000000849366188, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7000000309199095, "rewards/format_reward_func": 1.0, "step": 11804 }, { "completion_length": 231.95090198516846, "epoch": 1.979420763653129, "grad_norm": 0.11336370989884989, "kl": 0.0840301513671875, "learning_rate": 4.956843382296303e-07, "loss": 0.0001, "reward": 1.82857146859169, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8285714462399483, "rewards/format_reward_func": 1.0, "step": 11806 }, { "completion_length": 234.89733123779297, "epoch": 1.9797560668929963, "grad_norm": 0.16502533057972615, "kl": 0.1882171630859375, "learning_rate": 4.956821186574453e-07, "loss": 0.0002, "reward": 1.8000000417232513, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 11808 }, { "completion_length": 250.22768783569336, "epoch": 1.980091370132864, "grad_norm": 0.2793167309232402, "kl": 0.33477783203125, "learning_rate": 4.956798985196089e-07, "loss": 0.0003, "reward": 1.7535714954137802, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 11810 }, { "completion_length": 219.44643878936768, "epoch": 1.9804266733727314, "grad_norm": 0.1115267623950336, "kl": 0.0941162109375, "learning_rate": 4.956776778161262e-07, "loss": 0.0001, "reward": 1.8107143193483353, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.810714315623045, "rewards/format_reward_func": 1.0, "step": 11812 }, { "completion_length": 239.5982255935669, "epoch": 1.980761976612599, "grad_norm": 0.23300894138513203, "kl": 0.304840087890625, "learning_rate": 4.956754565470025e-07, "loss": 0.0003, "reward": 1.762946493923664, "reward_std": 0.03219861118122935, "rewards/equation_reward_func": 0.7642857488244772, "rewards/format_reward_func": 0.9986607171595097, "step": 11814 }, { "completion_length": 246.20983505249023, "epoch": 1.9810972798524666, "grad_norm": 0.24004711572000476, "kl": 0.123046875, "learning_rate": 4.956732347122428e-07, "loss": 0.0001, "reward": 1.8107143566012383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8107143193483353, "rewards/format_reward_func": 1.0, "step": 11816 }, { "completion_length": 243.9241180419922, "epoch": 1.9814325830923343, "grad_norm": 0.26924112990635035, "kl": 0.195953369140625, "learning_rate": 4.956710123118522e-07, "loss": 0.0002, "reward": 1.7714286223053932, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 11818 }, { "completion_length": 235.758939743042, "epoch": 1.9817678863322017, "grad_norm": 0.19173483626836108, "kl": 0.0982208251953125, "learning_rate": 4.956687893458359e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464286107569933, "rewards/format_reward_func": 1.0, "step": 11820 }, { "completion_length": 238.8259038925171, "epoch": 1.982103189572069, "grad_norm": 0.2064395106648162, "kl": 0.138946533203125, "learning_rate": 4.956665658141989e-07, "loss": 0.0001, "reward": 1.7000000849366188, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7000000290572643, "rewards/format_reward_func": 1.0, "step": 11822 }, { "completion_length": 241.92858123779297, "epoch": 1.9824384928119367, "grad_norm": 0.21597189485682244, "kl": 0.0922088623046875, "learning_rate": 4.956643417169464e-07, "loss": 0.0001, "reward": 1.750000037252903, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7589286100119352, "rewards/format_reward_func": 0.9910714328289032, "step": 11824 }, { "completion_length": 241.89733505249023, "epoch": 1.9827737960518044, "grad_norm": 0.1910625884590803, "kl": 0.1044464111328125, "learning_rate": 4.956621170540834e-07, "loss": 0.0001, "reward": 1.7803572118282318, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7848214469850063, "rewards/format_reward_func": 0.9955357164144516, "step": 11826 }, { "completion_length": 231.9866180419922, "epoch": 1.983109099291672, "grad_norm": 0.13525587988326038, "kl": 0.090179443359375, "learning_rate": 4.956598918256151e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964286059141159, "rewards/format_reward_func": 1.0, "step": 11828 }, { "completion_length": 238.78572273254395, "epoch": 1.9834444025315394, "grad_norm": 0.21546699753134915, "kl": 0.71807861328125, "learning_rate": 4.956576660315468e-07, "loss": 0.0007, "reward": 1.8178571909666061, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8178571797907352, "rewards/format_reward_func": 1.0, "step": 11830 }, { "completion_length": 235.4910831451416, "epoch": 1.983779705771407, "grad_norm": 0.14166303451460713, "kl": 0.173187255859375, "learning_rate": 4.956554396718835e-07, "loss": 0.0002, "reward": 1.800000049173832, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 11832 }, { "completion_length": 246.1339406967163, "epoch": 1.9841150090112745, "grad_norm": 0.24250097169224633, "kl": 0.094482421875, "learning_rate": 4.956532127466302e-07, "loss": 0.0001, "reward": 1.733928643167019, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.738392885774374, "rewards/format_reward_func": 0.9955357164144516, "step": 11834 }, { "completion_length": 240.32590293884277, "epoch": 1.984450312251142, "grad_norm": 0.23198980180167816, "kl": 0.0967254638671875, "learning_rate": 4.956509852557921e-07, "loss": 0.0001, "reward": 1.8178571686148643, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8178571723401546, "rewards/format_reward_func": 1.0, "step": 11836 }, { "completion_length": 247.0044755935669, "epoch": 1.9847856154910097, "grad_norm": 0.2247671504249133, "kl": 0.10345458984375, "learning_rate": 4.956487571993744e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 11838 }, { "completion_length": 235.56697177886963, "epoch": 1.9851209187308774, "grad_norm": 0.22602112500128277, "kl": 0.2289276123046875, "learning_rate": 4.956465285773822e-07, "loss": 0.0002, "reward": 1.789285771548748, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 11840 }, { "completion_length": 238.22322368621826, "epoch": 1.9854562219707448, "grad_norm": 0.2934779266919726, "kl": 0.118408203125, "learning_rate": 4.956442993898206e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 11842 }, { "completion_length": 241.77679920196533, "epoch": 1.9857915252106122, "grad_norm": 0.25116354054121726, "kl": 0.3289794921875, "learning_rate": 4.956420696366947e-07, "loss": 0.0003, "reward": 1.8250000476837158, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8250000327825546, "rewards/format_reward_func": 1.0, "step": 11844 }, { "completion_length": 235.3169765472412, "epoch": 1.9861268284504798, "grad_norm": 0.13130801114902244, "kl": 0.2256011962890625, "learning_rate": 4.956398393180097e-07, "loss": 0.0002, "reward": 1.807142898440361, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428909897804, "rewards/format_reward_func": 1.0, "step": 11846 }, { "completion_length": 248.06250953674316, "epoch": 1.9864621316903475, "grad_norm": 0.22475071490244067, "kl": 0.15399169921875, "learning_rate": 4.956376084337707e-07, "loss": 0.0002, "reward": 1.8035714775323868, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 11848 }, { "completion_length": 230.65625858306885, "epoch": 1.986797434930215, "grad_norm": 0.21280194742861228, "kl": 0.1126708984375, "learning_rate": 4.956353769839829e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 11850 }, { "completion_length": 245.8348331451416, "epoch": 1.9871327381700825, "grad_norm": 0.2820430557228894, "kl": 0.154571533203125, "learning_rate": 4.956331449686513e-07, "loss": 0.0002, "reward": 1.7035715207457542, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7035714648663998, "rewards/format_reward_func": 1.0, "step": 11852 }, { "completion_length": 240.2009038925171, "epoch": 1.9874680414099501, "grad_norm": 0.395312790500384, "kl": 0.4749603271484375, "learning_rate": 4.956309123877812e-07, "loss": 0.0005, "reward": 1.725000038743019, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7339286096394062, "rewards/format_reward_func": 0.9910714328289032, "step": 11854 }, { "completion_length": 239.75447368621826, "epoch": 1.9878033446498176, "grad_norm": 0.13378942220678655, "kl": 0.110565185546875, "learning_rate": 4.956286792413776e-07, "loss": 0.0001, "reward": 1.8142857626080513, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 11856 }, { "completion_length": 231.61608123779297, "epoch": 1.9881386478896852, "grad_norm": 0.18475450589268128, "kl": 0.114471435546875, "learning_rate": 4.956264455294459e-07, "loss": 0.0001, "reward": 1.7607143372297287, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 11858 }, { "completion_length": 238.84822463989258, "epoch": 1.9884739511295528, "grad_norm": 0.1555984953874135, "kl": 0.22589111328125, "learning_rate": 4.956242112519908e-07, "loss": 0.0002, "reward": 1.7785715013742447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 11860 }, { "completion_length": 232.9776906967163, "epoch": 1.9888092543694205, "grad_norm": 0.3377725470032938, "kl": 0.1129150390625, "learning_rate": 4.956219764090178e-07, "loss": 0.0001, "reward": 1.7875000685453415, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7919643260538578, "rewards/format_reward_func": 0.9955357164144516, "step": 11862 }, { "completion_length": 235.4509048461914, "epoch": 1.9891445576092879, "grad_norm": 0.2210363508356654, "kl": 0.12188720703125, "learning_rate": 4.956197410005319e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 11864 }, { "completion_length": 231.74108219146729, "epoch": 1.9894798608491553, "grad_norm": 0.1475095473252601, "kl": 0.1528472900390625, "learning_rate": 4.956175050265384e-07, "loss": 0.0002, "reward": 1.8267857804894447, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8312500193715096, "rewards/format_reward_func": 0.9955357164144516, "step": 11866 }, { "completion_length": 245.79018878936768, "epoch": 1.989815164089023, "grad_norm": 0.15650601562466082, "kl": 0.136932373046875, "learning_rate": 4.956152684870422e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286059141159, "rewards/format_reward_func": 1.0, "step": 11868 }, { "completion_length": 246.821439743042, "epoch": 1.9901504673288906, "grad_norm": 0.1704402931331183, "kl": 0.185546875, "learning_rate": 4.956130313820487e-07, "loss": 0.0002, "reward": 1.7250000983476639, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7339286003261805, "rewards/format_reward_func": 0.9910714328289032, "step": 11870 }, { "completion_length": 253.30358505249023, "epoch": 1.9904857705687582, "grad_norm": 0.38526863006362766, "kl": 0.310333251953125, "learning_rate": 4.956107937115629e-07, "loss": 0.0003, "reward": 1.7892857640981674, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 11872 }, { "completion_length": 247.47322368621826, "epoch": 1.9908210738086258, "grad_norm": 0.19867008633050648, "kl": 0.1563720703125, "learning_rate": 4.9560855547559e-07, "loss": 0.0002, "reward": 1.7232143506407738, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7276786006987095, "rewards/format_reward_func": 0.9955357164144516, "step": 11874 }, { "completion_length": 236.4866189956665, "epoch": 1.9911563770484932, "grad_norm": 0.1428311178265055, "kl": 0.11065673828125, "learning_rate": 4.956063166741351e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 11876 }, { "completion_length": 244.43751049041748, "epoch": 1.9914916802883607, "grad_norm": 0.2428530544891383, "kl": 0.1180877685546875, "learning_rate": 4.956040773072034e-07, "loss": 0.0001, "reward": 1.7696428894996643, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7741071805357933, "rewards/format_reward_func": 0.9955357164144516, "step": 11878 }, { "completion_length": 237.57590293884277, "epoch": 1.9918269835282283, "grad_norm": 1.1212923774941903, "kl": 0.1295166015625, "learning_rate": 4.956018373748001e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.06060915347188711, "rewards/equation_reward_func": 0.7732143066823483, "rewards/format_reward_func": 0.9910714328289032, "step": 11880 }, { "completion_length": 230.1071538925171, "epoch": 1.992162286768096, "grad_norm": 0.1906099981958323, "kl": 0.11297607421875, "learning_rate": 4.955995968769302e-07, "loss": 0.0001, "reward": 1.7892857640981674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 11882 }, { "completion_length": 234.17858219146729, "epoch": 1.9924975900079636, "grad_norm": 0.30340923600114456, "kl": 0.114898681640625, "learning_rate": 4.955973558135991e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.8089286014437675, "rewards/format_reward_func": 0.9910714328289032, "step": 11884 }, { "completion_length": 241.93751049041748, "epoch": 1.992832893247831, "grad_norm": 0.24126169281411733, "kl": 0.104156494140625, "learning_rate": 4.955951141848117e-07, "loss": 0.0001, "reward": 1.7982143312692642, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8026785962283611, "rewards/format_reward_func": 0.9955357164144516, "step": 11886 }, { "completion_length": 242.1875114440918, "epoch": 1.9931681964876986, "grad_norm": 0.13715380355658038, "kl": 0.1492919921875, "learning_rate": 4.955928719905734e-07, "loss": 0.0001, "reward": 1.796428643167019, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 11888 }, { "completion_length": 242.13393878936768, "epoch": 1.993503499727566, "grad_norm": 0.27291793247372775, "kl": 0.1129150390625, "learning_rate": 4.955906292308892e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7392857261002064, "rewards/format_reward_func": 1.0, "step": 11890 }, { "completion_length": 227.04465293884277, "epoch": 1.9938388029674337, "grad_norm": 0.23860127461571143, "kl": 0.0947265625, "learning_rate": 4.955883859057643e-07, "loss": 0.0001, "reward": 1.8285714611411095, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8285714630037546, "rewards/format_reward_func": 1.0, "step": 11892 }, { "completion_length": 234.54019165039062, "epoch": 1.9941741062073013, "grad_norm": 0.3011874190088839, "kl": 0.105133056640625, "learning_rate": 4.955861420152039e-07, "loss": 0.0001, "reward": 1.7892857939004898, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 11894 }, { "completion_length": 233.602689743042, "epoch": 1.994509409447169, "grad_norm": 0.24990488377435194, "kl": 0.118316650390625, "learning_rate": 4.955838975592131e-07, "loss": 0.0001, "reward": 1.7250000908970833, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000312924385, "rewards/format_reward_func": 1.0, "step": 11896 }, { "completion_length": 226.99554634094238, "epoch": 1.9948447126870363, "grad_norm": 0.37434620571171906, "kl": 0.120025634765625, "learning_rate": 4.955816525377971e-07, "loss": 0.0001, "reward": 1.8125000670552254, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8169643022119999, "rewards/format_reward_func": 0.9955357164144516, "step": 11898 }, { "completion_length": 228.21876049041748, "epoch": 1.9951800159269037, "grad_norm": 0.1891353284554539, "kl": 0.113739013671875, "learning_rate": 4.955794069509611e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 11900 }, { "completion_length": 224.43304538726807, "epoch": 1.9955153191667714, "grad_norm": 0.6070216232285166, "kl": 0.18603515625, "learning_rate": 4.955771607987104e-07, "loss": 0.0002, "reward": 1.7303571924567223, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7348214685916901, "rewards/format_reward_func": 0.9955357164144516, "step": 11902 }, { "completion_length": 231.16072368621826, "epoch": 1.995850622406639, "grad_norm": 0.2760599269226852, "kl": 0.164215087890625, "learning_rate": 4.955749140810499e-07, "loss": 0.0002, "reward": 1.7589286342263222, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928768336773, "rewards/format_reward_func": 0.9955357164144516, "step": 11904 }, { "completion_length": 217.94197463989258, "epoch": 1.9961859256465067, "grad_norm": 0.2162571700265734, "kl": 0.114288330078125, "learning_rate": 4.955726667979848e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143044471741, "rewards/format_reward_func": 1.0, "step": 11906 }, { "completion_length": 233.70536518096924, "epoch": 1.996521228886374, "grad_norm": 0.17218754513613752, "kl": 0.125152587890625, "learning_rate": 4.955704189495205e-07, "loss": 0.0001, "reward": 1.7803572118282318, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214507102966, "rewards/format_reward_func": 0.9955357164144516, "step": 11908 }, { "completion_length": 229.40179634094238, "epoch": 1.9968565321262417, "grad_norm": 0.3804112945798657, "kl": 0.1534423828125, "learning_rate": 4.955681705356621e-07, "loss": 0.0002, "reward": 1.7053572237491608, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7098214775323868, "rewards/format_reward_func": 0.9955357164144516, "step": 11910 }, { "completion_length": 230.25000953674316, "epoch": 1.9971918353661091, "grad_norm": 0.9168094662360421, "kl": 0.241485595703125, "learning_rate": 4.955659215564145e-07, "loss": 0.0002, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 11912 }, { "completion_length": 224.8616189956665, "epoch": 1.9975271386059767, "grad_norm": 0.16848882583586025, "kl": 0.13287353515625, "learning_rate": 4.955636720117833e-07, "loss": 0.0001, "reward": 1.8107143267989159, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8107143081724644, "rewards/format_reward_func": 1.0, "step": 11914 }, { "completion_length": 230.39732933044434, "epoch": 1.9978624418458444, "grad_norm": 0.558286576121209, "kl": 0.13238525390625, "learning_rate": 4.955614219017734e-07, "loss": 0.0001, "reward": 1.7732143625617027, "reward_std": 0.08838834892958403, "rewards/equation_reward_func": 0.7866071723401546, "rewards/format_reward_func": 0.9866071492433548, "step": 11916 }, { "completion_length": 218.53572463989258, "epoch": 1.998197745085712, "grad_norm": 0.12048965951566463, "kl": 0.1212921142578125, "learning_rate": 4.9555917122639e-07, "loss": 0.0001, "reward": 1.798214316368103, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.8026786036789417, "rewards/format_reward_func": 0.9955357164144516, "step": 11918 }, { "completion_length": 229.7723331451416, "epoch": 1.9985330483255794, "grad_norm": 0.26405640247652984, "kl": 0.13812255859375, "learning_rate": 4.955569199856384e-07, "loss": 0.0001, "reward": 1.7732143327593803, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 11920 }, { "completion_length": 222.51340293884277, "epoch": 1.9988683515654468, "grad_norm": 0.20701527554064678, "kl": 0.119659423828125, "learning_rate": 4.955546681795238e-07, "loss": 0.0001, "reward": 1.753571517765522, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714730620384, "rewards/format_reward_func": 1.0, "step": 11922 }, { "completion_length": 233.08929634094238, "epoch": 1.9992036548053145, "grad_norm": 0.2992797604063413, "kl": 0.166656494140625, "learning_rate": 4.955524158080513e-07, "loss": 0.0002, "reward": 1.7267858013510704, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7312500365078449, "rewards/format_reward_func": 0.9955357164144516, "step": 11924 }, { "completion_length": 223.50447273254395, "epoch": 1.9995389580451821, "grad_norm": 0.21232811921496075, "kl": 0.116607666015625, "learning_rate": 4.955501628712259e-07, "loss": 0.0001, "reward": 1.783035770058632, "reward_std": 0.04419417306780815, "rewards/equation_reward_func": 0.7848214581608772, "rewards/format_reward_func": 0.9982142895460129, "step": 11926 }, { "completion_length": 224.19643783569336, "epoch": 1.9998742612850497, "grad_norm": 0.22939465547794297, "kl": 0.9582061767578125, "learning_rate": 4.955479093690532e-07, "loss": 0.001, "reward": 1.7303572073578835, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7348214648663998, "rewards/format_reward_func": 0.9955357164144516, "step": 11928 }, { "completion_length": 224.04546356201172, "epoch": 2.0003353032398676, "grad_norm": 0.20265008955204794, "kl": 0.11465731534090909, "learning_rate": 4.955456553015381e-07, "loss": 0.0002, "reward": 1.7909091440114109, "reward_std": 0.053262587298046456, "rewards/equation_reward_func": 0.7941558740355752, "rewards/format_reward_func": 0.9967532483014193, "step": 11930 }, { "completion_length": 219.13393878936768, "epoch": 2.0006706064797353, "grad_norm": 0.25991971602352476, "kl": 0.1035919189453125, "learning_rate": 4.955434006686859e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857566475868, "rewards/format_reward_func": 1.0, "step": 11932 }, { "completion_length": 217.21429634094238, "epoch": 2.0010059097196025, "grad_norm": 0.17602677588111465, "kl": 0.135345458984375, "learning_rate": 4.955411454705016e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964285835623741, "rewards/format_reward_func": 1.0, "step": 11934 }, { "completion_length": 226.6651906967163, "epoch": 2.00134121295947, "grad_norm": 0.23395105985453332, "kl": 0.1150360107421875, "learning_rate": 4.955388897069907e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 11936 }, { "completion_length": 221.41518783569336, "epoch": 2.0016765161993377, "grad_norm": 0.1894342197370109, "kl": 0.1356201171875, "learning_rate": 4.955366333781581e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 11938 }, { "completion_length": 224.43304634094238, "epoch": 2.0020118194392054, "grad_norm": 0.19172940809934583, "kl": 0.13311767578125, "learning_rate": 4.955343764840093e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8107143007218838, "rewards/format_reward_func": 1.0, "step": 11940 }, { "completion_length": 210.82143878936768, "epoch": 2.002347122679073, "grad_norm": 0.19378477540179337, "kl": 0.1277923583984375, "learning_rate": 4.955321190245491e-07, "loss": 0.0001, "reward": 1.7571429461240768, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.757142897695303, "rewards/format_reward_func": 1.0, "step": 11942 }, { "completion_length": 218.50000953674316, "epoch": 2.0026824259189406, "grad_norm": 0.20741615096888355, "kl": 0.123382568359375, "learning_rate": 4.955298609997831e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 11944 }, { "completion_length": 226.2321538925171, "epoch": 2.003017729158808, "grad_norm": 0.15846754537055674, "kl": 0.10369873046875, "learning_rate": 4.955276024097163e-07, "loss": 0.0001, "reward": 1.7607143446803093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 11946 }, { "completion_length": 228.56697463989258, "epoch": 2.0033530323986755, "grad_norm": 0.3061465529034817, "kl": 0.119293212890625, "learning_rate": 4.955253432543539e-07, "loss": 0.0001, "reward": 1.7696429267525673, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7741071600466967, "rewards/format_reward_func": 0.9955357164144516, "step": 11948 }, { "completion_length": 217.4821538925171, "epoch": 2.003688335638543, "grad_norm": 0.373817116939758, "kl": 0.13421630859375, "learning_rate": 4.955230835337012e-07, "loss": 0.0001, "reward": 1.733928643167019, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.738392885774374, "rewards/format_reward_func": 0.9955357164144516, "step": 11950 }, { "completion_length": 220.44643878936768, "epoch": 2.0040236388784107, "grad_norm": 0.2275983755411765, "kl": 0.130859375, "learning_rate": 4.955208232477633e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 11952 }, { "completion_length": 208.45536708831787, "epoch": 2.0043589421182784, "grad_norm": 0.1904971770533389, "kl": 0.129486083984375, "learning_rate": 4.955185623965454e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 11954 }, { "completion_length": 225.9821538925171, "epoch": 2.0046942453581456, "grad_norm": 0.17717864307604564, "kl": 0.13543701171875, "learning_rate": 4.955163009800527e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7410714738070965, "rewards/format_reward_func": 0.9910714328289032, "step": 11956 }, { "completion_length": 214.18750953674316, "epoch": 2.005029548598013, "grad_norm": 0.16732658290309635, "kl": 0.115814208984375, "learning_rate": 4.955140389982904e-07, "loss": 0.0001, "reward": 1.8142857775092125, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 11958 }, { "completion_length": 229.22322463989258, "epoch": 2.005364851837881, "grad_norm": 0.21928717715473264, "kl": 0.118377685546875, "learning_rate": 4.95511776451264e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7946428954601288, "rewards/format_reward_func": 0.9910714328289032, "step": 11960 }, { "completion_length": 219.00001049041748, "epoch": 2.0057001550777485, "grad_norm": 0.11188186472262818, "kl": 0.129486083984375, "learning_rate": 4.955095133389783e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 11962 }, { "completion_length": 216.20536613464355, "epoch": 2.006035458317616, "grad_norm": 0.287615123732697, "kl": 0.158966064453125, "learning_rate": 4.955072496614386e-07, "loss": 0.0002, "reward": 1.7839286401867867, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928753435612, "rewards/format_reward_func": 0.9955357164144516, "step": 11964 }, { "completion_length": 221.66965293884277, "epoch": 2.0063707615574837, "grad_norm": 0.13775078111321387, "kl": 0.1495361328125, "learning_rate": 4.955049854186503e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285798370838, "rewards/format_reward_func": 1.0, "step": 11966 }, { "completion_length": 220.4285831451416, "epoch": 2.006706064797351, "grad_norm": 0.20963167919580555, "kl": 0.142425537109375, "learning_rate": 4.955027206106184e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143126428127, "rewards/format_reward_func": 1.0, "step": 11968 }, { "completion_length": 208.98661518096924, "epoch": 2.0070413680372186, "grad_norm": 0.048640389814782435, "kl": 0.104705810546875, "learning_rate": 4.955004552373483e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 11970 }, { "completion_length": 224.97322463989258, "epoch": 2.007376671277086, "grad_norm": 0.41609726327583996, "kl": 0.146697998046875, "learning_rate": 4.95498189298845e-07, "loss": 0.0001, "reward": 1.7232143878936768, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7276786044239998, "rewards/format_reward_func": 0.9955357164144516, "step": 11972 }, { "completion_length": 220.17858123779297, "epoch": 2.007711974516954, "grad_norm": 0.2719038569618615, "kl": 0.10687255859375, "learning_rate": 4.954959227951139e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 11974 }, { "completion_length": 215.39733123779297, "epoch": 2.0080472777568215, "grad_norm": 0.17050510753330186, "kl": 0.124847412109375, "learning_rate": 4.954936557261603e-07, "loss": 0.0001, "reward": 1.7196429297327995, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7241071779280901, "rewards/format_reward_func": 0.9955357164144516, "step": 11976 }, { "completion_length": 222.39286613464355, "epoch": 2.008382580996689, "grad_norm": 0.1061362122372038, "kl": 0.139892578125, "learning_rate": 4.954913880919892e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 11978 }, { "completion_length": 219.17858123779297, "epoch": 2.0087178842365563, "grad_norm": 0.4607873940896255, "kl": 0.220184326171875, "learning_rate": 4.954891198926058e-07, "loss": 0.0002, "reward": 1.7821429073810577, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428999304771, "rewards/format_reward_func": 1.0, "step": 11980 }, { "completion_length": 218.12054443359375, "epoch": 2.009053187476424, "grad_norm": 0.243853842105626, "kl": 0.13153076171875, "learning_rate": 4.954868511280156e-07, "loss": 0.0001, "reward": 1.8053572103381157, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8098214529454708, "rewards/format_reward_func": 0.9955357164144516, "step": 11982 }, { "completion_length": 219.4241180419922, "epoch": 2.0093884907162916, "grad_norm": 0.17020714244723772, "kl": 0.13604736328125, "learning_rate": 4.954845817982234e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 11984 }, { "completion_length": 213.47322368621826, "epoch": 2.009723793956159, "grad_norm": 0.09836840420537803, "kl": 0.15594482421875, "learning_rate": 4.95482311903235e-07, "loss": 0.0002, "reward": 1.733928643167019, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 11986 }, { "completion_length": 212.58929538726807, "epoch": 2.010059097196027, "grad_norm": 0.275191574538384, "kl": 0.12689208984375, "learning_rate": 4.954800414430551e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 11988 }, { "completion_length": 232.5357265472412, "epoch": 2.010394400435894, "grad_norm": 0.1892092833782436, "kl": 0.17681884765625, "learning_rate": 4.954777704176891e-07, "loss": 0.0002, "reward": 1.7392857670783997, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857261002064, "rewards/format_reward_func": 1.0, "step": 11990 }, { "completion_length": 217.90179538726807, "epoch": 2.0107297036757616, "grad_norm": 0.1421242256000931, "kl": 0.14764404296875, "learning_rate": 4.954754988271423e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143227010965, "rewards/format_reward_func": 1.0, "step": 11992 }, { "completion_length": 220.9910831451416, "epoch": 2.0110650069156293, "grad_norm": 0.2826369696564157, "kl": 0.155670166015625, "learning_rate": 4.9547322667142e-07, "loss": 0.0002, "reward": 1.7919643595814705, "reward_std": 0.061871842946857214, "rewards/equation_reward_func": 0.798214316368103, "rewards/format_reward_func": 0.9937500059604645, "step": 11994 }, { "completion_length": 226.55358028411865, "epoch": 2.011400310155497, "grad_norm": 0.23081500085930848, "kl": 0.143829345703125, "learning_rate": 4.954709539505272e-07, "loss": 0.0001, "reward": 1.8250000551342964, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.8339286036789417, "rewards/format_reward_func": 0.9910714328289032, "step": 11996 }, { "completion_length": 225.42411708831787, "epoch": 2.0117356133953646, "grad_norm": 0.15678403010002545, "kl": 0.1483154296875, "learning_rate": 4.954686806644692e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286148548126, "rewards/format_reward_func": 1.0, "step": 11998 }, { "completion_length": 234.8928689956665, "epoch": 2.012070916635232, "grad_norm": 0.41964272204763825, "kl": 0.121246337890625, "learning_rate": 4.954664068132514e-07, "loss": 0.0001, "reward": 1.8071429133415222, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.8160714618861675, "rewards/format_reward_func": 0.9910714328289032, "step": 12000 }, { "completion_length": 230.38840293884277, "epoch": 2.0124062198750994, "grad_norm": 0.18041820981293524, "kl": 0.126129150390625, "learning_rate": 4.954641323968788e-07, "loss": 0.0001, "reward": 1.7553571909666061, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7598214633762836, "rewards/format_reward_func": 0.9955357164144516, "step": 12002 }, { "completion_length": 234.41965579986572, "epoch": 2.012741523114967, "grad_norm": 0.2307812936228628, "kl": 0.127593994140625, "learning_rate": 4.954618574153569e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107142932713032, "rewards/format_reward_func": 1.0, "step": 12004 }, { "completion_length": 247.77233695983887, "epoch": 2.0130768263548346, "grad_norm": 0.26071255191800646, "kl": 0.129608154296875, "learning_rate": 4.954595818686907e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.7625000216066837, "rewards/format_reward_func": 0.9910714328289032, "step": 12006 }, { "completion_length": 232.83929634094238, "epoch": 2.0134121295947023, "grad_norm": 0.20923252643628967, "kl": 0.13958740234375, "learning_rate": 4.954573057568856e-07, "loss": 0.0001, "reward": 1.69821435213089, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7026786059141159, "rewards/format_reward_func": 0.9955357164144516, "step": 12008 }, { "completion_length": 237.15179634094238, "epoch": 2.01374743283457, "grad_norm": 0.25729457637255143, "kl": 0.141448974609375, "learning_rate": 4.954550290799468e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7214285954833031, "rewards/format_reward_func": 1.0, "step": 12010 }, { "completion_length": 234.2946538925171, "epoch": 2.014082736074437, "grad_norm": 0.27225565276967606, "kl": 0.14306640625, "learning_rate": 4.954527518378794e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.7410714589059353, "rewards/format_reward_func": 0.9910714328289032, "step": 12012 }, { "completion_length": 229.37500953674316, "epoch": 2.0144180393143047, "grad_norm": 0.14623596893714957, "kl": 0.12762451171875, "learning_rate": 4.954504740306887e-07, "loss": 0.0001, "reward": 1.8089286386966705, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.8133928813040257, "rewards/format_reward_func": 0.9955357164144516, "step": 12014 }, { "completion_length": 230.6339406967163, "epoch": 2.0147533425541724, "grad_norm": 0.2575949222882184, "kl": 0.12567138671875, "learning_rate": 4.954481956583802e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.785714328289032, "rewards/format_reward_func": 1.0, "step": 12016 }, { "completion_length": 232.95090293884277, "epoch": 2.01508864579404, "grad_norm": 0.2922948645964965, "kl": 0.154632568359375, "learning_rate": 4.954459167209588e-07, "loss": 0.0002, "reward": 1.80892863124609, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.8133928589522839, "rewards/format_reward_func": 0.9955357164144516, "step": 12018 }, { "completion_length": 243.82590198516846, "epoch": 2.0154239490339076, "grad_norm": 0.30686562447010646, "kl": 0.15203857421875, "learning_rate": 4.9544363721843e-07, "loss": 0.0002, "reward": 1.7553571984171867, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7598214484751225, "rewards/format_reward_func": 0.9955357164144516, "step": 12020 }, { "completion_length": 225.49107933044434, "epoch": 2.0157592522737753, "grad_norm": 0.20599917438686824, "kl": 0.16033935546875, "learning_rate": 4.954413571507988e-07, "loss": 0.0002, "reward": 1.7892857640981674, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 12022 }, { "completion_length": 236.26786708831787, "epoch": 2.0160945555136425, "grad_norm": 0.24825955537246205, "kl": 0.14947509765625, "learning_rate": 4.954390765180707e-07, "loss": 0.0001, "reward": 1.7696429267525673, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 12024 }, { "completion_length": 233.32143878936768, "epoch": 2.01642985875351, "grad_norm": 0.10900224773752099, "kl": 0.155120849609375, "learning_rate": 4.954367953202509e-07, "loss": 0.0002, "reward": 1.7375000566244125, "reward_std": 0.05808377265930176, "rewards/equation_reward_func": 0.7508928813040257, "rewards/format_reward_func": 0.9866071492433548, "step": 12026 }, { "completion_length": 221.6964406967163, "epoch": 2.0167651619933777, "grad_norm": 0.2646779873755802, "kl": 0.149627685546875, "learning_rate": 4.954345135573445e-07, "loss": 0.0001, "reward": 1.812500037252903, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8169643133878708, "rewards/format_reward_func": 0.9955357164144516, "step": 12028 }, { "completion_length": 223.02679538726807, "epoch": 2.0171004652332454, "grad_norm": 0.20961948724422066, "kl": 0.15570068359375, "learning_rate": 4.954322312293568e-07, "loss": 0.0002, "reward": 1.753571480512619, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714656114578, "rewards/format_reward_func": 1.0, "step": 12030 }, { "completion_length": 233.98661708831787, "epoch": 2.017435768473113, "grad_norm": 0.44244999777054755, "kl": 0.162322998046875, "learning_rate": 4.954299483362932e-07, "loss": 0.0002, "reward": 1.787500061094761, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7919643111526966, "rewards/format_reward_func": 0.9955357164144516, "step": 12032 }, { "completion_length": 226.696439743042, "epoch": 2.0177710717129806, "grad_norm": 0.0029737839904434033, "kl": 0.13922119140625, "learning_rate": 4.954276648781588e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 12034 }, { "completion_length": 238.12054634094238, "epoch": 2.018106374952848, "grad_norm": 0.15109126581438292, "kl": 0.155914306640625, "learning_rate": 4.95425380854959e-07, "loss": 0.0002, "reward": 1.7232143580913544, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276785969734192, "rewards/format_reward_func": 0.9955357164144516, "step": 12036 }, { "completion_length": 240.7991180419922, "epoch": 2.0184416781927155, "grad_norm": 0.20111466564363314, "kl": 0.168670654296875, "learning_rate": 4.954230962666989e-07, "loss": 0.0002, "reward": 1.771428644657135, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 1.0, "step": 12038 }, { "completion_length": 239.30358219146729, "epoch": 2.018776981432583, "grad_norm": 0.1693994281368671, "kl": 0.142608642578125, "learning_rate": 4.954208111133839e-07, "loss": 0.0001, "reward": 1.8196429014205933, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8241071701049805, "rewards/format_reward_func": 0.9955357164144516, "step": 12040 }, { "completion_length": 243.7812614440918, "epoch": 2.0191122846724507, "grad_norm": 0.3227749388676018, "kl": 0.171875, "learning_rate": 4.954185253950191e-07, "loss": 0.0002, "reward": 1.7339286729693413, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7383928839117289, "rewards/format_reward_func": 0.9955357164144516, "step": 12042 }, { "completion_length": 236.8660831451416, "epoch": 2.0194475879123184, "grad_norm": 0.17381341547292486, "kl": 0.1773681640625, "learning_rate": 4.9541623911161e-07, "loss": 0.0002, "reward": 1.796428643167019, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.8053571730852127, "rewards/format_reward_func": 0.9910714328289032, "step": 12044 }, { "completion_length": 234.32590198516846, "epoch": 2.0197828911521856, "grad_norm": 0.21425360276716288, "kl": 0.141387939453125, "learning_rate": 4.954139522631617e-07, "loss": 0.0001, "reward": 1.7285715118050575, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7375000193715096, "rewards/format_reward_func": 0.9910714328289032, "step": 12046 }, { "completion_length": 239.37500858306885, "epoch": 2.020118194392053, "grad_norm": 0.30991828406477046, "kl": 0.19769287109375, "learning_rate": 4.954116648496793e-07, "loss": 0.0002, "reward": 1.7428572252392769, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 12048 }, { "completion_length": 242.7366189956665, "epoch": 2.020453497631921, "grad_norm": 0.16492187085630522, "kl": 0.258819580078125, "learning_rate": 4.954093768711685e-07, "loss": 0.0003, "reward": 1.800000049173832, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000342726707, "rewards/format_reward_func": 1.0, "step": 12050 }, { "completion_length": 244.13393878936768, "epoch": 2.0207888008717885, "grad_norm": 0.36228155609406354, "kl": 0.3056640625, "learning_rate": 4.954070883276342e-07, "loss": 0.0003, "reward": 1.8142857626080513, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857477068901, "rewards/format_reward_func": 1.0, "step": 12052 }, { "completion_length": 239.1741189956665, "epoch": 2.021124104111656, "grad_norm": 0.23091231775013618, "kl": 0.128692626953125, "learning_rate": 4.954047992190818e-07, "loss": 0.0001, "reward": 1.742857202887535, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571712225676, "rewards/format_reward_func": 1.0, "step": 12054 }, { "completion_length": 232.46876335144043, "epoch": 2.0214594073515237, "grad_norm": 0.24644318616572938, "kl": 0.320404052734375, "learning_rate": 4.954025095455166e-07, "loss": 0.0003, "reward": 1.8142857402563095, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.8232143186032772, "rewards/format_reward_func": 0.9910714328289032, "step": 12056 }, { "completion_length": 234.58036708831787, "epoch": 2.021794710591391, "grad_norm": 0.18374386788386235, "kl": 0.17694091796875, "learning_rate": 4.954002193069438e-07, "loss": 0.0002, "reward": 1.7500000670552254, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 12058 }, { "completion_length": 229.8794755935669, "epoch": 2.0221300138312586, "grad_norm": 0.11192113605103671, "kl": 0.22650146484375, "learning_rate": 4.953979285033687e-07, "loss": 0.0002, "reward": 1.767857201397419, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 12060 }, { "completion_length": 248.83929538726807, "epoch": 2.022465317071126, "grad_norm": 0.20104151765938574, "kl": 0.20709228515625, "learning_rate": 4.953956371347966e-07, "loss": 0.0002, "reward": 1.7000000700354576, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7089286129921675, "rewards/format_reward_func": 0.9910714328289032, "step": 12062 }, { "completion_length": 228.36161708831787, "epoch": 2.022800620310994, "grad_norm": 0.27693609985790313, "kl": 0.66949462890625, "learning_rate": 4.953933452012327e-07, "loss": 0.0007, "reward": 1.7750000730156898, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 12064 }, { "completion_length": 225.81251049041748, "epoch": 2.0231359235508615, "grad_norm": 0.26251403881160484, "kl": 0.167205810546875, "learning_rate": 4.953910527026824e-07, "loss": 0.0002, "reward": 1.7517858073115349, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7562500387430191, "rewards/format_reward_func": 0.9955357164144516, "step": 12066 }, { "completion_length": 228.04018783569336, "epoch": 2.0234712267907287, "grad_norm": 0.29266835993005574, "kl": 0.438812255859375, "learning_rate": 4.95388759639151e-07, "loss": 0.0004, "reward": 1.7803572043776512, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7848214581608772, "rewards/format_reward_func": 0.9955357164144516, "step": 12068 }, { "completion_length": 226.27679443359375, "epoch": 2.0238065300305963, "grad_norm": 0.19577650905885083, "kl": 0.363250732421875, "learning_rate": 4.953864660106435e-07, "loss": 0.0004, "reward": 1.7946429178118706, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7991071566939354, "rewards/format_reward_func": 0.9955357164144516, "step": 12070 }, { "completion_length": 230.3526906967163, "epoch": 2.024141833270464, "grad_norm": 0.14275428631407802, "kl": 0.262969970703125, "learning_rate": 4.953841718171655e-07, "loss": 0.0003, "reward": 1.7803572192788124, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7848214507102966, "rewards/format_reward_func": 0.9955357164144516, "step": 12072 }, { "completion_length": 225.99108123779297, "epoch": 2.0244771365103316, "grad_norm": 0.17367730450703955, "kl": 0.56787109375, "learning_rate": 4.953818770587221e-07, "loss": 0.0006, "reward": 1.7321429252624512, "reward_std": 0.08586296439170837, "rewards/equation_reward_func": 0.7410714514553547, "rewards/format_reward_func": 0.9910714328289032, "step": 12074 }, { "completion_length": 230.7009038925171, "epoch": 2.024812439750199, "grad_norm": 0.25745718941039286, "kl": 0.167633056640625, "learning_rate": 4.953795817353187e-07, "loss": 0.0002, "reward": 1.7857143357396126, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143022119999, "rewards/format_reward_func": 1.0, "step": 12076 }, { "completion_length": 230.60715293884277, "epoch": 2.025147742990067, "grad_norm": 0.4703462047223365, "kl": 0.290985107421875, "learning_rate": 4.953772858469605e-07, "loss": 0.0003, "reward": 1.7857143729925156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 12078 }, { "completion_length": 225.26340198516846, "epoch": 2.025483046229934, "grad_norm": 0.209617257104548, "kl": 0.496490478515625, "learning_rate": 4.953749893936527e-07, "loss": 0.0005, "reward": 1.78035718947649, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.784821467474103, "rewards/format_reward_func": 0.9955357164144516, "step": 12080 }, { "completion_length": 222.3928680419922, "epoch": 2.0258183494698017, "grad_norm": 0.2109313296218021, "kl": 0.129119873046875, "learning_rate": 4.953726923754008e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.785714328289032, "rewards/format_reward_func": 1.0, "step": 12082 }, { "completion_length": 228.6473331451416, "epoch": 2.0261536527096693, "grad_norm": 0.7119368148754645, "kl": 0.433197021484375, "learning_rate": 4.9537039479221e-07, "loss": 0.0004, "reward": 1.7767857611179352, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500409781933, "rewards/format_reward_func": 0.9955357164144516, "step": 12084 }, { "completion_length": 228.45983409881592, "epoch": 2.026488955949537, "grad_norm": 0.20029418610940258, "kl": 0.416046142578125, "learning_rate": 4.953680966440855e-07, "loss": 0.0004, "reward": 1.7535715252161026, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 12086 }, { "completion_length": 227.72768783569336, "epoch": 2.0268242591894046, "grad_norm": 0.28945798184832394, "kl": 0.2611083984375, "learning_rate": 4.953657979310327e-07, "loss": 0.0003, "reward": 1.7125000730156898, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7169643230736256, "rewards/format_reward_func": 0.9955357164144516, "step": 12088 }, { "completion_length": 217.47768783569336, "epoch": 2.0271595624292718, "grad_norm": 0.3697406638610568, "kl": 0.174652099609375, "learning_rate": 4.953634986530569e-07, "loss": 0.0002, "reward": 1.742857202887535, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571674972773, "rewards/format_reward_func": 1.0, "step": 12090 }, { "completion_length": 231.9598331451416, "epoch": 2.0274948656691394, "grad_norm": 0.19183056439734558, "kl": 0.171661376953125, "learning_rate": 4.953611988101633e-07, "loss": 0.0002, "reward": 1.7464286535978317, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 12092 }, { "completion_length": 223.97768783569336, "epoch": 2.027830168909007, "grad_norm": 0.2675128520542845, "kl": 0.1929931640625, "learning_rate": 4.953588984023573e-07, "loss": 0.0002, "reward": 1.7857143506407738, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7946428805589676, "rewards/format_reward_func": 0.9910714328289032, "step": 12094 }, { "completion_length": 225.57143783569336, "epoch": 2.0281654721488747, "grad_norm": 0.2959984170493704, "kl": 0.1334228515625, "learning_rate": 4.953565974296441e-07, "loss": 0.0001, "reward": 1.6714286804199219, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.6714286021888256, "rewards/format_reward_func": 1.0, "step": 12096 }, { "completion_length": 216.91072463989258, "epoch": 2.0285007753887423, "grad_norm": 0.3055442488342892, "kl": 0.158233642578125, "learning_rate": 4.95354295892029e-07, "loss": 0.0002, "reward": 1.8035714998841286, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 12098 }, { "completion_length": 229.08483409881592, "epoch": 2.02883607862861, "grad_norm": 0.4134827691873043, "kl": 0.18817138671875, "learning_rate": 4.953519937895174e-07, "loss": 0.0002, "reward": 1.721428632736206, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214286159723997, "rewards/format_reward_func": 1.0, "step": 12100 }, { "completion_length": 218.18304634094238, "epoch": 2.029171381868477, "grad_norm": 0.20157766541376793, "kl": 0.151275634765625, "learning_rate": 4.953496911221145e-07, "loss": 0.0002, "reward": 1.7642857804894447, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 12102 }, { "completion_length": 231.22768783569336, "epoch": 2.0295066851083448, "grad_norm": 0.2464018435854221, "kl": 0.17156982421875, "learning_rate": 4.953473878898258e-07, "loss": 0.0002, "reward": 1.810714341700077, "reward_std": 0.0656599123030901, "rewards/equation_reward_func": 0.8196428790688515, "rewards/format_reward_func": 0.9910714328289032, "step": 12104 }, { "completion_length": 225.34375953674316, "epoch": 2.0298419883482124, "grad_norm": 0.18348432319917418, "kl": 0.15228271484375, "learning_rate": 4.953450840926563e-07, "loss": 0.0002, "reward": 1.7446429282426834, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7491071783006191, "rewards/format_reward_func": 0.9955357164144516, "step": 12106 }, { "completion_length": 228.602689743042, "epoch": 2.03017729158808, "grad_norm": 0.2064246228600626, "kl": 0.357513427734375, "learning_rate": 4.953427797306115e-07, "loss": 0.0004, "reward": 1.7803572043776512, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 12108 }, { "completion_length": 236.37054634094238, "epoch": 2.0305125948279477, "grad_norm": 0.1549182499060302, "kl": 0.149200439453125, "learning_rate": 4.953404748036965e-07, "loss": 0.0001, "reward": 1.7553571909666061, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7598214782774448, "rewards/format_reward_func": 0.9955357164144516, "step": 12110 }, { "completion_length": 237.93750953674316, "epoch": 2.0308478980678153, "grad_norm": 0.1820479868588014, "kl": 0.184326171875, "learning_rate": 4.953381693119169e-07, "loss": 0.0002, "reward": 1.74821437895298, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.752678606659174, "rewards/format_reward_func": 0.9955357164144516, "step": 12112 }, { "completion_length": 236.83929824829102, "epoch": 2.0311832013076825, "grad_norm": 0.1271072757567212, "kl": 0.206817626953125, "learning_rate": 4.95335863255278e-07, "loss": 0.0002, "reward": 1.76071435213089, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7696428783237934, "rewards/format_reward_func": 0.9910714328289032, "step": 12114 }, { "completion_length": 237.74108409881592, "epoch": 2.03151850454755, "grad_norm": 0.30312186630702503, "kl": 0.359375, "learning_rate": 4.953335566337847e-07, "loss": 0.0004, "reward": 1.726785808801651, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7312500327825546, "rewards/format_reward_func": 0.9955357164144516, "step": 12116 }, { "completion_length": 237.87500953674316, "epoch": 2.0318538077874178, "grad_norm": 0.34231948362766396, "kl": 0.210174560546875, "learning_rate": 4.953312494474427e-07, "loss": 0.0002, "reward": 1.7683036476373672, "reward_std": 0.06502856919541955, "rewards/equation_reward_func": 0.7830357365310192, "rewards/format_reward_func": 0.9852678664028645, "step": 12118 }, { "completion_length": 247.51340293884277, "epoch": 2.0321891110272854, "grad_norm": 0.24182887804491468, "kl": 0.211639404296875, "learning_rate": 4.953289416962573e-07, "loss": 0.0002, "reward": 1.7375000789761543, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7419643215835094, "rewards/format_reward_func": 0.9955357164144516, "step": 12120 }, { "completion_length": 236.508939743042, "epoch": 2.032524414267153, "grad_norm": 0.18792396805814143, "kl": 0.15826416015625, "learning_rate": 4.953266333802336e-07, "loss": 0.0002, "reward": 1.7482143342494965, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7616071589291096, "rewards/format_reward_func": 0.9866071492433548, "step": 12122 }, { "completion_length": 234.99554634094238, "epoch": 2.03285971750702, "grad_norm": 0.23981471522060907, "kl": 0.146087646484375, "learning_rate": 4.953243244993771e-07, "loss": 0.0001, "reward": 1.7839286252856255, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928790688515, "rewards/format_reward_func": 0.9955357164144516, "step": 12124 }, { "completion_length": 230.20090293884277, "epoch": 2.033195020746888, "grad_norm": 0.5070715919217202, "kl": 0.173065185546875, "learning_rate": 4.953220150536931e-07, "loss": 0.0002, "reward": 1.8071429058909416, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.8071428947150707, "rewards/format_reward_func": 1.0, "step": 12126 }, { "completion_length": 235.46429538726807, "epoch": 2.0335303239867555, "grad_norm": 0.20365360632250812, "kl": 0.140228271484375, "learning_rate": 4.953197050431867e-07, "loss": 0.0001, "reward": 1.7625000551342964, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 12128 }, { "completion_length": 228.5982255935669, "epoch": 2.033865627226623, "grad_norm": 0.3304221388416682, "kl": 0.15032958984375, "learning_rate": 4.953173944678635e-07, "loss": 0.0002, "reward": 1.8035714849829674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 12130 }, { "completion_length": 244.96429443359375, "epoch": 2.0342009304664908, "grad_norm": 0.48853927043600925, "kl": 0.200531005859375, "learning_rate": 4.953150833277286e-07, "loss": 0.0002, "reward": 1.7553571909666061, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.759821455925703, "rewards/format_reward_func": 0.9955357164144516, "step": 12132 }, { "completion_length": 233.75000953674316, "epoch": 2.0345362337063584, "grad_norm": 0.5231974895770883, "kl": 0.131744384765625, "learning_rate": 4.953127716227875e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571939468384, "rewards/format_reward_func": 1.0, "step": 12134 }, { "completion_length": 221.73661708831787, "epoch": 2.0348715369462256, "grad_norm": 0.156355831696372, "kl": 0.13409423828125, "learning_rate": 4.953104593530455e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 12136 }, { "completion_length": 239.66072463989258, "epoch": 2.035206840186093, "grad_norm": 0.28143142463041965, "kl": 0.167388916015625, "learning_rate": 4.953081465185077e-07, "loss": 0.0002, "reward": 1.7321429178118706, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428991854191, "rewards/format_reward_func": 1.0, "step": 12138 }, { "completion_length": 217.82590198516846, "epoch": 2.035542143425961, "grad_norm": 0.15327124267949874, "kl": 0.12646484375, "learning_rate": 4.953058331191797e-07, "loss": 0.0001, "reward": 1.7517857775092125, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 12140 }, { "completion_length": 220.13840198516846, "epoch": 2.0358774466658285, "grad_norm": 0.243204474048529, "kl": 0.13507080078125, "learning_rate": 4.953035191550667e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000413507223, "rewards/format_reward_func": 1.0, "step": 12142 }, { "completion_length": 240.28572845458984, "epoch": 2.036212749905696, "grad_norm": 0.3016617870258245, "kl": 0.14996337890625, "learning_rate": 4.95301204626174e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.800000011920929, "rewards/format_reward_func": 1.0, "step": 12144 }, { "completion_length": 238.83036613464355, "epoch": 2.0365480531455633, "grad_norm": 0.9705617575803153, "kl": 0.199249267578125, "learning_rate": 4.952988895325071e-07, "loss": 0.0002, "reward": 1.7464286461472511, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285958558321, "rewards/format_reward_func": 1.0, "step": 12146 }, { "completion_length": 242.16518688201904, "epoch": 2.036883356385431, "grad_norm": 0.24141183406603373, "kl": 0.160888671875, "learning_rate": 4.952965738740712e-07, "loss": 0.0002, "reward": 1.764285758137703, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7642857581377029, "rewards/format_reward_func": 1.0, "step": 12148 }, { "completion_length": 222.35715198516846, "epoch": 2.0372186596252986, "grad_norm": 0.11416317860538366, "kl": 0.1640625, "learning_rate": 4.952942576508715e-07, "loss": 0.0002, "reward": 1.7892857566475868, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 12150 }, { "completion_length": 232.70536708831787, "epoch": 2.037553962865166, "grad_norm": 0.5464999282439347, "kl": 0.185272216796875, "learning_rate": 4.952919408629136e-07, "loss": 0.0002, "reward": 1.8321429267525673, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8321429006755352, "rewards/format_reward_func": 1.0, "step": 12152 }, { "completion_length": 231.72322463989258, "epoch": 2.037889266105034, "grad_norm": 0.19676459706372948, "kl": 0.166259765625, "learning_rate": 4.952896235102027e-07, "loss": 0.0002, "reward": 1.7178572118282318, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571820259094, "rewards/format_reward_func": 1.0, "step": 12154 }, { "completion_length": 233.97322463989258, "epoch": 2.0382245693449015, "grad_norm": 0.3356855438729618, "kl": 0.20611572265625, "learning_rate": 4.95287305592744e-07, "loss": 0.0002, "reward": 1.8214286267757416, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8214286006987095, "rewards/format_reward_func": 1.0, "step": 12156 }, { "completion_length": 238.13393783569336, "epoch": 2.0385598725847687, "grad_norm": 0.11742634321299134, "kl": 0.263336181640625, "learning_rate": 4.952849871105431e-07, "loss": 0.0003, "reward": 1.7575893327593803, "reward_std": 0.02967323106713593, "rewards/equation_reward_func": 0.7633928973227739, "rewards/format_reward_func": 0.9941964335739613, "step": 12158 }, { "completion_length": 234.12947273254395, "epoch": 2.0388951758246363, "grad_norm": 0.13895650350235308, "kl": 0.15374755859375, "learning_rate": 4.952826680636051e-07, "loss": 0.0002, "reward": 1.7785714864730835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 12160 }, { "completion_length": 241.33483219146729, "epoch": 2.039230479064504, "grad_norm": 0.2290955370201332, "kl": 0.221832275390625, "learning_rate": 4.952803484519357e-07, "loss": 0.0002, "reward": 1.8214285895228386, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8214286044239998, "rewards/format_reward_func": 1.0, "step": 12162 }, { "completion_length": 248.62054824829102, "epoch": 2.0395657823043716, "grad_norm": 0.5380827733704721, "kl": 0.365081787109375, "learning_rate": 4.952780282755398e-07, "loss": 0.0004, "reward": 1.7196429148316383, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7241071630269289, "rewards/format_reward_func": 0.9955357164144516, "step": 12164 }, { "completion_length": 244.63393783569336, "epoch": 2.039901085544239, "grad_norm": 0.21779016308612434, "kl": 0.276580810546875, "learning_rate": 4.95275707534423e-07, "loss": 0.0003, "reward": 1.785714365541935, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.785714328289032, "rewards/format_reward_func": 1.0, "step": 12166 }, { "completion_length": 237.87054347991943, "epoch": 2.0402363887841064, "grad_norm": 0.12616860556579165, "kl": 0.215911865234375, "learning_rate": 4.952733862285905e-07, "loss": 0.0002, "reward": 1.7821429297327995, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7910714633762836, "rewards/format_reward_func": 0.9910714328289032, "step": 12168 }, { "completion_length": 262.1384057998657, "epoch": 2.040571692023974, "grad_norm": 0.19813592000547936, "kl": 0.26214599609375, "learning_rate": 4.952710643580478e-07, "loss": 0.0003, "reward": 1.7071429193019867, "reward_std": 0.08081220556050539, "rewards/equation_reward_func": 0.7250000312924385, "rewards/format_reward_func": 0.9821428656578064, "step": 12170 }, { "completion_length": 254.040189743042, "epoch": 2.0409069952638417, "grad_norm": 0.16061184685130958, "kl": 0.239654541015625, "learning_rate": 4.952687419228001e-07, "loss": 0.0002, "reward": 1.749553620815277, "reward_std": 0.06124049751088023, "rewards/equation_reward_func": 0.7616071701049805, "rewards/format_reward_func": 0.9879464358091354, "step": 12172 }, { "completion_length": 242.59375953674316, "epoch": 2.0412422985037093, "grad_norm": 0.2398218040558381, "kl": 0.1614990234375, "learning_rate": 4.952664189228529e-07, "loss": 0.0002, "reward": 1.7839286178350449, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 12174 }, { "completion_length": 239.91965579986572, "epoch": 2.041577601743577, "grad_norm": 0.1620207173064198, "kl": 0.160888671875, "learning_rate": 4.952640953582114e-07, "loss": 0.0002, "reward": 1.7428572103381157, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571879863739, "rewards/format_reward_func": 1.0, "step": 12176 }, { "completion_length": 237.92858028411865, "epoch": 2.0419129049834446, "grad_norm": 0.21874752432843478, "kl": 0.17864990234375, "learning_rate": 4.95261771228881e-07, "loss": 0.0002, "reward": 1.767857201397419, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571827709675, "rewards/format_reward_func": 1.0, "step": 12178 }, { "completion_length": 251.3616180419922, "epoch": 2.0422482082233118, "grad_norm": 0.26590850068573935, "kl": 0.19781494140625, "learning_rate": 4.952594465348672e-07, "loss": 0.0002, "reward": 1.7482143491506577, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7526785936206579, "rewards/format_reward_func": 0.9955357164144516, "step": 12180 }, { "completion_length": 225.64286708831787, "epoch": 2.0425835114631794, "grad_norm": 0.20064903092848752, "kl": 0.1510009765625, "learning_rate": 4.95257121276175e-07, "loss": 0.0002, "reward": 1.8196429088711739, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.833035733550787, "rewards/format_reward_func": 0.9866071492433548, "step": 12182 }, { "completion_length": 246.2946538925171, "epoch": 2.042918814703047, "grad_norm": 0.2570804310225239, "kl": 0.225006103515625, "learning_rate": 4.952547954528101e-07, "loss": 0.0002, "reward": 1.7053572162985802, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7098214793950319, "rewards/format_reward_func": 0.9955357164144516, "step": 12184 }, { "completion_length": 231.93304634094238, "epoch": 2.0432541179429147, "grad_norm": 0.24222254138144964, "kl": 0.16400146484375, "learning_rate": 4.952524690647778e-07, "loss": 0.0002, "reward": 1.7910714745521545, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7955357544124126, "rewards/format_reward_func": 0.9955357164144516, "step": 12186 }, { "completion_length": 236.96876049041748, "epoch": 2.0435894211827823, "grad_norm": 0.24676297049278167, "kl": 0.234893798828125, "learning_rate": 4.952501421120832e-07, "loss": 0.0002, "reward": 1.7892857566475868, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.798214316368103, "rewards/format_reward_func": 0.9910714328289032, "step": 12188 }, { "completion_length": 232.9687623977661, "epoch": 2.04392472442265, "grad_norm": 0.20240000669952848, "kl": 0.198699951171875, "learning_rate": 4.952478145947321e-07, "loss": 0.0002, "reward": 1.7482143566012383, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7616071663796902, "rewards/format_reward_func": 0.9866071492433548, "step": 12190 }, { "completion_length": 234.76340198516846, "epoch": 2.044260027662517, "grad_norm": 0.2367568215370992, "kl": 0.19390869140625, "learning_rate": 4.952454865127295e-07, "loss": 0.0002, "reward": 1.7642857879400253, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857320606709, "rewards/format_reward_func": 1.0, "step": 12192 }, { "completion_length": 226.4553680419922, "epoch": 2.0445953309023848, "grad_norm": 0.3957196761252101, "kl": 0.1790771484375, "learning_rate": 4.952431578660807e-07, "loss": 0.0002, "reward": 1.7803572118282318, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 12194 }, { "completion_length": 238.0446548461914, "epoch": 2.0449306341422524, "grad_norm": 0.41923647210505366, "kl": 0.213134765625, "learning_rate": 4.952408286547913e-07, "loss": 0.0002, "reward": 1.753571480512619, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714767873287, "rewards/format_reward_func": 1.0, "step": 12196 }, { "completion_length": 230.87054824829102, "epoch": 2.04526593738212, "grad_norm": 0.16331890639992575, "kl": 0.154632568359375, "learning_rate": 4.952384988788666e-07, "loss": 0.0002, "reward": 1.6553572416305542, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.6598214600235224, "rewards/format_reward_func": 0.9955357164144516, "step": 12198 }, { "completion_length": 224.12500858306885, "epoch": 2.0456012406219877, "grad_norm": 0.4519803190382613, "kl": 0.20135498046875, "learning_rate": 4.95236168538312e-07, "loss": 0.0002, "reward": 1.7107143625617027, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7107143215835094, "rewards/format_reward_func": 1.0, "step": 12200 }, { "completion_length": 226.50000762939453, "epoch": 2.045936543861855, "grad_norm": 0.338038635536733, "kl": 0.194580078125, "learning_rate": 4.952338376331327e-07, "loss": 0.0002, "reward": 1.7857143431901932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 12202 }, { "completion_length": 223.4464406967163, "epoch": 2.0462718471017225, "grad_norm": 0.10915545282494744, "kl": 0.13482666015625, "learning_rate": 4.952315061633343e-07, "loss": 0.0001, "reward": 1.8285714909434319, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8285714574158192, "rewards/format_reward_func": 1.0, "step": 12204 }, { "completion_length": 224.90179824829102, "epoch": 2.04660715034159, "grad_norm": 0.09504629135042848, "kl": 0.133575439453125, "learning_rate": 4.952291741289221e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.0328299580141902, "rewards/equation_reward_func": 0.781250037252903, "rewards/format_reward_func": 0.9955357164144516, "step": 12206 }, { "completion_length": 225.60268878936768, "epoch": 2.0469424535814578, "grad_norm": 0.1865660206136097, "kl": 0.11761474609375, "learning_rate": 4.952268415299013e-07, "loss": 0.0001, "reward": 1.8250000402331352, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8250000104308128, "rewards/format_reward_func": 1.0, "step": 12208 }, { "completion_length": 228.60268878936768, "epoch": 2.0472777568213254, "grad_norm": 0.22629386598093604, "kl": 0.16473388671875, "learning_rate": 4.952245083662774e-07, "loss": 0.0002, "reward": 1.7535715252161026, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 12210 }, { "completion_length": 230.33929634094238, "epoch": 2.047613060061193, "grad_norm": 0.27102016159457293, "kl": 0.1397705078125, "learning_rate": 4.952221746380557e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 12212 }, { "completion_length": 231.64286994934082, "epoch": 2.0479483633010602, "grad_norm": 0.29415841832577194, "kl": 0.14801025390625, "learning_rate": 4.952198403452417e-07, "loss": 0.0001, "reward": 1.7142857760190964, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.714285746216774, "rewards/format_reward_func": 1.0, "step": 12214 }, { "completion_length": 228.3616180419922, "epoch": 2.048283666540928, "grad_norm": 0.27608760539171945, "kl": 0.14324951171875, "learning_rate": 4.952175054878407e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 12216 }, { "completion_length": 221.0000057220459, "epoch": 2.0486189697807955, "grad_norm": 0.16831740463669606, "kl": 0.14044189453125, "learning_rate": 4.952151700658581e-07, "loss": 0.0001, "reward": 1.7767857611179352, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500298023224, "rewards/format_reward_func": 0.9955357164144516, "step": 12218 }, { "completion_length": 230.52233028411865, "epoch": 2.048954273020663, "grad_norm": 0.30209937383700664, "kl": 0.13238525390625, "learning_rate": 4.952128340792992e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857488244772, "rewards/format_reward_func": 1.0, "step": 12220 }, { "completion_length": 222.59375953674316, "epoch": 2.0492895762605308, "grad_norm": 0.3109690344702524, "kl": 0.151123046875, "learning_rate": 4.952104975281696e-07, "loss": 0.0002, "reward": 1.7535715252161026, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714693367481, "rewards/format_reward_func": 1.0, "step": 12222 }, { "completion_length": 232.5134038925171, "epoch": 2.049624879500398, "grad_norm": 0.2744549215687352, "kl": 0.1292724609375, "learning_rate": 4.952081604124743e-07, "loss": 0.0001, "reward": 1.6910715103149414, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.6955357603728771, "rewards/format_reward_func": 0.9955357164144516, "step": 12224 }, { "completion_length": 235.00001049041748, "epoch": 2.0499601827402656, "grad_norm": 0.5163864123677877, "kl": 0.248382568359375, "learning_rate": 4.952058227322191e-07, "loss": 0.0002, "reward": 1.7910714820027351, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7955357506871223, "rewards/format_reward_func": 0.9955357164144516, "step": 12226 }, { "completion_length": 220.38840293884277, "epoch": 2.0502954859801332, "grad_norm": 0.07724610768291482, "kl": 0.1690673828125, "learning_rate": 4.952034844874091e-07, "loss": 0.0002, "reward": 1.7964286133646965, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 12228 }, { "completion_length": 242.69197463989258, "epoch": 2.050630789220001, "grad_norm": 0.16030103752196245, "kl": 0.132568359375, "learning_rate": 4.952011456780497e-07, "loss": 0.0001, "reward": 1.801785759627819, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8062500134110451, "rewards/format_reward_func": 0.9955357164144516, "step": 12230 }, { "completion_length": 226.81251049041748, "epoch": 2.0509660924598685, "grad_norm": 0.17879382796345136, "kl": 0.123504638671875, "learning_rate": 4.951988063041464e-07, "loss": 0.0001, "reward": 1.7625000327825546, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 12232 }, { "completion_length": 227.62500858306885, "epoch": 2.051301395699736, "grad_norm": 0.26533960090283815, "kl": 0.143218994140625, "learning_rate": 4.951964663657046e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8035714700818062, "rewards/format_reward_func": 1.0, "step": 12234 }, { "completion_length": 235.46875953674316, "epoch": 2.0516366989396033, "grad_norm": 0.3753427425352448, "kl": 0.139190673828125, "learning_rate": 4.951941258627294e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 12236 }, { "completion_length": 230.11608028411865, "epoch": 2.051972002179471, "grad_norm": 0.35682220127991493, "kl": 0.129425048828125, "learning_rate": 4.951917847952266e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428693830967, "rewards/format_reward_func": 1.0, "step": 12238 }, { "completion_length": 222.79465198516846, "epoch": 2.0523073054193386, "grad_norm": 0.13520711911359942, "kl": 0.142059326171875, "learning_rate": 4.951894431632014e-07, "loss": 0.0001, "reward": 1.733928643167019, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7383928969502449, "rewards/format_reward_func": 0.9955357164144516, "step": 12240 }, { "completion_length": 218.290189743042, "epoch": 2.0526426086592062, "grad_norm": 0.13303137022439182, "kl": 0.121063232421875, "learning_rate": 4.951871009666591e-07, "loss": 0.0001, "reward": 1.7785715088248253, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 12242 }, { "completion_length": 223.2455472946167, "epoch": 2.052977911899074, "grad_norm": 0.2376218232090186, "kl": 0.28485107421875, "learning_rate": 4.951847582056053e-07, "loss": 0.0003, "reward": 1.8000000566244125, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 12244 }, { "completion_length": 207.01786708831787, "epoch": 2.0533132151389415, "grad_norm": 0.09265186879940798, "kl": 0.1336669921875, "learning_rate": 4.951824148800452e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8071428723633289, "rewards/format_reward_func": 1.0, "step": 12246 }, { "completion_length": 219.26786613464355, "epoch": 2.0536485183788087, "grad_norm": 0.2203674172881168, "kl": 0.193115234375, "learning_rate": 4.951800709899843e-07, "loss": 0.0002, "reward": 1.7607143595814705, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 12248 }, { "completion_length": 219.40625762939453, "epoch": 2.0539838216186763, "grad_norm": 0.11510668979914045, "kl": 0.20257568359375, "learning_rate": 4.95177726535428e-07, "loss": 0.0002, "reward": 1.7500000819563866, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 12250 }, { "completion_length": 217.38840198516846, "epoch": 2.054319124858544, "grad_norm": 0.16410520017656868, "kl": 0.1424102783203125, "learning_rate": 4.951753815163816e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 12252 }, { "completion_length": 211.48215198516846, "epoch": 2.0546544280984116, "grad_norm": 0.18802458515738016, "kl": 0.313232421875, "learning_rate": 4.951730359328507e-07, "loss": 0.0003, "reward": 1.7964286506175995, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 12254 }, { "completion_length": 217.08483123779297, "epoch": 2.0549897313382792, "grad_norm": 0.3090721220893524, "kl": 0.201141357421875, "learning_rate": 4.951706897848404e-07, "loss": 0.0002, "reward": 1.7928571999073029, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 12256 }, { "completion_length": 214.23661518096924, "epoch": 2.0553250345781464, "grad_norm": 0.21218909376421832, "kl": 0.141326904296875, "learning_rate": 4.951683430723563e-07, "loss": 0.0001, "reward": 1.7642858102917671, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 12258 }, { "completion_length": 216.852689743042, "epoch": 2.055660337818014, "grad_norm": 0.32296017850789377, "kl": 0.15814208984375, "learning_rate": 4.951659957954039e-07, "loss": 0.0002, "reward": 1.8017857819795609, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.8062500059604645, "rewards/format_reward_func": 0.9955357164144516, "step": 12260 }, { "completion_length": 214.16072463989258, "epoch": 2.0559956410578817, "grad_norm": 0.5911557976325647, "kl": 0.276123046875, "learning_rate": 4.951636479539883e-07, "loss": 0.0003, "reward": 1.8267857432365417, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8312500342726707, "rewards/format_reward_func": 0.9955357164144516, "step": 12262 }, { "completion_length": 226.8125123977661, "epoch": 2.0563309442977493, "grad_norm": 0.3042559685781432, "kl": 0.15435791015625, "learning_rate": 4.951612995481152e-07, "loss": 0.0002, "reward": 1.7553572058677673, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7598214671015739, "rewards/format_reward_func": 0.9955357164144516, "step": 12264 }, { "completion_length": 213.71429634094238, "epoch": 2.056666247537617, "grad_norm": 0.2080678635967977, "kl": 0.33575439453125, "learning_rate": 4.951589505777899e-07, "loss": 0.0003, "reward": 1.7821428999304771, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 12266 }, { "completion_length": 217.05804443359375, "epoch": 2.0570015507774846, "grad_norm": 0.1594908542843629, "kl": 0.2147216796875, "learning_rate": 4.951566010430177e-07, "loss": 0.0002, "reward": 1.7607143372297287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 12268 }, { "completion_length": 209.49554443359375, "epoch": 2.057336854017352, "grad_norm": 0.1559413315105943, "kl": 0.192535400390625, "learning_rate": 4.951542509438041e-07, "loss": 0.0002, "reward": 1.7589286342263222, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7633928954601288, "rewards/format_reward_func": 0.9955357164144516, "step": 12270 }, { "completion_length": 201.78125858306885, "epoch": 2.0576721572572194, "grad_norm": 0.271571152588572, "kl": 0.2626953125, "learning_rate": 4.951519002801546e-07, "loss": 0.0003, "reward": 1.7142858058214188, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7142857499420643, "rewards/format_reward_func": 1.0, "step": 12272 }, { "completion_length": 202.54911613464355, "epoch": 2.058007460497087, "grad_norm": 0.30856242436789477, "kl": 0.436859130859375, "learning_rate": 4.951495490520745e-07, "loss": 0.0004, "reward": 1.7571429386734962, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 12274 }, { "completion_length": 205.22322273254395, "epoch": 2.0583427637369547, "grad_norm": 0.21680426274120007, "kl": 0.1307373046875, "learning_rate": 4.951471972595694e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 12276 }, { "completion_length": 199.67858123779297, "epoch": 2.0586780669768223, "grad_norm": 0.10996703065060714, "kl": 0.391815185546875, "learning_rate": 4.951448449026443e-07, "loss": 0.0004, "reward": 1.7928571924567223, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571552038193, "rewards/format_reward_func": 1.0, "step": 12278 }, { "completion_length": 204.74108123779297, "epoch": 2.0590133702166895, "grad_norm": 0.09081324500665985, "kl": 1.105133056640625, "learning_rate": 4.95142491981305e-07, "loss": 0.0011, "reward": 1.8071429207921028, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 12280 }, { "completion_length": 214.883939743042, "epoch": 2.059348673456557, "grad_norm": 0.34335413647230867, "kl": 0.147857666015625, "learning_rate": 4.951401384955568e-07, "loss": 0.0001, "reward": 1.7107143551111221, "reward_std": 0.035355339758098125, "rewards/equation_reward_func": 0.7196428906172514, "rewards/format_reward_func": 0.9910714328289032, "step": 12282 }, { "completion_length": 211.67857933044434, "epoch": 2.059683976696425, "grad_norm": 0.42040633375445585, "kl": 0.171630859375, "learning_rate": 4.951377844454051e-07, "loss": 0.0002, "reward": 1.7160715013742447, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7205357477068901, "rewards/format_reward_func": 0.9955357164144516, "step": 12284 }, { "completion_length": 216.30804634094238, "epoch": 2.0600192799362924, "grad_norm": 0.03826716338430946, "kl": 1.191925048828125, "learning_rate": 4.951354298308554e-07, "loss": 0.0012, "reward": 1.7892857789993286, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.789285734295845, "rewards/format_reward_func": 1.0, "step": 12286 }, { "completion_length": 212.11161613464355, "epoch": 2.06035458317616, "grad_norm": 0.3257955722546039, "kl": 0.256622314453125, "learning_rate": 4.951330746519129e-07, "loss": 0.0003, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 12288 }, { "completion_length": 211.2232255935669, "epoch": 2.0606898864160277, "grad_norm": 0.1655865158957351, "kl": 0.50634765625, "learning_rate": 4.951307189085833e-07, "loss": 0.0005, "reward": 1.7321429178118706, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428973227739, "rewards/format_reward_func": 1.0, "step": 12290 }, { "completion_length": 216.52679443359375, "epoch": 2.061025189655895, "grad_norm": 0.33884942243688243, "kl": 0.244232177734375, "learning_rate": 4.951283626008717e-07, "loss": 0.0002, "reward": 1.7321429401636124, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7321428917348385, "rewards/format_reward_func": 1.0, "step": 12292 }, { "completion_length": 208.66072463989258, "epoch": 2.0613604928957625, "grad_norm": 0.24913338205065425, "kl": 0.36474609375, "learning_rate": 4.951260057287839e-07, "loss": 0.0004, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 12294 }, { "completion_length": 208.3616180419922, "epoch": 2.06169579613563, "grad_norm": 0.19352248282013856, "kl": 0.12322998046875, "learning_rate": 4.951236482923252e-07, "loss": 0.0001, "reward": 1.7892857939004898, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857566475868, "rewards/format_reward_func": 1.0, "step": 12296 }, { "completion_length": 208.8303689956665, "epoch": 2.062031099375498, "grad_norm": 0.08871718127407249, "kl": 0.1153717041015625, "learning_rate": 4.951212902915009e-07, "loss": 0.0001, "reward": 1.8196429163217545, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8241071626543999, "rewards/format_reward_func": 0.9955357164144516, "step": 12298 }, { "completion_length": 227.90625762939453, "epoch": 2.0623664026153654, "grad_norm": 0.1901467761656163, "kl": 0.38531494140625, "learning_rate": 4.951189317263164e-07, "loss": 0.0004, "reward": 1.7214286625385284, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7303571626543999, "rewards/format_reward_func": 0.9910714328289032, "step": 12300 }, { "completion_length": 209.73661518096924, "epoch": 2.0627017058552326, "grad_norm": 0.2980171673664818, "kl": 0.2169189453125, "learning_rate": 4.951165725967774e-07, "loss": 0.0002, "reward": 1.767410784959793, "reward_std": 0.0460882093757391, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9968750067055225, "step": 12302 }, { "completion_length": 205.17411518096924, "epoch": 2.0630370090951002, "grad_norm": 0.12847052840527082, "kl": 0.134552001953125, "learning_rate": 4.951142129028891e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 12304 }, { "completion_length": 215.54018783569336, "epoch": 2.063372312334968, "grad_norm": 0.1620875253578144, "kl": 0.37359619140625, "learning_rate": 4.951118526446569e-07, "loss": 0.0004, "reward": 1.7714286372065544, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 12306 }, { "completion_length": 225.3259038925171, "epoch": 2.0637076155748355, "grad_norm": 0.24326793036443256, "kl": 0.216827392578125, "learning_rate": 4.951094918220865e-07, "loss": 0.0002, "reward": 1.814285770058632, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8142857328057289, "rewards/format_reward_func": 1.0, "step": 12308 }, { "completion_length": 211.29018878936768, "epoch": 2.064042918814703, "grad_norm": 0.16845783070573056, "kl": 0.169158935546875, "learning_rate": 4.951071304351831e-07, "loss": 0.0002, "reward": 1.7928571999073029, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 12310 }, { "completion_length": 205.84822368621826, "epoch": 2.0643782220545708, "grad_norm": 0.4214904190744296, "kl": 0.1233978271484375, "learning_rate": 4.951047684839522e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714626312256, "rewards/format_reward_func": 1.0, "step": 12312 }, { "completion_length": 210.79465007781982, "epoch": 2.064713525294438, "grad_norm": 0.21442170307890102, "kl": 0.15289306640625, "learning_rate": 4.951024059683993e-07, "loss": 0.0002, "reward": 1.7580357789993286, "reward_std": 0.059346459805965424, "rewards/equation_reward_func": 0.7598214745521545, "rewards/format_reward_func": 0.9982142895460129, "step": 12314 }, { "completion_length": 211.85715103149414, "epoch": 2.0650488285343056, "grad_norm": 0.3091366156084564, "kl": 0.134857177734375, "learning_rate": 4.951000428885297e-07, "loss": 0.0001, "reward": 1.844642885029316, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.849107164889574, "rewards/format_reward_func": 0.9955357164144516, "step": 12316 }, { "completion_length": 212.8571538925171, "epoch": 2.0653841317741732, "grad_norm": 0.45210288819457994, "kl": 0.677154541015625, "learning_rate": 4.95097679244349e-07, "loss": 0.0007, "reward": 1.7339286506175995, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7383928894996643, "rewards/format_reward_func": 0.9955357164144516, "step": 12318 }, { "completion_length": 227.17411708831787, "epoch": 2.065719435014041, "grad_norm": 0.2152976862107728, "kl": 0.2509307861328125, "learning_rate": 4.950953150358625e-07, "loss": 0.0003, "reward": 1.8232143223285675, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.8276785910129547, "rewards/format_reward_func": 0.9955357164144516, "step": 12320 }, { "completion_length": 227.540189743042, "epoch": 2.0660547382539085, "grad_norm": 0.5459242095577989, "kl": 0.271026611328125, "learning_rate": 4.950929502630757e-07, "loss": 0.0003, "reward": 1.782142922282219, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 12322 }, { "completion_length": 220.99554347991943, "epoch": 2.066390041493776, "grad_norm": 0.425233638393935, "kl": 0.907958984375, "learning_rate": 4.950905849259942e-07, "loss": 0.0009, "reward": 1.7821429446339607, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 12324 }, { "completion_length": 234.6250114440918, "epoch": 2.0667253447336433, "grad_norm": 0.3270812138751066, "kl": 0.9796600341796875, "learning_rate": 4.950882190246232e-07, "loss": 0.001, "reward": 1.6357143744826317, "reward_std": 0.11111677903681993, "rewards/equation_reward_func": 0.6625000331550837, "rewards/format_reward_func": 0.9732142984867096, "step": 12326 }, { "completion_length": 221.82143878936768, "epoch": 2.067060647973511, "grad_norm": 0.2345927392282807, "kl": 0.71343994140625, "learning_rate": 4.950858525589682e-07, "loss": 0.0007, "reward": 1.7857143357396126, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7946428917348385, "rewards/format_reward_func": 0.9910714328289032, "step": 12328 }, { "completion_length": 222.04018783569336, "epoch": 2.0673959512133786, "grad_norm": 0.1827131215605797, "kl": 0.113525390625, "learning_rate": 4.950834855290347e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 12330 }, { "completion_length": 212.12500953674316, "epoch": 2.0677312544532462, "grad_norm": 0.17331603165530027, "kl": 0.10479736328125, "learning_rate": 4.950811179348282e-07, "loss": 0.0001, "reward": 1.8535714596509933, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8535714522004128, "rewards/format_reward_func": 1.0, "step": 12332 }, { "completion_length": 226.7901906967163, "epoch": 2.068066557693114, "grad_norm": 0.2783940467018383, "kl": 2.2259521484375, "learning_rate": 4.950787497763541e-07, "loss": 0.0022, "reward": 1.7232143580913544, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7357143126428127, "rewards/format_reward_func": 0.9875000044703484, "step": 12334 }, { "completion_length": 220.68751049041748, "epoch": 2.068401860932981, "grad_norm": 0.2831385527880495, "kl": 0.7620086669921875, "learning_rate": 4.950763810536178e-07, "loss": 0.0008, "reward": 1.8035714998841286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 12336 }, { "completion_length": 225.80804634094238, "epoch": 2.0687371641728487, "grad_norm": 0.2587481213808961, "kl": 2.004638671875, "learning_rate": 4.950740117666248e-07, "loss": 0.002, "reward": 1.7678572311997414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 12338 }, { "completion_length": 225.87501049041748, "epoch": 2.0690724674127163, "grad_norm": 0.14320682134950544, "kl": 0.146728515625, "learning_rate": 4.950716419153806e-07, "loss": 0.0001, "reward": 1.7892857939004898, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 12340 }, { "completion_length": 226.38840198516846, "epoch": 2.069407770652584, "grad_norm": 0.1876906013637508, "kl": 0.278961181640625, "learning_rate": 4.950692714998906e-07, "loss": 0.0003, "reward": 1.7241072282195091, "reward_std": 0.06692260596901178, "rewards/equation_reward_func": 0.7303571663796902, "rewards/format_reward_func": 0.9937500059604645, "step": 12342 }, { "completion_length": 234.2946538925171, "epoch": 2.0697430738924516, "grad_norm": 0.37498717092004047, "kl": 0.311004638671875, "learning_rate": 4.950669005201603e-07, "loss": 0.0003, "reward": 1.716071493923664, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7205357626080513, "rewards/format_reward_func": 0.9955357164144516, "step": 12344 }, { "completion_length": 218.04465579986572, "epoch": 2.0700783771323192, "grad_norm": 0.18627335088277233, "kl": 0.1470947265625, "learning_rate": 4.950645289761952e-07, "loss": 0.0001, "reward": 1.8250000476837158, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8250000178813934, "rewards/format_reward_func": 1.0, "step": 12346 }, { "completion_length": 228.9017972946167, "epoch": 2.0704136803721864, "grad_norm": 0.3317378232716388, "kl": 0.1026153564453125, "learning_rate": 4.950621568680006e-07, "loss": 0.0001, "reward": 1.7625000551342964, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 12348 }, { "completion_length": 215.88840198516846, "epoch": 2.070748983612054, "grad_norm": 0.13501275717490083, "kl": 0.2255859375, "learning_rate": 4.950597841955821e-07, "loss": 0.0002, "reward": 1.767857201397419, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7767857350409031, "rewards/format_reward_func": 0.9910714328289032, "step": 12350 }, { "completion_length": 223.21429443359375, "epoch": 2.0710842868519217, "grad_norm": 0.1918316612561862, "kl": 0.6068115234375, "learning_rate": 4.95057410958945e-07, "loss": 0.0006, "reward": 1.7535714954137802, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 12352 }, { "completion_length": 222.90179538726807, "epoch": 2.0714195900917893, "grad_norm": 0.28909041151334275, "kl": 0.0965576171875, "learning_rate": 4.95055037158095e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857428640127, "rewards/format_reward_func": 1.0, "step": 12354 }, { "completion_length": 232.17411708831787, "epoch": 2.071754893331657, "grad_norm": 0.23212451175861568, "kl": 0.3805084228515625, "learning_rate": 4.950526627930374e-07, "loss": 0.0004, "reward": 1.7410715073347092, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7455357536673546, "rewards/format_reward_func": 0.9955357164144516, "step": 12356 }, { "completion_length": 229.99108028411865, "epoch": 2.0720901965715246, "grad_norm": 0.1909832712879006, "kl": 0.110687255859375, "learning_rate": 4.950502878637776e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 12358 }, { "completion_length": 220.53572463989258, "epoch": 2.072425499811392, "grad_norm": 0.10395956700858099, "kl": 0.14569091796875, "learning_rate": 4.950479123703213e-07, "loss": 0.0001, "reward": 1.807142898440361, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8071428742259741, "rewards/format_reward_func": 1.0, "step": 12360 }, { "completion_length": 219.85715103149414, "epoch": 2.0727608030512594, "grad_norm": 0.18317384805951137, "kl": 0.105224609375, "learning_rate": 4.950455363126739e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428868919611, "rewards/format_reward_func": 1.0, "step": 12362 }, { "completion_length": 223.12500953674316, "epoch": 2.073096106291127, "grad_norm": 0.1526248796039914, "kl": 0.239898681640625, "learning_rate": 4.950431596908408e-07, "loss": 0.0002, "reward": 1.7571429312229156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428846567869, "rewards/format_reward_func": 1.0, "step": 12364 }, { "completion_length": 220.6250114440918, "epoch": 2.0734314095309947, "grad_norm": 0.25779972652365757, "kl": 0.1161346435546875, "learning_rate": 4.950407825048273e-07, "loss": 0.0001, "reward": 1.791071467101574, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 12366 }, { "completion_length": 214.35268688201904, "epoch": 2.0737667127708623, "grad_norm": 0.19846381452816875, "kl": 0.09454345703125, "learning_rate": 4.950384047546393e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 12368 }, { "completion_length": 220.6384038925171, "epoch": 2.0741020160107295, "grad_norm": 0.21754462554569812, "kl": 0.136505126953125, "learning_rate": 4.95036026440282e-07, "loss": 0.0001, "reward": 1.751785770058632, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 12370 }, { "completion_length": 225.25001049041748, "epoch": 2.074437319250597, "grad_norm": 0.18328997328817917, "kl": 0.206512451171875, "learning_rate": 4.950336475617608e-07, "loss": 0.0002, "reward": 1.7696429193019867, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7741071656346321, "rewards/format_reward_func": 0.9955357164144516, "step": 12372 }, { "completion_length": 221.01340293884277, "epoch": 2.074772622490465, "grad_norm": 0.2382505115459546, "kl": 0.163055419921875, "learning_rate": 4.950312681190813e-07, "loss": 0.0002, "reward": 1.7660714983940125, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.770535733550787, "rewards/format_reward_func": 0.9955357164144516, "step": 12374 }, { "completion_length": 221.47322368621826, "epoch": 2.0751079257303324, "grad_norm": 0.36282886473131615, "kl": 0.253021240234375, "learning_rate": 4.950288881122491e-07, "loss": 0.0003, "reward": 1.8071429058909416, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428742259741, "rewards/format_reward_func": 1.0, "step": 12376 }, { "completion_length": 219.15179634094238, "epoch": 2.0754432289702, "grad_norm": 0.06999156876761552, "kl": 0.363494873046875, "learning_rate": 4.950265075412694e-07, "loss": 0.0004, "reward": 1.7750000581145287, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000171363354, "rewards/format_reward_func": 1.0, "step": 12378 }, { "completion_length": 207.70536613464355, "epoch": 2.0757785322100677, "grad_norm": 0.21250234133357188, "kl": 0.13916015625, "learning_rate": 4.95024126406148e-07, "loss": 0.0001, "reward": 1.817857176065445, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.817857176065445, "rewards/format_reward_func": 1.0, "step": 12380 }, { "completion_length": 221.18750953674316, "epoch": 2.076113835449935, "grad_norm": 0.21523313575949132, "kl": 0.14617919921875, "learning_rate": 4.950217447068901e-07, "loss": 0.0001, "reward": 1.7160715162754059, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.720535758882761, "rewards/format_reward_func": 0.9955357164144516, "step": 12382 }, { "completion_length": 211.87500953674316, "epoch": 2.0764491386898025, "grad_norm": 0.1939850515633133, "kl": 0.119110107421875, "learning_rate": 4.950193624435013e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000242143869, "rewards/format_reward_func": 1.0, "step": 12384 }, { "completion_length": 220.07590198516846, "epoch": 2.07678444192967, "grad_norm": 0.23250932473523991, "kl": 0.113616943359375, "learning_rate": 4.950169796159871e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 12386 }, { "completion_length": 217.35715293884277, "epoch": 2.077119745169538, "grad_norm": 0.21520081391524276, "kl": 0.161041259765625, "learning_rate": 4.950145962243529e-07, "loss": 0.0002, "reward": 1.785714328289032, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 12388 }, { "completion_length": 223.40179538726807, "epoch": 2.0774550484094054, "grad_norm": 0.751735218253887, "kl": 1.289947509765625, "learning_rate": 4.950122122686044e-07, "loss": 0.0013, "reward": 1.792857177555561, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571589291096, "rewards/format_reward_func": 1.0, "step": 12390 }, { "completion_length": 220.88840293884277, "epoch": 2.0777903516492726, "grad_norm": 0.2627694441752578, "kl": 0.12933349609375, "learning_rate": 4.950098277487468e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 12392 }, { "completion_length": 217.602689743042, "epoch": 2.0781256548891403, "grad_norm": 0.09369122758849056, "kl": 0.142425537109375, "learning_rate": 4.950074426647858e-07, "loss": 0.0001, "reward": 1.7303572297096252, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7348214723169804, "rewards/format_reward_func": 0.9955357164144516, "step": 12394 }, { "completion_length": 213.12947368621826, "epoch": 2.078460958129008, "grad_norm": 0.22262652267705993, "kl": 0.17474365234375, "learning_rate": 4.950050570167268e-07, "loss": 0.0002, "reward": 1.8000000640749931, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 12396 }, { "completion_length": 216.79911518096924, "epoch": 2.0787962613688755, "grad_norm": 0.252928583623105, "kl": 0.204071044921875, "learning_rate": 4.950026708045754e-07, "loss": 0.0002, "reward": 1.7803572416305542, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7848214581608772, "rewards/format_reward_func": 0.9955357164144516, "step": 12398 }, { "completion_length": 222.32590293884277, "epoch": 2.079131564608743, "grad_norm": 0.1992977674304534, "kl": 0.112945556640625, "learning_rate": 4.95000284028337e-07, "loss": 0.0001, "reward": 1.756696492433548, "reward_std": 0.04103744635358453, "rewards/equation_reward_func": 0.7598214615136385, "rewards/format_reward_func": 0.9968750029802322, "step": 12400 }, { "completion_length": 229.6071538925171, "epoch": 2.079466867848611, "grad_norm": 0.8789151158222025, "kl": 0.360076904296875, "learning_rate": 4.94997896688017e-07, "loss": 0.0004, "reward": 1.7660714909434319, "reward_std": 0.07828682195395231, "rewards/equation_reward_func": 0.7794643081724644, "rewards/format_reward_func": 0.9866071492433548, "step": 12402 }, { "completion_length": 228.88840103149414, "epoch": 2.079802171088478, "grad_norm": 0.1887117558689548, "kl": 0.135833740234375, "learning_rate": 4.94995508783621e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 1.0, "step": 12404 }, { "completion_length": 232.11608409881592, "epoch": 2.0801374743283456, "grad_norm": 0.19187944328862966, "kl": 0.108917236328125, "learning_rate": 4.949931203151547e-07, "loss": 0.0001, "reward": 1.7375000640749931, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7419643066823483, "rewards/format_reward_func": 0.9955357164144516, "step": 12406 }, { "completion_length": 233.0803689956665, "epoch": 2.0804727775682132, "grad_norm": 0.21875754778227985, "kl": 0.12646484375, "learning_rate": 4.949907312826231e-07, "loss": 0.0001, "reward": 1.7625000551342964, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7758928798139095, "rewards/format_reward_func": 0.9866071455180645, "step": 12408 }, { "completion_length": 221.72768688201904, "epoch": 2.080808080808081, "grad_norm": 0.28006826916314953, "kl": 0.11962890625, "learning_rate": 4.949883416860322e-07, "loss": 0.0001, "reward": 1.7803571969270706, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7848214581608772, "rewards/format_reward_func": 0.9955357164144516, "step": 12410 }, { "completion_length": 231.99108028411865, "epoch": 2.0811433840479485, "grad_norm": 0.10164294385984807, "kl": 0.131683349609375, "learning_rate": 4.949859515253873e-07, "loss": 0.0001, "reward": 1.7607143372297287, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 12412 }, { "completion_length": 226.0044755935669, "epoch": 2.0814786872878157, "grad_norm": 0.2045897020362465, "kl": 0.165740966796875, "learning_rate": 4.949835608006939e-07, "loss": 0.0002, "reward": 1.7821429073810577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 12414 }, { "completion_length": 236.52679443359375, "epoch": 2.0818139905276833, "grad_norm": 0.16084116411428384, "kl": 0.12066650390625, "learning_rate": 4.949811695119574e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7517857365310192, "rewards/format_reward_func": 0.9910714328289032, "step": 12416 }, { "completion_length": 232.37947845458984, "epoch": 2.082149293767551, "grad_norm": 0.21511452457931682, "kl": 0.130126953125, "learning_rate": 4.949787776591836e-07, "loss": 0.0001, "reward": 1.7767857387661934, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500391155481, "rewards/format_reward_func": 0.9955357164144516, "step": 12418 }, { "completion_length": 229.25447463989258, "epoch": 2.0824845970074186, "grad_norm": 0.21129750414378579, "kl": 0.1319732666015625, "learning_rate": 4.949763852423776e-07, "loss": 0.0001, "reward": 1.735714353621006, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143312692642, "rewards/format_reward_func": 1.0, "step": 12420 }, { "completion_length": 237.6964406967163, "epoch": 2.0828199002472862, "grad_norm": 0.40663412421997575, "kl": 0.1898193359375, "learning_rate": 4.949739922615454e-07, "loss": 0.0002, "reward": 1.7589286863803864, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928768336773, "rewards/format_reward_func": 0.9955357164144516, "step": 12422 }, { "completion_length": 230.7946538925171, "epoch": 2.083155203487154, "grad_norm": 0.16470876741545637, "kl": 0.1302337646484375, "learning_rate": 4.949715987166921e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7839285936206579, "rewards/format_reward_func": 0.9910714328289032, "step": 12424 }, { "completion_length": 230.53572368621826, "epoch": 2.083490506727021, "grad_norm": 0.15797828640324696, "kl": 0.15777587890625, "learning_rate": 4.949692046078232e-07, "loss": 0.0002, "reward": 1.7857143580913544, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 12426 }, { "completion_length": 237.6875114440918, "epoch": 2.0838258099668887, "grad_norm": 0.22081453106368287, "kl": 0.161590576171875, "learning_rate": 4.949668099349446e-07, "loss": 0.0002, "reward": 1.782142922282219, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 12428 }, { "completion_length": 228.06250858306885, "epoch": 2.0841611132067563, "grad_norm": 0.22329552621134163, "kl": 0.1348876953125, "learning_rate": 4.949644146980615e-07, "loss": 0.0001, "reward": 1.7464286163449287, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 1.0, "step": 12430 }, { "completion_length": 230.2991180419922, "epoch": 2.084496416446624, "grad_norm": 0.22023536636649677, "kl": 0.164520263671875, "learning_rate": 4.949620188971794e-07, "loss": 0.0002, "reward": 1.764285795390606, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7732143178582191, "rewards/format_reward_func": 0.9910714328289032, "step": 12432 }, { "completion_length": 224.04018878936768, "epoch": 2.0848317196864916, "grad_norm": 0.17440609622680234, "kl": 0.1301422119140625, "learning_rate": 4.94959622532304e-07, "loss": 0.0001, "reward": 1.8142857775092125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 12434 }, { "completion_length": 232.14733219146729, "epoch": 2.085167022926359, "grad_norm": 0.17576071552016947, "kl": 0.13531494140625, "learning_rate": 4.949572256034406e-07, "loss": 0.0001, "reward": 1.783928632736206, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7883928753435612, "rewards/format_reward_func": 0.9955357164144516, "step": 12436 }, { "completion_length": 232.62947463989258, "epoch": 2.0855023261662264, "grad_norm": 0.19723789067097333, "kl": 0.14813232421875, "learning_rate": 4.94954828110595e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7875000219792128, "rewards/format_reward_func": 0.9910714328289032, "step": 12438 }, { "completion_length": 235.81697463989258, "epoch": 2.085837629406094, "grad_norm": 0.14950628149768866, "kl": 0.1369171142578125, "learning_rate": 4.949524300537726e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7589286081492901, "rewards/format_reward_func": 0.9910714328289032, "step": 12440 }, { "completion_length": 230.64286708831787, "epoch": 2.0861729326459617, "grad_norm": 0.4378233230600497, "kl": 0.23809814453125, "learning_rate": 4.949500314329787e-07, "loss": 0.0002, "reward": 1.7089286223053932, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7133928835391998, "rewards/format_reward_func": 0.9955357164144516, "step": 12442 }, { "completion_length": 222.9107255935669, "epoch": 2.0865082358858293, "grad_norm": 0.46352014756844934, "kl": 0.178680419921875, "learning_rate": 4.949476322482191e-07, "loss": 0.0002, "reward": 1.782142922282219, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428775787354, "rewards/format_reward_func": 1.0, "step": 12444 }, { "completion_length": 215.74554443359375, "epoch": 2.086843539125697, "grad_norm": 0.24448623383945575, "kl": 0.13616943359375, "learning_rate": 4.949452324994992e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143081724644, "rewards/format_reward_func": 1.0, "step": 12446 }, { "completion_length": 226.6250123977661, "epoch": 2.087178842365564, "grad_norm": 0.2482961130975891, "kl": 0.15899658203125, "learning_rate": 4.949428321868246e-07, "loss": 0.0002, "reward": 1.7714286372065544, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 12448 }, { "completion_length": 232.67858409881592, "epoch": 2.087514145605432, "grad_norm": 0.31050673652855626, "kl": 0.240020751953125, "learning_rate": 4.949404313102008e-07, "loss": 0.0002, "reward": 1.7589286416769028, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7633928842842579, "rewards/format_reward_func": 0.9955357164144516, "step": 12450 }, { "completion_length": 234.43304443359375, "epoch": 2.0878494488452994, "grad_norm": 0.26036864450294417, "kl": 0.2906341552734375, "learning_rate": 4.949380298696333e-07, "loss": 0.0003, "reward": 1.7723214700818062, "reward_std": 0.049244935624301434, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.9982142895460129, "step": 12452 }, { "completion_length": 229.3616180419922, "epoch": 2.088184752085167, "grad_norm": 0.09615753218038946, "kl": 0.14678955078125, "learning_rate": 4.949356278651277e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.746428582817316, "rewards/format_reward_func": 1.0, "step": 12454 }, { "completion_length": 237.2232255935669, "epoch": 2.0885200553250347, "grad_norm": 0.24540628173182102, "kl": 0.163970947265625, "learning_rate": 4.949332252966893e-07, "loss": 0.0002, "reward": 1.750000074505806, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7589285969734192, "rewards/format_reward_func": 0.9910714328289032, "step": 12456 }, { "completion_length": 221.55804538726807, "epoch": 2.0888553585649023, "grad_norm": 0.18356936840482607, "kl": 0.1224517822265625, "learning_rate": 4.949308221643239e-07, "loss": 0.0001, "reward": 1.731250062584877, "reward_std": 0.056821079924702644, "rewards/equation_reward_func": 0.7375000398606062, "rewards/format_reward_func": 0.9937500059604645, "step": 12458 }, { "completion_length": 229.7009038925171, "epoch": 2.0891906618047695, "grad_norm": 0.4034373111510341, "kl": 0.2835540771484375, "learning_rate": 4.949284184680369e-07, "loss": 0.0003, "reward": 1.8232143372297287, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8276785835623741, "rewards/format_reward_func": 0.9955357164144516, "step": 12460 }, { "completion_length": 220.22768878936768, "epoch": 2.089525965044637, "grad_norm": 0.1536477528310614, "kl": 0.1710205078125, "learning_rate": 4.949260142078339e-07, "loss": 0.0002, "reward": 1.8178571909666061, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 1.0, "step": 12462 }, { "completion_length": 231.63393878936768, "epoch": 2.089861268284505, "grad_norm": 0.244357653418181, "kl": 0.19512939453125, "learning_rate": 4.949236093837204e-07, "loss": 0.0002, "reward": 1.8071429133415222, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 12464 }, { "completion_length": 225.87054538726807, "epoch": 2.0901965715243724, "grad_norm": 0.18870748260664152, "kl": 0.23162841796875, "learning_rate": 4.949212039957019e-07, "loss": 0.0002, "reward": 1.7629465013742447, "reward_std": 0.05240166233852506, "rewards/equation_reward_func": 0.764285733923316, "rewards/format_reward_func": 0.9986607171595097, "step": 12466 }, { "completion_length": 216.29911613464355, "epoch": 2.09053187476424, "grad_norm": 0.19688421926182828, "kl": 0.33929443359375, "learning_rate": 4.949187980437841e-07, "loss": 0.0003, "reward": 1.7857143506407738, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 12468 }, { "completion_length": 222.70983219146729, "epoch": 2.0908671780041073, "grad_norm": 0.5548509345150273, "kl": 0.38800048828125, "learning_rate": 4.949163915279722e-07, "loss": 0.0004, "reward": 1.717857226729393, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571708500385, "rewards/format_reward_func": 1.0, "step": 12470 }, { "completion_length": 210.02232933044434, "epoch": 2.091202481243975, "grad_norm": 0.20444495515084055, "kl": 0.30419921875, "learning_rate": 4.949139844482721e-07, "loss": 0.0003, "reward": 1.7696429193019867, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7741071805357933, "rewards/format_reward_func": 0.9955357164144516, "step": 12472 }, { "completion_length": 224.08483028411865, "epoch": 2.0915377844838425, "grad_norm": 0.22854212897739953, "kl": 0.231231689453125, "learning_rate": 4.949115768046893e-07, "loss": 0.0002, "reward": 1.8196429014205933, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.8241071663796902, "rewards/format_reward_func": 0.9955357164144516, "step": 12474 }, { "completion_length": 218.58929824829102, "epoch": 2.09187308772371, "grad_norm": 0.22404496442844024, "kl": 0.5330810546875, "learning_rate": 4.949091685972291e-07, "loss": 0.0005, "reward": 1.7660714983940125, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7705357223749161, "rewards/format_reward_func": 0.9955357164144516, "step": 12476 }, { "completion_length": 225.81251049041748, "epoch": 2.092208390963578, "grad_norm": 0.26550278680486267, "kl": 0.1566162109375, "learning_rate": 4.949067598258972e-07, "loss": 0.0002, "reward": 1.7589286491274834, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928805589676, "rewards/format_reward_func": 0.9955357164144516, "step": 12478 }, { "completion_length": 228.25894165039062, "epoch": 2.0925436942034454, "grad_norm": 0.20035787461152282, "kl": 0.33837890625, "learning_rate": 4.949043504906992e-07, "loss": 0.0003, "reward": 1.8053571954369545, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8098214566707611, "rewards/format_reward_func": 0.9955357164144516, "step": 12480 }, { "completion_length": 224.30358123779297, "epoch": 2.0928789974433126, "grad_norm": 0.09406973070022828, "kl": 0.242889404296875, "learning_rate": 4.949019405916405e-07, "loss": 0.0002, "reward": 1.82857146859169, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8285714648663998, "rewards/format_reward_func": 1.0, "step": 12482 }, { "completion_length": 229.59375858306885, "epoch": 2.0932143006831803, "grad_norm": 0.2354022784534689, "kl": 0.712188720703125, "learning_rate": 4.948995301287268e-07, "loss": 0.0007, "reward": 1.814285770058632, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857551574707, "rewards/format_reward_func": 1.0, "step": 12484 }, { "completion_length": 222.6607255935669, "epoch": 2.093549603923048, "grad_norm": 0.14301333677568714, "kl": 0.337249755859375, "learning_rate": 4.948971191019635e-07, "loss": 0.0003, "reward": 1.7839286401867867, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 12486 }, { "completion_length": 222.1116180419922, "epoch": 2.0938849071629155, "grad_norm": 0.09825610613763396, "kl": 0.35711669921875, "learning_rate": 4.948947075113563e-07, "loss": 0.0004, "reward": 1.7500000670552254, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 12488 }, { "completion_length": 230.32143783569336, "epoch": 2.094220210402783, "grad_norm": 0.01907351915692328, "kl": 0.244720458984375, "learning_rate": 4.948922953569107e-07, "loss": 0.0002, "reward": 1.757142923772335, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 12490 }, { "completion_length": 228.71429443359375, "epoch": 2.0945555136426504, "grad_norm": 0.19290737070434144, "kl": 0.406707763671875, "learning_rate": 4.948898826386322e-07, "loss": 0.0004, "reward": 1.735714353621006, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7357143275439739, "rewards/format_reward_func": 1.0, "step": 12492 }, { "completion_length": 230.04465293884277, "epoch": 2.094890816882518, "grad_norm": 0.19890650026038148, "kl": 0.1217041015625, "learning_rate": 4.948874693565263e-07, "loss": 0.0001, "reward": 1.7410715147852898, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7455357387661934, "rewards/format_reward_func": 0.9955357164144516, "step": 12494 }, { "completion_length": 232.09375953674316, "epoch": 2.0952261201223856, "grad_norm": 0.24585345275777434, "kl": 0.1280670166015625, "learning_rate": 4.948850555105988e-07, "loss": 0.0001, "reward": 1.7214286476373672, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7303571812808514, "rewards/format_reward_func": 0.9910714328289032, "step": 12496 }, { "completion_length": 227.74554634094238, "epoch": 2.0955614233622533, "grad_norm": 0.2588696781419015, "kl": 0.45916748046875, "learning_rate": 4.948826411008551e-07, "loss": 0.0005, "reward": 1.757142923772335, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428697556257, "rewards/format_reward_func": 1.0, "step": 12498 }, { "completion_length": 245.97768878936768, "epoch": 2.095896726602121, "grad_norm": 0.428716342350395, "kl": 0.237884521484375, "learning_rate": 4.948802261273007e-07, "loss": 0.0002, "reward": 1.7392857894301414, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.748214315623045, "rewards/format_reward_func": 0.9910714328289032, "step": 12500 }, { "completion_length": 225.5044755935669, "epoch": 2.0962320298419885, "grad_norm": 0.16797785673849946, "kl": 0.152435302734375, "learning_rate": 4.948778105899412e-07, "loss": 0.0002, "reward": 1.8142857775092125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 12502 }, { "completion_length": 239.165189743042, "epoch": 2.0965673330818557, "grad_norm": 0.13361972927868213, "kl": 0.1374664306640625, "learning_rate": 4.948753944887823e-07, "loss": 0.0001, "reward": 1.814285770058632, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 12504 }, { "completion_length": 223.86608219146729, "epoch": 2.0969026363217234, "grad_norm": 0.1260006794495531, "kl": 0.21466064453125, "learning_rate": 4.948729778238293e-07, "loss": 0.0002, "reward": 1.8000000268220901, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 12506 }, { "completion_length": 233.87947368621826, "epoch": 2.097237939561591, "grad_norm": 0.21277466514518686, "kl": 0.112701416015625, "learning_rate": 4.948705605950879e-07, "loss": 0.0001, "reward": 1.7750000357627869, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7839285973459482, "rewards/format_reward_func": 0.9910714328289032, "step": 12508 }, { "completion_length": 237.5178689956665, "epoch": 2.0975732428014586, "grad_norm": 0.2227883614821665, "kl": 0.1185150146484375, "learning_rate": 4.948681428025638e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 12510 }, { "completion_length": 250.70983219146729, "epoch": 2.0979085460413263, "grad_norm": 0.20100054630349626, "kl": 0.16143798828125, "learning_rate": 4.948657244462624e-07, "loss": 0.0002, "reward": 1.719642922282219, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7241071797907352, "rewards/format_reward_func": 0.9955357164144516, "step": 12512 }, { "completion_length": 248.77233505249023, "epoch": 2.098243849281194, "grad_norm": 0.3901435180972612, "kl": 0.18560791015625, "learning_rate": 4.948633055261894e-07, "loss": 0.0002, "reward": 1.803571492433548, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 12514 }, { "completion_length": 248.1026906967163, "epoch": 2.098579152521061, "grad_norm": 0.17118320801676853, "kl": 0.18536376953125, "learning_rate": 4.948608860423501e-07, "loss": 0.0002, "reward": 1.825000062584877, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8250000141561031, "rewards/format_reward_func": 1.0, "step": 12516 }, { "completion_length": 253.0580472946167, "epoch": 2.0989144557609287, "grad_norm": 0.1819693796045121, "kl": 0.36138916015625, "learning_rate": 4.948584659947504e-07, "loss": 0.0004, "reward": 1.8250000402331352, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8250000327825546, "rewards/format_reward_func": 1.0, "step": 12518 }, { "completion_length": 260.1785840988159, "epoch": 2.0992497590007964, "grad_norm": 0.3183018413425357, "kl": 0.29443359375, "learning_rate": 4.948560453833956e-07, "loss": 0.0003, "reward": 1.7267857939004898, "reward_std": 0.0530330091714859, "rewards/equation_reward_func": 0.7401785999536514, "rewards/format_reward_func": 0.9866071492433548, "step": 12520 }, { "completion_length": 256.9642972946167, "epoch": 2.099585062240664, "grad_norm": 0.3034727556135848, "kl": 0.21856689453125, "learning_rate": 4.948536242082915e-07, "loss": 0.0002, "reward": 1.7339286357164383, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.738392885774374, "rewards/format_reward_func": 0.9955357164144516, "step": 12522 }, { "completion_length": 252.75447750091553, "epoch": 2.0999203654805316, "grad_norm": 0.2733843588864411, "kl": 0.21575927734375, "learning_rate": 4.948512024694436e-07, "loss": 0.0002, "reward": 1.783928632736206, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928883820772, "rewards/format_reward_func": 0.9955357164144516, "step": 12524 }, { "completion_length": 256.7901906967163, "epoch": 2.100255668720399, "grad_norm": 0.09788705115293139, "kl": 0.50946044921875, "learning_rate": 4.948487801668574e-07, "loss": 0.0005, "reward": 1.7785715013742447, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 12526 }, { "completion_length": 254.4241180419922, "epoch": 2.1005909719602665, "grad_norm": 0.4143288036401865, "kl": 0.23956298828125, "learning_rate": 4.948463573005384e-07, "loss": 0.0002, "reward": 1.7933036237955093, "reward_std": 0.05997780663892627, "rewards/equation_reward_func": 0.7991071790456772, "rewards/format_reward_func": 0.9941964335739613, "step": 12528 }, { "completion_length": 253.6160831451416, "epoch": 2.100926275200134, "grad_norm": 0.2037976901278312, "kl": 0.17803955078125, "learning_rate": 4.948439338704925e-07, "loss": 0.0002, "reward": 1.7428572103381157, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7517857495695353, "rewards/format_reward_func": 0.9910714328289032, "step": 12530 }, { "completion_length": 254.9375114440918, "epoch": 2.1012615784400017, "grad_norm": 0.1723439343144399, "kl": 0.4744873046875, "learning_rate": 4.94841509876725e-07, "loss": 0.0005, "reward": 1.7910715118050575, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7955357320606709, "rewards/format_reward_func": 0.9955357164144516, "step": 12532 }, { "completion_length": 255.24108505249023, "epoch": 2.1015968816798694, "grad_norm": 0.555953331373012, "kl": 0.710052490234375, "learning_rate": 4.948390853192415e-07, "loss": 0.0007, "reward": 1.7714286297559738, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7803571745753288, "rewards/format_reward_func": 0.9910714328289032, "step": 12534 }, { "completion_length": 257.3660840988159, "epoch": 2.101932184919737, "grad_norm": 0.2409187824669436, "kl": 0.139373779296875, "learning_rate": 4.948366601980479e-07, "loss": 0.0001, "reward": 1.805357202887535, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8098214566707611, "rewards/format_reward_func": 0.9955357164144516, "step": 12536 }, { "completion_length": 257.7991199493408, "epoch": 2.102267488159604, "grad_norm": 0.2669260915173354, "kl": 0.519439697265625, "learning_rate": 4.948342345131492e-07, "loss": 0.0005, "reward": 1.7803571969270706, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214767873287, "rewards/format_reward_func": 0.9955357164144516, "step": 12538 }, { "completion_length": 248.3526906967163, "epoch": 2.102602791399472, "grad_norm": 0.24818006289820269, "kl": 0.297607421875, "learning_rate": 4.948318082645515e-07, "loss": 0.0003, "reward": 1.7464286386966705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 12540 }, { "completion_length": 247.46429920196533, "epoch": 2.1029380946393395, "grad_norm": 0.16690281788865083, "kl": 0.3175048828125, "learning_rate": 4.948293814522602e-07, "loss": 0.0003, "reward": 1.787500038743019, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.8008929006755352, "rewards/format_reward_func": 0.9866071492433548, "step": 12542 }, { "completion_length": 249.2991180419922, "epoch": 2.103273397879207, "grad_norm": 0.3490837667156745, "kl": 0.246368408203125, "learning_rate": 4.948269540762809e-07, "loss": 0.0002, "reward": 1.789285771548748, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 12544 }, { "completion_length": 254.8884038925171, "epoch": 2.1036087011190747, "grad_norm": 0.13535564372346423, "kl": 0.2268829345703125, "learning_rate": 4.94824526136619e-07, "loss": 0.0002, "reward": 1.7232143729925156, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276785969734192, "rewards/format_reward_func": 0.9955357164144516, "step": 12546 }, { "completion_length": 262.5759038925171, "epoch": 2.103944004358942, "grad_norm": 0.17458894142526152, "kl": 0.57427978515625, "learning_rate": 4.948220976332804e-07, "loss": 0.0006, "reward": 1.8000000715255737, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 12548 }, { "completion_length": 257.714298248291, "epoch": 2.1042793075988095, "grad_norm": 0.309491138362646, "kl": 0.987518310546875, "learning_rate": 4.948196685662705e-07, "loss": 0.001, "reward": 1.7714286521077156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 12550 }, { "completion_length": 256.870548248291, "epoch": 2.104614610838677, "grad_norm": 0.1550566234114588, "kl": 0.277984619140625, "learning_rate": 4.948172389355951e-07, "loss": 0.0003, "reward": 1.778571493923664, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 12552 }, { "completion_length": 251.26787185668945, "epoch": 2.104949914078545, "grad_norm": 0.16131576931507266, "kl": 0.341156005859375, "learning_rate": 4.948148087412594e-07, "loss": 0.0003, "reward": 1.7928572073578835, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.8017857372760773, "rewards/format_reward_func": 0.9910714328289032, "step": 12554 }, { "completion_length": 250.04911994934082, "epoch": 2.1052852173184124, "grad_norm": 0.21502764088900536, "kl": 0.14691162109375, "learning_rate": 4.948123779832694e-07, "loss": 0.0001, "reward": 1.7589286267757416, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7633928991854191, "rewards/format_reward_func": 0.9955357164144516, "step": 12556 }, { "completion_length": 282.6250104904175, "epoch": 2.10562052055828, "grad_norm": 0.3334944368615149, "kl": 1.33514404296875, "learning_rate": 4.948099466616307e-07, "loss": 0.0013, "reward": 1.750000037252903, "reward_std": 0.08081220276653767, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 0.9821428656578064, "step": 12558 }, { "completion_length": 262.6875114440918, "epoch": 2.1059558237981473, "grad_norm": 0.0697285926398769, "kl": 0.35528564453125, "learning_rate": 4.948075147763484e-07, "loss": 0.0004, "reward": 1.7767857760190964, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7901786081492901, "rewards/format_reward_func": 0.9866071492433548, "step": 12560 }, { "completion_length": 270.2767963409424, "epoch": 2.106291127038015, "grad_norm": 0.18152372568138508, "kl": 0.301727294921875, "learning_rate": 4.948050823274287e-07, "loss": 0.0003, "reward": 1.7553572058677673, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214633762836, "rewards/format_reward_func": 0.9955357164144516, "step": 12562 }, { "completion_length": 267.56251335144043, "epoch": 2.1066264302778825, "grad_norm": 0.18210512754892727, "kl": 0.208160400390625, "learning_rate": 4.948026493148769e-07, "loss": 0.0002, "reward": 1.8107143566012383, "reward_std": 0.07576144021004438, "rewards/equation_reward_func": 0.8196428790688515, "rewards/format_reward_func": 0.9910714328289032, "step": 12564 }, { "completion_length": 267.49554538726807, "epoch": 2.10696173351775, "grad_norm": 0.654924979260132, "kl": 0.542205810546875, "learning_rate": 4.948002157386984e-07, "loss": 0.0005, "reward": 1.7785714864730835, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7875000312924385, "rewards/format_reward_func": 0.9910714328289032, "step": 12566 }, { "completion_length": 265.4642963409424, "epoch": 2.107297036757618, "grad_norm": 0.10704312735958628, "kl": 0.801483154296875, "learning_rate": 4.947977815988992e-07, "loss": 0.0008, "reward": 1.7446429431438446, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7491071783006191, "rewards/format_reward_func": 0.9955357164144516, "step": 12568 }, { "completion_length": 278.745548248291, "epoch": 2.107632339997485, "grad_norm": 0.1477785650076975, "kl": 0.212066650390625, "learning_rate": 4.947953468954848e-07, "loss": 0.0002, "reward": 1.7196429148316383, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7419643215835094, "rewards/format_reward_func": 0.977678582072258, "step": 12570 }, { "completion_length": 269.9241199493408, "epoch": 2.1079676432373526, "grad_norm": 0.2375696627186269, "kl": 0.45843505859375, "learning_rate": 4.947929116284606e-07, "loss": 0.0005, "reward": 1.7196429073810577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 12572 }, { "completion_length": 263.5312604904175, "epoch": 2.1083029464772203, "grad_norm": 0.15276034133103347, "kl": 0.269683837890625, "learning_rate": 4.947904757978325e-07, "loss": 0.0003, "reward": 1.7464286610484123, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 12574 }, { "completion_length": 253.84375858306885, "epoch": 2.108638249717088, "grad_norm": 0.19924283727198752, "kl": 0.11798095703125, "learning_rate": 4.947880394036058e-07, "loss": 0.0001, "reward": 1.7571428939700127, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7660714443773031, "rewards/format_reward_func": 0.9910714328289032, "step": 12576 }, { "completion_length": 266.6339387893677, "epoch": 2.1089735529569555, "grad_norm": 0.5955246532748683, "kl": 0.442657470703125, "learning_rate": 4.947856024457865e-07, "loss": 0.0004, "reward": 1.7410714998841286, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7455357443541288, "rewards/format_reward_func": 0.9955357164144516, "step": 12578 }, { "completion_length": 255.42411708831787, "epoch": 2.109308856196823, "grad_norm": 0.1900532254599884, "kl": 0.520721435546875, "learning_rate": 4.947831649243798e-07, "loss": 0.0005, "reward": 1.8107143193483353, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.8196428939700127, "rewards/format_reward_func": 0.9910714328289032, "step": 12580 }, { "completion_length": 269.7991189956665, "epoch": 2.1096441594366904, "grad_norm": 0.31313371529861506, "kl": 0.828125, "learning_rate": 4.947807268393915e-07, "loss": 0.0008, "reward": 1.7093750685453415, "reward_std": 0.06755395070649683, "rewards/equation_reward_func": 0.7196428924798965, "rewards/format_reward_func": 0.9897321499884129, "step": 12582 }, { "completion_length": 256.04465198516846, "epoch": 2.109979462676558, "grad_norm": 0.2102414212533989, "kl": 1.605743408203125, "learning_rate": 4.947782881908273e-07, "loss": 0.0016, "reward": 1.8017857819795609, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8062500320374966, "rewards/format_reward_func": 0.9955357164144516, "step": 12584 }, { "completion_length": 251.70536613464355, "epoch": 2.1103147659164256, "grad_norm": 0.13884928172541447, "kl": 0.314422607421875, "learning_rate": 4.947758489786927e-07, "loss": 0.0003, "reward": 1.7464286386966705, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.755357176065445, "rewards/format_reward_func": 0.9910714328289032, "step": 12586 }, { "completion_length": 254.7366180419922, "epoch": 2.1106500691562933, "grad_norm": 0.19675570563541028, "kl": 0.25933837890625, "learning_rate": 4.947734092029934e-07, "loss": 0.0003, "reward": 1.714285783469677, "reward_std": 0.09091372787952423, "rewards/equation_reward_func": 0.7232143357396126, "rewards/format_reward_func": 0.9910714328289032, "step": 12588 }, { "completion_length": 251.1428680419922, "epoch": 2.110985372396161, "grad_norm": 0.24465264663289335, "kl": 0.3800048828125, "learning_rate": 4.947709688637348e-07, "loss": 0.0004, "reward": 1.7107143849134445, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7107143178582191, "rewards/format_reward_func": 1.0, "step": 12590 }, { "completion_length": 250.22322463989258, "epoch": 2.1113206756360285, "grad_norm": 0.18432661633018005, "kl": 0.72412109375, "learning_rate": 4.947685279609228e-07, "loss": 0.0007, "reward": 1.775000087916851, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 12592 }, { "completion_length": 253.7857265472412, "epoch": 2.1116559788758957, "grad_norm": 0.2395700514182744, "kl": 0.30462646484375, "learning_rate": 4.947660864945629e-07, "loss": 0.0003, "reward": 1.7535715028643608, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7535714656114578, "rewards/format_reward_func": 1.0, "step": 12594 }, { "completion_length": 243.6473331451416, "epoch": 2.1119912821157634, "grad_norm": 0.4434999353814348, "kl": 0.285491943359375, "learning_rate": 4.947636444646605e-07, "loss": 0.0003, "reward": 1.7285715267062187, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7285714596509933, "rewards/format_reward_func": 1.0, "step": 12596 }, { "completion_length": 253.28126430511475, "epoch": 2.112326585355631, "grad_norm": 0.20279855857111112, "kl": 0.1322021484375, "learning_rate": 4.947612018712218e-07, "loss": 0.0001, "reward": 1.7285715118050575, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 12598 }, { "completion_length": 249.29465770721436, "epoch": 2.1126618885954986, "grad_norm": 0.28162706638188384, "kl": 0.419097900390625, "learning_rate": 4.947587587142518e-07, "loss": 0.0004, "reward": 1.7285715192556381, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 12600 }, { "completion_length": 249.19643878936768, "epoch": 2.1129971918353663, "grad_norm": 0.18187601895711714, "kl": 0.145660400390625, "learning_rate": 4.947563149937565e-07, "loss": 0.0001, "reward": 1.8200893625617027, "reward_std": 0.05240166140720248, "rewards/equation_reward_func": 0.8214285932481289, "rewards/format_reward_func": 0.9986607171595097, "step": 12602 }, { "completion_length": 243.66965579986572, "epoch": 2.1133324950752335, "grad_norm": 0.23567424608565962, "kl": 0.263092041015625, "learning_rate": 4.947538707097413e-07, "loss": 0.0003, "reward": 1.8250000774860382, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.825000025331974, "rewards/format_reward_func": 1.0, "step": 12604 }, { "completion_length": 246.23215579986572, "epoch": 2.113667798315101, "grad_norm": 0.26916544402526654, "kl": 0.621246337890625, "learning_rate": 4.94751425862212e-07, "loss": 0.0006, "reward": 1.7607143446803093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143223285675, "rewards/format_reward_func": 1.0, "step": 12606 }, { "completion_length": 238.81697463989258, "epoch": 2.1140031015549687, "grad_norm": 0.18268104728073659, "kl": 0.4063720703125, "learning_rate": 4.947489804511742e-07, "loss": 0.0004, "reward": 1.757142923772335, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 12608 }, { "completion_length": 251.133939743042, "epoch": 2.1143384047948364, "grad_norm": 0.17624304213752912, "kl": 0.40753173828125, "learning_rate": 4.947465344766335e-07, "loss": 0.0004, "reward": 1.7535715103149414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714451223612, "rewards/format_reward_func": 1.0, "step": 12610 }, { "completion_length": 248.90625762939453, "epoch": 2.114673708034704, "grad_norm": 0.28952980847030996, "kl": 0.224822998046875, "learning_rate": 4.947440879385955e-07, "loss": 0.0002, "reward": 1.7214286476373672, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214285992085934, "rewards/format_reward_func": 1.0, "step": 12612 }, { "completion_length": 231.34822463989258, "epoch": 2.1150090112745716, "grad_norm": 0.39774063211490374, "kl": 0.192047119140625, "learning_rate": 4.947416408370659e-07, "loss": 0.0002, "reward": 1.8589285984635353, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.8633928783237934, "rewards/format_reward_func": 0.9955357164144516, "step": 12614 }, { "completion_length": 242.60268783569336, "epoch": 2.115344314514439, "grad_norm": 0.24908888175240032, "kl": 0.17919921875, "learning_rate": 4.947391931720503e-07, "loss": 0.0002, "reward": 1.7357143461704254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143182307482, "rewards/format_reward_func": 1.0, "step": 12616 }, { "completion_length": 253.0669755935669, "epoch": 2.1156796177543065, "grad_norm": 0.19669393215519845, "kl": 0.18682861328125, "learning_rate": 4.947367449435542e-07, "loss": 0.0002, "reward": 1.7875000536441803, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7919643111526966, "rewards/format_reward_func": 0.9955357164144516, "step": 12618 }, { "completion_length": 242.2232255935669, "epoch": 2.116014920994174, "grad_norm": 0.1812298854708925, "kl": 0.133697509765625, "learning_rate": 4.947342961515835e-07, "loss": 0.0001, "reward": 1.785714365541935, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 12620 }, { "completion_length": 245.42858505249023, "epoch": 2.1163502242340417, "grad_norm": 0.6592117291790457, "kl": 0.193939208984375, "learning_rate": 4.947318467961437e-07, "loss": 0.0002, "reward": 1.7946429029107094, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7991071678698063, "rewards/format_reward_func": 0.9955357164144516, "step": 12622 }, { "completion_length": 254.62947750091553, "epoch": 2.1166855274739094, "grad_norm": 0.1700151949627714, "kl": 0.1432952880859375, "learning_rate": 4.947293968772405e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 1.0, "step": 12624 }, { "completion_length": 242.79911708831787, "epoch": 2.1170208307137766, "grad_norm": 0.17733056326423086, "kl": 0.13336181640625, "learning_rate": 4.947269463948795e-07, "loss": 0.0001, "reward": 1.7964286133646965, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964286170899868, "rewards/format_reward_func": 1.0, "step": 12626 }, { "completion_length": 252.6250114440918, "epoch": 2.117356133953644, "grad_norm": 0.1922726528758622, "kl": 0.155487060546875, "learning_rate": 4.947244953490662e-07, "loss": 0.0002, "reward": 1.760714367032051, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.760714340955019, "rewards/format_reward_func": 1.0, "step": 12628 }, { "completion_length": 249.55804920196533, "epoch": 2.117691437193512, "grad_norm": 0.24193949809038998, "kl": 0.129150390625, "learning_rate": 4.947220437398064e-07, "loss": 0.0001, "reward": 1.7750000357627869, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000413507223, "rewards/format_reward_func": 1.0, "step": 12630 }, { "completion_length": 243.89286708831787, "epoch": 2.1180267404333795, "grad_norm": 0.1576165068164449, "kl": 0.19390869140625, "learning_rate": 4.947195915671058e-07, "loss": 0.0002, "reward": 1.8035714775323868, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714738070965, "rewards/format_reward_func": 1.0, "step": 12632 }, { "completion_length": 256.64733505249023, "epoch": 2.118362043673247, "grad_norm": 0.10762529730307481, "kl": 0.192108154296875, "learning_rate": 4.947171388309699e-07, "loss": 0.0002, "reward": 1.8285714536905289, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8285714499652386, "rewards/format_reward_func": 1.0, "step": 12634 }, { "completion_length": 255.4642972946167, "epoch": 2.1186973469131147, "grad_norm": 0.1776726750654768, "kl": 0.19140625, "learning_rate": 4.947146855314045e-07, "loss": 0.0002, "reward": 1.721428669989109, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7214286103844643, "rewards/format_reward_func": 1.0, "step": 12636 }, { "completion_length": 249.75893878936768, "epoch": 2.119032650152982, "grad_norm": 0.19551152845046466, "kl": 0.137237548828125, "learning_rate": 4.947122316684151e-07, "loss": 0.0001, "reward": 1.8178572058677673, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8178571723401546, "rewards/format_reward_func": 1.0, "step": 12638 }, { "completion_length": 263.6919755935669, "epoch": 2.1193679533928496, "grad_norm": 0.16631205617192682, "kl": 0.1456298828125, "learning_rate": 4.947097772420074e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 12640 }, { "completion_length": 262.1205463409424, "epoch": 2.119703256632717, "grad_norm": 0.25879339095547543, "kl": 0.1573333740234375, "learning_rate": 4.94707322252187e-07, "loss": 0.0002, "reward": 1.7571429461240768, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 12642 }, { "completion_length": 265.4285840988159, "epoch": 2.120038559872585, "grad_norm": 0.008186147312404166, "kl": 0.17401123046875, "learning_rate": 4.947048666989597e-07, "loss": 0.0002, "reward": 1.7107143551111221, "reward_std": 0.025253813713788986, "rewards/equation_reward_func": 0.7196428868919611, "rewards/format_reward_func": 0.9910714328289032, "step": 12644 }, { "completion_length": 255.92858600616455, "epoch": 2.1203738631124525, "grad_norm": 0.1079018393352671, "kl": 0.259307861328125, "learning_rate": 4.947024105823311e-07, "loss": 0.0003, "reward": 1.769642896950245, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7830357346683741, "rewards/format_reward_func": 0.9866071492433548, "step": 12646 }, { "completion_length": 261.4285840988159, "epoch": 2.12070916635232, "grad_norm": 0.16324194777430936, "kl": 0.167999267578125, "learning_rate": 4.946999539023068e-07, "loss": 0.0002, "reward": 1.7410715147852898, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.745535746216774, "rewards/format_reward_func": 0.9955357164144516, "step": 12648 }, { "completion_length": 266.54465770721436, "epoch": 2.1210444695921873, "grad_norm": 0.24910040332383881, "kl": 0.21734619140625, "learning_rate": 4.946974966588924e-07, "loss": 0.0002, "reward": 1.7464286237955093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285958558321, "rewards/format_reward_func": 1.0, "step": 12650 }, { "completion_length": 273.5401954650879, "epoch": 2.121379772832055, "grad_norm": 0.24830042928049242, "kl": 0.1650390625, "learning_rate": 4.946950388520938e-07, "loss": 0.0002, "reward": 1.739285796880722, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7392857521772385, "rewards/format_reward_func": 1.0, "step": 12652 }, { "completion_length": 259.7500104904175, "epoch": 2.1217150760719226, "grad_norm": 0.1759505675423626, "kl": 0.13311767578125, "learning_rate": 4.946925804819163e-07, "loss": 0.0001, "reward": 1.8250000476837158, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8250000141561031, "rewards/format_reward_func": 1.0, "step": 12654 }, { "completion_length": 263.8080463409424, "epoch": 2.12205037931179, "grad_norm": 0.3763846045538354, "kl": 0.1600341796875, "learning_rate": 4.946901215483659e-07, "loss": 0.0002, "reward": 1.753571480512619, "reward_std": 0.08586296532303095, "rewards/equation_reward_func": 0.7625000216066837, "rewards/format_reward_func": 0.9910714328289032, "step": 12656 }, { "completion_length": 262.22768688201904, "epoch": 2.122385682551658, "grad_norm": 0.09794541085463028, "kl": 0.1695556640625, "learning_rate": 4.94687662051448e-07, "loss": 0.0002, "reward": 1.7892857566475868, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857529222965, "rewards/format_reward_func": 1.0, "step": 12658 }, { "completion_length": 261.977689743042, "epoch": 2.122720985791525, "grad_norm": 0.21298906750501434, "kl": 0.194488525390625, "learning_rate": 4.946852019911684e-07, "loss": 0.0002, "reward": 1.7750000357627869, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000301748514, "rewards/format_reward_func": 1.0, "step": 12660 }, { "completion_length": 253.3482255935669, "epoch": 2.1230562890313927, "grad_norm": 0.1329742623791657, "kl": 0.120574951171875, "learning_rate": 4.946827413675328e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 12662 }, { "completion_length": 256.8526906967163, "epoch": 2.1233915922712603, "grad_norm": 0.27845250518816944, "kl": 0.246185302734375, "learning_rate": 4.946802801805467e-07, "loss": 0.0002, "reward": 1.7392857819795609, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857410013676, "rewards/format_reward_func": 1.0, "step": 12664 }, { "completion_length": 247.48215293884277, "epoch": 2.123726895511128, "grad_norm": 0.21240464239704232, "kl": 0.2212371826171875, "learning_rate": 4.946778184302161e-07, "loss": 0.0002, "reward": 1.7964286282658577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 12666 }, { "completion_length": 255.99554634094238, "epoch": 2.1240621987509956, "grad_norm": 0.2736472912552943, "kl": 0.261810302734375, "learning_rate": 4.946753561165462e-07, "loss": 0.0003, "reward": 1.7482143640518188, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7526786103844643, "rewards/format_reward_func": 0.9955357164144516, "step": 12668 }, { "completion_length": 255.18751049041748, "epoch": 2.124397501990863, "grad_norm": 0.14204252196879363, "kl": 0.22735595703125, "learning_rate": 4.946728932395431e-07, "loss": 0.0002, "reward": 1.716071493923664, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7205357402563095, "rewards/format_reward_func": 0.9955357164144516, "step": 12670 }, { "completion_length": 244.4464406967163, "epoch": 2.1247328052307304, "grad_norm": 0.44296381897565446, "kl": 0.254974365234375, "learning_rate": 4.946704297992121e-07, "loss": 0.0003, "reward": 1.8035714998841286, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 12672 }, { "completion_length": 250.7544765472412, "epoch": 2.125068108470598, "grad_norm": 0.1577363318076351, "kl": 0.217864990234375, "learning_rate": 4.946679657955591e-07, "loss": 0.0002, "reward": 1.789285771548748, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857268452644, "rewards/format_reward_func": 1.0, "step": 12674 }, { "completion_length": 248.09376049041748, "epoch": 2.1254034117104657, "grad_norm": 0.14258669660016726, "kl": 0.170196533203125, "learning_rate": 4.946655012285898e-07, "loss": 0.0002, "reward": 1.7589286416769028, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 12676 }, { "completion_length": 243.52233123779297, "epoch": 2.1257387149503333, "grad_norm": 0.23125273450125225, "kl": 0.18536376953125, "learning_rate": 4.946630360983098e-07, "loss": 0.0002, "reward": 1.7379465028643608, "reward_std": 0.0473508988507092, "rewards/equation_reward_func": 0.7392857559025288, "rewards/format_reward_func": 0.9986607171595097, "step": 12678 }, { "completion_length": 251.3750114440918, "epoch": 2.126074018190201, "grad_norm": 0.18106699184120606, "kl": 0.211212158203125, "learning_rate": 4.946605704047247e-07, "loss": 0.0002, "reward": 1.753571480512619, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 12680 }, { "completion_length": 246.27679538726807, "epoch": 2.126409321430068, "grad_norm": 0.6107476265404755, "kl": 0.19964599609375, "learning_rate": 4.946581041478404e-07, "loss": 0.0002, "reward": 1.7750000581145287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 12682 }, { "completion_length": 250.76340675354004, "epoch": 2.1267446246699357, "grad_norm": 0.19478372513176684, "kl": 0.208770751953125, "learning_rate": 4.946556373276622e-07, "loss": 0.0002, "reward": 1.7535714879631996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714711993933, "rewards/format_reward_func": 1.0, "step": 12684 }, { "completion_length": 248.64733123779297, "epoch": 2.1270799279098034, "grad_norm": 0.0650269589634784, "kl": 0.2137451171875, "learning_rate": 4.946531699441963e-07, "loss": 0.0002, "reward": 1.6892857998609543, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.6892857532948256, "rewards/format_reward_func": 1.0, "step": 12686 }, { "completion_length": 248.41519165039062, "epoch": 2.127415231149671, "grad_norm": 0.06310082996410975, "kl": 0.1319580078125, "learning_rate": 4.946507019974479e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 12688 }, { "completion_length": 251.1741189956665, "epoch": 2.1277505343895387, "grad_norm": 0.15660504026504873, "kl": 0.162445068359375, "learning_rate": 4.946482334874229e-07, "loss": 0.0002, "reward": 1.8142857626080513, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857365310192, "rewards/format_reward_func": 1.0, "step": 12690 }, { "completion_length": 240.34822463989258, "epoch": 2.1280858376294063, "grad_norm": 0.3681435884554297, "kl": 0.135772705078125, "learning_rate": 4.94645764414127e-07, "loss": 0.0001, "reward": 1.8151786178350449, "reward_std": 0.049244935624301434, "rewards/equation_reward_func": 0.8169643133878708, "rewards/format_reward_func": 0.9982142895460129, "step": 12692 }, { "completion_length": 234.3750114440918, "epoch": 2.1284211408692735, "grad_norm": 0.0995885339917896, "kl": 0.128265380859375, "learning_rate": 4.946432947775657e-07, "loss": 0.0001, "reward": 1.8035714700818062, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714570432901, "rewards/format_reward_func": 1.0, "step": 12694 }, { "completion_length": 245.80804538726807, "epoch": 2.128756444109141, "grad_norm": 0.5983936194365209, "kl": 0.15484619140625, "learning_rate": 4.946408245777449e-07, "loss": 0.0002, "reward": 1.7571429312229156, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7660714499652386, "rewards/format_reward_func": 0.9910714328289032, "step": 12696 }, { "completion_length": 240.03572368621826, "epoch": 2.1290917473490087, "grad_norm": 0.21612487513493533, "kl": 0.141876220703125, "learning_rate": 4.946383538146704e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714510828257, "rewards/format_reward_func": 1.0, "step": 12698 }, { "completion_length": 243.2544755935669, "epoch": 2.1294270505888764, "grad_norm": 0.15909532838737647, "kl": 0.151611328125, "learning_rate": 4.946358824883476e-07, "loss": 0.0002, "reward": 1.8178571686148643, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8178571723401546, "rewards/format_reward_func": 1.0, "step": 12700 }, { "completion_length": 236.58036708831787, "epoch": 2.129762353828744, "grad_norm": 0.1954038164117403, "kl": 0.157379150390625, "learning_rate": 4.946334105987822e-07, "loss": 0.0002, "reward": 1.7214286401867867, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214286029338837, "rewards/format_reward_func": 1.0, "step": 12702 }, { "completion_length": 237.7410831451416, "epoch": 2.130097657068611, "grad_norm": 0.15764717852820267, "kl": 0.182037353515625, "learning_rate": 4.9463093814598e-07, "loss": 0.0002, "reward": 1.7714286223053932, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7714285887777805, "rewards/format_reward_func": 1.0, "step": 12704 }, { "completion_length": 241.12501049041748, "epoch": 2.130432960308479, "grad_norm": 0.1504773466200583, "kl": 0.121063232421875, "learning_rate": 4.946284651299467e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 12706 }, { "completion_length": 237.4419765472412, "epoch": 2.1307682635483465, "grad_norm": 0.11417352849954919, "kl": 0.2191009521484375, "learning_rate": 4.946259915506879e-07, "loss": 0.0002, "reward": 1.789285771548748, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857566475868, "rewards/format_reward_func": 1.0, "step": 12708 }, { "completion_length": 238.57590579986572, "epoch": 2.131103566788214, "grad_norm": 0.5617548101285669, "kl": 0.231964111328125, "learning_rate": 4.946235174082096e-07, "loss": 0.0002, "reward": 1.7808036282658577, "reward_std": 0.03724937466904521, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 0.9986607171595097, "step": 12710 }, { "completion_length": 240.80358219146729, "epoch": 2.1314388700280817, "grad_norm": 0.23236516414959607, "kl": 0.173370361328125, "learning_rate": 4.94621042702517e-07, "loss": 0.0002, "reward": 1.7500000596046448, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000111758709, "rewards/format_reward_func": 1.0, "step": 12712 }, { "completion_length": 237.41072463989258, "epoch": 2.1317741732679494, "grad_norm": 0.24766922153293103, "kl": 0.2633056640625, "learning_rate": 4.946185674336163e-07, "loss": 0.0003, "reward": 1.7607143372297287, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 12714 }, { "completion_length": 248.7410831451416, "epoch": 2.1321094765078166, "grad_norm": 0.2851471685393492, "kl": 0.199249267578125, "learning_rate": 4.946160916015128e-07, "loss": 0.0002, "reward": 1.7678572237491608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.767857164144516, "rewards/format_reward_func": 1.0, "step": 12716 }, { "completion_length": 235.63393878936768, "epoch": 2.132444779747684, "grad_norm": 0.24856425416445746, "kl": 0.212646484375, "learning_rate": 4.946136152062123e-07, "loss": 0.0002, "reward": 1.74508935213089, "reward_std": 0.03346130205318332, "rewards/equation_reward_func": 0.7464286088943481, "rewards/format_reward_func": 0.9986607171595097, "step": 12718 }, { "completion_length": 246.4062614440918, "epoch": 2.132780082987552, "grad_norm": 0.1818853615657871, "kl": 0.207855224609375, "learning_rate": 4.946111382477207e-07, "loss": 0.0002, "reward": 1.7642857730388641, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857637256384, "rewards/format_reward_func": 1.0, "step": 12720 }, { "completion_length": 236.11608028411865, "epoch": 2.1331153862274195, "grad_norm": 0.2388736773932256, "kl": 0.304229736328125, "learning_rate": 4.946086607260436e-07, "loss": 0.0003, "reward": 1.7928572073578835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 12722 }, { "completion_length": 234.6384038925171, "epoch": 2.133450689467287, "grad_norm": 0.1481105248262793, "kl": 0.19354248046875, "learning_rate": 4.946061826411866e-07, "loss": 0.0002, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 12724 }, { "completion_length": 236.2589406967163, "epoch": 2.1337859927071543, "grad_norm": 0.19108891475425632, "kl": 0.15118408203125, "learning_rate": 4.946037039931554e-07, "loss": 0.0002, "reward": 1.787500075995922, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7919643186032772, "rewards/format_reward_func": 0.9955357164144516, "step": 12726 }, { "completion_length": 232.9821538925171, "epoch": 2.134121295947022, "grad_norm": 0.1379382492406183, "kl": 0.180023193359375, "learning_rate": 4.946012247819559e-07, "loss": 0.0002, "reward": 1.74642863124609, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 12728 }, { "completion_length": 238.2142972946167, "epoch": 2.1344565991868896, "grad_norm": 0.42170481009249156, "kl": 0.398681640625, "learning_rate": 4.945987450075936e-07, "loss": 0.0004, "reward": 1.7928571999073029, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7928571812808514, "rewards/format_reward_func": 1.0, "step": 12730 }, { "completion_length": 240.73661708831787, "epoch": 2.134791902426757, "grad_norm": 0.13332983942445129, "kl": 0.1611328125, "learning_rate": 4.945962646700744e-07, "loss": 0.0002, "reward": 1.7142857909202576, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857555299997, "rewards/format_reward_func": 1.0, "step": 12732 }, { "completion_length": 230.14732933044434, "epoch": 2.135127205666625, "grad_norm": 0.14906506873937503, "kl": 0.3936309814453125, "learning_rate": 4.945937837694039e-07, "loss": 0.0004, "reward": 1.805357202887535, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.8098214641213417, "rewards/format_reward_func": 0.9955357164144516, "step": 12734 }, { "completion_length": 242.59822177886963, "epoch": 2.1354625089064925, "grad_norm": 0.09027486073294985, "kl": 0.150177001953125, "learning_rate": 4.945913023055877e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7214286178350449, "rewards/format_reward_func": 1.0, "step": 12736 }, { "completion_length": 246.93750953674316, "epoch": 2.1357978121463597, "grad_norm": 0.5243776630881638, "kl": 0.211212158203125, "learning_rate": 4.945888202786317e-07, "loss": 0.0002, "reward": 1.760714329779148, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143335044384, "rewards/format_reward_func": 1.0, "step": 12738 }, { "completion_length": 242.2634048461914, "epoch": 2.1361331153862273, "grad_norm": 0.29612204522098395, "kl": 0.139556884765625, "learning_rate": 4.945863376885415e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 12740 }, { "completion_length": 239.7812614440918, "epoch": 2.136468418626095, "grad_norm": 0.24427987178665603, "kl": 0.1544189453125, "learning_rate": 4.94583854535323e-07, "loss": 0.0002, "reward": 1.7696429193019867, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7741071581840515, "rewards/format_reward_func": 0.9955357164144516, "step": 12742 }, { "completion_length": 234.2276906967163, "epoch": 2.1368037218659626, "grad_norm": 0.1073191925162693, "kl": 0.1297607421875, "learning_rate": 4.945813708189816e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 12744 }, { "completion_length": 248.2634048461914, "epoch": 2.13713902510583, "grad_norm": 0.19538950089251983, "kl": 0.14056396484375, "learning_rate": 4.945788865395234e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7875000201165676, "rewards/format_reward_func": 0.9910714328289032, "step": 12746 }, { "completion_length": 246.08929824829102, "epoch": 2.137474328345698, "grad_norm": 0.2802740460051694, "kl": 0.148651123046875, "learning_rate": 4.945764016969538e-07, "loss": 0.0001, "reward": 1.7196429148316383, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7241071872413158, "rewards/format_reward_func": 0.9955357164144516, "step": 12748 }, { "completion_length": 239.8750123977661, "epoch": 2.137809631585565, "grad_norm": 0.21579468742525323, "kl": 0.1495361328125, "learning_rate": 4.945739162912787e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000342726707, "rewards/format_reward_func": 1.0, "step": 12750 }, { "completion_length": 241.4375114440918, "epoch": 2.1381449348254327, "grad_norm": 0.1691322578412735, "kl": 0.12725830078125, "learning_rate": 4.945714303225037e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428775787354, "rewards/format_reward_func": 1.0, "step": 12752 }, { "completion_length": 232.0491180419922, "epoch": 2.1384802380653003, "grad_norm": 0.24020569233265468, "kl": 0.12933349609375, "learning_rate": 4.945689437906346e-07, "loss": 0.0001, "reward": 1.8071429133415222, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 12754 }, { "completion_length": 243.6875123977661, "epoch": 2.138815541305168, "grad_norm": 0.2156933055364058, "kl": 0.146575927734375, "learning_rate": 4.945664566956771e-07, "loss": 0.0001, "reward": 1.8142857551574707, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857477068901, "rewards/format_reward_func": 1.0, "step": 12756 }, { "completion_length": 235.9553680419922, "epoch": 2.1391508445450356, "grad_norm": 0.23128610933191962, "kl": 0.15655517578125, "learning_rate": 4.94563969037637e-07, "loss": 0.0002, "reward": 1.710714340209961, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7107143141329288, "rewards/format_reward_func": 1.0, "step": 12758 }, { "completion_length": 234.47768878936768, "epoch": 2.139486147784903, "grad_norm": 0.36352148754101415, "kl": 0.148345947265625, "learning_rate": 4.945614808165199e-07, "loss": 0.0001, "reward": 1.732142947614193, "reward_std": 0.07576143834739923, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 1.0, "step": 12760 }, { "completion_length": 244.48215293884277, "epoch": 2.1398214510247704, "grad_norm": 0.14300670479017372, "kl": 0.15911865234375, "learning_rate": 4.945589920323317e-07, "loss": 0.0002, "reward": 1.7642857730388641, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 12762 }, { "completion_length": 234.37054634094238, "epoch": 2.140156754264638, "grad_norm": 0.8349680304170968, "kl": 0.28076171875, "learning_rate": 4.945565026850779e-07, "loss": 0.0003, "reward": 1.7642857879400253, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 12764 }, { "completion_length": 241.1875114440918, "epoch": 2.1404920575045057, "grad_norm": 1.056639847178951, "kl": 0.64544677734375, "learning_rate": 4.945540127747643e-07, "loss": 0.0006, "reward": 1.7839286103844643, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7883928772062063, "rewards/format_reward_func": 0.9955357164144516, "step": 12766 }, { "completion_length": 227.62501049041748, "epoch": 2.1408273607443733, "grad_norm": 0.1428928703085797, "kl": 0.1385498046875, "learning_rate": 4.945515223013969e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000156462193, "rewards/format_reward_func": 1.0, "step": 12768 }, { "completion_length": 232.0491180419922, "epoch": 2.141162663984241, "grad_norm": 0.14921203769840066, "kl": 0.152801513671875, "learning_rate": 4.94549031264981e-07, "loss": 0.0002, "reward": 1.8250000551342964, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.825000025331974, "rewards/format_reward_func": 1.0, "step": 12770 }, { "completion_length": 233.75000953674316, "epoch": 2.141497967224108, "grad_norm": 0.2428657901809172, "kl": 0.146759033203125, "learning_rate": 4.945465396655227e-07, "loss": 0.0001, "reward": 1.8464286103844643, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8464285954833031, "rewards/format_reward_func": 1.0, "step": 12772 }, { "completion_length": 239.32143878936768, "epoch": 2.1418332704639758, "grad_norm": 0.12902869004400336, "kl": 0.138916015625, "learning_rate": 4.945440475030276e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 12774 }, { "completion_length": 232.8750114440918, "epoch": 2.1421685737038434, "grad_norm": 0.19235439708019675, "kl": 0.1307373046875, "learning_rate": 4.945415547775013e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 12776 }, { "completion_length": 243.0669755935669, "epoch": 2.142503876943711, "grad_norm": 0.435288208029278, "kl": 0.150848388671875, "learning_rate": 4.945390614889499e-07, "loss": 0.0002, "reward": 1.7750000655651093, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7839286029338837, "rewards/format_reward_func": 0.9910714328289032, "step": 12778 }, { "completion_length": 234.4062614440918, "epoch": 2.1428391801835787, "grad_norm": 0.23064924712066712, "kl": 0.206634521484375, "learning_rate": 4.945365676373787e-07, "loss": 0.0002, "reward": 1.7821429446339607, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 12780 }, { "completion_length": 238.9285831451416, "epoch": 2.1431744834234463, "grad_norm": 0.1791756501286884, "kl": 0.144989013671875, "learning_rate": 4.945340732227938e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 12782 }, { "completion_length": 244.47769165039062, "epoch": 2.1435097866633135, "grad_norm": 0.4171668278383133, "kl": 0.13970947265625, "learning_rate": 4.945315782452008e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571626543999, "rewards/format_reward_func": 1.0, "step": 12784 }, { "completion_length": 236.4241189956665, "epoch": 2.143845089903181, "grad_norm": 0.19374842158996536, "kl": 0.1378936767578125, "learning_rate": 4.945290827046053e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143227010965, "rewards/format_reward_func": 1.0, "step": 12786 }, { "completion_length": 229.32590198516846, "epoch": 2.1441803931430488, "grad_norm": 0.21744398788212368, "kl": 0.120758056640625, "learning_rate": 4.945265866010133e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 12788 }, { "completion_length": 226.24108409881592, "epoch": 2.1445156963829164, "grad_norm": 0.27278636765832526, "kl": 0.154998779296875, "learning_rate": 4.945240899344304e-07, "loss": 0.0002, "reward": 1.7642857655882835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857581377029, "rewards/format_reward_func": 1.0, "step": 12790 }, { "completion_length": 234.80358219146729, "epoch": 2.144850999622784, "grad_norm": 0.3670106294704579, "kl": 0.1717529296875, "learning_rate": 4.945215927048623e-07, "loss": 0.0002, "reward": 1.728571504354477, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 12792 }, { "completion_length": 238.80804824829102, "epoch": 2.145186302862651, "grad_norm": 0.34621680075467387, "kl": 0.151611328125, "learning_rate": 4.94519094912315e-07, "loss": 0.0002, "reward": 1.7857143580913544, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 12794 }, { "completion_length": 234.86608028411865, "epoch": 2.145521606102519, "grad_norm": 0.35599495255633246, "kl": 0.12451171875, "learning_rate": 4.945165965567939e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8035714663565159, "rewards/format_reward_func": 1.0, "step": 12796 }, { "completion_length": 233.17858123779297, "epoch": 2.1458569093423865, "grad_norm": 0.3200094677780826, "kl": 0.1365966796875, "learning_rate": 4.945140976383051e-07, "loss": 0.0001, "reward": 1.769642911851406, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.9955357164144516, "step": 12798 }, { "completion_length": 233.56251049041748, "epoch": 2.146192212582254, "grad_norm": 0.4427323544427076, "kl": 0.123443603515625, "learning_rate": 4.94511598156854e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 12800 }, { "completion_length": 226.57590579986572, "epoch": 2.1465275158221218, "grad_norm": 0.14726816307483223, "kl": 0.128753662109375, "learning_rate": 4.945090981124468e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 12802 }, { "completion_length": 231.0535831451416, "epoch": 2.1468628190619894, "grad_norm": 0.29032372274327023, "kl": 0.143157958984375, "learning_rate": 4.945065975050888e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857544124126, "rewards/format_reward_func": 1.0, "step": 12804 }, { "completion_length": 228.04911613464355, "epoch": 2.1471981223018566, "grad_norm": 0.12590136262960336, "kl": 0.145751953125, "learning_rate": 4.94504096334786e-07, "loss": 0.0001, "reward": 1.807142898440361, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428835391998, "rewards/format_reward_func": 1.0, "step": 12806 }, { "completion_length": 238.69643783569336, "epoch": 2.147533425541724, "grad_norm": 0.30009913665500965, "kl": 0.144683837890625, "learning_rate": 4.945015946015442e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428775787354, "rewards/format_reward_func": 1.0, "step": 12808 }, { "completion_length": 234.6294755935669, "epoch": 2.147868728781592, "grad_norm": 0.21758811899769895, "kl": 0.137908935546875, "learning_rate": 4.944990923053689e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 12810 }, { "completion_length": 239.12054443359375, "epoch": 2.1482040320214595, "grad_norm": 0.27609212343688166, "kl": 0.1478271484375, "learning_rate": 4.944965894462662e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.785714304074645, "rewards/format_reward_func": 1.0, "step": 12812 }, { "completion_length": 240.602689743042, "epoch": 2.148539335261327, "grad_norm": 0.21616130808787837, "kl": 0.164459228515625, "learning_rate": 4.944940860242416e-07, "loss": 0.0002, "reward": 1.7857143431901932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 12814 }, { "completion_length": 237.48661994934082, "epoch": 2.1488746385011943, "grad_norm": 0.24664055919005196, "kl": 0.142486572265625, "learning_rate": 4.94491582039301e-07, "loss": 0.0001, "reward": 1.8000000640749931, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000417232513, "rewards/format_reward_func": 1.0, "step": 12816 }, { "completion_length": 244.68304634094238, "epoch": 2.149209941741062, "grad_norm": 0.2179678551592934, "kl": 0.149383544921875, "learning_rate": 4.944890774914502e-07, "loss": 0.0001, "reward": 1.7178572192788124, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571671247482, "rewards/format_reward_func": 1.0, "step": 12818 }, { "completion_length": 238.2366180419922, "epoch": 2.1495452449809296, "grad_norm": 0.10679201305907572, "kl": 0.14337158203125, "learning_rate": 4.944865723806948e-07, "loss": 0.0001, "reward": 1.796428643167019, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 12820 }, { "completion_length": 233.70983123779297, "epoch": 2.149880548220797, "grad_norm": 0.3037664974555986, "kl": 0.140838623046875, "learning_rate": 4.944840667070406e-07, "loss": 0.0001, "reward": 1.8080357685685158, "reward_std": 0.029041885398328304, "rewards/equation_reward_func": 0.8098214641213417, "rewards/format_reward_func": 0.9982142895460129, "step": 12822 }, { "completion_length": 240.33929824829102, "epoch": 2.150215851460665, "grad_norm": 0.17097031750047306, "kl": 0.132476806640625, "learning_rate": 4.944815604704936e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428872644901, "rewards/format_reward_func": 1.0, "step": 12824 }, { "completion_length": 240.99108123779297, "epoch": 2.1505511547005325, "grad_norm": 0.2717364472056746, "kl": 0.14019775390625, "learning_rate": 4.944790536710592e-07, "loss": 0.0001, "reward": 1.7589286491274834, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7625000290572643, "rewards/format_reward_func": 0.9964285716414452, "step": 12826 }, { "completion_length": 240.4062614440918, "epoch": 2.1508864579403997, "grad_norm": 0.06785661992573451, "kl": 0.140716552734375, "learning_rate": 4.944765463087435e-07, "loss": 0.0001, "reward": 1.8160714507102966, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.8205357566475868, "rewards/format_reward_func": 0.9955357164144516, "step": 12828 }, { "completion_length": 250.36608600616455, "epoch": 2.1512217611802673, "grad_norm": 0.1515955621735846, "kl": 0.15386962890625, "learning_rate": 4.944740383835521e-07, "loss": 0.0002, "reward": 1.7357143759727478, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143256813288, "rewards/format_reward_func": 1.0, "step": 12830 }, { "completion_length": 242.3928680419922, "epoch": 2.151557064420135, "grad_norm": 0.1461774077036663, "kl": 0.129852294921875, "learning_rate": 4.944715298954909e-07, "loss": 0.0001, "reward": 1.8142857551574707, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857477068901, "rewards/format_reward_func": 1.0, "step": 12832 }, { "completion_length": 239.88393878936768, "epoch": 2.1518923676600026, "grad_norm": 0.11773547519069961, "kl": 0.15191650390625, "learning_rate": 4.944690208445656e-07, "loss": 0.0002, "reward": 1.8107143267989159, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.810714315623045, "rewards/format_reward_func": 1.0, "step": 12834 }, { "completion_length": 242.55358028411865, "epoch": 2.15222767089987, "grad_norm": 0.13150936149774745, "kl": 0.14276123046875, "learning_rate": 4.944665112307819e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428827941418, "rewards/format_reward_func": 1.0, "step": 12836 }, { "completion_length": 247.94644260406494, "epoch": 2.1525629741397374, "grad_norm": 0.004567143634278605, "kl": 0.137481689453125, "learning_rate": 4.944640010541457e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 12838 }, { "completion_length": 244.2544755935669, "epoch": 2.152898277379605, "grad_norm": 0.17917256243490093, "kl": 0.1134490966796875, "learning_rate": 4.944614903146626e-07, "loss": 0.0001, "reward": 1.8142857775092125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857328057289, "rewards/format_reward_func": 1.0, "step": 12840 }, { "completion_length": 239.4955472946167, "epoch": 2.1532335806194727, "grad_norm": 0.166648209325707, "kl": 0.13653564453125, "learning_rate": 4.944589790123387e-07, "loss": 0.0001, "reward": 1.8107143491506577, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8107143044471741, "rewards/format_reward_func": 1.0, "step": 12842 }, { "completion_length": 244.07590579986572, "epoch": 2.1535688838593403, "grad_norm": 0.2685607135597953, "kl": 0.1331787109375, "learning_rate": 4.944564671471794e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714626312256, "rewards/format_reward_func": 1.0, "step": 12844 }, { "completion_length": 246.5491189956665, "epoch": 2.153904187099208, "grad_norm": 0.13520245836311928, "kl": 0.152587890625, "learning_rate": 4.944539547191908e-07, "loss": 0.0002, "reward": 1.739285796880722, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 12846 }, { "completion_length": 251.59822940826416, "epoch": 2.1542394903390756, "grad_norm": 0.20708427682175876, "kl": 0.1339111328125, "learning_rate": 4.944514417283786e-07, "loss": 0.0001, "reward": 1.6928572282195091, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.6928571816533804, "rewards/format_reward_func": 1.0, "step": 12848 }, { "completion_length": 245.05804538726807, "epoch": 2.1545747935789428, "grad_norm": 0.13092118170338693, "kl": 0.134552001953125, "learning_rate": 4.944489281747485e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 12850 }, { "completion_length": 238.1785831451416, "epoch": 2.1549100968188104, "grad_norm": 0.2020475938040691, "kl": 0.13726806640625, "learning_rate": 4.944464140583063e-07, "loss": 0.0001, "reward": 1.7616072222590446, "reward_std": 0.054295698180794716, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9982142895460129, "step": 12852 }, { "completion_length": 248.54911613464355, "epoch": 2.155245400058678, "grad_norm": 0.26429877443982663, "kl": 0.129852294921875, "learning_rate": 4.944438993790578e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714637488127, "rewards/format_reward_func": 1.0, "step": 12854 }, { "completion_length": 250.86608505249023, "epoch": 2.1555807032985457, "grad_norm": 0.18398285158750388, "kl": 0.1419677734375, "learning_rate": 4.944413841370088e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 12856 }, { "completion_length": 244.50447368621826, "epoch": 2.1559160065384133, "grad_norm": 0.27150227309966846, "kl": 0.137847900390625, "learning_rate": 4.944388683321652e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7500000316649675, "rewards/format_reward_func": 1.0, "step": 12858 }, { "completion_length": 237.32143878936768, "epoch": 2.1562513097782805, "grad_norm": 0.15710165366100282, "kl": 0.136749267578125, "learning_rate": 4.944363519645326e-07, "loss": 0.0001, "reward": 1.7178572192788124, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7178571745753288, "rewards/format_reward_func": 1.0, "step": 12860 }, { "completion_length": 239.290189743042, "epoch": 2.156586613018148, "grad_norm": 0.15330536446957824, "kl": 0.131866455078125, "learning_rate": 4.944338350341169e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 12862 }, { "completion_length": 237.24108409881592, "epoch": 2.1569219162580158, "grad_norm": 0.058822255525960855, "kl": 0.142547607421875, "learning_rate": 4.944313175409239e-07, "loss": 0.0001, "reward": 1.7928571850061417, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571850061417, "rewards/format_reward_func": 1.0, "step": 12864 }, { "completion_length": 254.0759048461914, "epoch": 2.1572572194978834, "grad_norm": 0.20590961982409228, "kl": 0.1185302734375, "learning_rate": 4.944287994849594e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714656114578, "rewards/format_reward_func": 1.0, "step": 12866 }, { "completion_length": 237.5759038925171, "epoch": 2.157592522737751, "grad_norm": 0.23532305877840948, "kl": 0.13934326171875, "learning_rate": 4.944262808662292e-07, "loss": 0.0001, "reward": 1.717857226729393, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571838885546, "rewards/format_reward_func": 1.0, "step": 12868 }, { "completion_length": 244.3482255935669, "epoch": 2.1579278259776187, "grad_norm": 0.19126064452006622, "kl": 0.137176513671875, "learning_rate": 4.94423761684739e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 1.0, "step": 12870 }, { "completion_length": 233.21876049041748, "epoch": 2.158263129217486, "grad_norm": 0.18566948164981376, "kl": 0.13421630859375, "learning_rate": 4.944212419404947e-07, "loss": 0.0001, "reward": 1.814285770058632, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857365310192, "rewards/format_reward_func": 1.0, "step": 12872 }, { "completion_length": 236.08036708831787, "epoch": 2.1585984324573535, "grad_norm": 0.2988504098040542, "kl": 0.147857666015625, "learning_rate": 4.944187216335021e-07, "loss": 0.0001, "reward": 1.836607187986374, "reward_std": 0.049244935624301434, "rewards/equation_reward_func": 0.8383928723633289, "rewards/format_reward_func": 0.9982142895460129, "step": 12874 }, { "completion_length": 240.16518783569336, "epoch": 2.158933735697221, "grad_norm": 0.17006143463215537, "kl": 0.138519287109375, "learning_rate": 4.94416200763767e-07, "loss": 0.0001, "reward": 1.7696429267525673, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7732143178582191, "rewards/format_reward_func": 0.9964285790920258, "step": 12876 }, { "completion_length": 242.79465579986572, "epoch": 2.1592690389370888, "grad_norm": 0.24022367229575953, "kl": 0.15008544921875, "learning_rate": 4.944136793312952e-07, "loss": 0.0002, "reward": 1.7857143506407738, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 12878 }, { "completion_length": 237.74554824829102, "epoch": 2.1596043421769564, "grad_norm": 0.12534665865683378, "kl": 0.155029296875, "learning_rate": 4.944111573360924e-07, "loss": 0.0002, "reward": 1.7544643729925156, "reward_std": 0.03409264795482159, "rewards/equation_reward_func": 0.7562500350177288, "rewards/format_reward_func": 0.9982142895460129, "step": 12880 }, { "completion_length": 238.2053680419922, "epoch": 2.159939645416824, "grad_norm": 0.2235627000227893, "kl": 0.134307861328125, "learning_rate": 4.944086347781646e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 12882 }, { "completion_length": 242.26340198516846, "epoch": 2.1602749486566912, "grad_norm": 0.21898008921483378, "kl": 0.13482666015625, "learning_rate": 4.944061116575173e-07, "loss": 0.0001, "reward": 1.7651786506175995, "reward_std": 0.029041884932667017, "rewards/equation_reward_func": 0.766964316368103, "rewards/format_reward_func": 0.9982142895460129, "step": 12884 }, { "completion_length": 242.3035831451416, "epoch": 2.160610251896559, "grad_norm": 0.24976873447595765, "kl": 0.139129638671875, "learning_rate": 4.944035879741567e-07, "loss": 0.0001, "reward": 1.72946435213089, "reward_std": 0.049244935624301434, "rewards/equation_reward_func": 0.7312500327825546, "rewards/format_reward_func": 0.9982142895460129, "step": 12886 }, { "completion_length": 246.10268878936768, "epoch": 2.1609455551364265, "grad_norm": 0.35250302204529516, "kl": 0.134246826171875, "learning_rate": 4.944010637280884e-07, "loss": 0.0001, "reward": 1.7580357939004898, "reward_std": 0.059346460737288, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9982142895460129, "step": 12888 }, { "completion_length": 238.5848331451416, "epoch": 2.161280858376294, "grad_norm": 0.1638936631207704, "kl": 0.130126953125, "learning_rate": 4.943985389193182e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 12890 }, { "completion_length": 236.87947273254395, "epoch": 2.1616161616161618, "grad_norm": 0.117822403773612, "kl": 0.1263427734375, "learning_rate": 4.943960135478519e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 12892 }, { "completion_length": 240.99108219146729, "epoch": 2.1619514648560294, "grad_norm": 0.4960007888798321, "kl": 0.125946044921875, "learning_rate": 4.943934876136955e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 12894 }, { "completion_length": 240.76786994934082, "epoch": 2.1622867680958966, "grad_norm": 0.2388921461736386, "kl": 0.147216796875, "learning_rate": 4.943909611168546e-07, "loss": 0.0001, "reward": 1.7214286550879478, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7214285880327225, "rewards/format_reward_func": 1.0, "step": 12896 }, { "completion_length": 235.0267972946167, "epoch": 2.1626220713357642, "grad_norm": 0.1771875945580934, "kl": 0.130950927734375, "learning_rate": 4.94388434057335e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 12898 }, { "completion_length": 242.977689743042, "epoch": 2.162957374575632, "grad_norm": 0.11793385067854463, "kl": 0.14813232421875, "learning_rate": 4.943859064351426e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143081724644, "rewards/format_reward_func": 1.0, "step": 12900 }, { "completion_length": 236.16072273254395, "epoch": 2.1632926778154995, "grad_norm": 0.1048863379933829, "kl": 0.1263427734375, "learning_rate": 4.943833782502834e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 12902 }, { "completion_length": 248.91965293884277, "epoch": 2.163627981055367, "grad_norm": 0.24007095785937319, "kl": 0.14215087890625, "learning_rate": 4.94380849502763e-07, "loss": 0.0001, "reward": 1.7267858013510704, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7312500439584255, "rewards/format_reward_func": 0.9955357164144516, "step": 12904 }, { "completion_length": 243.36161708831787, "epoch": 2.1639632842952343, "grad_norm": 0.22506223143518816, "kl": 0.137115478515625, "learning_rate": 4.943783201925873e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 12906 }, { "completion_length": 237.40626049041748, "epoch": 2.164298587535102, "grad_norm": 0.09530809311521778, "kl": 0.143402099609375, "learning_rate": 4.943757903197621e-07, "loss": 0.0001, "reward": 1.835714340209961, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8357142992317677, "rewards/format_reward_func": 1.0, "step": 12908 }, { "completion_length": 242.81251049041748, "epoch": 2.1646338907749696, "grad_norm": 0.2760145257903106, "kl": 0.160980224609375, "learning_rate": 4.943732598842931e-07, "loss": 0.0002, "reward": 1.682142935693264, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.6821429096162319, "rewards/format_reward_func": 1.0, "step": 12910 }, { "completion_length": 248.42858409881592, "epoch": 2.1649691940148372, "grad_norm": 0.17348253473498154, "kl": 0.135223388671875, "learning_rate": 4.943707288861864e-07, "loss": 0.0001, "reward": 1.7232143580913544, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7276786025613546, "rewards/format_reward_func": 0.9955357164144516, "step": 12912 }, { "completion_length": 246.4553680419922, "epoch": 2.165304497254705, "grad_norm": 0.187496772007708, "kl": 0.165252685546875, "learning_rate": 4.943681973254476e-07, "loss": 0.0002, "reward": 1.7321429252624512, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321429122239351, "rewards/format_reward_func": 1.0, "step": 12914 }, { "completion_length": 236.97322368621826, "epoch": 2.1656398004945725, "grad_norm": 0.12459034225641151, "kl": 0.131591796875, "learning_rate": 4.943656652020825e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 12916 }, { "completion_length": 238.65179443359375, "epoch": 2.1659751037344397, "grad_norm": 0.347977850319333, "kl": 0.138275146484375, "learning_rate": 4.943631325160971e-07, "loss": 0.0001, "reward": 1.7892857939004898, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 12918 }, { "completion_length": 235.53126049041748, "epoch": 2.1663104069743073, "grad_norm": 0.17829052465307105, "kl": 0.133636474609375, "learning_rate": 4.943605992674973e-07, "loss": 0.0001, "reward": 1.7000000849366188, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7000000402331352, "rewards/format_reward_func": 1.0, "step": 12920 }, { "completion_length": 242.2455472946167, "epoch": 2.166645710214175, "grad_norm": 0.20660870590652766, "kl": 0.1290283203125, "learning_rate": 4.943580654562886e-07, "loss": 0.0001, "reward": 1.7558036521077156, "reward_std": 0.07260471233166754, "rewards/equation_reward_func": 0.7571428790688515, "rewards/format_reward_func": 0.9986607171595097, "step": 12922 }, { "completion_length": 238.7053689956665, "epoch": 2.1669810134540426, "grad_norm": 0.10815500831875394, "kl": 0.177398681640625, "learning_rate": 4.943555310824772e-07, "loss": 0.0002, "reward": 1.7946429029107094, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7991071604192257, "rewards/format_reward_func": 0.9955357164144516, "step": 12924 }, { "completion_length": 234.91518878936768, "epoch": 2.1673163166939102, "grad_norm": 0.2615412690593561, "kl": 0.123870849609375, "learning_rate": 4.943529961460688e-07, "loss": 0.0001, "reward": 1.804464340209961, "reward_std": 0.03409264795482159, "rewards/equation_reward_func": 0.8062500320374966, "rewards/format_reward_func": 0.9982142895460129, "step": 12926 }, { "completion_length": 241.53125953674316, "epoch": 2.1676516199337774, "grad_norm": 0.23643354312928433, "kl": 0.138916015625, "learning_rate": 4.943504606470691e-07, "loss": 0.0001, "reward": 1.8392857611179352, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8392857238650322, "rewards/format_reward_func": 1.0, "step": 12928 }, { "completion_length": 236.5223331451416, "epoch": 2.167986923173645, "grad_norm": 0.20165447625301341, "kl": 0.13446044921875, "learning_rate": 4.943479245854841e-07, "loss": 0.0001, "reward": 1.7839286401867867, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7875000163912773, "rewards/format_reward_func": 0.9964285790920258, "step": 12930 }, { "completion_length": 237.37947273254395, "epoch": 2.1683222264135127, "grad_norm": 0.2801653478125558, "kl": 0.139373779296875, "learning_rate": 4.943453879613194e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8107143044471741, "rewards/format_reward_func": 1.0, "step": 12932 }, { "completion_length": 235.91965293884277, "epoch": 2.1686575296533803, "grad_norm": 0.24652515854083196, "kl": 0.11895751953125, "learning_rate": 4.943428507745811e-07, "loss": 0.0001, "reward": 1.7750000432133675, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000171363354, "rewards/format_reward_func": 1.0, "step": 12934 }, { "completion_length": 228.6294765472412, "epoch": 2.168992832893248, "grad_norm": 0.5322413719457854, "kl": 0.122833251953125, "learning_rate": 4.94340313025275e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857305705547, "rewards/format_reward_func": 1.0, "step": 12936 }, { "completion_length": 233.21429634094238, "epoch": 2.1693281361331156, "grad_norm": 0.10032357589058459, "kl": 0.120361328125, "learning_rate": 4.94337774713407e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143238186836, "rewards/format_reward_func": 1.0, "step": 12938 }, { "completion_length": 236.88394165039062, "epoch": 2.169663439372983, "grad_norm": 0.14355426238650287, "kl": 0.0960540771484375, "learning_rate": 4.943352358389827e-07, "loss": 0.0001, "reward": 1.7642858028411865, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 12940 }, { "completion_length": 235.64733123779297, "epoch": 2.1699987426128504, "grad_norm": 0.25435027499296264, "kl": 0.12823486328125, "learning_rate": 4.943326964020083e-07, "loss": 0.0001, "reward": 1.8089286237955093, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.8133928813040257, "rewards/format_reward_func": 0.9955357164144516, "step": 12942 }, { "completion_length": 244.31250953674316, "epoch": 2.170334045852718, "grad_norm": 0.2898807685825902, "kl": 0.119598388671875, "learning_rate": 4.943301564024892e-07, "loss": 0.0001, "reward": 1.742857202887535, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571581840515, "rewards/format_reward_func": 1.0, "step": 12944 }, { "completion_length": 247.62947463989258, "epoch": 2.1706693490925857, "grad_norm": 0.19631638793532102, "kl": 0.12933349609375, "learning_rate": 4.943276158404316e-07, "loss": 0.0001, "reward": 1.7000000700354576, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7000000290572643, "rewards/format_reward_func": 1.0, "step": 12946 }, { "completion_length": 243.0892972946167, "epoch": 2.1710046523324533, "grad_norm": 0.17855137823784265, "kl": 0.110931396484375, "learning_rate": 4.943250747158413e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7392857596278191, "rewards/format_reward_func": 1.0, "step": 12948 }, { "completion_length": 237.7991180419922, "epoch": 2.1713399555723205, "grad_norm": 0.3307348578135927, "kl": 0.140533447265625, "learning_rate": 4.943225330287239e-07, "loss": 0.0001, "reward": 1.7714286148548126, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 12950 }, { "completion_length": 235.58036708831787, "epoch": 2.171675258812188, "grad_norm": 0.4506722857335034, "kl": 0.117889404296875, "learning_rate": 4.943199907790856e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 12952 }, { "completion_length": 236.32590293884277, "epoch": 2.172010562052056, "grad_norm": 0.20793578554397196, "kl": 0.1285400390625, "learning_rate": 4.943174479669321e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 12954 }, { "completion_length": 237.71429538726807, "epoch": 2.1723458652919234, "grad_norm": 0.15868192009752824, "kl": 0.105560302734375, "learning_rate": 4.943149045922692e-07, "loss": 0.0001, "reward": 1.7482143342494965, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7526786085218191, "rewards/format_reward_func": 0.9955357164144516, "step": 12956 }, { "completion_length": 237.6071538925171, "epoch": 2.172681168531791, "grad_norm": 0.18955779258601502, "kl": 0.109527587890625, "learning_rate": 4.943123606551028e-07, "loss": 0.0001, "reward": 1.8160714954137802, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.8205357380211353, "rewards/format_reward_func": 0.9955357164144516, "step": 12958 }, { "completion_length": 248.1562623977661, "epoch": 2.1730164717716587, "grad_norm": 0.2265311963984262, "kl": 0.206207275390625, "learning_rate": 4.943098161554388e-07, "loss": 0.0002, "reward": 1.7696429267525673, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7741071730852127, "rewards/format_reward_func": 0.9955357164144516, "step": 12960 }, { "completion_length": 239.08483409881592, "epoch": 2.173351775011526, "grad_norm": 0.24927354696810647, "kl": 0.1038360595703125, "learning_rate": 4.94307271093283e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 12962 }, { "completion_length": 244.32143688201904, "epoch": 2.1736870782513935, "grad_norm": 0.12840740398816736, "kl": 0.1054229736328125, "learning_rate": 4.943047254686413e-07, "loss": 0.0001, "reward": 1.8035714700818062, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714570432901, "rewards/format_reward_func": 1.0, "step": 12964 }, { "completion_length": 238.6785831451416, "epoch": 2.174022381491261, "grad_norm": 0.185315617261064, "kl": 0.110504150390625, "learning_rate": 4.943021792815194e-07, "loss": 0.0001, "reward": 1.8214286342263222, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.821428582072258, "rewards/format_reward_func": 1.0, "step": 12966 }, { "completion_length": 242.80804824829102, "epoch": 2.174357684731129, "grad_norm": 0.48087252874133135, "kl": 0.13427734375, "learning_rate": 4.942996325319234e-07, "loss": 0.0001, "reward": 1.7625000849366188, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643051922321, "rewards/format_reward_func": 0.9955357164144516, "step": 12968 }, { "completion_length": 249.78126049041748, "epoch": 2.1746929879709964, "grad_norm": 0.15766073915047, "kl": 0.11785888671875, "learning_rate": 4.942970852198591e-07, "loss": 0.0001, "reward": 1.6750000715255737, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.6750000286847353, "rewards/format_reward_func": 1.0, "step": 12970 }, { "completion_length": 239.28572463989258, "epoch": 2.1750282912108636, "grad_norm": 0.09665109152968684, "kl": 0.13330078125, "learning_rate": 4.942945373453323e-07, "loss": 0.0001, "reward": 1.7589286118745804, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 12972 }, { "completion_length": 237.196439743042, "epoch": 2.1753635944507312, "grad_norm": 0.21211440287784858, "kl": 0.1326904296875, "learning_rate": 4.942919889083488e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000204890966, "rewards/format_reward_func": 1.0, "step": 12974 }, { "completion_length": 244.95090675354004, "epoch": 2.175698897690599, "grad_norm": 0.5472880673504554, "kl": 0.135406494140625, "learning_rate": 4.942894399089148e-07, "loss": 0.0001, "reward": 1.814285784959793, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 12976 }, { "completion_length": 232.86608219146729, "epoch": 2.1760342009304665, "grad_norm": 0.4324622384788192, "kl": 0.1165771484375, "learning_rate": 4.942868903470357e-07, "loss": 0.0001, "reward": 1.8214286118745804, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8214285932481289, "rewards/format_reward_func": 1.0, "step": 12978 }, { "completion_length": 234.79018878936768, "epoch": 2.176369504170334, "grad_norm": 0.23861054744834512, "kl": 0.129150390625, "learning_rate": 4.942843402227178e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 12980 }, { "completion_length": 244.60715293884277, "epoch": 2.176704807410202, "grad_norm": 0.17599420950231748, "kl": 0.119537353515625, "learning_rate": 4.942817895359666e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 12982 }, { "completion_length": 242.7232265472412, "epoch": 2.177040110650069, "grad_norm": 0.20729228630689475, "kl": 0.131744384765625, "learning_rate": 4.942792382867884e-07, "loss": 0.0001, "reward": 1.7142858058214188, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7142857555299997, "rewards/format_reward_func": 1.0, "step": 12984 }, { "completion_length": 234.30804824829102, "epoch": 2.1773754138899366, "grad_norm": 0.23350022151308217, "kl": 0.131134033203125, "learning_rate": 4.942766864751886e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500350177288, "rewards/format_reward_func": 0.9955357164144516, "step": 12986 }, { "completion_length": 243.3303689956665, "epoch": 2.1777107171298042, "grad_norm": 0.16288105801056224, "kl": 0.125274658203125, "learning_rate": 4.942741341011733e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714626312256, "rewards/format_reward_func": 1.0, "step": 12988 }, { "completion_length": 238.56697177886963, "epoch": 2.178046020369672, "grad_norm": 0.17009587148192523, "kl": 0.134521484375, "learning_rate": 4.942715811647484e-07, "loss": 0.0001, "reward": 1.701785795390606, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7062500342726707, "rewards/format_reward_func": 0.9955357164144516, "step": 12990 }, { "completion_length": 241.102689743042, "epoch": 2.1783813236095395, "grad_norm": 0.1392263569922293, "kl": 0.191131591796875, "learning_rate": 4.942690276659198e-07, "loss": 0.0002, "reward": 1.7964286282658577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 12992 }, { "completion_length": 241.1116180419922, "epoch": 2.1787166268494067, "grad_norm": 0.19459083800717636, "kl": 0.20111083984375, "learning_rate": 4.942664736046933e-07, "loss": 0.0002, "reward": 1.7321429401636124, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428768336773, "rewards/format_reward_func": 1.0, "step": 12994 }, { "completion_length": 240.47322463989258, "epoch": 2.1790519300892743, "grad_norm": 0.22337487727499877, "kl": 0.15386962890625, "learning_rate": 4.942639189810748e-07, "loss": 0.0002, "reward": 1.7375000715255737, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643253087997, "rewards/format_reward_func": 0.9955357164144516, "step": 12996 }, { "completion_length": 247.71876335144043, "epoch": 2.179387233329142, "grad_norm": 0.39531989522954086, "kl": 0.17596435546875, "learning_rate": 4.942613637950702e-07, "loss": 0.0002, "reward": 1.7339286655187607, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.738392885774374, "rewards/format_reward_func": 0.9955357164144516, "step": 12998 }, { "completion_length": 243.2232255935669, "epoch": 2.1797225365690096, "grad_norm": 0.10995658678733815, "kl": 0.2958984375, "learning_rate": 4.942588080466854e-07, "loss": 0.0003, "reward": 1.7571429312229156, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 13000 }, { "completion_length": 231.92858123779297, "epoch": 2.1800578398088772, "grad_norm": 0.2621288294232232, "kl": 0.099517822265625, "learning_rate": 4.942562517359262e-07, "loss": 0.0001, "reward": 1.8250000551342964, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8250000216066837, "rewards/format_reward_func": 1.0, "step": 13002 }, { "completion_length": 244.7366189956665, "epoch": 2.180393143048745, "grad_norm": 0.2000719271959465, "kl": 0.135772705078125, "learning_rate": 4.942536948627986e-07, "loss": 0.0001, "reward": 1.753571517765522, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 13004 }, { "completion_length": 236.33036708831787, "epoch": 2.180728446288612, "grad_norm": 0.19829520653945196, "kl": 0.115936279296875, "learning_rate": 4.942511374273084e-07, "loss": 0.0001, "reward": 1.725000075995922, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7250000201165676, "rewards/format_reward_func": 1.0, "step": 13006 }, { "completion_length": 242.82143878936768, "epoch": 2.1810637495284797, "grad_norm": 0.23086917895470546, "kl": 0.17926025390625, "learning_rate": 4.942485794294616e-07, "loss": 0.0002, "reward": 1.8178572058677673, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8178571574389935, "rewards/format_reward_func": 1.0, "step": 13008 }, { "completion_length": 244.8125123977661, "epoch": 2.1813990527683473, "grad_norm": 0.12326582796200601, "kl": 0.230316162109375, "learning_rate": 4.942460208692639e-07, "loss": 0.0002, "reward": 1.7392857894301414, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7392857447266579, "rewards/format_reward_func": 1.0, "step": 13010 }, { "completion_length": 246.26340579986572, "epoch": 2.181734356008215, "grad_norm": 0.5111766646617298, "kl": 0.141357421875, "learning_rate": 4.942434617467213e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000149011612, "rewards/format_reward_func": 1.0, "step": 13012 }, { "completion_length": 247.73661708831787, "epoch": 2.1820696592480826, "grad_norm": 0.18897060945754657, "kl": 0.142791748046875, "learning_rate": 4.942409020618398e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7571428883820772, "rewards/format_reward_func": 1.0, "step": 13014 }, { "completion_length": 245.17858600616455, "epoch": 2.1824049624879502, "grad_norm": 0.192037173413482, "kl": 0.22442626953125, "learning_rate": 4.942383418146251e-07, "loss": 0.0002, "reward": 1.7642857804894447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 13016 }, { "completion_length": 246.90179538726807, "epoch": 2.1827402657278174, "grad_norm": 0.1740235049742812, "kl": 0.486602783203125, "learning_rate": 4.942357810050832e-07, "loss": 0.0005, "reward": 1.743303619325161, "reward_std": 0.04987628059461713, "rewards/equation_reward_func": 0.749107176437974, "rewards/format_reward_func": 0.9941964335739613, "step": 13018 }, { "completion_length": 244.92858123779297, "epoch": 2.183075568967685, "grad_norm": 0.28280715676848855, "kl": 0.159271240234375, "learning_rate": 4.9423321963322e-07, "loss": 0.0002, "reward": 1.7669643759727478, "reward_std": 0.04671955481171608, "rewards/equation_reward_func": 0.7732143066823483, "rewards/format_reward_func": 0.9937500059604645, "step": 13020 }, { "completion_length": 241.37501049041748, "epoch": 2.1834108722075527, "grad_norm": 0.19996439624931472, "kl": 0.172210693359375, "learning_rate": 4.942306576990414e-07, "loss": 0.0002, "reward": 1.7910714820027351, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7955357395112514, "rewards/format_reward_func": 0.9955357164144516, "step": 13022 }, { "completion_length": 249.63840579986572, "epoch": 2.1837461754474203, "grad_norm": 0.0978921822772246, "kl": 0.177581787109375, "learning_rate": 4.942280952025531e-07, "loss": 0.0002, "reward": 1.7857143357396126, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 13024 }, { "completion_length": 240.29018878936768, "epoch": 2.184081478687288, "grad_norm": 0.3820928046968436, "kl": 0.283355712890625, "learning_rate": 4.942255321437614e-07, "loss": 0.0003, "reward": 1.78035718947649, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7848214656114578, "rewards/format_reward_func": 0.9955357164144516, "step": 13026 }, { "completion_length": 234.20983219146729, "epoch": 2.1844167819271556, "grad_norm": 0.31466234028861156, "kl": 0.243011474609375, "learning_rate": 4.942229685226719e-07, "loss": 0.0002, "reward": 1.7642857804894447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 13028 }, { "completion_length": 241.95536613464355, "epoch": 2.184752085167023, "grad_norm": 0.14632813185984134, "kl": 0.114013671875, "learning_rate": 4.942204043392905e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 13030 }, { "completion_length": 238.94197845458984, "epoch": 2.1850873884068904, "grad_norm": 0.12801467517810663, "kl": 0.314605712890625, "learning_rate": 4.942178395936232e-07, "loss": 0.0003, "reward": 1.717857226729393, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7178571540862322, "rewards/format_reward_func": 1.0, "step": 13032 }, { "completion_length": 244.12054634094238, "epoch": 2.185422691646758, "grad_norm": 0.1482522554310038, "kl": 0.12371826171875, "learning_rate": 4.942152742856759e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.753571443259716, "rewards/format_reward_func": 1.0, "step": 13034 }, { "completion_length": 252.0535831451416, "epoch": 2.1857579948866257, "grad_norm": 0.328016225135122, "kl": 0.243499755859375, "learning_rate": 4.942127084154545e-07, "loss": 0.0002, "reward": 1.7750000804662704, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 13036 }, { "completion_length": 245.58483409881592, "epoch": 2.1860932981264933, "grad_norm": 0.2915103242446317, "kl": 0.1773681640625, "learning_rate": 4.942101419829649e-07, "loss": 0.0002, "reward": 1.7660715132951736, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7705357447266579, "rewards/format_reward_func": 0.9955357164144516, "step": 13038 }, { "completion_length": 242.22768688201904, "epoch": 2.1864286013663605, "grad_norm": 0.15696621358735263, "kl": 0.229949951171875, "learning_rate": 4.94207574988213e-07, "loss": 0.0002, "reward": 1.7035714760422707, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7035714611411095, "rewards/format_reward_func": 1.0, "step": 13040 }, { "completion_length": 249.46876430511475, "epoch": 2.186763904606228, "grad_norm": 0.16961914232860997, "kl": 0.256103515625, "learning_rate": 4.942050074312047e-07, "loss": 0.0003, "reward": 1.7553572058677673, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214484751225, "rewards/format_reward_func": 0.9955357164144516, "step": 13042 }, { "completion_length": 246.27233219146729, "epoch": 2.187099207846096, "grad_norm": 0.08810562177050113, "kl": 0.142333984375, "learning_rate": 4.94202439311946e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 13044 }, { "completion_length": 237.0982255935669, "epoch": 2.1874345110859634, "grad_norm": 0.19939925649510315, "kl": 0.15093994140625, "learning_rate": 4.941998706304426e-07, "loss": 0.0002, "reward": 1.7928571924567223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571812808514, "rewards/format_reward_func": 1.0, "step": 13046 }, { "completion_length": 240.7321538925171, "epoch": 2.187769814325831, "grad_norm": 0.37088907779026276, "kl": 0.181976318359375, "learning_rate": 4.941973013867007e-07, "loss": 0.0002, "reward": 1.782142922282219, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428831666708, "rewards/format_reward_func": 1.0, "step": 13048 }, { "completion_length": 241.3571548461914, "epoch": 2.1881051175656987, "grad_norm": 0.34147771560012263, "kl": 0.721923828125, "learning_rate": 4.94194731580726e-07, "loss": 0.0007, "reward": 1.766071505844593, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 13050 }, { "completion_length": 249.03572368621826, "epoch": 2.188440420805566, "grad_norm": 0.19678197516440343, "kl": 0.230865478515625, "learning_rate": 4.941921612125246e-07, "loss": 0.0002, "reward": 1.7607143446803093, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.769642885774374, "rewards/format_reward_func": 0.9910714328289032, "step": 13052 }, { "completion_length": 241.30358028411865, "epoch": 2.1887757240454335, "grad_norm": 0.18948886620622116, "kl": 0.227386474609375, "learning_rate": 4.941895902821022e-07, "loss": 0.0002, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 13054 }, { "completion_length": 237.2455472946167, "epoch": 2.189111027285301, "grad_norm": 0.16057536488565435, "kl": 0.434478759765625, "learning_rate": 4.941870187894648e-07, "loss": 0.0004, "reward": 1.7500000521540642, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 13056 }, { "completion_length": 227.5625114440918, "epoch": 2.189446330525169, "grad_norm": 0.22919726377893454, "kl": 0.162689208984375, "learning_rate": 4.941844467346183e-07, "loss": 0.0002, "reward": 1.8107143491506577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107143007218838, "rewards/format_reward_func": 1.0, "step": 13058 }, { "completion_length": 238.60268688201904, "epoch": 2.1897816337650364, "grad_norm": 0.208520058844818, "kl": 0.284759521484375, "learning_rate": 4.941818741175689e-07, "loss": 0.0003, "reward": 1.792857177555561, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571812808514, "rewards/format_reward_func": 1.0, "step": 13060 }, { "completion_length": 237.09822463989258, "epoch": 2.1901169370049036, "grad_norm": 0.13094832960062402, "kl": 0.207489013671875, "learning_rate": 4.941793009383221e-07, "loss": 0.0002, "reward": 1.8142857626080513, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8142857365310192, "rewards/format_reward_func": 1.0, "step": 13062 }, { "completion_length": 239.67858123779297, "epoch": 2.1904522402447713, "grad_norm": 0.12603606773818665, "kl": 0.319000244140625, "learning_rate": 4.94176727196884e-07, "loss": 0.0003, "reward": 1.7517857775092125, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500350177288, "rewards/format_reward_func": 0.9955357164144516, "step": 13064 }, { "completion_length": 230.8125114440918, "epoch": 2.190787543484639, "grad_norm": 0.15633282016082847, "kl": 0.31939697265625, "learning_rate": 4.941741528932606e-07, "loss": 0.0003, "reward": 1.744642935693264, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7491071633994579, "rewards/format_reward_func": 0.9955357164144516, "step": 13066 }, { "completion_length": 234.54465293884277, "epoch": 2.1911228467245065, "grad_norm": 0.43623124322019347, "kl": 0.6263427734375, "learning_rate": 4.941715780274578e-07, "loss": 0.0006, "reward": 1.7750000804662704, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.783928606659174, "rewards/format_reward_func": 0.9910714328289032, "step": 13068 }, { "completion_length": 242.11161708831787, "epoch": 2.191458149964374, "grad_norm": 0.2560604452422528, "kl": 0.178070068359375, "learning_rate": 4.941690025994814e-07, "loss": 0.0002, "reward": 1.7357143759727478, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143089175224, "rewards/format_reward_func": 1.0, "step": 13070 }, { "completion_length": 229.65626049041748, "epoch": 2.191793453204242, "grad_norm": 0.11541466072905084, "kl": 0.167022705078125, "learning_rate": 4.941664266093375e-07, "loss": 0.0002, "reward": 1.7428572177886963, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 13072 }, { "completion_length": 238.8125114440918, "epoch": 2.192128756444109, "grad_norm": 0.1561942106742485, "kl": 0.19775390625, "learning_rate": 4.941638500570319e-07, "loss": 0.0002, "reward": 1.6517857909202576, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.6562500298023224, "rewards/format_reward_func": 0.9955357164144516, "step": 13074 }, { "completion_length": 237.8884048461914, "epoch": 2.1924640596839766, "grad_norm": 0.17344246627332055, "kl": 0.162841796875, "learning_rate": 4.941612729425706e-07, "loss": 0.0002, "reward": 1.8446429073810577, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8491071686148643, "rewards/format_reward_func": 0.9955357164144516, "step": 13076 }, { "completion_length": 241.5491189956665, "epoch": 2.1927993629238443, "grad_norm": 0.22191151876539467, "kl": 0.168365478515625, "learning_rate": 4.941586952659595e-07, "loss": 0.0002, "reward": 1.8071428909897804, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428947150707, "rewards/format_reward_func": 1.0, "step": 13078 }, { "completion_length": 245.51787090301514, "epoch": 2.193134666163712, "grad_norm": 0.41291817489040716, "kl": 0.185089111328125, "learning_rate": 4.941561170272047e-07, "loss": 0.0002, "reward": 1.7500000596046448, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 13080 }, { "completion_length": 234.9866180419922, "epoch": 2.1934699694035795, "grad_norm": 0.2561760676478586, "kl": 0.151092529296875, "learning_rate": 4.941535382263119e-07, "loss": 0.0002, "reward": 1.7642857804894447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 13082 }, { "completion_length": 228.5625123977661, "epoch": 2.1938052726434467, "grad_norm": 0.2149898909073194, "kl": 0.13726806640625, "learning_rate": 4.94150958863287e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 13084 }, { "completion_length": 240.17858219146729, "epoch": 2.1941405758833143, "grad_norm": 0.1678046221919406, "kl": 0.171142578125, "learning_rate": 4.941483789381362e-07, "loss": 0.0002, "reward": 1.767857201397419, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 13086 }, { "completion_length": 234.0491180419922, "epoch": 2.194475879123182, "grad_norm": 0.11829237495922411, "kl": 0.215484619140625, "learning_rate": 4.941457984508653e-07, "loss": 0.0002, "reward": 1.7821429446339607, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 13088 }, { "completion_length": 226.1785831451416, "epoch": 2.1948111823630496, "grad_norm": 0.1562739900767488, "kl": 0.147369384765625, "learning_rate": 4.941432174014803e-07, "loss": 0.0001, "reward": 1.780357226729393, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 13090 }, { "completion_length": 230.92411613464355, "epoch": 2.1951464856029173, "grad_norm": 0.16259796276652824, "kl": 0.198089599609375, "learning_rate": 4.941406357899871e-07, "loss": 0.0002, "reward": 1.7785714864730835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 13092 }, { "completion_length": 232.1875123977661, "epoch": 2.195481788842785, "grad_norm": 0.20673151685475438, "kl": 0.217559814453125, "learning_rate": 4.941380536163915e-07, "loss": 0.0002, "reward": 1.8089286237955093, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8133928813040257, "rewards/format_reward_func": 0.9955357164144516, "step": 13094 }, { "completion_length": 236.8571538925171, "epoch": 2.195817092082652, "grad_norm": 0.1114181529377033, "kl": 0.215423583984375, "learning_rate": 4.941354708806996e-07, "loss": 0.0002, "reward": 1.7107143476605415, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7107143104076385, "rewards/format_reward_func": 1.0, "step": 13096 }, { "completion_length": 236.03126430511475, "epoch": 2.1961523953225197, "grad_norm": 0.2740025408477465, "kl": 0.39141845703125, "learning_rate": 4.941328875829175e-07, "loss": 0.0004, "reward": 1.7589286267757416, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9955357164144516, "step": 13098 }, { "completion_length": 228.6875114440918, "epoch": 2.1964876985623873, "grad_norm": 0.0945623183408348, "kl": 0.166290283203125, "learning_rate": 4.941303037230508e-07, "loss": 0.0002, "reward": 1.8250000402331352, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8250000290572643, "rewards/format_reward_func": 1.0, "step": 13100 }, { "completion_length": 232.227689743042, "epoch": 2.196823001802255, "grad_norm": 0.22985184246306434, "kl": 0.164337158203125, "learning_rate": 4.941277193011057e-07, "loss": 0.0002, "reward": 1.7500000670552254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 13102 }, { "completion_length": 234.11608219146729, "epoch": 2.1971583050421226, "grad_norm": 0.20757059453359875, "kl": 0.194183349609375, "learning_rate": 4.94125134317088e-07, "loss": 0.0002, "reward": 1.7357143238186836, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143312692642, "rewards/format_reward_func": 1.0, "step": 13104 }, { "completion_length": 246.06697463989258, "epoch": 2.19749360828199, "grad_norm": 0.23602206759001917, "kl": 0.2412109375, "learning_rate": 4.941225487710038e-07, "loss": 0.0002, "reward": 1.7232143729925156, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7276786006987095, "rewards/format_reward_func": 0.9955357164144516, "step": 13106 }, { "completion_length": 237.20983123779297, "epoch": 2.1978289115218574, "grad_norm": 0.19656052106365124, "kl": 0.30426025390625, "learning_rate": 4.94119962662859e-07, "loss": 0.0003, "reward": 1.7767857760190964, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7812500242143869, "rewards/format_reward_func": 0.9955357164144516, "step": 13108 }, { "completion_length": 233.30804824829102, "epoch": 2.198164214761725, "grad_norm": 0.1888572394239757, "kl": 0.25299072265625, "learning_rate": 4.941173759926595e-07, "loss": 0.0003, "reward": 1.7214286625385284, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.721428606659174, "rewards/format_reward_func": 1.0, "step": 13110 }, { "completion_length": 234.5134038925171, "epoch": 2.1984995180015927, "grad_norm": 0.14359310245026907, "kl": 0.162506103515625, "learning_rate": 4.941147887604113e-07, "loss": 0.0002, "reward": 1.7808036133646965, "reward_std": 0.02714784862473607, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 0.9986607171595097, "step": 13112 }, { "completion_length": 230.72768878936768, "epoch": 2.1988348212414603, "grad_norm": 0.1815342261677029, "kl": 0.1749267578125, "learning_rate": 4.941122009661202e-07, "loss": 0.0002, "reward": 1.7571429163217545, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 13114 }, { "completion_length": 234.0134038925171, "epoch": 2.199170124481328, "grad_norm": 0.22855059183061707, "kl": 0.1219482421875, "learning_rate": 4.941096126097926e-07, "loss": 0.0001, "reward": 1.7250000834465027, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000275671482, "rewards/format_reward_func": 1.0, "step": 13116 }, { "completion_length": 235.95536708831787, "epoch": 2.199505427721195, "grad_norm": 0.3166301610655401, "kl": 0.2043609619140625, "learning_rate": 4.94107023691434e-07, "loss": 0.0002, "reward": 1.757142923772335, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428921073675, "rewards/format_reward_func": 1.0, "step": 13118 }, { "completion_length": 237.70536613464355, "epoch": 2.199840730961063, "grad_norm": 0.17579007961883955, "kl": 0.1287078857421875, "learning_rate": 4.941044342110504e-07, "loss": 0.0001, "reward": 1.7196429371833801, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.724107176065445, "rewards/format_reward_func": 0.9955357164144516, "step": 13120 }, { "completion_length": 240.477689743042, "epoch": 2.2001760342009304, "grad_norm": 0.25137758864888454, "kl": 0.1131591796875, "learning_rate": 4.94101844168648e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 13122 }, { "completion_length": 236.290189743042, "epoch": 2.200511337440798, "grad_norm": 0.19700262041976796, "kl": 0.11376953125, "learning_rate": 4.940992535642327e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7964286170899868, "rewards/format_reward_func": 1.0, "step": 13124 }, { "completion_length": 231.4375123977661, "epoch": 2.2008466406806657, "grad_norm": 0.07527575658926482, "kl": 0.156158447265625, "learning_rate": 4.940966623978103e-07, "loss": 0.0002, "reward": 1.8214286044239998, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8214286044239998, "rewards/format_reward_func": 1.0, "step": 13126 }, { "completion_length": 242.0625114440918, "epoch": 2.201181943920533, "grad_norm": 0.21908432403033265, "kl": 0.110595703125, "learning_rate": 4.94094070669387e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 13128 }, { "completion_length": 244.4509048461914, "epoch": 2.2015172471604005, "grad_norm": 0.19426442618048773, "kl": 0.130706787109375, "learning_rate": 4.940914783789685e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.775000024586916, "rewards/format_reward_func": 1.0, "step": 13130 }, { "completion_length": 230.9821538925171, "epoch": 2.201852550400268, "grad_norm": 0.13175834022378757, "kl": 0.1300048828125, "learning_rate": 4.940888855265611e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428943425417, "rewards/format_reward_func": 1.0, "step": 13132 }, { "completion_length": 225.47768783569336, "epoch": 2.202187853640136, "grad_norm": 0.33402959027393114, "kl": 0.1414337158203125, "learning_rate": 4.940862921121705e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 13134 }, { "completion_length": 236.03125953674316, "epoch": 2.2025231568800034, "grad_norm": 0.1890372794346963, "kl": 0.11383056640625, "learning_rate": 4.940836981358027e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714454948902, "rewards/format_reward_func": 1.0, "step": 13136 }, { "completion_length": 230.97322463989258, "epoch": 2.202858460119871, "grad_norm": 0.1619307898339516, "kl": 0.1507568359375, "learning_rate": 4.940811035974638e-07, "loss": 0.0002, "reward": 1.7642857879400253, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 13138 }, { "completion_length": 236.21429634094238, "epoch": 2.2031937633597383, "grad_norm": 0.24074576556891386, "kl": 0.128509521484375, "learning_rate": 4.940785084971597e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000186264515, "rewards/format_reward_func": 1.0, "step": 13140 }, { "completion_length": 223.4732265472412, "epoch": 2.203529066599606, "grad_norm": 0.11759453324945487, "kl": 0.268890380859375, "learning_rate": 4.940759128348963e-07, "loss": 0.0003, "reward": 1.7928571850061417, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.8017857410013676, "rewards/format_reward_func": 0.9910714328289032, "step": 13142 }, { "completion_length": 236.6428689956665, "epoch": 2.2038643698394735, "grad_norm": 0.17489192703639114, "kl": 0.14190673828125, "learning_rate": 4.940733166106797e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571619093418, "rewards/format_reward_func": 1.0, "step": 13144 }, { "completion_length": 222.54465198516846, "epoch": 2.204199673079341, "grad_norm": 0.2115976973391719, "kl": 0.13714599609375, "learning_rate": 4.940707198245158e-07, "loss": 0.0001, "reward": 1.7571429386734962, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 13146 }, { "completion_length": 220.4330472946167, "epoch": 2.204534976319209, "grad_norm": 0.10467149392557401, "kl": 0.148040771484375, "learning_rate": 4.940681224764107e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7321428880095482, "rewards/format_reward_func": 1.0, "step": 13148 }, { "completion_length": 224.23661613464355, "epoch": 2.2048702795590764, "grad_norm": 0.218368171315283, "kl": 0.23980712890625, "learning_rate": 4.940655245663702e-07, "loss": 0.0002, "reward": 1.7500000670552254, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 13150 }, { "completion_length": 230.19197463989258, "epoch": 2.2052055827989436, "grad_norm": 0.1515065738307242, "kl": 0.230987548828125, "learning_rate": 4.940629260944004e-07, "loss": 0.0002, "reward": 1.785714328289032, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 13152 }, { "completion_length": 223.79018688201904, "epoch": 2.2055408860388113, "grad_norm": 0.35081248015729055, "kl": 0.125213623046875, "learning_rate": 4.940603270605072e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7991071678698063, "rewards/format_reward_func": 0.9955357164144516, "step": 13154 }, { "completion_length": 223.76340293884277, "epoch": 2.205876189278679, "grad_norm": 0.4487984507252899, "kl": 0.14996337890625, "learning_rate": 4.940577274646967e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7428571786731482, "rewards/format_reward_func": 1.0, "step": 13156 }, { "completion_length": 229.00893878936768, "epoch": 2.2062114925185465, "grad_norm": 0.12167153735920415, "kl": 0.259735107421875, "learning_rate": 4.940551273069748e-07, "loss": 0.0003, "reward": 1.8053571805357933, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8098214510828257, "rewards/format_reward_func": 0.9955357164144516, "step": 13158 }, { "completion_length": 229.25447463989258, "epoch": 2.206546795758414, "grad_norm": 0.1730710604136419, "kl": 0.174346923828125, "learning_rate": 4.940525265873475e-07, "loss": 0.0002, "reward": 1.778571493923664, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 13160 }, { "completion_length": 225.1875114440918, "epoch": 2.206882098998282, "grad_norm": 0.007503315078522894, "kl": 0.306854248046875, "learning_rate": 4.940499253058208e-07, "loss": 0.0003, "reward": 1.757142923772335, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 13162 }, { "completion_length": 228.38393878936768, "epoch": 2.207217402238149, "grad_norm": 0.16247735776919064, "kl": 0.292724609375, "learning_rate": 4.940473234624008e-07, "loss": 0.0003, "reward": 1.742857187986374, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7517857439815998, "rewards/format_reward_func": 0.9910714328289032, "step": 13164 }, { "completion_length": 223.4241180419922, "epoch": 2.2075527054780166, "grad_norm": 0.4386678302614182, "kl": 0.526611328125, "learning_rate": 4.940447210570932e-07, "loss": 0.0005, "reward": 1.7714286297559738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 13166 }, { "completion_length": 229.11607933044434, "epoch": 2.2078880087178843, "grad_norm": 0.1466795277008058, "kl": 0.63433837890625, "learning_rate": 4.940421180899042e-07, "loss": 0.0006, "reward": 1.74642863124609, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 13168 }, { "completion_length": 223.15626049041748, "epoch": 2.208223311957752, "grad_norm": 0.1795933383348911, "kl": 0.19903564453125, "learning_rate": 4.940395145608398e-07, "loss": 0.0002, "reward": 1.7750000655651093, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 13170 }, { "completion_length": 224.09822368621826, "epoch": 2.2085586151976195, "grad_norm": 0.15875580392748082, "kl": 0.1873779296875, "learning_rate": 4.940369104699059e-07, "loss": 0.0002, "reward": 1.7785715013742447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 13172 }, { "completion_length": 237.92411708831787, "epoch": 2.2088939184374867, "grad_norm": 0.21953287765735693, "kl": 0.3541259765625, "learning_rate": 4.940343058171086e-07, "loss": 0.0004, "reward": 1.7696429044008255, "reward_std": 0.08333758264780045, "rewards/equation_reward_func": 0.7741071805357933, "rewards/format_reward_func": 0.9955357164144516, "step": 13174 }, { "completion_length": 226.10268878936768, "epoch": 2.2092292216773544, "grad_norm": 0.23376125371304202, "kl": 0.185394287109375, "learning_rate": 4.940317006024539e-07, "loss": 0.0002, "reward": 1.8107143267989159, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.8196428716182709, "rewards/format_reward_func": 0.9910714328289032, "step": 13176 }, { "completion_length": 241.2991189956665, "epoch": 2.209564524917222, "grad_norm": 0.14580594406211703, "kl": 0.4073486328125, "learning_rate": 4.940290948259477e-07, "loss": 0.0004, "reward": 1.7267857939004898, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7312500197440386, "rewards/format_reward_func": 0.9955357164144516, "step": 13178 }, { "completion_length": 227.40625858306885, "epoch": 2.2098998281570896, "grad_norm": 0.21299387186202437, "kl": 0.16229248046875, "learning_rate": 4.94026488487596e-07, "loss": 0.0002, "reward": 1.7767857760190964, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500260770321, "rewards/format_reward_func": 0.9955357164144516, "step": 13180 }, { "completion_length": 224.30804634094238, "epoch": 2.2102351313969573, "grad_norm": 0.27077338842068727, "kl": 0.13043212890625, "learning_rate": 4.940238815874049e-07, "loss": 0.0001, "reward": 1.8017857819795609, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.806250024586916, "rewards/format_reward_func": 0.9955357164144516, "step": 13182 }, { "completion_length": 230.0491180419922, "epoch": 2.210570434636825, "grad_norm": 0.17648422454315202, "kl": 0.1396484375, "learning_rate": 4.940212741253803e-07, "loss": 0.0001, "reward": 1.7660715132951736, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7794643230736256, "rewards/format_reward_func": 0.9866071492433548, "step": 13184 }, { "completion_length": 230.9285831451416, "epoch": 2.210905737876692, "grad_norm": 0.19626567358030633, "kl": 0.319732666015625, "learning_rate": 4.940186661015283e-07, "loss": 0.0003, "reward": 1.7857143357396126, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143394649029, "rewards/format_reward_func": 1.0, "step": 13186 }, { "completion_length": 233.7767972946167, "epoch": 2.2112410411165597, "grad_norm": 0.3021643043558887, "kl": 0.31884765625, "learning_rate": 4.940160575158549e-07, "loss": 0.0003, "reward": 1.76071435213089, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7696429062634706, "rewards/format_reward_func": 0.9910714328289032, "step": 13188 }, { "completion_length": 228.15179538726807, "epoch": 2.2115763443564274, "grad_norm": 0.3841834809821566, "kl": 0.54290771484375, "learning_rate": 4.94013448368366e-07, "loss": 0.0005, "reward": 1.714285783469677, "reward_std": 0.12121830508112907, "rewards/equation_reward_func": 0.7321428917348385, "rewards/format_reward_func": 0.9821428656578064, "step": 13190 }, { "completion_length": 223.93750953674316, "epoch": 2.211911647596295, "grad_norm": 0.32209311190475476, "kl": 0.2110443115234375, "learning_rate": 4.940108386590676e-07, "loss": 0.0002, "reward": 1.714285783469677, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7232143133878708, "rewards/format_reward_func": 0.9910714328289032, "step": 13192 }, { "completion_length": 235.66072463989258, "epoch": 2.2122469508361626, "grad_norm": 0.15596004633128052, "kl": 0.768280029296875, "learning_rate": 4.940082283879658e-07, "loss": 0.0008, "reward": 1.7357143461704254, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7446428947150707, "rewards/format_reward_func": 0.9910714328289032, "step": 13194 }, { "completion_length": 226.04018783569336, "epoch": 2.21258225407603, "grad_norm": 0.21343340214385512, "kl": 0.2079620361328125, "learning_rate": 4.940056175550666e-07, "loss": 0.0002, "reward": 1.7875000461935997, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.791964303702116, "rewards/format_reward_func": 0.9955357164144516, "step": 13196 }, { "completion_length": 229.76340293884277, "epoch": 2.2129175573158975, "grad_norm": 0.26267101558592826, "kl": 0.22930908203125, "learning_rate": 4.940030061603761e-07, "loss": 0.0002, "reward": 1.8053571730852127, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8098214641213417, "rewards/format_reward_func": 0.9955357164144516, "step": 13198 }, { "completion_length": 229.1384038925171, "epoch": 2.213252860555765, "grad_norm": 0.23785842627214238, "kl": 0.151458740234375, "learning_rate": 4.940003942039002e-07, "loss": 0.0002, "reward": 1.7125000581145287, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7258928902447224, "rewards/format_reward_func": 0.9866071492433548, "step": 13200 }, { "completion_length": 241.13394260406494, "epoch": 2.2135881637956327, "grad_norm": 0.2702312817671659, "kl": 0.29144287109375, "learning_rate": 4.939977816856447e-07, "loss": 0.0003, "reward": 1.7500000670552254, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 13202 }, { "completion_length": 240.51340293884277, "epoch": 2.2139234670355004, "grad_norm": 0.14928824964527235, "kl": 0.140411376953125, "learning_rate": 4.939951686056161e-07, "loss": 0.0001, "reward": 1.7375000715255737, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7419643215835094, "rewards/format_reward_func": 0.9955357164144516, "step": 13204 }, { "completion_length": 234.3125114440918, "epoch": 2.214258770275368, "grad_norm": 0.15811116279839596, "kl": 0.374359130859375, "learning_rate": 4.939925549638201e-07, "loss": 0.0004, "reward": 1.8535714745521545, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8535714522004128, "rewards/format_reward_func": 1.0, "step": 13206 }, { "completion_length": 249.30804824829102, "epoch": 2.214594073515235, "grad_norm": 0.3955689485987965, "kl": 0.528076171875, "learning_rate": 4.939899407602627e-07, "loss": 0.0005, "reward": 1.778571456670761, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7875000406056643, "rewards/format_reward_func": 0.9910714328289032, "step": 13208 }, { "completion_length": 241.12501335144043, "epoch": 2.214929376755103, "grad_norm": 0.4672937520059191, "kl": 0.34112548828125, "learning_rate": 4.939873259949499e-07, "loss": 0.0003, "reward": 1.8035714700818062, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.8125000149011612, "rewards/format_reward_func": 0.9910714328289032, "step": 13210 }, { "completion_length": 249.0446538925171, "epoch": 2.2152646799949705, "grad_norm": 0.09535743739495414, "kl": 0.265960693359375, "learning_rate": 4.939847106678881e-07, "loss": 0.0003, "reward": 1.7321429252624512, "reward_std": 0.05555839091539383, "rewards/equation_reward_func": 0.741071468219161, "rewards/format_reward_func": 0.9910714328289032, "step": 13212 }, { "completion_length": 250.58037090301514, "epoch": 2.215599983234838, "grad_norm": 0.11639680227151614, "kl": 0.145843505859375, "learning_rate": 4.939820947790828e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714656114578, "rewards/format_reward_func": 1.0, "step": 13214 }, { "completion_length": 236.36608219146729, "epoch": 2.2159352864747057, "grad_norm": 0.22684060354639632, "kl": 0.2822265625, "learning_rate": 4.939794783285403e-07, "loss": 0.0003, "reward": 1.8410714864730835, "reward_std": 0.0833375845104456, "rewards/equation_reward_func": 0.8544643111526966, "rewards/format_reward_func": 0.9866071492433548, "step": 13216 }, { "completion_length": 241.20536518096924, "epoch": 2.216270589714573, "grad_norm": 0.06232753458071066, "kl": 0.929351806640625, "learning_rate": 4.939768613162666e-07, "loss": 0.0009, "reward": 1.7696429044008255, "reward_std": 0.0328299580141902, "rewards/equation_reward_func": 0.7830357365310192, "rewards/format_reward_func": 0.9866071492433548, "step": 13218 }, { "completion_length": 225.6250114440918, "epoch": 2.2166058929544405, "grad_norm": 0.17169788855031926, "kl": 0.16473388671875, "learning_rate": 4.939742437422677e-07, "loss": 0.0002, "reward": 1.8500000312924385, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.858928594738245, "rewards/format_reward_func": 0.9910714328289032, "step": 13220 }, { "completion_length": 231.65179634094238, "epoch": 2.216941196194308, "grad_norm": 0.22926052211592493, "kl": 0.64813232421875, "learning_rate": 4.939716256065496e-07, "loss": 0.0006, "reward": 1.7928571999073029, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.8017857559025288, "rewards/format_reward_func": 0.9910714328289032, "step": 13222 }, { "completion_length": 231.18304824829102, "epoch": 2.217276499434176, "grad_norm": 0.22491708701033208, "kl": 0.4256591796875, "learning_rate": 4.939690069091185e-07, "loss": 0.0004, "reward": 1.696428656578064, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.7053571753203869, "rewards/format_reward_func": 0.9910714328289032, "step": 13224 }, { "completion_length": 225.24554443359375, "epoch": 2.2176118026740435, "grad_norm": 0.1770503668273261, "kl": 0.747161865234375, "learning_rate": 4.939663876499801e-07, "loss": 0.0007, "reward": 1.8142857775092125, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857328057289, "rewards/format_reward_func": 1.0, "step": 13226 }, { "completion_length": 240.5044755935669, "epoch": 2.217947105913911, "grad_norm": 0.1134760793769743, "kl": 1.253082275390625, "learning_rate": 4.939637678291408e-07, "loss": 0.0013, "reward": 1.7678572088479996, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.776785746216774, "rewards/format_reward_func": 0.9910714328289032, "step": 13228 }, { "completion_length": 223.47322463989258, "epoch": 2.2182824091537783, "grad_norm": 0.32150448400287074, "kl": 0.858489990234375, "learning_rate": 4.939611474466063e-07, "loss": 0.0009, "reward": 1.744642935693264, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071671247482, "rewards/format_reward_func": 0.9955357164144516, "step": 13230 }, { "completion_length": 223.62947463989258, "epoch": 2.218617712393646, "grad_norm": 0.2605905038870512, "kl": 0.36175537109375, "learning_rate": 4.939585265023828e-07, "loss": 0.0004, "reward": 1.7660714909434319, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7705357521772385, "rewards/format_reward_func": 0.9955357164144516, "step": 13232 }, { "completion_length": 239.5937614440918, "epoch": 2.2189530156335135, "grad_norm": 0.2594317754827288, "kl": 0.962310791015625, "learning_rate": 4.939559049964764e-07, "loss": 0.001, "reward": 1.7285715118050575, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7375000398606062, "rewards/format_reward_func": 0.9910714328289032, "step": 13234 }, { "completion_length": 238.602689743042, "epoch": 2.219288318873381, "grad_norm": 0.2423681610989229, "kl": 0.459014892578125, "learning_rate": 4.93953282928893e-07, "loss": 0.0005, "reward": 1.7553572058677673, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214633762836, "rewards/format_reward_func": 0.9955357164144516, "step": 13236 }, { "completion_length": 246.2857255935669, "epoch": 2.219623622113249, "grad_norm": 0.18164710354197572, "kl": 0.57537841796875, "learning_rate": 4.939506602996388e-07, "loss": 0.0006, "reward": 1.7642857655882835, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7642857618629932, "rewards/format_reward_func": 1.0, "step": 13238 }, { "completion_length": 231.62054347991943, "epoch": 2.219958925353116, "grad_norm": 0.1970273094202194, "kl": 0.5147247314453125, "learning_rate": 4.939480371087196e-07, "loss": 0.0005, "reward": 1.8089286163449287, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.813392885029316, "rewards/format_reward_func": 0.9955357164144516, "step": 13240 }, { "completion_length": 246.5625114440918, "epoch": 2.2202942285929836, "grad_norm": 0.2030433424392116, "kl": 0.534576416015625, "learning_rate": 4.939454133561415e-07, "loss": 0.0005, "reward": 1.7196429148316383, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.733035746961832, "rewards/format_reward_func": 0.9866071492433548, "step": 13242 }, { "completion_length": 242.4151906967163, "epoch": 2.2206295318328513, "grad_norm": 0.1793058229309209, "kl": 0.400634765625, "learning_rate": 4.939427890419108e-07, "loss": 0.0004, "reward": 1.7482143715023994, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526785954833031, "rewards/format_reward_func": 0.9955357164144516, "step": 13244 }, { "completion_length": 246.6116189956665, "epoch": 2.220964835072719, "grad_norm": 0.18217815077685284, "kl": 0.204193115234375, "learning_rate": 4.939401641660332e-07, "loss": 0.0002, "reward": 1.7535715103149414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714656114578, "rewards/format_reward_func": 1.0, "step": 13246 }, { "completion_length": 236.48661613464355, "epoch": 2.2213001383125865, "grad_norm": 0.703389472035067, "kl": 0.359527587890625, "learning_rate": 4.93937538728515e-07, "loss": 0.0004, "reward": 1.750000074505806, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7589285895228386, "rewards/format_reward_func": 0.9910714328289032, "step": 13248 }, { "completion_length": 230.27233123779297, "epoch": 2.221635441552454, "grad_norm": 0.15593416735907303, "kl": 0.16259765625, "learning_rate": 4.939349127293621e-07, "loss": 0.0002, "reward": 1.7821429073810577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 13250 }, { "completion_length": 247.27679920196533, "epoch": 2.2219707447923214, "grad_norm": 0.31791197842722513, "kl": 0.48486328125, "learning_rate": 4.939322861685806e-07, "loss": 0.0005, "reward": 1.7321429252624512, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7410714700818062, "rewards/format_reward_func": 0.9910714328289032, "step": 13252 }, { "completion_length": 224.00447273254395, "epoch": 2.222306048032189, "grad_norm": 0.12457813021055096, "kl": 0.187591552734375, "learning_rate": 4.939296590461765e-07, "loss": 0.0002, "reward": 1.79464291036129, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7991071529686451, "rewards/format_reward_func": 0.9955357164144516, "step": 13254 }, { "completion_length": 233.6607265472412, "epoch": 2.2226413512720566, "grad_norm": 0.16283221198189626, "kl": 0.1414794921875, "learning_rate": 4.939270313621559e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857506871223, "rewards/format_reward_func": 1.0, "step": 13256 }, { "completion_length": 237.87500667572021, "epoch": 2.2229766545119243, "grad_norm": 0.26208369447346597, "kl": 0.2811279296875, "learning_rate": 4.939244031165248e-07, "loss": 0.0003, "reward": 1.7857143580913544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 13258 }, { "completion_length": 237.2009048461914, "epoch": 2.223311957751792, "grad_norm": 0.14783752676243633, "kl": 0.203338623046875, "learning_rate": 4.939217743092894e-07, "loss": 0.0002, "reward": 1.7250000685453415, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7250000350177288, "rewards/format_reward_func": 1.0, "step": 13260 }, { "completion_length": 233.46876335144043, "epoch": 2.223647260991659, "grad_norm": 0.5945979191598133, "kl": 0.155548095703125, "learning_rate": 4.939191449404555e-07, "loss": 0.0002, "reward": 1.8178571984171867, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8178571686148643, "rewards/format_reward_func": 1.0, "step": 13262 }, { "completion_length": 235.95090579986572, "epoch": 2.2239825642315267, "grad_norm": 0.08776234428861629, "kl": 0.2462158203125, "learning_rate": 4.939165150100294e-07, "loss": 0.0002, "reward": 1.707142949104309, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7071428950875998, "rewards/format_reward_func": 1.0, "step": 13264 }, { "completion_length": 233.6250114440918, "epoch": 2.2243178674713944, "grad_norm": 0.12217827536623904, "kl": 0.211669921875, "learning_rate": 4.93913884518017e-07, "loss": 0.0002, "reward": 1.7910714820027351, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7955357544124126, "rewards/format_reward_func": 0.9955357164144516, "step": 13266 }, { "completion_length": 229.37947463989258, "epoch": 2.224653170711262, "grad_norm": 0.19298089285952225, "kl": 0.270233154296875, "learning_rate": 4.939112534644245e-07, "loss": 0.0003, "reward": 1.7517857924103737, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7562500201165676, "rewards/format_reward_func": 0.9955357164144516, "step": 13268 }, { "completion_length": 229.87054538726807, "epoch": 2.2249884739511296, "grad_norm": 0.14434383594107386, "kl": 0.153961181640625, "learning_rate": 4.939086218492577e-07, "loss": 0.0002, "reward": 1.7107143551111221, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7107143308967352, "rewards/format_reward_func": 1.0, "step": 13270 }, { "completion_length": 230.58036708831787, "epoch": 2.2253237771909973, "grad_norm": 0.16133111860810595, "kl": 0.159027099609375, "learning_rate": 4.939059896725228e-07, "loss": 0.0002, "reward": 1.8000000640749931, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 13272 }, { "completion_length": 229.78572368621826, "epoch": 2.2256590804308645, "grad_norm": 0.2167370871722688, "kl": 0.201263427734375, "learning_rate": 4.939033569342259e-07, "loss": 0.0002, "reward": 1.7714286223053932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714286185801029, "rewards/format_reward_func": 1.0, "step": 13274 }, { "completion_length": 229.45090293884277, "epoch": 2.225994383670732, "grad_norm": 0.2795147417582892, "kl": 0.13800048828125, "learning_rate": 4.939007236343732e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 13276 }, { "completion_length": 229.4419755935669, "epoch": 2.2263296869105997, "grad_norm": 0.26344600167128623, "kl": 0.23748779296875, "learning_rate": 4.938980897729704e-07, "loss": 0.0002, "reward": 1.775000050663948, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 13278 }, { "completion_length": 234.10715293884277, "epoch": 2.2266649901504674, "grad_norm": 0.09384786079257194, "kl": 0.217926025390625, "learning_rate": 4.938954553500238e-07, "loss": 0.0002, "reward": 1.7678571864962578, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 13280 }, { "completion_length": 221.8928680419922, "epoch": 2.227000293390335, "grad_norm": 0.009029269373749394, "kl": 0.191070556640625, "learning_rate": 4.938928203655396e-07, "loss": 0.0002, "reward": 1.846428595483303, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8464285880327225, "rewards/format_reward_func": 1.0, "step": 13282 }, { "completion_length": 229.92858219146729, "epoch": 2.2273355966302026, "grad_norm": 0.10794024296635173, "kl": 0.130828857421875, "learning_rate": 4.938901848195236e-07, "loss": 0.0001, "reward": 1.7767857611179352, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7803571745753288, "rewards/format_reward_func": 0.9964285716414452, "step": 13284 }, { "completion_length": 232.4821538925171, "epoch": 2.22767089987007, "grad_norm": 0.10866707186198227, "kl": 0.20367431640625, "learning_rate": 4.938875487119819e-07, "loss": 0.0002, "reward": 1.7642857804894447, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7642857581377029, "rewards/format_reward_func": 1.0, "step": 13286 }, { "completion_length": 235.95983219146729, "epoch": 2.2280062031099375, "grad_norm": 0.06240347822895245, "kl": 0.181243896484375, "learning_rate": 4.938849120429207e-07, "loss": 0.0002, "reward": 1.7642857655882835, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7642857488244772, "rewards/format_reward_func": 1.0, "step": 13288 }, { "completion_length": 228.5937623977661, "epoch": 2.228341506349805, "grad_norm": 0.1655080256398611, "kl": 0.1346435546875, "learning_rate": 4.93882274812346e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 13290 }, { "completion_length": 234.00000858306885, "epoch": 2.2286768095896727, "grad_norm": 0.1567514946119727, "kl": 0.19720458984375, "learning_rate": 4.938796370202639e-07, "loss": 0.0002, "reward": 1.821428619325161, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8214285783469677, "rewards/format_reward_func": 1.0, "step": 13292 }, { "completion_length": 227.3928680419922, "epoch": 2.2290121128295404, "grad_norm": 0.1766307361388533, "kl": 0.14697265625, "learning_rate": 4.938769986666804e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7500000465661287, "rewards/format_reward_func": 1.0, "step": 13294 }, { "completion_length": 238.21429920196533, "epoch": 2.229347416069408, "grad_norm": 0.09715960629355828, "kl": 0.177581787109375, "learning_rate": 4.938743597516017e-07, "loss": 0.0002, "reward": 1.7142857983708382, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7142857536673546, "rewards/format_reward_func": 1.0, "step": 13296 }, { "completion_length": 232.15179634094238, "epoch": 2.229682719309275, "grad_norm": 0.4376923634979283, "kl": 0.1544189453125, "learning_rate": 4.938717202750338e-07, "loss": 0.0002, "reward": 1.7964286357164383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7964286096394062, "rewards/format_reward_func": 1.0, "step": 13298 }, { "completion_length": 239.44644165039062, "epoch": 2.230018022549143, "grad_norm": 0.14513764982478072, "kl": 0.25518798828125, "learning_rate": 4.938690802369827e-07, "loss": 0.0003, "reward": 1.750000074505806, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 13300 }, { "completion_length": 235.77679634094238, "epoch": 2.2303533257890105, "grad_norm": 0.22726923725353268, "kl": 0.1492919921875, "learning_rate": 4.938664396374545e-07, "loss": 0.0001, "reward": 1.8000000640749931, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 13302 }, { "completion_length": 231.8794755935669, "epoch": 2.230688629028878, "grad_norm": 0.24320216391633032, "kl": 0.149627685546875, "learning_rate": 4.938637984764555e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428723633289, "rewards/format_reward_func": 1.0, "step": 13304 }, { "completion_length": 245.3571538925171, "epoch": 2.2310239322687457, "grad_norm": 0.22138009336756811, "kl": 0.219757080078125, "learning_rate": 4.938611567539915e-07, "loss": 0.0002, "reward": 1.814285770058632, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857328057289, "rewards/format_reward_func": 1.0, "step": 13306 }, { "completion_length": 243.50001049041748, "epoch": 2.231359235508613, "grad_norm": 0.37270995546541225, "kl": 0.167022705078125, "learning_rate": 4.938585144700688e-07, "loss": 0.0002, "reward": 1.7767857909202576, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500223517418, "rewards/format_reward_func": 0.9955357164144516, "step": 13308 }, { "completion_length": 237.3125123977661, "epoch": 2.2316945387484806, "grad_norm": 0.3370206438808287, "kl": 0.197845458984375, "learning_rate": 4.938558716246933e-07, "loss": 0.0002, "reward": 1.8125000447034836, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8169643133878708, "rewards/format_reward_func": 0.9955357164144516, "step": 13310 }, { "completion_length": 238.8080472946167, "epoch": 2.232029841988348, "grad_norm": 0.10725523459583197, "kl": 0.18255615234375, "learning_rate": 4.938532282178713e-07, "loss": 0.0002, "reward": 1.7428572103381157, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 13312 }, { "completion_length": 249.25447368621826, "epoch": 2.232365145228216, "grad_norm": 0.05706609548541911, "kl": 0.271392822265625, "learning_rate": 4.938505842496086e-07, "loss": 0.0003, "reward": 1.769642911851406, "reward_std": 0.012626906856894493, "rewards/equation_reward_func": 0.7741071805357933, "rewards/format_reward_func": 0.9955357164144516, "step": 13314 }, { "completion_length": 241.30358409881592, "epoch": 2.2327004484680835, "grad_norm": 0.297844963742036, "kl": 0.2213134765625, "learning_rate": 4.938479397199115e-07, "loss": 0.0002, "reward": 1.7750000804662704, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 13316 }, { "completion_length": 242.61608505249023, "epoch": 2.233035751707951, "grad_norm": 0.06049902245495544, "kl": 0.239166259765625, "learning_rate": 4.93845294628786e-07, "loss": 0.0002, "reward": 1.7660714909434319, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.7705357484519482, "rewards/format_reward_func": 0.9955357164144516, "step": 13318 }, { "completion_length": 253.62500858306885, "epoch": 2.2333710549478183, "grad_norm": 0.20782510215756428, "kl": 0.47796630859375, "learning_rate": 4.938426489762382e-07, "loss": 0.0005, "reward": 1.8107143715023994, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 13320 }, { "completion_length": 241.4062623977661, "epoch": 2.233706358187686, "grad_norm": 0.15660245756764526, "kl": 0.2474365234375, "learning_rate": 4.938400027622744e-07, "loss": 0.0002, "reward": 1.800000049173832, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000342726707, "rewards/format_reward_func": 1.0, "step": 13322 }, { "completion_length": 239.20536994934082, "epoch": 2.2340416614275536, "grad_norm": 0.1377272999131664, "kl": 0.541748046875, "learning_rate": 4.938373559869003e-07, "loss": 0.0005, "reward": 1.7446429282426834, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 13324 }, { "completion_length": 234.62054538726807, "epoch": 2.234376964667421, "grad_norm": 0.4585849996854235, "kl": 0.53997802734375, "learning_rate": 4.938347086501223e-07, "loss": 0.0005, "reward": 1.755357213318348, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 13326 }, { "completion_length": 233.2634048461914, "epoch": 2.234712267907289, "grad_norm": 0.15034751021902695, "kl": 0.90093994140625, "learning_rate": 4.938320607519464e-07, "loss": 0.0009, "reward": 1.7410714998841286, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357369035482, "rewards/format_reward_func": 0.9955357164144516, "step": 13328 }, { "completion_length": 249.1116180419922, "epoch": 2.235047571147156, "grad_norm": 0.18235629112200313, "kl": 0.187347412109375, "learning_rate": 4.938294122923785e-07, "loss": 0.0002, "reward": 1.7803572118282318, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7848214618861675, "rewards/format_reward_func": 0.9955357164144516, "step": 13330 }, { "completion_length": 238.00893878936768, "epoch": 2.2353828743870237, "grad_norm": 0.2001295789142171, "kl": 1.100799560546875, "learning_rate": 4.938267632714252e-07, "loss": 0.0011, "reward": 1.7821429148316383, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7910714522004128, "rewards/format_reward_func": 0.9910714328289032, "step": 13332 }, { "completion_length": 231.4910831451416, "epoch": 2.2357181776268913, "grad_norm": 0.7877813828478731, "kl": 0.62371826171875, "learning_rate": 4.938241136890921e-07, "loss": 0.0006, "reward": 1.753571517765522, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7625000178813934, "rewards/format_reward_func": 0.9910714328289032, "step": 13334 }, { "completion_length": 235.11161708831787, "epoch": 2.236053480866759, "grad_norm": 0.15697719157481013, "kl": 0.374969482421875, "learning_rate": 4.938214635453854e-07, "loss": 0.0004, "reward": 1.7464286535978317, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 13336 }, { "completion_length": 239.9241180419922, "epoch": 2.2363887841066266, "grad_norm": 0.45109904732269696, "kl": 0.39056396484375, "learning_rate": 4.938188128403114e-07, "loss": 0.0004, "reward": 1.7017858028411865, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7062500193715096, "rewards/format_reward_func": 0.9955357164144516, "step": 13338 }, { "completion_length": 239.34376049041748, "epoch": 2.236724087346494, "grad_norm": 0.21322803207128538, "kl": 1.0438232421875, "learning_rate": 4.938161615738762e-07, "loss": 0.001, "reward": 1.7178571820259094, "reward_std": 0.04545686487108469, "rewards/equation_reward_func": 0.7267857324331999, "rewards/format_reward_func": 0.9910714328289032, "step": 13340 }, { "completion_length": 240.7142972946167, "epoch": 2.2370593905863614, "grad_norm": 0.1442597372037303, "kl": 0.367919921875, "learning_rate": 4.938135097460856e-07, "loss": 0.0004, "reward": 1.7482143491506577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526786103844643, "rewards/format_reward_func": 0.9955357164144516, "step": 13342 }, { "completion_length": 239.2053689956665, "epoch": 2.237394693826229, "grad_norm": 0.26108253632311346, "kl": 0.6842041015625, "learning_rate": 4.93810857356946e-07, "loss": 0.0007, "reward": 1.719642959535122, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7241071835160255, "rewards/format_reward_func": 0.9955357164144516, "step": 13344 }, { "completion_length": 240.7366180419922, "epoch": 2.2377299970660967, "grad_norm": 0.19328804186926843, "kl": 0.4044189453125, "learning_rate": 4.938082044064634e-07, "loss": 0.0004, "reward": 1.7392857745289803, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7482143193483353, "rewards/format_reward_func": 0.9910714328289032, "step": 13346 }, { "completion_length": 236.4285831451416, "epoch": 2.2380653003059643, "grad_norm": 0.10448958503500268, "kl": 0.347503662109375, "learning_rate": 4.938055508946439e-07, "loss": 0.0003, "reward": 1.7053572088479996, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7098214663565159, "rewards/format_reward_func": 0.9955357164144516, "step": 13348 }, { "completion_length": 226.47322463989258, "epoch": 2.238400603545832, "grad_norm": 0.3026737005415429, "kl": 0.236053466796875, "learning_rate": 4.938028968214937e-07, "loss": 0.0002, "reward": 1.7410715147852898, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7455357499420643, "rewards/format_reward_func": 0.9955357164144516, "step": 13350 }, { "completion_length": 227.45090293884277, "epoch": 2.238735906785699, "grad_norm": 0.21333624860225478, "kl": 0.204345703125, "learning_rate": 4.938002421870187e-07, "loss": 0.0002, "reward": 1.741071492433548, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7455357499420643, "rewards/format_reward_func": 0.9955357164144516, "step": 13352 }, { "completion_length": 242.4241189956665, "epoch": 2.2390712100255667, "grad_norm": 0.11296037504989383, "kl": 0.187530517578125, "learning_rate": 4.937975869912252e-07, "loss": 0.0002, "reward": 1.7718750685453415, "reward_std": 0.035986683797091246, "rewards/equation_reward_func": 0.7776785846799612, "rewards/format_reward_func": 0.9941964335739613, "step": 13354 }, { "completion_length": 238.2500114440918, "epoch": 2.2394065132654344, "grad_norm": 0.4362049106828727, "kl": 0.346435546875, "learning_rate": 4.937949312341193e-07, "loss": 0.0003, "reward": 1.7183036357164383, "reward_std": 0.07513009523972869, "rewards/equation_reward_func": 0.733035746961832, "rewards/format_reward_func": 0.9852678664028645, "step": 13356 }, { "completion_length": 229.33929538726807, "epoch": 2.239741816505302, "grad_norm": 0.18681869868795595, "kl": 0.319091796875, "learning_rate": 4.937922749157071e-07, "loss": 0.0003, "reward": 1.750446505844593, "reward_std": 0.05997780757024884, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9941964335739613, "step": 13358 }, { "completion_length": 230.15179634094238, "epoch": 2.2400771197451697, "grad_norm": 0.565797210429418, "kl": 0.379913330078125, "learning_rate": 4.937896180359946e-07, "loss": 0.0004, "reward": 1.7678572162985802, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7678571604192257, "rewards/format_reward_func": 1.0, "step": 13360 }, { "completion_length": 229.35715293884277, "epoch": 2.2404124229850373, "grad_norm": 0.25123464149445224, "kl": 0.290374755859375, "learning_rate": 4.937869605949881e-07, "loss": 0.0003, "reward": 1.7910714745521545, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7955357357859612, "rewards/format_reward_func": 0.9955357164144516, "step": 13362 }, { "completion_length": 233.9732255935669, "epoch": 2.2407477262249045, "grad_norm": 0.936629094464273, "kl": 0.34619140625, "learning_rate": 4.937843025926936e-07, "loss": 0.0003, "reward": 1.7446429282426834, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7491071578115225, "rewards/format_reward_func": 0.9955357164144516, "step": 13364 }, { "completion_length": 237.1160831451416, "epoch": 2.241083029464772, "grad_norm": 0.1248168838109331, "kl": 0.30145263671875, "learning_rate": 4.937816440291172e-07, "loss": 0.0003, "reward": 1.7750000730156898, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 13366 }, { "completion_length": 224.27233123779297, "epoch": 2.2414183327046397, "grad_norm": 0.07464853245578626, "kl": 0.222442626953125, "learning_rate": 4.937789849042651e-07, "loss": 0.0002, "reward": 1.7803572043776512, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.7848214581608772, "rewards/format_reward_func": 0.9955357164144516, "step": 13368 }, { "completion_length": 218.9910831451416, "epoch": 2.2417536359445074, "grad_norm": 0.3339117384872975, "kl": 0.615142822265625, "learning_rate": 4.937763252181434e-07, "loss": 0.0006, "reward": 1.7892857789993286, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7982143089175224, "rewards/format_reward_func": 0.9910714328289032, "step": 13370 }, { "completion_length": 221.57143878936768, "epoch": 2.242088939184375, "grad_norm": 0.23947230221231972, "kl": 0.119140625, "learning_rate": 4.937736649707582e-07, "loss": 0.0001, "reward": 1.853571467101574, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.8625000193715096, "rewards/format_reward_func": 0.9910714328289032, "step": 13372 }, { "completion_length": 231.50447273254395, "epoch": 2.242424242424242, "grad_norm": 0.19491513244225595, "kl": 0.304962158203125, "learning_rate": 4.937710041621156e-07, "loss": 0.0003, "reward": 1.7964286133646965, "reward_std": 0.07576143927872181, "rewards/equation_reward_func": 0.805357176810503, "rewards/format_reward_func": 0.9910714328289032, "step": 13374 }, { "completion_length": 217.4017972946167, "epoch": 2.24275954566411, "grad_norm": 0.2010389179975474, "kl": 0.448394775390625, "learning_rate": 4.937683427922218e-07, "loss": 0.0004, "reward": 1.775000050663948, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000152736902, "rewards/format_reward_func": 1.0, "step": 13376 }, { "completion_length": 228.98661613464355, "epoch": 2.2430948489039775, "grad_norm": 0.34878759281384486, "kl": 0.397796630859375, "learning_rate": 4.93765680861083e-07, "loss": 0.0004, "reward": 1.7767857760190964, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.781250037252903, "rewards/format_reward_func": 0.9955357164144516, "step": 13378 }, { "completion_length": 231.84822463989258, "epoch": 2.243430152143845, "grad_norm": 0.40787454075194673, "kl": 0.804473876953125, "learning_rate": 4.93763018368705e-07, "loss": 0.0008, "reward": 1.7982143089175224, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.8116071708500385, "rewards/format_reward_func": 0.9866071492433548, "step": 13380 }, { "completion_length": 228.08929443359375, "epoch": 2.2437654553837127, "grad_norm": 0.3151023477866274, "kl": 0.313934326171875, "learning_rate": 4.937603553150944e-07, "loss": 0.0003, "reward": 1.7803571969270706, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214600235224, "rewards/format_reward_func": 0.9955357164144516, "step": 13382 }, { "completion_length": 237.78572368621826, "epoch": 2.2441007586235804, "grad_norm": 0.13640015825158425, "kl": 0.5728759765625, "learning_rate": 4.937576917002569e-07, "loss": 0.0006, "reward": 1.662500061094761, "reward_std": 0.06313453428447247, "rewards/equation_reward_func": 0.6758929062634706, "rewards/format_reward_func": 0.9866071492433548, "step": 13384 }, { "completion_length": 236.40179443359375, "epoch": 2.2444360618634476, "grad_norm": 0.17800346490208352, "kl": 0.17962646484375, "learning_rate": 4.937550275241989e-07, "loss": 0.0002, "reward": 1.7941965013742447, "reward_std": 0.05871511623263359, "rewards/equation_reward_func": 0.8017857484519482, "rewards/format_reward_func": 0.9924107193946838, "step": 13386 }, { "completion_length": 232.49108409881592, "epoch": 2.244771365103315, "grad_norm": 0.6126242940664247, "kl": 0.170166015625, "learning_rate": 4.937523627869264e-07, "loss": 0.0002, "reward": 1.7683036029338837, "reward_std": 0.054927044780924916, "rewards/equation_reward_func": 0.7830357439815998, "rewards/format_reward_func": 0.9852678664028645, "step": 13388 }, { "completion_length": 246.34822463989258, "epoch": 2.245106668343183, "grad_norm": 0.32475932657472756, "kl": 0.211639404296875, "learning_rate": 4.937496974884457e-07, "loss": 0.0002, "reward": 1.725000075995922, "reward_std": 0.11616754159331322, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 0.9821428656578064, "step": 13390 }, { "completion_length": 239.3794765472412, "epoch": 2.2454419715830505, "grad_norm": 0.5876372394781416, "kl": 0.22247314453125, "learning_rate": 4.937470316287627e-07, "loss": 0.0002, "reward": 1.7875000461935997, "reward_std": 0.06818529404699802, "rewards/equation_reward_func": 0.8008928894996643, "rewards/format_reward_func": 0.9866071492433548, "step": 13392 }, { "completion_length": 236.8080472946167, "epoch": 2.245777274822918, "grad_norm": 0.36167189638405006, "kl": 0.2952880859375, "learning_rate": 4.937443652078836e-07, "loss": 0.0003, "reward": 1.709375075995922, "reward_std": 0.04735090141184628, "rewards/equation_reward_func": 0.7196428962051868, "rewards/format_reward_func": 0.9897321499884129, "step": 13394 }, { "completion_length": 232.4241180419922, "epoch": 2.2461125780627853, "grad_norm": 0.2621123567593843, "kl": 0.179229736328125, "learning_rate": 4.937416982258146e-07, "loss": 0.0002, "reward": 1.7910714745521545, "reward_std": 0.08333758357912302, "rewards/equation_reward_func": 0.8044643141329288, "rewards/format_reward_func": 0.9866071492433548, "step": 13396 }, { "completion_length": 227.4196538925171, "epoch": 2.246447881302653, "grad_norm": 0.15406421507840143, "kl": 0.264862060546875, "learning_rate": 4.93739030682562e-07, "loss": 0.0003, "reward": 1.8053571805357933, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.8187500312924385, "rewards/format_reward_func": 0.9866071492433548, "step": 13398 }, { "completion_length": 221.07590293884277, "epoch": 2.2467831845425206, "grad_norm": 0.19532516413637616, "kl": 0.7318115234375, "learning_rate": 4.937363625781317e-07, "loss": 0.0007, "reward": 1.7982143238186836, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.802678607404232, "rewards/format_reward_func": 0.9955357164144516, "step": 13400 }, { "completion_length": 219.62054634094238, "epoch": 2.247118487782388, "grad_norm": 0.1643623346466658, "kl": 0.149444580078125, "learning_rate": 4.937336939125299e-07, "loss": 0.0001, "reward": 1.7571429088711739, "reward_std": 0.030304577201604843, "rewards/equation_reward_func": 0.7660714592784643, "rewards/format_reward_func": 0.9910714328289032, "step": 13402 }, { "completion_length": 224.09822368621826, "epoch": 2.247453791022256, "grad_norm": 0.1959789257161625, "kl": 0.298797607421875, "learning_rate": 4.937310246857628e-07, "loss": 0.0003, "reward": 1.7857143431901932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 13404 }, { "completion_length": 221.84375953674316, "epoch": 2.2477890942621235, "grad_norm": 0.2076420808661432, "kl": 0.147674560546875, "learning_rate": 4.937283548978365e-07, "loss": 0.0001, "reward": 1.755357213318348, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7598214671015739, "rewards/format_reward_func": 0.9955357164144516, "step": 13406 }, { "completion_length": 223.008939743042, "epoch": 2.2481243975019907, "grad_norm": 0.14167261600573977, "kl": 0.41143798828125, "learning_rate": 4.937256845487572e-07, "loss": 0.0004, "reward": 1.7839286178350449, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7875000350177288, "rewards/format_reward_func": 0.9964285716414452, "step": 13408 }, { "completion_length": 221.56250858306885, "epoch": 2.2484597007418583, "grad_norm": 0.2508994403319969, "kl": 0.654296875, "learning_rate": 4.93723013638531e-07, "loss": 0.0007, "reward": 1.725000075995922, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7339285966008902, "rewards/format_reward_func": 0.9910714328289032, "step": 13410 }, { "completion_length": 230.81251049041748, "epoch": 2.248795003981726, "grad_norm": 0.14106200207665437, "kl": 0.626007080078125, "learning_rate": 4.93720342167164e-07, "loss": 0.0006, "reward": 1.778571479022503, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 13412 }, { "completion_length": 218.97322463989258, "epoch": 2.2491303072215936, "grad_norm": 0.36174404106793095, "kl": 0.28924560546875, "learning_rate": 4.937176701346623e-07, "loss": 0.0003, "reward": 1.778571493923664, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 13414 }, { "completion_length": 226.5491189956665, "epoch": 2.249465610461461, "grad_norm": 0.23314871303389753, "kl": 0.13134765625, "learning_rate": 4.937149975410324e-07, "loss": 0.0001, "reward": 1.7053572461009026, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7098214589059353, "rewards/format_reward_func": 0.9955357164144516, "step": 13416 }, { "completion_length": 213.50000953674316, "epoch": 2.249800913701329, "grad_norm": 0.27470988123417955, "kl": 0.274169921875, "learning_rate": 4.937123243862801e-07, "loss": 0.0003, "reward": 1.796428620815277, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 13418 }, { "completion_length": 219.67858028411865, "epoch": 2.250136216941196, "grad_norm": 0.3176363415668159, "kl": 0.127593994140625, "learning_rate": 4.937096506704116e-07, "loss": 0.0001, "reward": 1.8089286163449287, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.8133928887546062, "rewards/format_reward_func": 0.9955357164144516, "step": 13420 }, { "completion_length": 221.87501049041748, "epoch": 2.2504715201810637, "grad_norm": 0.224929363983825, "kl": 0.194793701171875, "learning_rate": 4.937069763934333e-07, "loss": 0.0002, "reward": 1.759375050663948, "reward_std": 0.027147849323228, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 0.9986607171595097, "step": 13422 }, { "completion_length": 225.65179538726807, "epoch": 2.2508068234209313, "grad_norm": 0.018372357162259132, "kl": 0.30120849609375, "learning_rate": 4.937043015553511e-07, "loss": 0.0003, "reward": 1.7892857566475868, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 13424 }, { "completion_length": 226.90626049041748, "epoch": 2.251142126660799, "grad_norm": 0.23131515399617453, "kl": 0.152099609375, "learning_rate": 4.937016261561712e-07, "loss": 0.0002, "reward": 1.8017857670783997, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8062500264495611, "rewards/format_reward_func": 0.9955357164144516, "step": 13426 }, { "completion_length": 233.4910831451416, "epoch": 2.2514774299006666, "grad_norm": 0.2435184314392744, "kl": 0.39764404296875, "learning_rate": 4.936989501958997e-07, "loss": 0.0004, "reward": 1.7392857670783997, "reward_std": 0.0656599160283804, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 0.9821428656578064, "step": 13428 }, { "completion_length": 229.040189743042, "epoch": 2.251812733140534, "grad_norm": 0.14798224341983937, "kl": 0.17626953125, "learning_rate": 4.93696273674543e-07, "loss": 0.0002, "reward": 1.817857213318348, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.817857164889574, "rewards/format_reward_func": 1.0, "step": 13430 }, { "completion_length": 234.95536708831787, "epoch": 2.2521480363804014, "grad_norm": 0.2096833069883818, "kl": 0.19940185546875, "learning_rate": 4.93693596592107e-07, "loss": 0.0002, "reward": 1.725000061094761, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000182539225, "rewards/format_reward_func": 1.0, "step": 13432 }, { "completion_length": 219.80358028411865, "epoch": 2.252483339620269, "grad_norm": 0.2303255478167584, "kl": 0.118499755859375, "learning_rate": 4.936909189485981e-07, "loss": 0.0001, "reward": 1.785714365541935, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 13434 }, { "completion_length": 228.52679443359375, "epoch": 2.2528186428601367, "grad_norm": 0.15056984969771808, "kl": 0.13458251953125, "learning_rate": 4.936882407440225e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 13436 }, { "completion_length": 234.61608219146729, "epoch": 2.2531539461000043, "grad_norm": 0.23384952531418834, "kl": 0.36761474609375, "learning_rate": 4.936855619783859e-07, "loss": 0.0004, "reward": 1.7642857730388641, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 13438 }, { "completion_length": 242.87054824829102, "epoch": 2.253489249339872, "grad_norm": 0.2586868516066045, "kl": 0.386260986328125, "learning_rate": 4.93682882651695e-07, "loss": 0.0004, "reward": 1.7625000551342964, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 13440 }, { "completion_length": 235.42411708831787, "epoch": 2.253824552579739, "grad_norm": 0.15906009822520661, "kl": 0.144134521484375, "learning_rate": 4.936802027639557e-07, "loss": 0.0001, "reward": 1.789285771548748, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 13442 }, { "completion_length": 228.6562614440918, "epoch": 2.2541598558196068, "grad_norm": 0.18259206308634707, "kl": 0.108551025390625, "learning_rate": 4.936775223151742e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.741071455180645, "rewards/format_reward_func": 0.9910714328289032, "step": 13444 }, { "completion_length": 251.96430015563965, "epoch": 2.2544951590594744, "grad_norm": 0.0743592992559579, "kl": 0.2841796875, "learning_rate": 4.936748413053567e-07, "loss": 0.0003, "reward": 1.7517857775092125, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7562500312924385, "rewards/format_reward_func": 0.9955357164144516, "step": 13446 }, { "completion_length": 230.42411708831787, "epoch": 2.254830462299342, "grad_norm": 0.1739434056610246, "kl": 0.193389892578125, "learning_rate": 4.936721597345093e-07, "loss": 0.0002, "reward": 1.8107143342494965, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143193483353, "rewards/format_reward_func": 1.0, "step": 13448 }, { "completion_length": 242.21876049041748, "epoch": 2.2551657655392097, "grad_norm": 0.25470983698517347, "kl": 0.884552001953125, "learning_rate": 4.936694776026384e-07, "loss": 0.0009, "reward": 1.6821429207921028, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.6821429040282965, "rewards/format_reward_func": 1.0, "step": 13450 }, { "completion_length": 236.9955472946167, "epoch": 2.2555010687790773, "grad_norm": 0.19159469148946703, "kl": 0.196136474609375, "learning_rate": 4.9366679490975e-07, "loss": 0.0002, "reward": 1.7839286476373672, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 13452 }, { "completion_length": 243.32143878936768, "epoch": 2.2558363720189445, "grad_norm": 0.2567005414121196, "kl": 0.144256591796875, "learning_rate": 4.936641116558502e-07, "loss": 0.0001, "reward": 1.7821429297327995, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 13454 }, { "completion_length": 237.25893783569336, "epoch": 2.256171675258812, "grad_norm": 0.30205311916900046, "kl": 0.12261962890625, "learning_rate": 4.936614278409452e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7669643051922321, "rewards/format_reward_func": 0.9955357164144516, "step": 13456 }, { "completion_length": 240.8928680419922, "epoch": 2.2565069784986798, "grad_norm": 0.1936254461303534, "kl": 0.12506103515625, "learning_rate": 4.936587434650414e-07, "loss": 0.0001, "reward": 1.819642886519432, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8241071626543999, "rewards/format_reward_func": 0.9955357164144516, "step": 13458 }, { "completion_length": 233.42858219146729, "epoch": 2.2568422817385474, "grad_norm": 0.06795896946491352, "kl": 0.112396240234375, "learning_rate": 4.936560585281447e-07, "loss": 0.0001, "reward": 1.7732143253087997, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 13460 }, { "completion_length": 229.0044755935669, "epoch": 2.257177584978415, "grad_norm": 0.24394781620102302, "kl": 0.173187255859375, "learning_rate": 4.936533730302615e-07, "loss": 0.0002, "reward": 1.7535714879631996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 13462 }, { "completion_length": 226.38393783569336, "epoch": 2.257512888218282, "grad_norm": 0.26312285892319304, "kl": 0.642669677734375, "learning_rate": 4.936506869713979e-07, "loss": 0.0006, "reward": 1.7696429193019867, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.9955357164144516, "step": 13464 }, { "completion_length": 228.5000114440918, "epoch": 2.25784819145815, "grad_norm": 0.327693517526902, "kl": 0.1544189453125, "learning_rate": 4.9364800035156e-07, "loss": 0.0002, "reward": 1.7607143744826317, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607142981141806, "rewards/format_reward_func": 1.0, "step": 13466 }, { "completion_length": 228.26340198516846, "epoch": 2.2581834946980175, "grad_norm": 0.2079338402791647, "kl": 0.130828857421875, "learning_rate": 4.936453131707542e-07, "loss": 0.0001, "reward": 1.7214286625385284, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214286141097546, "rewards/format_reward_func": 1.0, "step": 13468 }, { "completion_length": 235.0491180419922, "epoch": 2.258518797937885, "grad_norm": 0.09090744850758828, "kl": 0.2340087890625, "learning_rate": 4.936426254289865e-07, "loss": 0.0002, "reward": 1.7696429416537285, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7741071712225676, "rewards/format_reward_func": 0.9955357164144516, "step": 13470 }, { "completion_length": 223.758939743042, "epoch": 2.2588541011777528, "grad_norm": 0.24446218583199866, "kl": 0.10858154296875, "learning_rate": 4.936399371262631e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 13472 }, { "completion_length": 231.61608028411865, "epoch": 2.2591894044176204, "grad_norm": 0.37153122030690916, "kl": 0.2354736328125, "learning_rate": 4.936372482625902e-07, "loss": 0.0002, "reward": 1.7696429044008255, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7741071693599224, "rewards/format_reward_func": 0.9955357164144516, "step": 13474 }, { "completion_length": 224.7991180419922, "epoch": 2.2595247076574876, "grad_norm": 0.27591459797161505, "kl": 0.130615234375, "learning_rate": 4.936345588379742e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 13476 }, { "completion_length": 218.008939743042, "epoch": 2.259860010897355, "grad_norm": 0.19964699963299606, "kl": 0.11248779296875, "learning_rate": 4.936318688524209e-07, "loss": 0.0001, "reward": 1.7785715088248253, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 13478 }, { "completion_length": 222.89286613464355, "epoch": 2.260195314137223, "grad_norm": 0.19133067031787318, "kl": 0.102752685546875, "learning_rate": 4.936291783059367e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 1.0, "step": 13480 }, { "completion_length": 225.29465293884277, "epoch": 2.2605306173770905, "grad_norm": 0.11899468868728356, "kl": 0.1959228515625, "learning_rate": 4.93626487198528e-07, "loss": 0.0002, "reward": 1.803571492433548, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714458674192, "rewards/format_reward_func": 1.0, "step": 13482 }, { "completion_length": 219.39733123779297, "epoch": 2.260865920616958, "grad_norm": 0.21464143001614744, "kl": 0.13031005859375, "learning_rate": 4.936237955302006e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 13484 }, { "completion_length": 207.0759048461914, "epoch": 2.2612012238568253, "grad_norm": 0.25222121509954876, "kl": 0.114593505859375, "learning_rate": 4.936211033009611e-07, "loss": 0.0001, "reward": 1.7464286088943481, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464286033064127, "rewards/format_reward_func": 1.0, "step": 13486 }, { "completion_length": 217.8616180419922, "epoch": 2.261536527096693, "grad_norm": 0.15743116447756114, "kl": 0.11029052734375, "learning_rate": 4.936184105108153e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 13488 }, { "completion_length": 211.71429538726807, "epoch": 2.2618718303365606, "grad_norm": 0.33981219507890775, "kl": 0.143707275390625, "learning_rate": 4.936157171597697e-07, "loss": 0.0001, "reward": 1.8142857775092125, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8142857253551483, "rewards/format_reward_func": 1.0, "step": 13490 }, { "completion_length": 222.19197273254395, "epoch": 2.262207133576428, "grad_norm": 0.20660793689804108, "kl": 0.121429443359375, "learning_rate": 4.936130232478303e-07, "loss": 0.0001, "reward": 1.7678571864962578, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571920841932, "rewards/format_reward_func": 1.0, "step": 13492 }, { "completion_length": 216.99108028411865, "epoch": 2.262542436816296, "grad_norm": 0.18929196634838816, "kl": 0.161285400390625, "learning_rate": 4.936103287750035e-07, "loss": 0.0002, "reward": 1.7321429401636124, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 1.0, "step": 13494 }, { "completion_length": 211.60715103149414, "epoch": 2.2628777400561635, "grad_norm": 0.24113602668906872, "kl": 0.107635498046875, "learning_rate": 4.936076337412954e-07, "loss": 0.0001, "reward": 1.7803572043776512, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7848214395344257, "rewards/format_reward_func": 0.9955357164144516, "step": 13496 }, { "completion_length": 212.38393783569336, "epoch": 2.2632130432960307, "grad_norm": 0.13588768404975396, "kl": 0.2230224609375, "learning_rate": 4.936049381467121e-07, "loss": 0.0002, "reward": 1.807142898440361, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428909897804, "rewards/format_reward_func": 1.0, "step": 13498 }, { "completion_length": 215.57590198516846, "epoch": 2.2635483465358983, "grad_norm": 0.17110466396901336, "kl": 0.12445068359375, "learning_rate": 4.9360224199126e-07, "loss": 0.0001, "reward": 1.832142896950245, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8321428708732128, "rewards/format_reward_func": 1.0, "step": 13500 }, { "completion_length": 217.27233028411865, "epoch": 2.263883649775766, "grad_norm": 0.2187412522132426, "kl": 0.113922119140625, "learning_rate": 4.935995452749452e-07, "loss": 0.0001, "reward": 1.7178572043776512, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7258928902447224, "rewards/format_reward_func": 0.9919642955064774, "step": 13502 }, { "completion_length": 215.1205472946167, "epoch": 2.2642189530156336, "grad_norm": 0.15344155196626352, "kl": 0.102447509765625, "learning_rate": 4.935968479977738e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8035714477300644, "rewards/format_reward_func": 1.0, "step": 13504 }, { "completion_length": 214.22768688201904, "epoch": 2.264554256255501, "grad_norm": 0.06149945562538831, "kl": 0.119384765625, "learning_rate": 4.935941501597523e-07, "loss": 0.0001, "reward": 1.7357143834233284, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.735714316368103, "rewards/format_reward_func": 1.0, "step": 13506 }, { "completion_length": 211.74554538726807, "epoch": 2.2648895594953684, "grad_norm": 0.06281976986005453, "kl": 0.101837158203125, "learning_rate": 4.935914517608867e-07, "loss": 0.0001, "reward": 1.8107143193483353, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.810714315623045, "rewards/format_reward_func": 1.0, "step": 13508 }, { "completion_length": 216.7410831451416, "epoch": 2.265224862735236, "grad_norm": 0.18027577339039147, "kl": 0.097869873046875, "learning_rate": 4.935887528011833e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 13510 }, { "completion_length": 220.33929538726807, "epoch": 2.2655601659751037, "grad_norm": 0.2485328935508096, "kl": 0.113037109375, "learning_rate": 4.935860532806482e-07, "loss": 0.0001, "reward": 1.8196429163217545, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8241071663796902, "rewards/format_reward_func": 0.9955357164144516, "step": 13512 }, { "completion_length": 210.77679634094238, "epoch": 2.2658954692149713, "grad_norm": 0.1617038734571405, "kl": 0.128143310546875, "learning_rate": 4.935833531992877e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571604192257, "rewards/format_reward_func": 1.0, "step": 13514 }, { "completion_length": 210.24554538726807, "epoch": 2.266230772454839, "grad_norm": 0.1882055746221909, "kl": 0.122772216796875, "learning_rate": 4.93580652557108e-07, "loss": 0.0001, "reward": 1.7178572118282318, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7178571783006191, "rewards/format_reward_func": 1.0, "step": 13516 }, { "completion_length": 203.39732837677002, "epoch": 2.2665660756947066, "grad_norm": 0.05495894567000175, "kl": 0.1014556884765625, "learning_rate": 4.935779513541154e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 13518 }, { "completion_length": 212.12947273254395, "epoch": 2.2669013789345738, "grad_norm": 0.1476861958550593, "kl": 0.106475830078125, "learning_rate": 4.93575249590316e-07, "loss": 0.0001, "reward": 1.728571504354477, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714484751225, "rewards/format_reward_func": 1.0, "step": 13520 }, { "completion_length": 212.41072273254395, "epoch": 2.2672366821744414, "grad_norm": 0.32315519728920805, "kl": 0.111785888671875, "learning_rate": 4.935725472657161e-07, "loss": 0.0001, "reward": 1.8178571984171867, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8178571611642838, "rewards/format_reward_func": 1.0, "step": 13522 }, { "completion_length": 215.58036708831787, "epoch": 2.267571985414309, "grad_norm": 0.19584402312119173, "kl": 0.115447998046875, "learning_rate": 4.935698443803218e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 13524 }, { "completion_length": 207.04911613464355, "epoch": 2.2679072886541767, "grad_norm": 0.14119223662442124, "kl": 0.130584716796875, "learning_rate": 4.935671409341394e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571730852127, "rewards/format_reward_func": 1.0, "step": 13526 }, { "completion_length": 209.13840007781982, "epoch": 2.2682425918940443, "grad_norm": 0.18142706946143367, "kl": 0.10601806640625, "learning_rate": 4.935644369271753e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 13528 }, { "completion_length": 217.67858123779297, "epoch": 2.2685778951339115, "grad_norm": 0.3059452585174862, "kl": 0.129547119140625, "learning_rate": 4.935617323594355e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.746428593993187, "rewards/format_reward_func": 1.0, "step": 13530 }, { "completion_length": 206.62947463989258, "epoch": 2.268913198373779, "grad_norm": 0.2880253369534212, "kl": 0.128936767578125, "learning_rate": 4.935590272309261e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 1.0, "step": 13532 }, { "completion_length": 208.76786708831787, "epoch": 2.2692485016136468, "grad_norm": 0.2830816012342474, "kl": 0.1079864501953125, "learning_rate": 4.935563215416537e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714544355869, "rewards/format_reward_func": 1.0, "step": 13534 }, { "completion_length": 216.34376049041748, "epoch": 2.2695838048535144, "grad_norm": 0.1421908094574526, "kl": 0.144805908203125, "learning_rate": 4.935536152916243e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714380443096, "rewards/format_reward_func": 1.0, "step": 13536 }, { "completion_length": 205.62500762939453, "epoch": 2.269919108093382, "grad_norm": 0.08285467355914013, "kl": 0.24676513671875, "learning_rate": 4.935509084808441e-07, "loss": 0.0002, "reward": 1.7821429073810577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428999304771, "rewards/format_reward_func": 1.0, "step": 13538 }, { "completion_length": 211.83483123779297, "epoch": 2.2702544113332497, "grad_norm": 0.13214472741476582, "kl": 0.20220947265625, "learning_rate": 4.935482011093195e-07, "loss": 0.0002, "reward": 1.7500000819563866, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 13540 }, { "completion_length": 207.53572463989258, "epoch": 2.270589714573117, "grad_norm": 0.08028323710669062, "kl": 0.10394287109375, "learning_rate": 4.935454931770567e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 13542 }, { "completion_length": 206.92858028411865, "epoch": 2.2709250178129845, "grad_norm": 0.18685389674187627, "kl": 0.113983154296875, "learning_rate": 4.935427846840617e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 13544 }, { "completion_length": 202.05358028411865, "epoch": 2.271260321052852, "grad_norm": 0.3338371078997882, "kl": 0.1292724609375, "learning_rate": 4.935400756303411e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 13546 }, { "completion_length": 211.33929538726807, "epoch": 2.2715956242927198, "grad_norm": 0.18042370161512275, "kl": 0.106903076171875, "learning_rate": 4.935373660159008e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 13548 }, { "completion_length": 212.53572463989258, "epoch": 2.2719309275325874, "grad_norm": 0.15448487921955562, "kl": 0.162017822265625, "learning_rate": 4.935346558407472e-07, "loss": 0.0002, "reward": 1.782142922282219, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 13550 }, { "completion_length": 207.77233028411865, "epoch": 2.2722662307724546, "grad_norm": 0.1929360171540079, "kl": 0.11138916015625, "learning_rate": 4.935319451048866e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 13552 }, { "completion_length": 207.915189743042, "epoch": 2.2726015340123222, "grad_norm": 0.22371807794920298, "kl": 0.11895751953125, "learning_rate": 4.935292338083251e-07, "loss": 0.0001, "reward": 1.850000038743019, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8500000275671482, "rewards/format_reward_func": 1.0, "step": 13554 }, { "completion_length": 215.48215293884277, "epoch": 2.27293683725219, "grad_norm": 0.2786729445248081, "kl": 0.112701416015625, "learning_rate": 4.93526521951069e-07, "loss": 0.0001, "reward": 1.7750000432133675, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000469386578, "rewards/format_reward_func": 1.0, "step": 13556 }, { "completion_length": 217.72768592834473, "epoch": 2.2732721404920575, "grad_norm": 0.1350406475402067, "kl": 0.15032958984375, "learning_rate": 4.935238095331246e-07, "loss": 0.0002, "reward": 1.7714286223053932, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 13558 }, { "completion_length": 208.49107933044434, "epoch": 2.273607443731925, "grad_norm": 0.3304221891155211, "kl": 0.2548828125, "learning_rate": 4.935210965544981e-07, "loss": 0.0003, "reward": 1.7892857789993286, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 13560 }, { "completion_length": 215.14733028411865, "epoch": 2.2739427469717928, "grad_norm": 0.2805466392633232, "kl": 0.131866455078125, "learning_rate": 4.935183830151958e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 13562 }, { "completion_length": 203.00000858306885, "epoch": 2.2742780502116604, "grad_norm": 0.2473474152110955, "kl": 0.250518798828125, "learning_rate": 4.935156689152238e-07, "loss": 0.0002, "reward": 1.8392857313156128, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8392857350409031, "rewards/format_reward_func": 1.0, "step": 13564 }, { "completion_length": 219.06697273254395, "epoch": 2.2746133534515276, "grad_norm": 0.5644412380449078, "kl": 0.20465087890625, "learning_rate": 4.935129542545885e-07, "loss": 0.0002, "reward": 1.809821479022503, "reward_std": 0.04671955434605479, "rewards/equation_reward_func": 0.8160714544355869, "rewards/format_reward_func": 0.9937500059604645, "step": 13566 }, { "completion_length": 213.90179538726807, "epoch": 2.2749486566913952, "grad_norm": 0.11572995366572475, "kl": 0.11407470703125, "learning_rate": 4.93510239033296e-07, "loss": 0.0001, "reward": 1.7428572252392769, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 13568 }, { "completion_length": 216.8973331451416, "epoch": 2.275283959931263, "grad_norm": 0.1595908587723372, "kl": 0.116973876953125, "learning_rate": 4.935075232513528e-07, "loss": 0.0001, "reward": 1.8285714760422707, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.8375000096857548, "rewards/format_reward_func": 0.9910714328289032, "step": 13570 }, { "completion_length": 217.70983123779297, "epoch": 2.2756192631711305, "grad_norm": 0.49599002369885, "kl": 0.1279296875, "learning_rate": 4.935048069087648e-07, "loss": 0.0001, "reward": 1.814285770058632, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857365310192, "rewards/format_reward_func": 1.0, "step": 13572 }, { "completion_length": 224.6071548461914, "epoch": 2.275954566410998, "grad_norm": 0.21004672028786117, "kl": 0.272674560546875, "learning_rate": 4.935020900055386e-07, "loss": 0.0003, "reward": 1.7625000551342964, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 13574 }, { "completion_length": 224.84822463989258, "epoch": 2.2762898696508653, "grad_norm": 0.29430683920252254, "kl": 0.14007568359375, "learning_rate": 4.934993725416803e-07, "loss": 0.0001, "reward": 1.7607143223285675, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7696428783237934, "rewards/format_reward_func": 0.9910714328289032, "step": 13576 }, { "completion_length": 224.74108409881592, "epoch": 2.276625172890733, "grad_norm": 0.1017204526352656, "kl": 0.1552734375, "learning_rate": 4.93496654517196e-07, "loss": 0.0002, "reward": 1.7535714954137802, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7625000216066837, "rewards/format_reward_func": 0.9910714328289032, "step": 13578 }, { "completion_length": 225.508939743042, "epoch": 2.2769604761306006, "grad_norm": 0.09881116987726597, "kl": 0.22412109375, "learning_rate": 4.934939359320924e-07, "loss": 0.0002, "reward": 1.8071429207921028, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.8160714544355869, "rewards/format_reward_func": 0.9910714328289032, "step": 13580 }, { "completion_length": 218.7098331451416, "epoch": 2.2772957793704682, "grad_norm": 0.1220410963795954, "kl": 0.153106689453125, "learning_rate": 4.934912167863752e-07, "loss": 0.0002, "reward": 1.7642857804894447, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 13582 }, { "completion_length": 231.77679538726807, "epoch": 2.277631082610336, "grad_norm": 0.39462603179441164, "kl": 0.32989501953125, "learning_rate": 4.934884970800511e-07, "loss": 0.0003, "reward": 1.7303572073578835, "reward_std": 0.05808377172797918, "rewards/equation_reward_func": 0.7437500320374966, "rewards/format_reward_func": 0.9866071492433548, "step": 13584 }, { "completion_length": 236.2991180419922, "epoch": 2.2779663858502035, "grad_norm": 0.14974869337815552, "kl": 0.371826171875, "learning_rate": 4.934857768131261e-07, "loss": 0.0004, "reward": 1.7214286550879478, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7303571682423353, "rewards/format_reward_func": 0.9910714328289032, "step": 13586 }, { "completion_length": 225.37054538726807, "epoch": 2.2783016890900707, "grad_norm": 0.11756198998018, "kl": 0.312469482421875, "learning_rate": 4.934830559856067e-07, "loss": 0.0003, "reward": 1.735714390873909, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143219560385, "rewards/format_reward_func": 1.0, "step": 13588 }, { "completion_length": 231.43751049041748, "epoch": 2.2786369923299383, "grad_norm": 0.22023421549487873, "kl": 0.3299560546875, "learning_rate": 4.934803345974989e-07, "loss": 0.0003, "reward": 1.8107143267989159, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.8196428865194321, "rewards/format_reward_func": 0.9910714328289032, "step": 13590 }, { "completion_length": 238.9687614440918, "epoch": 2.278972295569806, "grad_norm": 0.14824793990543636, "kl": 0.301483154296875, "learning_rate": 4.934776126488091e-07, "loss": 0.0003, "reward": 1.7500000521540642, "reward_std": 0.03030457627028227, "rewards/equation_reward_func": 0.7589285969734192, "rewards/format_reward_func": 0.9910714328289032, "step": 13592 }, { "completion_length": 244.1294755935669, "epoch": 2.2793075988096736, "grad_norm": 15.409615537389927, "kl": 2.38897705078125, "learning_rate": 4.934748901395436e-07, "loss": 0.0024, "reward": 1.7214286550879478, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7303571775555611, "rewards/format_reward_func": 0.9910714328289032, "step": 13594 }, { "completion_length": 227.65179634094238, "epoch": 2.2796429020495412, "grad_norm": 0.24654334349817073, "kl": 0.198028564453125, "learning_rate": 4.934721670697087e-07, "loss": 0.0002, "reward": 1.8125000223517418, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.816964328289032, "rewards/format_reward_func": 0.9955357164144516, "step": 13596 }, { "completion_length": 235.83929920196533, "epoch": 2.2799782052894084, "grad_norm": 0.30922367505226184, "kl": 0.369232177734375, "learning_rate": 4.934694434393105e-07, "loss": 0.0004, "reward": 1.719642959535122, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7241071797907352, "rewards/format_reward_func": 0.9955357164144516, "step": 13598 }, { "completion_length": 237.27679443359375, "epoch": 2.280313508529276, "grad_norm": 0.15791678668274603, "kl": 0.239410400390625, "learning_rate": 4.934667192483553e-07, "loss": 0.0002, "reward": 1.757142923772335, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571429014205933, "rewards/format_reward_func": 1.0, "step": 13600 }, { "completion_length": 230.6384038925171, "epoch": 2.2806488117691437, "grad_norm": 0.2840192280306608, "kl": 0.96710205078125, "learning_rate": 4.934639944968496e-07, "loss": 0.001, "reward": 1.7285715341567993, "reward_std": 0.0707106776535511, "rewards/equation_reward_func": 0.7375000305473804, "rewards/format_reward_func": 0.9910714328289032, "step": 13602 }, { "completion_length": 225.9732265472412, "epoch": 2.2809841150090113, "grad_norm": 0.19712667125561653, "kl": 0.264129638671875, "learning_rate": 4.934612691847994e-07, "loss": 0.0003, "reward": 1.7142857983708382, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7142857573926449, "rewards/format_reward_func": 1.0, "step": 13604 }, { "completion_length": 228.01340293884277, "epoch": 2.281319418248879, "grad_norm": 0.514089876213886, "kl": 0.3343505859375, "learning_rate": 4.934585433122112e-07, "loss": 0.0003, "reward": 1.7428572177886963, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 13606 }, { "completion_length": 233.58483219146729, "epoch": 2.2816547214887466, "grad_norm": 0.2498944000336924, "kl": 0.374603271484375, "learning_rate": 4.934558168790912e-07, "loss": 0.0004, "reward": 1.7218750789761543, "reward_std": 0.04987628129310906, "rewards/equation_reward_func": 0.7276785895228386, "rewards/format_reward_func": 0.9941964335739613, "step": 13608 }, { "completion_length": 234.68304920196533, "epoch": 2.281990024728614, "grad_norm": 0.4071963245632583, "kl": 0.64990234375, "learning_rate": 4.934530898854456e-07, "loss": 0.0006, "reward": 1.7714286297559738, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7803571596741676, "rewards/format_reward_func": 0.9910714328289032, "step": 13610 }, { "completion_length": 239.5089406967163, "epoch": 2.2823253279684814, "grad_norm": 0.1957224771809233, "kl": 0.480499267578125, "learning_rate": 4.934503623312806e-07, "loss": 0.0005, "reward": 1.7428572103381157, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7517857477068901, "rewards/format_reward_func": 0.9910714328289032, "step": 13612 }, { "completion_length": 225.28126049041748, "epoch": 2.282660631208349, "grad_norm": 0.05916231324178276, "kl": 0.1220703125, "learning_rate": 4.934476342166026e-07, "loss": 0.0001, "reward": 1.8053571581840515, "reward_std": 0.012626906856894493, "rewards/equation_reward_func": 0.8098214715719223, "rewards/format_reward_func": 0.9955357164144516, "step": 13614 }, { "completion_length": 228.18304634094238, "epoch": 2.2829959344482167, "grad_norm": 0.07388420505371393, "kl": 0.17498779296875, "learning_rate": 4.93444905541418e-07, "loss": 0.0002, "reward": 1.7553571984171867, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7598214671015739, "rewards/format_reward_func": 0.9955357164144516, "step": 13616 }, { "completion_length": 243.58036613464355, "epoch": 2.2833312376880843, "grad_norm": 0.20886338374905272, "kl": 0.190582275390625, "learning_rate": 4.93442176305733e-07, "loss": 0.0002, "reward": 1.7196429520845413, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 13618 }, { "completion_length": 244.8705472946167, "epoch": 2.2836665409279515, "grad_norm": 0.16759168247428288, "kl": 0.393280029296875, "learning_rate": 4.934394465095537e-07, "loss": 0.0004, "reward": 1.7375000715255737, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7508928924798965, "rewards/format_reward_func": 0.9866071492433548, "step": 13620 }, { "completion_length": 236.58482933044434, "epoch": 2.284001844167819, "grad_norm": 0.23261784632714255, "kl": 0.28619384765625, "learning_rate": 4.934367161528866e-07, "loss": 0.0003, "reward": 1.789285771548748, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7982142977416515, "rewards/format_reward_func": 0.9910714328289032, "step": 13622 }, { "completion_length": 242.65625953674316, "epoch": 2.284337147407687, "grad_norm": 0.07630423716878422, "kl": 0.18927001953125, "learning_rate": 4.93433985235738e-07, "loss": 0.0002, "reward": 1.8071429133415222, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428723633289, "rewards/format_reward_func": 1.0, "step": 13624 }, { "completion_length": 249.1651906967163, "epoch": 2.2846724506475544, "grad_norm": 0.13332822719857498, "kl": 0.256378173828125, "learning_rate": 4.934312537581141e-07, "loss": 0.0003, "reward": 1.7375000789761543, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7419643178582191, "rewards/format_reward_func": 0.9955357164144516, "step": 13626 }, { "completion_length": 246.47769165039062, "epoch": 2.285007753887422, "grad_norm": 0.2380732736718792, "kl": 0.46343994140625, "learning_rate": 4.93428521720021e-07, "loss": 0.0005, "reward": 1.8089286237955093, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8133928813040257, "rewards/format_reward_func": 0.9955357164144516, "step": 13628 }, { "completion_length": 257.6339387893677, "epoch": 2.2853430571272897, "grad_norm": 0.14169497889649182, "kl": 0.411865234375, "learning_rate": 4.934257891214653e-07, "loss": 0.0004, "reward": 1.7625000700354576, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.775892898440361, "rewards/format_reward_func": 0.9866071492433548, "step": 13630 }, { "completion_length": 248.64733505249023, "epoch": 2.285678360367157, "grad_norm": 0.13220287571089961, "kl": 0.20489501953125, "learning_rate": 4.934230559624533e-07, "loss": 0.0002, "reward": 1.7321429029107094, "reward_std": 0.025253813713788986, "rewards/equation_reward_func": 0.7410714533179998, "rewards/format_reward_func": 0.9910714328289032, "step": 13632 }, { "completion_length": 255.30358409881592, "epoch": 2.2860136636070245, "grad_norm": 0.18408713303571822, "kl": 0.283905029296875, "learning_rate": 4.93420322242991e-07, "loss": 0.0003, "reward": 1.7517857775092125, "reward_std": 0.08838834706693888, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.977678582072258, "step": 13634 }, { "completion_length": 252.05804824829102, "epoch": 2.286348966846892, "grad_norm": 0.164943595184681, "kl": 0.294219970703125, "learning_rate": 4.934175879630849e-07, "loss": 0.0003, "reward": 1.7214286774396896, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.730357188731432, "rewards/format_reward_func": 0.9910714328289032, "step": 13636 }, { "completion_length": 251.68751049041748, "epoch": 2.28668427008676, "grad_norm": 0.08371264433430739, "kl": 0.379974365234375, "learning_rate": 4.934148531227413e-07, "loss": 0.0004, "reward": 1.7196429297327995, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.724107176065445, "rewards/format_reward_func": 0.9955357164144516, "step": 13638 }, { "completion_length": 238.352689743042, "epoch": 2.2870195733266274, "grad_norm": 0.22735480378235184, "kl": 0.209197998046875, "learning_rate": 4.934121177219664e-07, "loss": 0.0002, "reward": 1.8339286148548126, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8383928835391998, "rewards/format_reward_func": 0.9955357164144516, "step": 13640 }, { "completion_length": 238.80358505249023, "epoch": 2.2873548765664946, "grad_norm": 0.19841604872169602, "kl": 0.206878662109375, "learning_rate": 4.934093817607666e-07, "loss": 0.0002, "reward": 1.7000000849366188, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.7089286148548126, "rewards/format_reward_func": 0.9910714328289032, "step": 13642 }, { "completion_length": 244.9687614440918, "epoch": 2.2876901798063622, "grad_norm": 0.18402523268310608, "kl": 0.249176025390625, "learning_rate": 4.934066452391482e-07, "loss": 0.0002, "reward": 1.7446429282426834, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.7491071783006191, "rewards/format_reward_func": 0.9955357164144516, "step": 13644 }, { "completion_length": 248.65179634094238, "epoch": 2.28802548304623, "grad_norm": 0.2082230382842461, "kl": 0.2586669921875, "learning_rate": 4.934039081571174e-07, "loss": 0.0003, "reward": 1.7500000521540642, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 13646 }, { "completion_length": 241.821439743042, "epoch": 2.2883607862860975, "grad_norm": 0.17611803208102136, "kl": 0.144500732421875, "learning_rate": 4.934011705146805e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964286096394062, "rewards/format_reward_func": 1.0, "step": 13648 }, { "completion_length": 227.36161708831787, "epoch": 2.288696089525965, "grad_norm": 0.14735190264686204, "kl": 0.245391845703125, "learning_rate": 4.93398432311844e-07, "loss": 0.0002, "reward": 1.7785714864730835, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7875000275671482, "rewards/format_reward_func": 0.9910714328289032, "step": 13650 }, { "completion_length": 237.90179824829102, "epoch": 2.289031392765833, "grad_norm": 0.25570229818737517, "kl": 0.161041259765625, "learning_rate": 4.93395693548614e-07, "loss": 0.0002, "reward": 1.7625000551342964, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9955357164144516, "step": 13652 }, { "completion_length": 238.3928689956665, "epoch": 2.2893666960057, "grad_norm": 0.1867942311968773, "kl": 0.15655517578125, "learning_rate": 4.933929542249968e-07, "loss": 0.0002, "reward": 1.7428572103381157, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7517857439815998, "rewards/format_reward_func": 0.9910714328289032, "step": 13654 }, { "completion_length": 224.59376049041748, "epoch": 2.2897019992455676, "grad_norm": 0.21953006511119944, "kl": 0.207977294921875, "learning_rate": 4.933902143409988e-07, "loss": 0.0002, "reward": 1.7705357745289803, "reward_std": 0.06187184248119593, "rewards/equation_reward_func": 0.7767857573926449, "rewards/format_reward_func": 0.9937500059604645, "step": 13656 }, { "completion_length": 247.86161994934082, "epoch": 2.2900373024854352, "grad_norm": 0.3480181787400742, "kl": 0.210174560546875, "learning_rate": 4.933874738966264e-07, "loss": 0.0002, "reward": 1.7553572058677673, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 13658 }, { "completion_length": 233.80358123779297, "epoch": 2.290372605725303, "grad_norm": 0.10373686583389505, "kl": 0.1361083984375, "learning_rate": 4.933847328918857e-07, "loss": 0.0001, "reward": 1.7732143551111221, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 13660 }, { "completion_length": 243.1384048461914, "epoch": 2.2907079089651705, "grad_norm": 0.10525744507139827, "kl": 0.15374755859375, "learning_rate": 4.933819913267831e-07, "loss": 0.0002, "reward": 1.7750000730156898, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 13662 }, { "completion_length": 246.4732265472412, "epoch": 2.2910432122050377, "grad_norm": 0.1968361967529215, "kl": 0.165618896484375, "learning_rate": 4.933792492013249e-07, "loss": 0.0002, "reward": 1.7785714715719223, "reward_std": 0.07071067672222853, "rewards/equation_reward_func": 0.7875000163912773, "rewards/format_reward_func": 0.9910714328289032, "step": 13664 }, { "completion_length": 235.36161708831787, "epoch": 2.2913785154449053, "grad_norm": 0.08932347581233609, "kl": 0.140716552734375, "learning_rate": 4.933765065155175e-07, "loss": 0.0001, "reward": 1.7839286103844643, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 13666 }, { "completion_length": 229.04018878936768, "epoch": 2.291713818684773, "grad_norm": 0.0748245735877684, "kl": 0.135772705078125, "learning_rate": 4.933737632693671e-07, "loss": 0.0001, "reward": 1.8392857313156128, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.839285746216774, "rewards/format_reward_func": 1.0, "step": 13668 }, { "completion_length": 231.80358123779297, "epoch": 2.2920491219246406, "grad_norm": 0.09018754149817187, "kl": 0.127655029296875, "learning_rate": 4.9337101946288e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 13670 }, { "completion_length": 236.7142972946167, "epoch": 2.2923844251645082, "grad_norm": 0.400827240902266, "kl": 0.14300537109375, "learning_rate": 4.933682750960627e-07, "loss": 0.0001, "reward": 1.7750000432133675, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000264495611, "rewards/format_reward_func": 1.0, "step": 13672 }, { "completion_length": 231.1741180419922, "epoch": 2.292719728404376, "grad_norm": 0.3008411517499098, "kl": 0.14361572265625, "learning_rate": 4.933655301689214e-07, "loss": 0.0001, "reward": 1.7446429133415222, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7580357510596514, "rewards/format_reward_func": 0.9866071492433548, "step": 13674 }, { "completion_length": 239.4687623977661, "epoch": 2.293055031644243, "grad_norm": 0.24453466502782206, "kl": 0.164886474609375, "learning_rate": 4.933627846814623e-07, "loss": 0.0002, "reward": 1.669642947614193, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.674107177183032, "rewards/format_reward_func": 0.9955357164144516, "step": 13676 }, { "completion_length": 240.4553689956665, "epoch": 2.2933903348841107, "grad_norm": 0.08003393432284185, "kl": 0.157135009765625, "learning_rate": 4.933600386336919e-07, "loss": 0.0002, "reward": 1.7500000670552254, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 13678 }, { "completion_length": 244.01340293884277, "epoch": 2.2937256381239783, "grad_norm": 0.22095536044182848, "kl": 0.130096435546875, "learning_rate": 4.933572920256165e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7660714536905289, "rewards/format_reward_func": 0.9910714328289032, "step": 13680 }, { "completion_length": 248.5803680419922, "epoch": 2.294060941363846, "grad_norm": 0.1975708507074759, "kl": 0.145477294921875, "learning_rate": 4.933545448572422e-07, "loss": 0.0001, "reward": 1.7325893640518188, "reward_std": 0.04482551896944642, "rewards/equation_reward_func": 0.7473214603960514, "rewards/format_reward_func": 0.9852678664028645, "step": 13682 }, { "completion_length": 235.977689743042, "epoch": 2.2943962446037136, "grad_norm": 0.0816102094427138, "kl": 0.14111328125, "learning_rate": 4.933517971285758e-07, "loss": 0.0001, "reward": 1.8071428909897804, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8071428760886192, "rewards/format_reward_func": 1.0, "step": 13684 }, { "completion_length": 227.9375114440918, "epoch": 2.294731547843581, "grad_norm": 0.3697824132658964, "kl": 0.158905029296875, "learning_rate": 4.933490488396232e-07, "loss": 0.0002, "reward": 1.8000000640749931, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8000000305473804, "rewards/format_reward_func": 1.0, "step": 13686 }, { "completion_length": 222.3794708251953, "epoch": 2.2950668510834484, "grad_norm": 0.3124872607707278, "kl": 0.131500244140625, "learning_rate": 4.933462999903908e-07, "loss": 0.0001, "reward": 1.7982143461704254, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.8026786036789417, "rewards/format_reward_func": 0.9955357164144516, "step": 13688 }, { "completion_length": 216.25447177886963, "epoch": 2.295402154323316, "grad_norm": 0.4198675461813113, "kl": 0.137725830078125, "learning_rate": 4.93343550580885e-07, "loss": 0.0001, "reward": 1.8089286386966705, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.8133928924798965, "rewards/format_reward_func": 0.9955357164144516, "step": 13690 }, { "completion_length": 219.85715293884277, "epoch": 2.2957374575631837, "grad_norm": 0.09580375324629241, "kl": 0.1383056640625, "learning_rate": 4.933408006111121e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 13692 }, { "completion_length": 222.63393878936768, "epoch": 2.2960727608030513, "grad_norm": 0.10388410768653196, "kl": 0.134613037109375, "learning_rate": 4.933380500810785e-07, "loss": 0.0001, "reward": 1.7553572058677673, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.759821455925703, "rewards/format_reward_func": 0.9955357164144516, "step": 13694 }, { "completion_length": 211.35715293884277, "epoch": 2.296408064042919, "grad_norm": 0.13906267429864502, "kl": 0.145172119140625, "learning_rate": 4.933352989907906e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7428571619093418, "rewards/format_reward_func": 1.0, "step": 13696 }, { "completion_length": 219.80358219146729, "epoch": 2.2967433672827866, "grad_norm": 0.13736673153418766, "kl": 0.1318359375, "learning_rate": 4.933325473402545e-07, "loss": 0.0001, "reward": 1.7660714909434319, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.769642885774374, "rewards/format_reward_func": 0.9964285716414452, "step": 13698 }, { "completion_length": 223.89733028411865, "epoch": 2.297078670522654, "grad_norm": 0.20006064325335238, "kl": 0.117767333984375, "learning_rate": 4.933297951294767e-07, "loss": 0.0001, "reward": 1.7392857745289803, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7392857596278191, "rewards/format_reward_func": 1.0, "step": 13700 }, { "completion_length": 218.90179634094238, "epoch": 2.2974139737625214, "grad_norm": 0.06124106879307074, "kl": 0.128936767578125, "learning_rate": 4.933270423584634e-07, "loss": 0.0001, "reward": 1.7821429371833801, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 13702 }, { "completion_length": 216.00000858306885, "epoch": 2.297749277002389, "grad_norm": 0.12457756050918946, "kl": 0.126312255859375, "learning_rate": 4.933242890272211e-07, "loss": 0.0001, "reward": 1.7982143238186836, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.8026785999536514, "rewards/format_reward_func": 0.9955357164144516, "step": 13704 }, { "completion_length": 215.63840103149414, "epoch": 2.2980845802422567, "grad_norm": 0.19463971761404908, "kl": 0.1267242431640625, "learning_rate": 4.933215351357561e-07, "loss": 0.0001, "reward": 1.7107143625617027, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7107143271714449, "rewards/format_reward_func": 1.0, "step": 13706 }, { "completion_length": 222.50001049041748, "epoch": 2.2984198834821243, "grad_norm": 0.1579762789545742, "kl": 0.125274658203125, "learning_rate": 4.933187806840747e-07, "loss": 0.0001, "reward": 1.821428619325161, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8214286044239998, "rewards/format_reward_func": 1.0, "step": 13708 }, { "completion_length": 230.0044755935669, "epoch": 2.2987551867219915, "grad_norm": 0.2105210773896155, "kl": 0.15521240234375, "learning_rate": 4.933160256721831e-07, "loss": 0.0002, "reward": 1.7339286357164383, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 13710 }, { "completion_length": 213.31251049041748, "epoch": 2.299090489961859, "grad_norm": 0.27241427140351204, "kl": 0.122161865234375, "learning_rate": 4.933132701000879e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 13712 }, { "completion_length": 219.40626049041748, "epoch": 2.299425793201727, "grad_norm": 0.259448491171959, "kl": 0.1417236328125, "learning_rate": 4.933105139677954e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.775000037625432, "rewards/format_reward_func": 1.0, "step": 13714 }, { "completion_length": 219.7366180419922, "epoch": 2.2997610964415944, "grad_norm": 0.25233047889996035, "kl": 0.137237548828125, "learning_rate": 4.933077572753118e-07, "loss": 0.0001, "reward": 1.7107143849134445, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7107143346220255, "rewards/format_reward_func": 1.0, "step": 13716 }, { "completion_length": 221.102689743042, "epoch": 2.300096399681462, "grad_norm": 0.3017274371156937, "kl": 0.140655517578125, "learning_rate": 4.933050000226437e-07, "loss": 0.0001, "reward": 1.7250000685453415, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7250000294297934, "rewards/format_reward_func": 1.0, "step": 13718 }, { "completion_length": 209.57143783569336, "epoch": 2.3004317029213297, "grad_norm": 0.11165261758553362, "kl": 0.136688232421875, "learning_rate": 4.93302242209797e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 13720 }, { "completion_length": 216.10269165039062, "epoch": 2.300767006161197, "grad_norm": 0.29489920734681696, "kl": 0.1304931640625, "learning_rate": 4.932994838367786e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 13722 }, { "completion_length": 220.83929538726807, "epoch": 2.3011023094010645, "grad_norm": 0.0707173188716113, "kl": 0.13385009765625, "learning_rate": 4.932967249035944e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7392857559025288, "rewards/format_reward_func": 1.0, "step": 13724 }, { "completion_length": 221.10268878936768, "epoch": 2.301437612640932, "grad_norm": 0.3062096065804645, "kl": 0.14453125, "learning_rate": 4.93293965410251e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 13726 }, { "completion_length": 226.51786613464355, "epoch": 2.3017729158808, "grad_norm": 0.23534533660186088, "kl": 0.1221923828125, "learning_rate": 4.932912053567546e-07, "loss": 0.0001, "reward": 1.807142898440361, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8071428909897804, "rewards/format_reward_func": 1.0, "step": 13728 }, { "completion_length": 213.85715293884277, "epoch": 2.3021082191206674, "grad_norm": 0.0034166384522816224, "kl": 0.1214599609375, "learning_rate": 4.932884447431118e-07, "loss": 0.0001, "reward": 1.8107143267989159, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107143193483353, "rewards/format_reward_func": 1.0, "step": 13730 }, { "completion_length": 220.40179538726807, "epoch": 2.3024435223605346, "grad_norm": 0.15621227615878694, "kl": 0.13421630859375, "learning_rate": 4.932856835693287e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 13732 }, { "completion_length": 218.62501049041748, "epoch": 2.3027788256004023, "grad_norm": 0.15452761746025206, "kl": 0.120635986328125, "learning_rate": 4.932829218354117e-07, "loss": 0.0001, "reward": 1.8125000223517418, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8169643133878708, "rewards/format_reward_func": 0.9955357164144516, "step": 13734 }, { "completion_length": 225.28572368621826, "epoch": 2.30311412884027, "grad_norm": 0.2775790745195352, "kl": 0.13287353515625, "learning_rate": 4.932801595413673e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 13736 }, { "completion_length": 225.00893688201904, "epoch": 2.3034494320801375, "grad_norm": 0.10617741628463849, "kl": 0.125, "learning_rate": 4.932773966872017e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7428571842610836, "rewards/format_reward_func": 1.0, "step": 13738 }, { "completion_length": 216.41965198516846, "epoch": 2.303784735320005, "grad_norm": 0.13860951964324789, "kl": 0.115509033203125, "learning_rate": 4.932746332729214e-07, "loss": 0.0001, "reward": 1.7803571820259094, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.784821443259716, "rewards/format_reward_func": 0.9955357164144516, "step": 13740 }, { "completion_length": 229.27679634094238, "epoch": 2.304120038559873, "grad_norm": 0.4581590275723739, "kl": 0.1278839111328125, "learning_rate": 4.932718692985327e-07, "loss": 0.0001, "reward": 1.7196429446339607, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7241071686148643, "rewards/format_reward_func": 0.9955357164144516, "step": 13742 }, { "completion_length": 224.69197368621826, "epoch": 2.30445534179974, "grad_norm": 0.2416406933707828, "kl": 0.1219482421875, "learning_rate": 4.93269104764042e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 13744 }, { "completion_length": 225.04911708831787, "epoch": 2.3047906450396076, "grad_norm": 0.31154448894666115, "kl": 0.120697021484375, "learning_rate": 4.932663396694554e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714693367481, "rewards/format_reward_func": 1.0, "step": 13746 }, { "completion_length": 222.9419755935669, "epoch": 2.3051259482794753, "grad_norm": 0.14378458200593694, "kl": 0.121368408203125, "learning_rate": 4.932635740147797e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 13748 }, { "completion_length": 226.48661613464355, "epoch": 2.305461251519343, "grad_norm": 0.2690045565333607, "kl": 0.12030029296875, "learning_rate": 4.93260807800021e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 13750 }, { "completion_length": 228.49107933044434, "epoch": 2.3057965547592105, "grad_norm": 0.3038352147540726, "kl": 0.12451171875, "learning_rate": 4.932580410251857e-07, "loss": 0.0001, "reward": 1.7392857894301414, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 1.0, "step": 13752 }, { "completion_length": 216.6071538925171, "epoch": 2.3061318579990777, "grad_norm": 0.19908074429539108, "kl": 0.13238525390625, "learning_rate": 4.932552736902803e-07, "loss": 0.0001, "reward": 1.7535714879631996, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714525729418, "rewards/format_reward_func": 1.0, "step": 13754 }, { "completion_length": 220.8794755935669, "epoch": 2.3064671612389454, "grad_norm": 0.2938801381346632, "kl": 0.1182708740234375, "learning_rate": 4.932525057953109e-07, "loss": 0.0001, "reward": 1.814285770058632, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 13756 }, { "completion_length": 224.69643783569336, "epoch": 2.306802464478813, "grad_norm": 0.08695741183974194, "kl": 0.1104888916015625, "learning_rate": 4.932497373402842e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 13758 }, { "completion_length": 223.71429538726807, "epoch": 2.3071377677186806, "grad_norm": 0.280091163735147, "kl": 0.1106109619140625, "learning_rate": 4.932469683252062e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 13760 }, { "completion_length": 226.7723331451416, "epoch": 2.3074730709585483, "grad_norm": 0.36408205627825097, "kl": 0.1078948974609375, "learning_rate": 4.932441987500837e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 13762 }, { "completion_length": 223.4241180419922, "epoch": 2.307808374198416, "grad_norm": 0.18035158937625873, "kl": 0.13739013671875, "learning_rate": 4.932414286149228e-07, "loss": 0.0001, "reward": 1.7482143566012383, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526785992085934, "rewards/format_reward_func": 0.9955357164144516, "step": 13764 }, { "completion_length": 220.99554634094238, "epoch": 2.308143677438283, "grad_norm": 0.12655154468340585, "kl": 0.12335205078125, "learning_rate": 4.9323865791973e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 13766 }, { "completion_length": 221.41965293884277, "epoch": 2.3084789806781507, "grad_norm": 0.155988583871809, "kl": 0.122344970703125, "learning_rate": 4.932358866645116e-07, "loss": 0.0001, "reward": 1.8428571820259094, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8428571578115225, "rewards/format_reward_func": 1.0, "step": 13768 }, { "completion_length": 230.5044765472412, "epoch": 2.3088142839180184, "grad_norm": 0.21690628723905842, "kl": 0.1177978515625, "learning_rate": 4.932331148492739e-07, "loss": 0.0001, "reward": 1.8178571909666061, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8178571686148643, "rewards/format_reward_func": 1.0, "step": 13770 }, { "completion_length": 230.7500114440918, "epoch": 2.309149587157886, "grad_norm": 0.22875686319240196, "kl": 0.10528564453125, "learning_rate": 4.932303424740234e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714285887777805, "rewards/format_reward_func": 1.0, "step": 13772 }, { "completion_length": 240.36608219146729, "epoch": 2.3094848903977536, "grad_norm": 0.13109625511493334, "kl": 0.117034912109375, "learning_rate": 4.932275695387666e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 13774 }, { "completion_length": 236.40179920196533, "epoch": 2.309820193637621, "grad_norm": 0.15888328027894855, "kl": 0.106597900390625, "learning_rate": 4.932247960435096e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 13776 }, { "completion_length": 234.8125114440918, "epoch": 2.3101554968774884, "grad_norm": 0.15421914534549414, "kl": 0.1151123046875, "learning_rate": 4.932220219882591e-07, "loss": 0.0001, "reward": 1.825000062584877, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8250000178813934, "rewards/format_reward_func": 1.0, "step": 13778 }, { "completion_length": 236.29911613464355, "epoch": 2.310490800117356, "grad_norm": 0.09890678034383357, "kl": 0.12774658203125, "learning_rate": 4.932192473730213e-07, "loss": 0.0001, "reward": 1.7910714447498322, "reward_std": 0.012626906856894493, "rewards/equation_reward_func": 0.7955357562750578, "rewards/format_reward_func": 0.9955357164144516, "step": 13780 }, { "completion_length": 238.1875123977661, "epoch": 2.3108261033572237, "grad_norm": 0.16175457520796585, "kl": 0.127197265625, "learning_rate": 4.932164721978024e-07, "loss": 0.0001, "reward": 1.7017858028411865, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.706250024959445, "rewards/format_reward_func": 0.9955357164144516, "step": 13782 }, { "completion_length": 231.01340198516846, "epoch": 2.3111614065970914, "grad_norm": 0.19291834599943525, "kl": 0.1243896484375, "learning_rate": 4.932136964626093e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000432133675, "rewards/format_reward_func": 1.0, "step": 13784 }, { "completion_length": 247.75894165039062, "epoch": 2.311496709836959, "grad_norm": 0.1155973995569849, "kl": 0.126373291015625, "learning_rate": 4.932109201674478e-07, "loss": 0.0001, "reward": 1.7821428924798965, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 13786 }, { "completion_length": 242.05358219146729, "epoch": 2.311832013076826, "grad_norm": 0.2314433409991325, "kl": 0.156005859375, "learning_rate": 4.932081433123248e-07, "loss": 0.0002, "reward": 1.7875000834465027, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7919642943888903, "rewards/format_reward_func": 0.9955357164144516, "step": 13788 }, { "completion_length": 236.2946548461914, "epoch": 2.312167316316694, "grad_norm": 0.0022804056792775137, "kl": 0.1068115234375, "learning_rate": 4.932053658972464e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 13790 }, { "completion_length": 229.5982255935669, "epoch": 2.3125026195565614, "grad_norm": 0.22285338579905276, "kl": 0.125946044921875, "learning_rate": 4.932025879222191e-07, "loss": 0.0001, "reward": 1.7392858117818832, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7392857372760773, "rewards/format_reward_func": 1.0, "step": 13792 }, { "completion_length": 221.48661708831787, "epoch": 2.312837922796429, "grad_norm": 0.32382712126000757, "kl": 0.119415283203125, "learning_rate": 4.931998093872492e-07, "loss": 0.0001, "reward": 1.8285714760422707, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8285714574158192, "rewards/format_reward_func": 1.0, "step": 13794 }, { "completion_length": 228.21875858306885, "epoch": 2.3131732260362967, "grad_norm": 0.19368082670345033, "kl": 0.11956787109375, "learning_rate": 4.931970302923433e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7214285992085934, "rewards/format_reward_func": 1.0, "step": 13796 }, { "completion_length": 226.20090293884277, "epoch": 2.313508529276164, "grad_norm": 0.29231224698111935, "kl": 0.12823486328125, "learning_rate": 4.931942506375076e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 13798 }, { "completion_length": 230.42411708831787, "epoch": 2.3138438325160315, "grad_norm": 0.18361270601123025, "kl": 0.19329833984375, "learning_rate": 4.931914704227486e-07, "loss": 0.0002, "reward": 1.7714286521077156, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 13800 }, { "completion_length": 228.7053680419922, "epoch": 2.314179135755899, "grad_norm": 0.20926399834142734, "kl": 0.152862548828125, "learning_rate": 4.931886896480726e-07, "loss": 0.0002, "reward": 1.7321429401636124, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428954601288, "rewards/format_reward_func": 1.0, "step": 13802 }, { "completion_length": 225.32143783569336, "epoch": 2.314514438995767, "grad_norm": 0.24781507311502457, "kl": 0.138458251953125, "learning_rate": 4.931859083134861e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 13804 }, { "completion_length": 229.91518783569336, "epoch": 2.3148497422356344, "grad_norm": 0.13919086617690282, "kl": 0.14959716796875, "learning_rate": 4.931831264189954e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714566707611, "rewards/format_reward_func": 1.0, "step": 13806 }, { "completion_length": 223.39286994934082, "epoch": 2.315185045475502, "grad_norm": 0.2862038229346431, "kl": 0.141693115234375, "learning_rate": 4.931803439646071e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321429029107094, "rewards/format_reward_func": 1.0, "step": 13808 }, { "completion_length": 229.4330472946167, "epoch": 2.3155203487153693, "grad_norm": 0.536633798934367, "kl": 0.44781494140625, "learning_rate": 4.931775609503274e-07, "loss": 0.0004, "reward": 1.7642857804894447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 13810 }, { "completion_length": 224.14286708831787, "epoch": 2.315855651955237, "grad_norm": 0.17537454963307, "kl": 0.14599609375, "learning_rate": 4.931747773761628e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 13812 }, { "completion_length": 230.0982255935669, "epoch": 2.3161909551951045, "grad_norm": 0.21195475663310653, "kl": 0.161041259765625, "learning_rate": 4.931719932421198e-07, "loss": 0.0002, "reward": 1.7678572088479996, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571827709675, "rewards/format_reward_func": 1.0, "step": 13814 }, { "completion_length": 234.0491189956665, "epoch": 2.316526258434972, "grad_norm": 0.3517252391572919, "kl": 0.397308349609375, "learning_rate": 4.931692085482045e-07, "loss": 0.0004, "reward": 1.7522322237491608, "reward_std": 0.04735089954920113, "rewards/equation_reward_func": 0.753571480512619, "rewards/format_reward_func": 0.9986607171595097, "step": 13816 }, { "completion_length": 218.95982933044434, "epoch": 2.31686156167484, "grad_norm": 0.21637620176464864, "kl": 0.149993896484375, "learning_rate": 4.931664232944237e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 13818 }, { "completion_length": 233.7946538925171, "epoch": 2.317196864914707, "grad_norm": 0.1493142580467565, "kl": 0.184417724609375, "learning_rate": 4.931636374807836e-07, "loss": 0.0002, "reward": 1.757142923772335, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428958326578, "rewards/format_reward_func": 1.0, "step": 13820 }, { "completion_length": 238.07590293884277, "epoch": 2.3175321681545746, "grad_norm": 0.23369268131692472, "kl": 0.210357666015625, "learning_rate": 4.931608511072906e-07, "loss": 0.0002, "reward": 1.7464286461472511, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 13822 }, { "completion_length": 222.6517972946167, "epoch": 2.3178674713944423, "grad_norm": 0.18778130385122507, "kl": 0.2147216796875, "learning_rate": 4.931580641739513e-07, "loss": 0.0002, "reward": 1.8214286342263222, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8214285932481289, "rewards/format_reward_func": 1.0, "step": 13824 }, { "completion_length": 230.2321548461914, "epoch": 2.31820277463431, "grad_norm": 0.09521929881811564, "kl": 0.161590576171875, "learning_rate": 4.931552766807718e-07, "loss": 0.0002, "reward": 1.7892857417464256, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 13826 }, { "completion_length": 234.70536708831787, "epoch": 2.3185380778741775, "grad_norm": 0.31405589164753217, "kl": 0.1912841796875, "learning_rate": 4.931524886277587e-07, "loss": 0.0002, "reward": 1.7267858237028122, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7312500197440386, "rewards/format_reward_func": 0.9955357164144516, "step": 13828 }, { "completion_length": 242.9107255935669, "epoch": 2.318873381114045, "grad_norm": 0.14866603002861797, "kl": 0.142608642578125, "learning_rate": 4.931497000149185e-07, "loss": 0.0001, "reward": 1.7401786372065544, "reward_std": 0.03409264795482159, "rewards/equation_reward_func": 0.7419643048197031, "rewards/format_reward_func": 0.9982142895460129, "step": 13830 }, { "completion_length": 234.8973331451416, "epoch": 2.319208684353913, "grad_norm": 0.25162094245415106, "kl": 0.150634765625, "learning_rate": 4.931469108422576e-07, "loss": 0.0002, "reward": 1.7482143566012383, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526785992085934, "rewards/format_reward_func": 0.9955357164144516, "step": 13832 }, { "completion_length": 246.16519355773926, "epoch": 2.31954398759378, "grad_norm": 0.11166127332516824, "kl": 0.158416748046875, "learning_rate": 4.931441211097822e-07, "loss": 0.0002, "reward": 1.7937500327825546, "reward_std": 0.029041884932667017, "rewards/equation_reward_func": 0.7955357301980257, "rewards/format_reward_func": 0.9982142895460129, "step": 13834 }, { "completion_length": 246.6875114440918, "epoch": 2.3198792908336476, "grad_norm": 0.21306896478258996, "kl": 0.1680908203125, "learning_rate": 4.93141330817499e-07, "loss": 0.0002, "reward": 1.725000075995922, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000312924385, "rewards/format_reward_func": 1.0, "step": 13836 }, { "completion_length": 245.06251335144043, "epoch": 2.3202145940735153, "grad_norm": 0.12055812100393454, "kl": 0.15869140625, "learning_rate": 4.931385399654143e-07, "loss": 0.0002, "reward": 1.751785784959793, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 13838 }, { "completion_length": 242.83929634094238, "epoch": 2.320549897313383, "grad_norm": 0.17749519435879707, "kl": 0.128753662109375, "learning_rate": 4.931357485535345e-07, "loss": 0.0001, "reward": 1.803571455180645, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714589059353, "rewards/format_reward_func": 1.0, "step": 13840 }, { "completion_length": 247.52233409881592, "epoch": 2.3208852005532505, "grad_norm": 0.05211308765186071, "kl": 0.16229248046875, "learning_rate": 4.93132956581866e-07, "loss": 0.0002, "reward": 1.775000050663948, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 13842 }, { "completion_length": 242.63840198516846, "epoch": 2.3212205037931177, "grad_norm": 0.22663570203284325, "kl": 0.146820068359375, "learning_rate": 4.931301640504154e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 13844 }, { "completion_length": 253.0312614440918, "epoch": 2.3215558070329854, "grad_norm": 0.16115678842341616, "kl": 0.141632080078125, "learning_rate": 4.93127370959189e-07, "loss": 0.0001, "reward": 1.7714286595582962, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714285925030708, "rewards/format_reward_func": 1.0, "step": 13846 }, { "completion_length": 257.8259000778198, "epoch": 2.321891110272853, "grad_norm": 0.23600484644051928, "kl": 0.141021728515625, "learning_rate": 4.931245773081931e-07, "loss": 0.0001, "reward": 1.7285715192556381, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.728571455925703, "rewards/format_reward_func": 1.0, "step": 13848 }, { "completion_length": 240.5000123977661, "epoch": 2.3222264135127206, "grad_norm": 0.14853695490593902, "kl": 0.16864013671875, "learning_rate": 4.931217830974344e-07, "loss": 0.0002, "reward": 1.7785714715719223, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 13850 }, { "completion_length": 253.27679634094238, "epoch": 2.3225617167525883, "grad_norm": 0.442639076297675, "kl": 0.14019775390625, "learning_rate": 4.931189883269193e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7607143074274063, "rewards/format_reward_func": 0.9821428656578064, "step": 13852 }, { "completion_length": 256.1741199493408, "epoch": 2.322897019992456, "grad_norm": 0.2172474753408119, "kl": 0.162078857421875, "learning_rate": 4.931161929966541e-07, "loss": 0.0002, "reward": 1.821428619325161, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.821428595110774, "rewards/format_reward_func": 1.0, "step": 13854 }, { "completion_length": 272.4464416503906, "epoch": 2.323232323232323, "grad_norm": 0.25720741051210894, "kl": 0.141754150390625, "learning_rate": 4.931133971066451e-07, "loss": 0.0001, "reward": 1.7410714849829674, "reward_std": 0.09343910869210958, "rewards/equation_reward_func": 0.7544643096625805, "rewards/format_reward_func": 0.9866071492433548, "step": 13856 }, { "completion_length": 252.602689743042, "epoch": 2.3235676264721907, "grad_norm": 0.19960302259651616, "kl": 0.136322021484375, "learning_rate": 4.931106006568991e-07, "loss": 0.0001, "reward": 1.7839286252856255, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928939700127, "rewards/format_reward_func": 0.9955357164144516, "step": 13858 }, { "completion_length": 257.57143783569336, "epoch": 2.3239029297120584, "grad_norm": 0.22315189200039845, "kl": 0.1318359375, "learning_rate": 4.931078036474224e-07, "loss": 0.0001, "reward": 1.7982143461704254, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8026785999536514, "rewards/format_reward_func": 0.9955357164144516, "step": 13860 }, { "completion_length": 268.7410840988159, "epoch": 2.324238232951926, "grad_norm": 0.6520892118623528, "kl": 0.4154052734375, "learning_rate": 4.931050060782212e-07, "loss": 0.0004, "reward": 1.7732143476605415, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.777678593993187, "rewards/format_reward_func": 0.9955357164144516, "step": 13862 }, { "completion_length": 263.2634038925171, "epoch": 2.3245735361917936, "grad_norm": 0.29998155060416803, "kl": 0.1378173828125, "learning_rate": 4.931022079493024e-07, "loss": 0.0001, "reward": 1.7285714969038963, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7285714522004128, "rewards/format_reward_func": 1.0, "step": 13864 }, { "completion_length": 277.0759038925171, "epoch": 2.324908839431661, "grad_norm": 0.09117265999784231, "kl": 0.144500732421875, "learning_rate": 4.930994092606719e-07, "loss": 0.0001, "reward": 1.7946429029107094, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7991071604192257, "rewards/format_reward_func": 0.9955357164144516, "step": 13866 }, { "completion_length": 279.62947368621826, "epoch": 2.3252441426715285, "grad_norm": 0.17952208593591043, "kl": 0.141357421875, "learning_rate": 4.930966100123366e-07, "loss": 0.0001, "reward": 1.8535714820027351, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8535714484751225, "rewards/format_reward_func": 1.0, "step": 13868 }, { "completion_length": 270.15179538726807, "epoch": 2.325579445911396, "grad_norm": 0.1446632522156985, "kl": 0.142425537109375, "learning_rate": 4.930938102043027e-07, "loss": 0.0001, "reward": 1.8446428999304771, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.8491071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 13870 }, { "completion_length": 287.2901906967163, "epoch": 2.3259147491512637, "grad_norm": 0.09508477728521918, "kl": 0.1434326171875, "learning_rate": 4.930910098365768e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7767857275903225, "rewards/format_reward_func": 0.9910714328289032, "step": 13872 }, { "completion_length": 287.87947845458984, "epoch": 2.3262500523911314, "grad_norm": 0.15208370071022806, "kl": 0.157379150390625, "learning_rate": 4.930882089091652e-07, "loss": 0.0002, "reward": 1.796428643167019, "reward_std": 0.04545686487108469, "rewards/equation_reward_func": 0.8053571730852127, "rewards/format_reward_func": 0.9910714328289032, "step": 13874 }, { "completion_length": 267.7678689956665, "epoch": 2.326585355630999, "grad_norm": 0.18081008050190195, "kl": 0.143707275390625, "learning_rate": 4.930854074220744e-07, "loss": 0.0001, "reward": 1.8000000342726707, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 13876 }, { "completion_length": 276.84376335144043, "epoch": 2.326920658870866, "grad_norm": 0.048354361422872846, "kl": 0.125518798828125, "learning_rate": 4.930826053753109e-07, "loss": 0.0001, "reward": 1.7732143253087997, "reward_std": 0.007576144300401211, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.9955357164144516, "step": 13878 }, { "completion_length": 278.4732303619385, "epoch": 2.327255962110734, "grad_norm": 0.1011312406782036, "kl": 0.16424560546875, "learning_rate": 4.930798027688811e-07, "loss": 0.0002, "reward": 1.7928571924567223, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 13880 }, { "completion_length": 274.27233123779297, "epoch": 2.3275912653506015, "grad_norm": 0.11005274619203892, "kl": 0.13623046875, "learning_rate": 4.930769996027915e-07, "loss": 0.0001, "reward": 1.7732143476605415, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.777678593993187, "rewards/format_reward_func": 0.9955357164144516, "step": 13882 }, { "completion_length": 269.85268783569336, "epoch": 2.327926568590469, "grad_norm": 0.08785502020528425, "kl": 0.13714599609375, "learning_rate": 4.930741958770485e-07, "loss": 0.0001, "reward": 1.769642911851406, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7741071842610836, "rewards/format_reward_func": 0.9955357164144516, "step": 13884 }, { "completion_length": 273.22322940826416, "epoch": 2.3282618718303367, "grad_norm": 0.04808767039662426, "kl": 0.149322509765625, "learning_rate": 4.930713915916585e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500223517418, "rewards/format_reward_func": 0.9955357164144516, "step": 13886 }, { "completion_length": 267.8437614440918, "epoch": 2.328597175070204, "grad_norm": 0.0033872799839121735, "kl": 0.1512451171875, "learning_rate": 4.930685867466281e-07, "loss": 0.0002, "reward": 1.8214285969734192, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.8214286006987095, "rewards/format_reward_func": 1.0, "step": 13888 }, { "completion_length": 277.64287185668945, "epoch": 2.3289324783100716, "grad_norm": 0.17916554169069204, "kl": 0.153594970703125, "learning_rate": 4.930657813419638e-07, "loss": 0.0002, "reward": 1.7303571850061417, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7437500283122063, "rewards/format_reward_func": 0.9866071492433548, "step": 13890 }, { "completion_length": 281.90626525878906, "epoch": 2.329267781549939, "grad_norm": 0.3585679228241435, "kl": 0.16705322265625, "learning_rate": 4.930629753776717e-07, "loss": 0.0002, "reward": 1.7464286386966705, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7553571686148643, "rewards/format_reward_func": 0.9910714328289032, "step": 13892 }, { "completion_length": 265.11162185668945, "epoch": 2.329603084789807, "grad_norm": 0.18914141170325072, "kl": 0.13250732421875, "learning_rate": 4.930601688537586e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.753571443259716, "rewards/format_reward_func": 1.0, "step": 13894 }, { "completion_length": 262.7500104904175, "epoch": 2.3299383880296745, "grad_norm": 0.03789824323268437, "kl": 0.121246337890625, "learning_rate": 4.930573617702311e-07, "loss": 0.0001, "reward": 1.7839286178350449, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 13896 }, { "completion_length": 262.20537185668945, "epoch": 2.330273691269542, "grad_norm": 0.2800962462007646, "kl": 0.1329345703125, "learning_rate": 4.930545541270952e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571753203869, "rewards/format_reward_func": 1.0, "step": 13898 }, { "completion_length": 267.5892963409424, "epoch": 2.3306089945094093, "grad_norm": 0.31285336060043917, "kl": 0.133331298828125, "learning_rate": 4.930517459243576e-07, "loss": 0.0001, "reward": 1.7803572043776512, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7848214544355869, "rewards/format_reward_func": 0.9955357164144516, "step": 13900 }, { "completion_length": 262.29465770721436, "epoch": 2.330944297749277, "grad_norm": 0.2676019483729929, "kl": 0.142913818359375, "learning_rate": 4.930489371620247e-07, "loss": 0.0001, "reward": 1.7696429342031479, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7741071656346321, "rewards/format_reward_func": 0.9955357164144516, "step": 13902 }, { "completion_length": 249.6071538925171, "epoch": 2.3312796009891446, "grad_norm": 0.2603757129759202, "kl": 0.135467529296875, "learning_rate": 4.930461278401033e-07, "loss": 0.0001, "reward": 1.8142857551574707, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 13904 }, { "completion_length": 255.84822463989258, "epoch": 2.331614904229012, "grad_norm": 0.17282725467474225, "kl": 0.143646240234375, "learning_rate": 4.930433179585995e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 13906 }, { "completion_length": 260.2187614440918, "epoch": 2.33195020746888, "grad_norm": 0.08766943170738209, "kl": 0.1429443359375, "learning_rate": 4.930405075175197e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 13908 }, { "completion_length": 256.4375114440918, "epoch": 2.332285510708747, "grad_norm": 0.20621206310280166, "kl": 0.1400146484375, "learning_rate": 4.930376965168707e-07, "loss": 0.0001, "reward": 1.8214286267757416, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8214285969734192, "rewards/format_reward_func": 1.0, "step": 13910 }, { "completion_length": 263.1875114440918, "epoch": 2.3326208139486146, "grad_norm": 0.20267854080407852, "kl": 0.13336181640625, "learning_rate": 4.930348849566587e-07, "loss": 0.0001, "reward": 1.7250000834465027, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000350177288, "rewards/format_reward_func": 1.0, "step": 13912 }, { "completion_length": 247.66072750091553, "epoch": 2.3329561171884823, "grad_norm": 0.149466506521507, "kl": 0.114837646484375, "learning_rate": 4.930320728368904e-07, "loss": 0.0001, "reward": 1.730357214808464, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7348214611411095, "rewards/format_reward_func": 0.9955357164144516, "step": 13914 }, { "completion_length": 261.1339406967163, "epoch": 2.33329142042835, "grad_norm": 0.21434438156759789, "kl": 0.137481689453125, "learning_rate": 4.930292601575721e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714328289032, "rewards/format_reward_func": 1.0, "step": 13916 }, { "completion_length": 250.38393783569336, "epoch": 2.3336267236682176, "grad_norm": 0.16715115682608317, "kl": 0.128936767578125, "learning_rate": 4.930264469187103e-07, "loss": 0.0001, "reward": 1.764285758137703, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7642857432365417, "rewards/format_reward_func": 1.0, "step": 13918 }, { "completion_length": 255.82144165039062, "epoch": 2.333962026908085, "grad_norm": 0.12919460625180537, "kl": 0.16015625, "learning_rate": 4.930236331203116e-07, "loss": 0.0002, "reward": 1.789285808801651, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857380211353, "rewards/format_reward_func": 1.0, "step": 13920 }, { "completion_length": 254.96429920196533, "epoch": 2.3342973301479524, "grad_norm": 0.19431944763476283, "kl": 0.1590576171875, "learning_rate": 4.930208187623823e-07, "loss": 0.0002, "reward": 1.7892857640981674, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857529222965, "rewards/format_reward_func": 1.0, "step": 13922 }, { "completion_length": 252.14733409881592, "epoch": 2.33463263338782, "grad_norm": 0.5509848451277506, "kl": 0.17974853515625, "learning_rate": 4.93018003844929e-07, "loss": 0.0002, "reward": 1.758928619325161, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 13924 }, { "completion_length": 248.67858123779297, "epoch": 2.3349679366276876, "grad_norm": 0.06223604978287845, "kl": 0.12445068359375, "learning_rate": 4.930151883679582e-07, "loss": 0.0001, "reward": 1.7321429178118706, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7321428954601288, "rewards/format_reward_func": 1.0, "step": 13926 }, { "completion_length": 251.88840293884277, "epoch": 2.3353032398675553, "grad_norm": 0.2945459558718312, "kl": 0.14276123046875, "learning_rate": 4.930123723314763e-07, "loss": 0.0001, "reward": 1.7696429193019867, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7830357365310192, "rewards/format_reward_func": 0.9866071492433548, "step": 13928 }, { "completion_length": 246.0223331451416, "epoch": 2.335638543107423, "grad_norm": 0.20297409443292536, "kl": 0.1510009765625, "learning_rate": 4.930095557354897e-07, "loss": 0.0002, "reward": 1.7821429446339607, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428682655096, "rewards/format_reward_func": 1.0, "step": 13930 }, { "completion_length": 252.99108505249023, "epoch": 2.33597384634729, "grad_norm": 0.17278858736936387, "kl": 0.183074951171875, "learning_rate": 4.930067385800051e-07, "loss": 0.0002, "reward": 1.785714365541935, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 13932 }, { "completion_length": 261.7634048461914, "epoch": 2.3363091495871577, "grad_norm": 0.3142320730365933, "kl": 0.22943115234375, "learning_rate": 4.93003920865029e-07, "loss": 0.0002, "reward": 1.6785715073347092, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.6785714589059353, "rewards/format_reward_func": 1.0, "step": 13934 }, { "completion_length": 263.3169755935669, "epoch": 2.3366444528270254, "grad_norm": 0.16214082641153868, "kl": 0.15496826171875, "learning_rate": 4.930011025905677e-07, "loss": 0.0002, "reward": 1.7714286223053932, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7803571727126837, "rewards/format_reward_func": 0.9910714328289032, "step": 13936 }, { "completion_length": 264.99554920196533, "epoch": 2.336979756066893, "grad_norm": 0.29927774244425503, "kl": 0.158111572265625, "learning_rate": 4.929982837566277e-07, "loss": 0.0002, "reward": 1.7517857626080513, "reward_std": 0.08838834520429373, "rewards/equation_reward_func": 0.7651786096394062, "rewards/format_reward_func": 0.9866071492433548, "step": 13938 }, { "completion_length": 270.9241199493408, "epoch": 2.3373150593067606, "grad_norm": 0.13127494720037441, "kl": 0.209259033203125, "learning_rate": 4.929954643632156e-07, "loss": 0.0002, "reward": 1.7696429342031479, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7741071805357933, "rewards/format_reward_func": 0.9955357164144516, "step": 13940 }, { "completion_length": 260.0669765472412, "epoch": 2.3376503625466283, "grad_norm": 0.1928191917215741, "kl": 0.188751220703125, "learning_rate": 4.929926444103378e-07, "loss": 0.0002, "reward": 1.7767857685685158, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500223517418, "rewards/format_reward_func": 0.9955357164144516, "step": 13942 }, { "completion_length": 259.8750123977661, "epoch": 2.3379856657864955, "grad_norm": 0.07497032977190916, "kl": 0.191162109375, "learning_rate": 4.92989823898001e-07, "loss": 0.0002, "reward": 1.6946429312229156, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.699107188731432, "rewards/format_reward_func": 0.9955357164144516, "step": 13944 }, { "completion_length": 255.8348331451416, "epoch": 2.338320969026363, "grad_norm": 0.1732593334616933, "kl": 0.129974365234375, "learning_rate": 4.929870028262113e-07, "loss": 0.0001, "reward": 1.7910714596509933, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.795535746961832, "rewards/format_reward_func": 0.9955357164144516, "step": 13946 }, { "completion_length": 258.7187614440918, "epoch": 2.3386562722662307, "grad_norm": 0.17122023310010204, "kl": 33.64996337890625, "learning_rate": 4.929841811949755e-07, "loss": 0.0336, "reward": 1.7410714849829674, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7455357424914837, "rewards/format_reward_func": 0.9955357164144516, "step": 13948 }, { "completion_length": 257.1428689956665, "epoch": 2.3389915755060984, "grad_norm": 0.15583217546042194, "kl": 0.14776611328125, "learning_rate": 4.929813590043001e-07, "loss": 0.0001, "reward": 1.6910715028643608, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.695535758510232, "rewards/format_reward_func": 0.9955357164144516, "step": 13950 }, { "completion_length": 248.93750858306885, "epoch": 2.339326878745966, "grad_norm": 0.20410850654255108, "kl": 0.15264892578125, "learning_rate": 4.929785362541915e-07, "loss": 0.0002, "reward": 1.7214286401867867, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7303571775555611, "rewards/format_reward_func": 0.9910714328289032, "step": 13952 }, { "completion_length": 246.821439743042, "epoch": 2.339662181985833, "grad_norm": 0.19591857120170567, "kl": 0.222259521484375, "learning_rate": 4.929757129446562e-07, "loss": 0.0002, "reward": 1.739285796880722, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7482143118977547, "rewards/format_reward_func": 0.9910714328289032, "step": 13954 }, { "completion_length": 242.6428689956665, "epoch": 2.339997485225701, "grad_norm": 0.1094692940084371, "kl": 0.172698974609375, "learning_rate": 4.929728890757007e-07, "loss": 0.0002, "reward": 1.8017857670783997, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.806250024586916, "rewards/format_reward_func": 0.9955357164144516, "step": 13956 }, { "completion_length": 267.64732933044434, "epoch": 2.3403327884655685, "grad_norm": 0.2566541790875162, "kl": 0.1883544921875, "learning_rate": 4.929700646473315e-07, "loss": 0.0002, "reward": 1.7571429312229156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 13958 }, { "completion_length": 254.52233123779297, "epoch": 2.340668091705436, "grad_norm": 0.17341243088634806, "kl": 0.224151611328125, "learning_rate": 4.929672396595553e-07, "loss": 0.0002, "reward": 1.7625000551342964, "reward_std": 0.07323605939745903, "rewards/equation_reward_func": 0.7848214488476515, "rewards/format_reward_func": 0.977678582072258, "step": 13960 }, { "completion_length": 253.27233028411865, "epoch": 2.3410033949453037, "grad_norm": 0.22604397912429652, "kl": 0.131744384765625, "learning_rate": 4.929644141123783e-07, "loss": 0.0001, "reward": 1.7732143327593803, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 13962 }, { "completion_length": 251.7678680419922, "epoch": 2.3413386981851714, "grad_norm": 0.6640328142423192, "kl": 0.463836669921875, "learning_rate": 4.92961588005807e-07, "loss": 0.0005, "reward": 1.7267857789993286, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.740178607404232, "rewards/format_reward_func": 0.9866071492433548, "step": 13964 }, { "completion_length": 248.24108600616455, "epoch": 2.341674001425039, "grad_norm": 0.10695830413721986, "kl": 0.135406494140625, "learning_rate": 4.929587613398481e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 13966 }, { "completion_length": 243.7053689956665, "epoch": 2.342009304664906, "grad_norm": 0.24921575355770892, "kl": 0.1546630859375, "learning_rate": 4.929559341145082e-07, "loss": 0.0002, "reward": 1.7767857685685158, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500409781933, "rewards/format_reward_func": 0.9955357164144516, "step": 13968 }, { "completion_length": 245.3750114440918, "epoch": 2.342344607904774, "grad_norm": 0.1565350284699649, "kl": 0.681243896484375, "learning_rate": 4.929531063297936e-07, "loss": 0.0007, "reward": 1.800000049173832, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 13970 }, { "completion_length": 242.3437614440918, "epoch": 2.3426799111446415, "grad_norm": 0.15593943795376905, "kl": 0.13665771484375, "learning_rate": 4.929502779857108e-07, "loss": 0.0001, "reward": 1.7160715088248253, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7205357365310192, "rewards/format_reward_func": 0.9955357164144516, "step": 13972 }, { "completion_length": 246.63840293884277, "epoch": 2.343015214384509, "grad_norm": 0.13590393267799425, "kl": 0.175994873046875, "learning_rate": 4.929474490822665e-07, "loss": 0.0002, "reward": 1.825000062584877, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8250000290572643, "rewards/format_reward_func": 1.0, "step": 13974 }, { "completion_length": 234.18751049041748, "epoch": 2.3433505176243767, "grad_norm": 0.4499483960615894, "kl": 0.209136962890625, "learning_rate": 4.92944619619467e-07, "loss": 0.0002, "reward": 1.7910714894533157, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7955357357859612, "rewards/format_reward_func": 0.9955357164144516, "step": 13976 }, { "completion_length": 241.38394165039062, "epoch": 2.343685820864244, "grad_norm": 0.25160773088122984, "kl": 0.16693115234375, "learning_rate": 4.929417895973189e-07, "loss": 0.0002, "reward": 1.7678572237491608, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 13978 }, { "completion_length": 242.8392972946167, "epoch": 2.3440211241041116, "grad_norm": 0.19259065367274678, "kl": 0.203277587890625, "learning_rate": 4.929389590158287e-07, "loss": 0.0002, "reward": 1.719642922282219, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.724107176065445, "rewards/format_reward_func": 0.9955357164144516, "step": 13980 }, { "completion_length": 234.7187614440918, "epoch": 2.344356427343979, "grad_norm": 0.16228758744483066, "kl": 0.11175537109375, "learning_rate": 4.929361278750031e-07, "loss": 0.0001, "reward": 1.8107143193483353, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143230736256, "rewards/format_reward_func": 1.0, "step": 13982 }, { "completion_length": 237.6205472946167, "epoch": 2.344691730583847, "grad_norm": 0.1776062458911018, "kl": 0.115142822265625, "learning_rate": 4.929332961748482e-07, "loss": 0.0001, "reward": 1.7910714969038963, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7955357395112514, "rewards/format_reward_func": 0.9955357164144516, "step": 13984 }, { "completion_length": 249.27679538726807, "epoch": 2.3450270338237145, "grad_norm": 0.26582332087857086, "kl": 0.185760498046875, "learning_rate": 4.92930463915371e-07, "loss": 0.0002, "reward": 1.7571429461240768, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7660714536905289, "rewards/format_reward_func": 0.9910714328289032, "step": 13986 }, { "completion_length": 237.51786708831787, "epoch": 2.345362337063582, "grad_norm": 0.26260524688968806, "kl": 0.1392822265625, "learning_rate": 4.929276310965778e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.06060915160924196, "rewards/equation_reward_func": 0.7946428805589676, "rewards/format_reward_func": 0.9910714328289032, "step": 13988 }, { "completion_length": 242.7634048461914, "epoch": 2.3456976403034493, "grad_norm": 0.5846938540785703, "kl": 0.22698974609375, "learning_rate": 4.92924797718475e-07, "loss": 0.0002, "reward": 1.8017857745289803, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8062500208616257, "rewards/format_reward_func": 0.9955357164144516, "step": 13990 }, { "completion_length": 247.4241180419922, "epoch": 2.346032943543317, "grad_norm": 0.232436541671342, "kl": 0.162109375, "learning_rate": 4.929219637810693e-07, "loss": 0.0002, "reward": 1.848214328289032, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8526785969734192, "rewards/format_reward_func": 0.9955357164144516, "step": 13992 }, { "completion_length": 249.7321548461914, "epoch": 2.3463682467831846, "grad_norm": 0.29302366800324925, "kl": 0.1395263671875, "learning_rate": 4.929191292843672e-07, "loss": 0.0001, "reward": 1.7303572297096252, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7348214462399483, "rewards/format_reward_func": 0.9955357164144516, "step": 13994 }, { "completion_length": 246.446439743042, "epoch": 2.346703550023052, "grad_norm": 0.18002907127930834, "kl": 0.11785888671875, "learning_rate": 4.929162942283751e-07, "loss": 0.0001, "reward": 1.7839286252856255, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7883928865194321, "rewards/format_reward_func": 0.9955357164144516, "step": 13996 }, { "completion_length": 248.5357255935669, "epoch": 2.34703885326292, "grad_norm": 0.30579652194045426, "kl": 0.1209716796875, "learning_rate": 4.929134586130998e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7633928991854191, "rewards/format_reward_func": 0.9955357164144516, "step": 13998 }, { "completion_length": 249.0446538925171, "epoch": 2.347374156502787, "grad_norm": 0.2357568781576199, "kl": 0.103302001953125, "learning_rate": 4.929106224385475e-07, "loss": 0.0001, "reward": 1.7964286729693413, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 14000 }, { "completion_length": 258.7678699493408, "epoch": 2.3477094597426547, "grad_norm": 0.13462703461072748, "kl": 0.127471923828125, "learning_rate": 4.929077857047249e-07, "loss": 0.0001, "reward": 1.7250000834465027, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7339286115020514, "rewards/format_reward_func": 0.9910714328289032, "step": 14002 }, { "completion_length": 253.50447368621826, "epoch": 2.3480447629825223, "grad_norm": 0.25273754623362055, "kl": 0.14434814453125, "learning_rate": 4.929049484116386e-07, "loss": 0.0001, "reward": 1.7607143446803093, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.7696428708732128, "rewards/format_reward_func": 0.9910714328289032, "step": 14004 }, { "completion_length": 258.54018783569336, "epoch": 2.34838006622239, "grad_norm": 0.19787426664635152, "kl": 0.123138427734375, "learning_rate": 4.929021105592949e-07, "loss": 0.0001, "reward": 1.8232143074274063, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.827678594738245, "rewards/format_reward_func": 0.9955357164144516, "step": 14006 }, { "completion_length": 271.6116199493408, "epoch": 2.3487153694622576, "grad_norm": 0.14992894677494317, "kl": 0.147003173828125, "learning_rate": 4.928992721477005e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7803571708500385, "rewards/format_reward_func": 0.9910714328289032, "step": 14008 }, { "completion_length": 263.29911708831787, "epoch": 2.349050672702125, "grad_norm": 0.0617426976233598, "kl": 0.131195068359375, "learning_rate": 4.928964331768621e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7571428772062063, "rewards/format_reward_func": 1.0, "step": 14010 }, { "completion_length": 253.83929920196533, "epoch": 2.3493859759419924, "grad_norm": 0.07002114062756537, "kl": 0.154541015625, "learning_rate": 4.928935936467859e-07, "loss": 0.0002, "reward": 1.748214341700077, "reward_std": 0.012626906856894493, "rewards/equation_reward_func": 0.7526786122471094, "rewards/format_reward_func": 0.9955357164144516, "step": 14012 }, { "completion_length": 250.7991180419922, "epoch": 2.34972127918186, "grad_norm": 0.25174814949002133, "kl": 0.28607177734375, "learning_rate": 4.928907535574786e-07, "loss": 0.0003, "reward": 1.7625000551342964, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 14014 }, { "completion_length": 253.5625123977661, "epoch": 2.3500565824217277, "grad_norm": 0.17426929995654813, "kl": 0.119384765625, "learning_rate": 4.928879129089468e-07, "loss": 0.0001, "reward": 1.7732143551111221, "reward_std": 0.03788072057068348, "rewards/equation_reward_func": 0.7776786163449287, "rewards/format_reward_func": 0.9955357164144516, "step": 14016 }, { "completion_length": 262.9821538925171, "epoch": 2.3503918856615953, "grad_norm": 0.1572711767458029, "kl": 0.310943603515625, "learning_rate": 4.928850717011969e-07, "loss": 0.0003, "reward": 1.6875000819563866, "reward_std": 0.07828682288527489, "rewards/equation_reward_func": 0.7008928880095482, "rewards/format_reward_func": 0.9866071492433548, "step": 14018 }, { "completion_length": 249.61608219146729, "epoch": 2.350727188901463, "grad_norm": 0.1388379776191087, "kl": 0.14971923828125, "learning_rate": 4.928822299342355e-07, "loss": 0.0001, "reward": 1.7910714969038963, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7955357395112514, "rewards/format_reward_func": 0.9955357164144516, "step": 14020 }, { "completion_length": 249.49554634094238, "epoch": 2.35106249214133, "grad_norm": 0.17107019366412024, "kl": 0.875244140625, "learning_rate": 4.928793876080692e-07, "loss": 0.0009, "reward": 1.7482143491506577, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7526785880327225, "rewards/format_reward_func": 0.9955357164144516, "step": 14022 }, { "completion_length": 242.68304824829102, "epoch": 2.3513977953811978, "grad_norm": 0.1720167275916922, "kl": 0.295806884765625, "learning_rate": 4.928765447227044e-07, "loss": 0.0003, "reward": 1.753571480512619, "reward_std": 0.04545686487108469, "rewards/equation_reward_func": 0.7625000327825546, "rewards/format_reward_func": 0.9910714328289032, "step": 14024 }, { "completion_length": 244.09376430511475, "epoch": 2.3517330986210654, "grad_norm": 0.133536958377386, "kl": 0.123748779296875, "learning_rate": 4.928737012781479e-07, "loss": 0.0001, "reward": 1.737500049173832, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7419643308967352, "rewards/format_reward_func": 0.9955357164144516, "step": 14026 }, { "completion_length": 246.2053680419922, "epoch": 2.352068401860933, "grad_norm": 0.26511754390693604, "kl": 0.18597412109375, "learning_rate": 4.92870857274406e-07, "loss": 0.0002, "reward": 1.7446428909897804, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7491071857511997, "rewards/format_reward_func": 0.9955357164144516, "step": 14028 }, { "completion_length": 239.23661708831787, "epoch": 2.3524037051008007, "grad_norm": 0.13234529295434244, "kl": 0.11456298828125, "learning_rate": 4.928680127114854e-07, "loss": 0.0001, "reward": 1.8250000178813934, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8250000197440386, "rewards/format_reward_func": 1.0, "step": 14030 }, { "completion_length": 241.6696548461914, "epoch": 2.3527390083406683, "grad_norm": 0.1214244184569556, "kl": 0.145416259765625, "learning_rate": 4.928651675893925e-07, "loss": 0.0001, "reward": 1.787500038743019, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7919643186032772, "rewards/format_reward_func": 0.9955357164144516, "step": 14032 }, { "completion_length": 240.49108409881592, "epoch": 2.3530743115805355, "grad_norm": 0.434751465835638, "kl": 1.688201904296875, "learning_rate": 4.92862321908134e-07, "loss": 0.0017, "reward": 1.773214340209961, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776785865426064, "rewards/format_reward_func": 0.9955357164144516, "step": 14034 }, { "completion_length": 250.0357265472412, "epoch": 2.353409614820403, "grad_norm": 0.15028585969842692, "kl": 0.15380859375, "learning_rate": 4.928594756677163e-07, "loss": 0.0002, "reward": 1.796428620815277, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 14036 }, { "completion_length": 231.30358219146729, "epoch": 2.3537449180602708, "grad_norm": 0.2420700954221213, "kl": 0.0980072021484375, "learning_rate": 4.928566288681461e-07, "loss": 0.0001, "reward": 1.8075893595814705, "reward_std": 0.0498762815259397, "rewards/equation_reward_func": 0.8133928757160902, "rewards/format_reward_func": 0.9941964335739613, "step": 14038 }, { "completion_length": 237.0892972946167, "epoch": 2.3540802213001384, "grad_norm": 0.14530807503245907, "kl": 0.228179931640625, "learning_rate": 4.928537815094299e-07, "loss": 0.0002, "reward": 1.8250000402331352, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8250000290572643, "rewards/format_reward_func": 1.0, "step": 14040 }, { "completion_length": 244.4509048461914, "epoch": 2.354415524540006, "grad_norm": 0.11180813148997293, "kl": 1.75067138671875, "learning_rate": 4.928509335915742e-07, "loss": 0.0017, "reward": 1.7803571820259094, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7848214525729418, "rewards/format_reward_func": 0.9955357164144516, "step": 14042 }, { "completion_length": 241.87947845458984, "epoch": 2.354750827779873, "grad_norm": 0.2982834793049277, "kl": 0.1754150390625, "learning_rate": 4.928480851145857e-07, "loss": 0.0002, "reward": 1.8321428894996643, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8321428783237934, "rewards/format_reward_func": 1.0, "step": 14044 }, { "completion_length": 242.92411708831787, "epoch": 2.355086131019741, "grad_norm": 0.08737988500493561, "kl": 0.383636474609375, "learning_rate": 4.928452360784709e-07, "loss": 0.0004, "reward": 1.751785770058632, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7562500294297934, "rewards/format_reward_func": 0.9955357164144516, "step": 14046 }, { "completion_length": 240.31697463989258, "epoch": 2.3554214342596085, "grad_norm": 0.20044606430797915, "kl": 0.116455078125, "learning_rate": 4.928423864832362e-07, "loss": 0.0001, "reward": 1.7428571805357933, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7517857290804386, "rewards/format_reward_func": 0.9910714328289032, "step": 14048 }, { "completion_length": 235.4866189956665, "epoch": 2.355756737499476, "grad_norm": 0.2400343553634341, "kl": 0.223541259765625, "learning_rate": 4.928395363288883e-07, "loss": 0.0002, "reward": 1.8500000685453415, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.8589285761117935, "rewards/format_reward_func": 0.9910714328289032, "step": 14050 }, { "completion_length": 224.12947463989258, "epoch": 2.3560920407393438, "grad_norm": 0.11322100214358818, "kl": 0.1439056396484375, "learning_rate": 4.928366856154338e-07, "loss": 0.0001, "reward": 1.7714286223053932, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7714286111295223, "rewards/format_reward_func": 1.0, "step": 14052 }, { "completion_length": 241.1250114440918, "epoch": 2.3564273439792114, "grad_norm": 0.2621128270019946, "kl": 0.87982177734375, "learning_rate": 4.928338343428791e-07, "loss": 0.0009, "reward": 1.7732143253087997, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7866071797907352, "rewards/format_reward_func": 0.9866071492433548, "step": 14054 }, { "completion_length": 237.6250114440918, "epoch": 2.3567626472190786, "grad_norm": 0.30913754789060155, "kl": 0.264068603515625, "learning_rate": 4.928309825112311e-07, "loss": 0.0003, "reward": 1.76071435213089, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143186032772, "rewards/format_reward_func": 1.0, "step": 14056 }, { "completion_length": 232.92858219146729, "epoch": 2.357097950458946, "grad_norm": 0.00804325255606672, "kl": 0.174468994140625, "learning_rate": 4.92828130120496e-07, "loss": 0.0002, "reward": 1.8035714775323868, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 14058 }, { "completion_length": 236.0357255935669, "epoch": 2.357433253698814, "grad_norm": 0.10639365932382884, "kl": 0.263580322265625, "learning_rate": 4.928252771706805e-07, "loss": 0.0003, "reward": 1.8250000476837158, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8250000178813934, "rewards/format_reward_func": 1.0, "step": 14060 }, { "completion_length": 228.81250858306885, "epoch": 2.3577685569386815, "grad_norm": 0.22752689175076363, "kl": 0.15313720703125, "learning_rate": 4.928224236617912e-07, "loss": 0.0002, "reward": 1.7928571924567223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571775555611, "rewards/format_reward_func": 1.0, "step": 14062 }, { "completion_length": 244.99554634094238, "epoch": 2.358103860178549, "grad_norm": 0.13870489497186492, "kl": 0.566314697265625, "learning_rate": 4.928195695938346e-07, "loss": 0.0006, "reward": 1.7535714879631996, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7625000327825546, "rewards/format_reward_func": 0.9910714328289032, "step": 14064 }, { "completion_length": 239.48215103149414, "epoch": 2.3584391634184163, "grad_norm": 0.28570805232389446, "kl": 0.2423095703125, "learning_rate": 4.928167149668174e-07, "loss": 0.0002, "reward": 1.8071429133415222, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 14066 }, { "completion_length": 234.80804920196533, "epoch": 2.358774466658284, "grad_norm": 0.08342651959363287, "kl": 0.23736572265625, "learning_rate": 4.928138597807461e-07, "loss": 0.0002, "reward": 1.7357143387198448, "reward_std": 0.012626906856894493, "rewards/equation_reward_func": 0.7437500357627869, "rewards/format_reward_func": 0.9919642880558968, "step": 14068 }, { "completion_length": 242.87947368621826, "epoch": 2.3591097698981516, "grad_norm": 0.14099043974641465, "kl": 1.082427978515625, "learning_rate": 4.928110040356272e-07, "loss": 0.0011, "reward": 1.7553572058677673, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7598214484751225, "rewards/format_reward_func": 0.9955357164144516, "step": 14070 }, { "completion_length": 228.77233123779297, "epoch": 2.359445073138019, "grad_norm": 0.17428381709094845, "kl": 0.250213623046875, "learning_rate": 4.928081477314674e-07, "loss": 0.0003, "reward": 1.805357202887535, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8098214529454708, "rewards/format_reward_func": 0.9955357164144516, "step": 14072 }, { "completion_length": 223.40179538726807, "epoch": 2.359780376377887, "grad_norm": 0.0054469211535804855, "kl": 0.17437744140625, "learning_rate": 4.928052908682731e-07, "loss": 0.0002, "reward": 1.783928595483303, "reward_std": 0.012626906856894493, "rewards/equation_reward_func": 0.7883929014205933, "rewards/format_reward_func": 0.9955357164144516, "step": 14074 }, { "completion_length": 228.59822273254395, "epoch": 2.3601156796177545, "grad_norm": 0.1825644958873576, "kl": 0.489898681640625, "learning_rate": 4.928024334460513e-07, "loss": 0.0005, "reward": 1.7553572058677673, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7598214671015739, "rewards/format_reward_func": 0.9955357164144516, "step": 14076 }, { "completion_length": 227.62947273254395, "epoch": 2.3604509828576217, "grad_norm": 0.28957667120115893, "kl": 0.1666259765625, "learning_rate": 4.92799575464808e-07, "loss": 0.0002, "reward": 1.825000062584877, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8250000216066837, "rewards/format_reward_func": 1.0, "step": 14078 }, { "completion_length": 228.77233123779297, "epoch": 2.3607862860974893, "grad_norm": 0.15762237281826855, "kl": 0.22186279296875, "learning_rate": 4.9279671692455e-07, "loss": 0.0002, "reward": 1.7964286133646965, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964286003261805, "rewards/format_reward_func": 1.0, "step": 14080 }, { "completion_length": 231.45536708831787, "epoch": 2.361121589337357, "grad_norm": 0.1459108784338708, "kl": 0.273040771484375, "learning_rate": 4.927938578252841e-07, "loss": 0.0003, "reward": 1.7892857939004898, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 14082 }, { "completion_length": 233.0803689956665, "epoch": 2.3614568925772246, "grad_norm": 0.35305294761911016, "kl": 0.149017333984375, "learning_rate": 4.927909981670166e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857413738966, "rewards/format_reward_func": 1.0, "step": 14084 }, { "completion_length": 242.602689743042, "epoch": 2.361792195817092, "grad_norm": 0.13617110402336785, "kl": 0.170318603515625, "learning_rate": 4.927881379497544e-07, "loss": 0.0002, "reward": 1.7357143685221672, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 14086 }, { "completion_length": 237.3884038925171, "epoch": 2.3621274990569594, "grad_norm": 0.135181646646026, "kl": 0.1388092041015625, "learning_rate": 4.927852771735037e-07, "loss": 0.0001, "reward": 1.775000050663948, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 14088 }, { "completion_length": 227.25000953674316, "epoch": 2.362462802296827, "grad_norm": 0.17809936062049062, "kl": 0.260589599609375, "learning_rate": 4.927824158382713e-07, "loss": 0.0003, "reward": 1.8053572103381157, "reward_std": 0.0328299580141902, "rewards/equation_reward_func": 0.8098214454948902, "rewards/format_reward_func": 0.9955357164144516, "step": 14090 }, { "completion_length": 231.6562614440918, "epoch": 2.3627981055366947, "grad_norm": 0.2369996115166829, "kl": 0.130584716796875, "learning_rate": 4.927795539440638e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 14092 }, { "completion_length": 243.64286613464355, "epoch": 2.3631334087765623, "grad_norm": 0.15600298133011206, "kl": 0.2369384765625, "learning_rate": 4.927766914908878e-07, "loss": 0.0002, "reward": 1.7928571850061417, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 14094 }, { "completion_length": 229.9241189956665, "epoch": 2.36346871201643, "grad_norm": 0.2560262655058894, "kl": 0.256927490234375, "learning_rate": 4.927738284787497e-07, "loss": 0.0003, "reward": 1.7446429207921028, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7491071689873934, "rewards/format_reward_func": 0.9955357164144516, "step": 14096 }, { "completion_length": 238.24108409881592, "epoch": 2.3638040152562976, "grad_norm": 0.13041247601855738, "kl": 0.255828857421875, "learning_rate": 4.927709649076564e-07, "loss": 0.0003, "reward": 1.8035714849829674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714514553547, "rewards/format_reward_func": 1.0, "step": 14098 }, { "completion_length": 244.14286994934082, "epoch": 2.364139318496165, "grad_norm": 0.18896033080219826, "kl": 0.119232177734375, "learning_rate": 4.927681007776142e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000223517418, "rewards/format_reward_func": 1.0, "step": 14100 }, { "completion_length": 230.97322463989258, "epoch": 2.3644746217360324, "grad_norm": 0.2740426705203649, "kl": 0.307281494140625, "learning_rate": 4.927652360886299e-07, "loss": 0.0003, "reward": 1.7375000566244125, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.74196432903409, "rewards/format_reward_func": 0.9955357164144516, "step": 14102 }, { "completion_length": 243.58036994934082, "epoch": 2.3648099249759, "grad_norm": 0.28152166534803, "kl": 0.393951416015625, "learning_rate": 4.927623708407099e-07, "loss": 0.0004, "reward": 1.7339286357164383, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.738392885774374, "rewards/format_reward_func": 0.9955357164144516, "step": 14104 }, { "completion_length": 231.9553689956665, "epoch": 2.3651452282157677, "grad_norm": 0.1982005460353846, "kl": 0.279541015625, "learning_rate": 4.92759505033861e-07, "loss": 0.0003, "reward": 1.750000074505806, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7500000186264515, "rewards/format_reward_func": 1.0, "step": 14106 }, { "completion_length": 236.05358123779297, "epoch": 2.3654805314556353, "grad_norm": 0.3090752866343735, "kl": 0.13604736328125, "learning_rate": 4.927566386680897e-07, "loss": 0.0001, "reward": 1.739285796880722, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7482143118977547, "rewards/format_reward_func": 0.9910714328289032, "step": 14108 }, { "completion_length": 236.45090579986572, "epoch": 2.365815834695503, "grad_norm": 0.3107913359321936, "kl": 0.2520751953125, "learning_rate": 4.927537717434027e-07, "loss": 0.0003, "reward": 1.7785714864730835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714529454708, "rewards/format_reward_func": 1.0, "step": 14110 }, { "completion_length": 236.11161994934082, "epoch": 2.36615113793537, "grad_norm": 0.29293999141656574, "kl": 0.412841796875, "learning_rate": 4.927509042598064e-07, "loss": 0.0004, "reward": 1.7250000834465027, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7250000201165676, "rewards/format_reward_func": 1.0, "step": 14112 }, { "completion_length": 225.22768783569336, "epoch": 2.3664864411752378, "grad_norm": 0.37355280432683863, "kl": 0.17962646484375, "learning_rate": 4.927480362173075e-07, "loss": 0.0002, "reward": 1.8000000715255737, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 14114 }, { "completion_length": 237.2276906967163, "epoch": 2.3668217444151054, "grad_norm": 0.0930373437809911, "kl": 0.1688232421875, "learning_rate": 4.927451676159127e-07, "loss": 0.0002, "reward": 1.785714328289032, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 14116 }, { "completion_length": 238.4107255935669, "epoch": 2.367157047654973, "grad_norm": 0.17907423202000464, "kl": 0.161224365234375, "learning_rate": 4.927422984556284e-07, "loss": 0.0002, "reward": 1.785714328289032, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7946428991854191, "rewards/format_reward_func": 0.9910714328289032, "step": 14118 }, { "completion_length": 230.6384038925171, "epoch": 2.3674923508948407, "grad_norm": 0.21644035641179135, "kl": 0.115264892578125, "learning_rate": 4.927394287364614e-07, "loss": 0.0001, "reward": 1.7892857640981674, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 14120 }, { "completion_length": 230.62947463989258, "epoch": 2.3678276541347083, "grad_norm": 0.1731059367453474, "kl": 0.129791259765625, "learning_rate": 4.927365584584184e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.764285746961832, "rewards/format_reward_func": 1.0, "step": 14122 }, { "completion_length": 239.2678680419922, "epoch": 2.3681629573745755, "grad_norm": 0.1786554966406942, "kl": 0.126251220703125, "learning_rate": 4.927336876215055e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7812500149011612, "rewards/format_reward_func": 0.9955357164144516, "step": 14124 }, { "completion_length": 235.62500858306885, "epoch": 2.368498260614443, "grad_norm": 0.15925308523575013, "kl": 0.12420654296875, "learning_rate": 4.927308162257299e-07, "loss": 0.0001, "reward": 1.7678571790456772, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571827709675, "rewards/format_reward_func": 1.0, "step": 14126 }, { "completion_length": 230.16965293884277, "epoch": 2.3688335638543108, "grad_norm": 0.08282613935284787, "kl": 0.15203857421875, "learning_rate": 4.927279442710979e-07, "loss": 0.0002, "reward": 1.7892857789993286, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7892857417464256, "rewards/format_reward_func": 1.0, "step": 14128 }, { "completion_length": 235.19197368621826, "epoch": 2.3691688670941784, "grad_norm": 0.2480821388756342, "kl": 0.123291015625, "learning_rate": 4.927250717576161e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 14130 }, { "completion_length": 226.73661708831787, "epoch": 2.369504170334046, "grad_norm": 0.17368345542346247, "kl": 0.1526947021484375, "learning_rate": 4.927221986852913e-07, "loss": 0.0002, "reward": 1.7357143387198448, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143238186836, "rewards/format_reward_func": 1.0, "step": 14132 }, { "completion_length": 239.01340293884277, "epoch": 2.3698394735739132, "grad_norm": 0.20367427730227183, "kl": 0.244476318359375, "learning_rate": 4.927193250541299e-07, "loss": 0.0002, "reward": 1.7482143491506577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7526785992085934, "rewards/format_reward_func": 0.9955357164144516, "step": 14134 }, { "completion_length": 237.95536613464355, "epoch": 2.370174776813781, "grad_norm": 0.0815353276461526, "kl": 0.142181396484375, "learning_rate": 4.927164508641388e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000447034836, "rewards/format_reward_func": 1.0, "step": 14136 }, { "completion_length": 229.46429538726807, "epoch": 2.3705100800536485, "grad_norm": 0.10713861064394091, "kl": 0.175323486328125, "learning_rate": 4.927135761153242e-07, "loss": 0.0002, "reward": 1.7767857909202576, "reward_std": 0.0328299580141902, "rewards/equation_reward_func": 0.7812500260770321, "rewards/format_reward_func": 0.9955357164144516, "step": 14138 }, { "completion_length": 225.665189743042, "epoch": 2.370845383293516, "grad_norm": 0.17673572626422573, "kl": 0.131622314453125, "learning_rate": 4.927107008076932e-07, "loss": 0.0001, "reward": 1.7160715013742447, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7205357495695353, "rewards/format_reward_func": 0.9955357164144516, "step": 14140 }, { "completion_length": 227.540189743042, "epoch": 2.3711806865333838, "grad_norm": 0.33349945659697183, "kl": 0.13836669921875, "learning_rate": 4.92707824941252e-07, "loss": 0.0001, "reward": 1.7839286476373672, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 14142 }, { "completion_length": 220.33929347991943, "epoch": 2.3715159897732514, "grad_norm": 0.1168489542544372, "kl": 0.1265869140625, "learning_rate": 4.927049485160075e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 14144 }, { "completion_length": 238.7634048461914, "epoch": 2.3718512930131186, "grad_norm": 0.1929672755226574, "kl": 0.138336181640625, "learning_rate": 4.927020715319662e-07, "loss": 0.0001, "reward": 1.8267857506871223, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8312500268220901, "rewards/format_reward_func": 0.9955357164144516, "step": 14146 }, { "completion_length": 226.50893783569336, "epoch": 2.3721865962529862, "grad_norm": 0.22830195961372327, "kl": 0.1256103515625, "learning_rate": 4.926991939891346e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428962051868, "rewards/format_reward_func": 1.0, "step": 14148 }, { "completion_length": 223.10268878936768, "epoch": 2.372521899492854, "grad_norm": 0.1226224587835622, "kl": 0.162078857421875, "learning_rate": 4.926963158875197e-07, "loss": 0.0002, "reward": 1.8214286044239998, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8214286081492901, "rewards/format_reward_func": 1.0, "step": 14150 }, { "completion_length": 239.80358123779297, "epoch": 2.3728572027327215, "grad_norm": 0.13037067304308758, "kl": 0.135223388671875, "learning_rate": 4.926934372271277e-07, "loss": 0.0001, "reward": 1.7625000849366188, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7669643275439739, "rewards/format_reward_func": 0.9955357164144516, "step": 14152 }, { "completion_length": 234.99554538726807, "epoch": 2.373192505972589, "grad_norm": 0.2723590433917922, "kl": 0.140380859375, "learning_rate": 4.926905580079655e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7892857305705547, "rewards/format_reward_func": 1.0, "step": 14154 }, { "completion_length": 249.51786613464355, "epoch": 2.3735278092124563, "grad_norm": 0.2768983996081548, "kl": 0.42657470703125, "learning_rate": 4.926876782300397e-07, "loss": 0.0004, "reward": 1.76071435213089, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7696428894996643, "rewards/format_reward_func": 0.9910714328289032, "step": 14156 }, { "completion_length": 228.89733028411865, "epoch": 2.373863112452324, "grad_norm": 0.2700517762375239, "kl": 0.125091552734375, "learning_rate": 4.926847978933568e-07, "loss": 0.0001, "reward": 1.760714367032051, "reward_std": 0.04545686487108469, "rewards/equation_reward_func": 0.7696428839117289, "rewards/format_reward_func": 0.9910714328289032, "step": 14158 }, { "completion_length": 228.62947463989258, "epoch": 2.3741984156921916, "grad_norm": 0.16048736519894619, "kl": 0.1554718017578125, "learning_rate": 4.926819169979235e-07, "loss": 0.0002, "reward": 1.7428572252392769, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571674972773, "rewards/format_reward_func": 1.0, "step": 14160 }, { "completion_length": 227.23215293884277, "epoch": 2.3745337189320592, "grad_norm": 0.18227184233243968, "kl": 0.13006591796875, "learning_rate": 4.926790355437466e-07, "loss": 0.0001, "reward": 1.7125000655651093, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7169643193483353, "rewards/format_reward_func": 0.9955357164144516, "step": 14162 }, { "completion_length": 241.8660831451416, "epoch": 2.374869022171927, "grad_norm": 0.289783282635767, "kl": 0.160186767578125, "learning_rate": 4.926761535308324e-07, "loss": 0.0002, "reward": 1.844642885029316, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.849107176065445, "rewards/format_reward_func": 0.9955357164144516, "step": 14164 }, { "completion_length": 232.65179443359375, "epoch": 2.3752043254117945, "grad_norm": 0.1962872977790209, "kl": 0.124542236328125, "learning_rate": 4.926732709591878e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 14166 }, { "completion_length": 240.821439743042, "epoch": 2.3755396286516617, "grad_norm": 0.22684733865237583, "kl": 0.1270751953125, "learning_rate": 4.926703878288194e-07, "loss": 0.0001, "reward": 1.712500087916851, "reward_std": 0.07323605846613646, "rewards/equation_reward_func": 0.716964315623045, "rewards/format_reward_func": 0.9955357164144516, "step": 14168 }, { "completion_length": 239.41518878936768, "epoch": 2.3758749318915293, "grad_norm": 0.21397832178290516, "kl": 0.120819091796875, "learning_rate": 4.926675041397338e-07, "loss": 0.0001, "reward": 1.7339286282658577, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7383928783237934, "rewards/format_reward_func": 0.9955357164144516, "step": 14170 }, { "completion_length": 232.6428689956665, "epoch": 2.376210235131397, "grad_norm": 0.18256904282426623, "kl": 0.129730224609375, "learning_rate": 4.926646198919375e-07, "loss": 0.0001, "reward": 1.7821428999304771, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428887546062, "rewards/format_reward_func": 1.0, "step": 14172 }, { "completion_length": 226.29911613464355, "epoch": 2.3765455383712646, "grad_norm": 0.2511379796899202, "kl": 0.1105499267578125, "learning_rate": 4.926617350854373e-07, "loss": 0.0001, "reward": 1.728571504354477, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714596509933, "rewards/format_reward_func": 1.0, "step": 14174 }, { "completion_length": 234.5357265472412, "epoch": 2.3768808416111322, "grad_norm": 0.11663380199458673, "kl": 0.11688232421875, "learning_rate": 4.926588497202399e-07, "loss": 0.0001, "reward": 1.7821428924798965, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 14176 }, { "completion_length": 232.5759048461914, "epoch": 2.3772161448509994, "grad_norm": 0.14246938662770156, "kl": 0.097503662109375, "learning_rate": 4.926559637963518e-07, "loss": 0.0001, "reward": 1.733928643167019, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7383928932249546, "rewards/format_reward_func": 0.9955357164144516, "step": 14178 }, { "completion_length": 243.15179538726807, "epoch": 2.377551448090867, "grad_norm": 0.1807404842578879, "kl": 0.113311767578125, "learning_rate": 4.926530773137798e-07, "loss": 0.0001, "reward": 1.7464286610484123, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 14180 }, { "completion_length": 237.43750953674316, "epoch": 2.3778867513307347, "grad_norm": 0.2560356006133875, "kl": 0.116668701171875, "learning_rate": 4.926501902725303e-07, "loss": 0.0001, "reward": 1.8196429014205933, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8241071775555611, "rewards/format_reward_func": 0.9955357164144516, "step": 14182 }, { "completion_length": 238.04911708831787, "epoch": 2.3782220545706023, "grad_norm": 0.17475930712994994, "kl": 0.139495849609375, "learning_rate": 4.926473026726101e-07, "loss": 0.0001, "reward": 1.6910715028643608, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.6955357547849417, "rewards/format_reward_func": 0.9955357164144516, "step": 14184 }, { "completion_length": 245.30358123779297, "epoch": 2.37855735781047, "grad_norm": 0.17446587499780564, "kl": 0.1220703125, "learning_rate": 4.92644414514026e-07, "loss": 0.0001, "reward": 1.7517857775092125, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 14186 }, { "completion_length": 234.08036518096924, "epoch": 2.3788926610503376, "grad_norm": 0.27303907383285336, "kl": 0.19439697265625, "learning_rate": 4.926415257967843e-07, "loss": 0.0002, "reward": 1.7571429386734962, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 14188 }, { "completion_length": 253.4732255935669, "epoch": 2.3792279642902048, "grad_norm": 0.14374585376478768, "kl": 0.106475830078125, "learning_rate": 4.92638636520892e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714700818062, "rewards/format_reward_func": 1.0, "step": 14190 }, { "completion_length": 238.92411613464355, "epoch": 2.3795632675300724, "grad_norm": 0.09037984547445888, "kl": 0.112701416015625, "learning_rate": 4.926357466863556e-07, "loss": 0.0001, "reward": 1.7750000432133675, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7750000208616257, "rewards/format_reward_func": 1.0, "step": 14192 }, { "completion_length": 250.32143878936768, "epoch": 2.37989857076994, "grad_norm": 0.2056104027875523, "kl": 0.109954833984375, "learning_rate": 4.926328562931816e-07, "loss": 0.0001, "reward": 1.712500087916851, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7169643193483353, "rewards/format_reward_func": 0.9955357164144516, "step": 14194 }, { "completion_length": 254.5714406967163, "epoch": 2.3802338740098077, "grad_norm": 0.21454443708666152, "kl": 0.124755859375, "learning_rate": 4.92629965341377e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 14196 }, { "completion_length": 245.61161708831787, "epoch": 2.3805691772496753, "grad_norm": 0.21721769675845912, "kl": 0.1088714599609375, "learning_rate": 4.92627073830948e-07, "loss": 0.0001, "reward": 1.796428643167019, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 14198 }, { "completion_length": 251.32590675354004, "epoch": 2.3809044804895425, "grad_norm": 0.14698313947537994, "kl": 0.118255615234375, "learning_rate": 4.926241817619017e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 14200 }, { "completion_length": 247.36161518096924, "epoch": 2.38123978372941, "grad_norm": 0.2984387094846763, "kl": 0.11102294921875, "learning_rate": 4.926212891342445e-07, "loss": 0.0001, "reward": 1.8160714954137802, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.820535734295845, "rewards/format_reward_func": 0.9955357164144516, "step": 14202 }, { "completion_length": 246.8259038925171, "epoch": 2.3815750869692778, "grad_norm": 0.19331540875940945, "kl": 0.115264892578125, "learning_rate": 4.926183959479833e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.753571467474103, "rewards/format_reward_func": 1.0, "step": 14204 }, { "completion_length": 245.6026906967163, "epoch": 2.3819103902091454, "grad_norm": 0.0032176377544802333, "kl": 0.115203857421875, "learning_rate": 4.926155022031244e-07, "loss": 0.0001, "reward": 1.7928571924567223, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 14206 }, { "completion_length": 240.30357933044434, "epoch": 2.382245693449013, "grad_norm": 0.22095254078246251, "kl": 0.111480712890625, "learning_rate": 4.926126078996747e-07, "loss": 0.0001, "reward": 1.8017857745289803, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8062500283122063, "rewards/format_reward_func": 0.9955357164144516, "step": 14208 }, { "completion_length": 252.48661613464355, "epoch": 2.3825809966888807, "grad_norm": 0.268309403775508, "kl": 0.125274658203125, "learning_rate": 4.926097130376409e-07, "loss": 0.0001, "reward": 1.7196429297327995, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7241071723401546, "rewards/format_reward_func": 0.9955357164144516, "step": 14210 }, { "completion_length": 244.227689743042, "epoch": 2.382916299928748, "grad_norm": 0.1499611496941687, "kl": 0.1090087890625, "learning_rate": 4.926068176170295e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714285962283611, "rewards/format_reward_func": 1.0, "step": 14212 }, { "completion_length": 250.77233123779297, "epoch": 2.3832516031686155, "grad_norm": 0.108530994446496, "kl": 0.17041015625, "learning_rate": 4.926039216378472e-07, "loss": 0.0002, "reward": 1.7696429267525673, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7741071619093418, "rewards/format_reward_func": 0.9955357164144516, "step": 14214 }, { "completion_length": 239.0535831451416, "epoch": 2.383586906408483, "grad_norm": 0.14041276773962685, "kl": 0.10272216796875, "learning_rate": 4.926010251001008e-07, "loss": 0.0001, "reward": 1.8321429044008255, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8321428745985031, "rewards/format_reward_func": 1.0, "step": 14216 }, { "completion_length": 237.2901906967163, "epoch": 2.3839222096483508, "grad_norm": 0.12151716109331748, "kl": 0.14166259765625, "learning_rate": 4.92598128003797e-07, "loss": 0.0001, "reward": 1.7196429297327995, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7241071742027998, "rewards/format_reward_func": 0.9955357164144516, "step": 14218 }, { "completion_length": 240.2455472946167, "epoch": 2.3842575128882184, "grad_norm": 0.16364630154217485, "kl": 0.126220703125, "learning_rate": 4.925952303489422e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7535714693367481, "rewards/format_reward_func": 1.0, "step": 14220 }, { "completion_length": 233.61608219146729, "epoch": 2.3845928161280856, "grad_norm": 0.17529658175337104, "kl": 0.133697509765625, "learning_rate": 4.925923321355433e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143092900515, "rewards/format_reward_func": 1.0, "step": 14222 }, { "completion_length": 241.4732255935669, "epoch": 2.3849281193679532, "grad_norm": 0.09252615676509375, "kl": 0.133056640625, "learning_rate": 4.925894333636069e-07, "loss": 0.0001, "reward": 1.7285714969038963, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7285714671015739, "rewards/format_reward_func": 1.0, "step": 14224 }, { "completion_length": 234.6294765472412, "epoch": 2.385263422607821, "grad_norm": 0.12747518818637713, "kl": 0.110015869140625, "learning_rate": 4.925865340331398e-07, "loss": 0.0001, "reward": 1.7214286550879478, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7214286122471094, "rewards/format_reward_func": 1.0, "step": 14226 }, { "completion_length": 242.12947463989258, "epoch": 2.3855987258476885, "grad_norm": 0.23217568953704362, "kl": 0.135894775390625, "learning_rate": 4.925836341441484e-07, "loss": 0.0001, "reward": 1.6642858162522316, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.6642857529222965, "rewards/format_reward_func": 1.0, "step": 14228 }, { "completion_length": 241.42858409881592, "epoch": 2.385934029087556, "grad_norm": 0.32283599449486605, "kl": 0.107147216796875, "learning_rate": 4.925807336966395e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 14230 }, { "completion_length": 241.62054824829102, "epoch": 2.3862693323274238, "grad_norm": 0.4361940214247699, "kl": 0.099517822265625, "learning_rate": 4.925778326906198e-07, "loss": 0.0001, "reward": 1.7821429371833801, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428757160902, "rewards/format_reward_func": 1.0, "step": 14232 }, { "completion_length": 241.95983409881592, "epoch": 2.3866046355672914, "grad_norm": 0.15672447960175115, "kl": 0.114654541015625, "learning_rate": 4.925749311260962e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 14234 }, { "completion_length": 247.38840293884277, "epoch": 2.3869399388071586, "grad_norm": 0.12357675091633724, "kl": 0.127593994140625, "learning_rate": 4.92572029003075e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8392857313156128, "rewards/format_reward_func": 1.0, "step": 14236 }, { "completion_length": 238.7009048461914, "epoch": 2.3872752420470262, "grad_norm": 0.34950113473575367, "kl": 0.118927001953125, "learning_rate": 4.92569126321563e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 14238 }, { "completion_length": 238.56250762939453, "epoch": 2.387610545286894, "grad_norm": 0.005908223008530719, "kl": 0.1171875, "learning_rate": 4.925662230815671e-07, "loss": 0.0001, "reward": 1.703571505844593, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7035714685916901, "rewards/format_reward_func": 1.0, "step": 14240 }, { "completion_length": 238.3705472946167, "epoch": 2.3879458485267615, "grad_norm": 0.23473823838715496, "kl": 0.10906982421875, "learning_rate": 4.925633192830937e-07, "loss": 0.0001, "reward": 1.7660714909434319, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7705357410013676, "rewards/format_reward_func": 0.9955357164144516, "step": 14242 }, { "completion_length": 234.7678680419922, "epoch": 2.3882811517666287, "grad_norm": 0.26786727999529003, "kl": 0.112030029296875, "learning_rate": 4.925604149261497e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 14244 }, { "completion_length": 233.0223331451416, "epoch": 2.3886164550064963, "grad_norm": 0.18724417602482607, "kl": 0.10772705078125, "learning_rate": 4.925575100107415e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714286185801029, "rewards/format_reward_func": 1.0, "step": 14246 }, { "completion_length": 240.4776906967163, "epoch": 2.388951758246364, "grad_norm": 0.1619747174711402, "kl": 0.1134033203125, "learning_rate": 4.925546045368762e-07, "loss": 0.0001, "reward": 1.850000038743019, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8500000201165676, "rewards/format_reward_func": 1.0, "step": 14248 }, { "completion_length": 235.22322463989258, "epoch": 2.3892870614862316, "grad_norm": 0.14819214172171727, "kl": 0.11566162109375, "learning_rate": 4.925516985045601e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 14250 }, { "completion_length": 238.0446538925171, "epoch": 2.3896223647260992, "grad_norm": 0.14841388129257704, "kl": 0.229644775390625, "learning_rate": 4.925487919138001e-07, "loss": 0.0002, "reward": 1.7928571924567223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571850061417, "rewards/format_reward_func": 1.0, "step": 14252 }, { "completion_length": 227.77679634094238, "epoch": 2.389957667965967, "grad_norm": 0.20086124023025853, "kl": 0.114776611328125, "learning_rate": 4.925458847646029e-07, "loss": 0.0001, "reward": 1.8107143566012383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.810714315623045, "rewards/format_reward_func": 1.0, "step": 14254 }, { "completion_length": 233.6250114440918, "epoch": 2.3902929712058345, "grad_norm": 0.2715224088342795, "kl": 0.108551025390625, "learning_rate": 4.925429770569751e-07, "loss": 0.0001, "reward": 1.7839286252856255, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.788392897695303, "rewards/format_reward_func": 0.9955357164144516, "step": 14256 }, { "completion_length": 227.0446538925171, "epoch": 2.3906282744457017, "grad_norm": 0.18961583525723497, "kl": 0.1087646484375, "learning_rate": 4.925400687909235e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143294066191, "rewards/format_reward_func": 1.0, "step": 14258 }, { "completion_length": 240.30804443359375, "epoch": 2.3909635776855693, "grad_norm": 0.20396875238100765, "kl": 0.1153564453125, "learning_rate": 4.925371599664547e-07, "loss": 0.0001, "reward": 1.8125000298023224, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.816964328289032, "rewards/format_reward_func": 0.9955357164144516, "step": 14260 }, { "completion_length": 235.96429538726807, "epoch": 2.391298880925437, "grad_norm": 1.6884351687415977, "kl": 0.130615234375, "learning_rate": 4.925342505835754e-07, "loss": 0.0001, "reward": 1.8107143118977547, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.810714328661561, "rewards/format_reward_func": 1.0, "step": 14262 }, { "completion_length": 245.05804538726807, "epoch": 2.3916341841653046, "grad_norm": 0.1875430097990074, "kl": 0.109161376953125, "learning_rate": 4.925313406422924e-07, "loss": 0.0001, "reward": 1.7642858028411865, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857413738966, "rewards/format_reward_func": 1.0, "step": 14264 }, { "completion_length": 240.56250953674316, "epoch": 2.3919694874051722, "grad_norm": 0.17942496787693438, "kl": 0.10748291015625, "learning_rate": 4.925284301426122e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7785714454948902, "rewards/format_reward_func": 1.0, "step": 14266 }, { "completion_length": 237.4821538925171, "epoch": 2.3923047906450394, "grad_norm": 0.09619269901935326, "kl": 0.11822509765625, "learning_rate": 4.925255190845418e-07, "loss": 0.0001, "reward": 1.7892857491970062, "reward_std": 0.005050762556493282, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 14268 }, { "completion_length": 235.8928680419922, "epoch": 2.392640093884907, "grad_norm": 0.11474742306725783, "kl": 0.108551025390625, "learning_rate": 4.925226074680876e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7500000223517418, "rewards/format_reward_func": 1.0, "step": 14270 }, { "completion_length": 247.57143878936768, "epoch": 2.3929753971247747, "grad_norm": 0.15741634440553448, "kl": 0.1328125, "learning_rate": 4.925196952932565e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 14272 }, { "completion_length": 251.08929920196533, "epoch": 2.3933107003646423, "grad_norm": 0.10766437431018219, "kl": 0.12640380859375, "learning_rate": 4.925167825600552e-07, "loss": 0.0001, "reward": 1.7750000432133675, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.775000037625432, "rewards/format_reward_func": 1.0, "step": 14274 }, { "completion_length": 257.86609077453613, "epoch": 2.39364600360451, "grad_norm": 0.16834165502722323, "kl": 0.108551025390625, "learning_rate": 4.925138692684902e-07, "loss": 0.0001, "reward": 1.8214286267757416, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8214286006987095, "rewards/format_reward_func": 1.0, "step": 14276 }, { "completion_length": 247.4509048461914, "epoch": 2.3939813068443776, "grad_norm": 0.17944890412209027, "kl": 0.102783203125, "learning_rate": 4.925109554185685e-07, "loss": 0.0001, "reward": 1.801785759627819, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8062500283122063, "rewards/format_reward_func": 0.9955357164144516, "step": 14278 }, { "completion_length": 251.76340198516846, "epoch": 2.394316610084245, "grad_norm": 0.11806261551808396, "kl": 0.233489990234375, "learning_rate": 4.925080410102965e-07, "loss": 0.0002, "reward": 1.6642857864499092, "reward_std": 0.03030457627028227, "rewards/equation_reward_func": 0.6732143275439739, "rewards/format_reward_func": 0.9910714328289032, "step": 14280 }, { "completion_length": 256.3169755935669, "epoch": 2.3946519133241124, "grad_norm": 0.282034937595845, "kl": 0.12908935546875, "learning_rate": 4.925051260436812e-07, "loss": 0.0001, "reward": 1.7750000581145287, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000283122063, "rewards/format_reward_func": 1.0, "step": 14282 }, { "completion_length": 252.32144260406494, "epoch": 2.39498721656398, "grad_norm": 0.17036559088687567, "kl": 0.13946533203125, "learning_rate": 4.925022105187291e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714790225029, "rewards/format_reward_func": 1.0, "step": 14284 }, { "completion_length": 258.7276887893677, "epoch": 2.3953225198038477, "grad_norm": 0.1503807619374923, "kl": 0.144378662109375, "learning_rate": 4.924992944354472e-07, "loss": 0.0001, "reward": 1.7892857640981674, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 14286 }, { "completion_length": 244.72768783569336, "epoch": 2.3956578230437153, "grad_norm": 0.21012577005227273, "kl": 0.1162109375, "learning_rate": 4.924963777938417e-07, "loss": 0.0001, "reward": 1.8000000715255737, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 14288 }, { "completion_length": 248.89733219146729, "epoch": 2.3959931262835825, "grad_norm": 0.3899478536470825, "kl": 0.423553466796875, "learning_rate": 4.924934605939199e-07, "loss": 0.0004, "reward": 1.7821428924798965, "reward_std": 0.045456865802407265, "rewards/equation_reward_func": 0.7910714633762836, "rewards/format_reward_func": 0.9910714328289032, "step": 14290 }, { "completion_length": 246.04018688201904, "epoch": 2.39632842952345, "grad_norm": 0.13040493195453964, "kl": 0.135955810546875, "learning_rate": 4.924905428356882e-07, "loss": 0.0001, "reward": 1.7732143476605415, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7776786088943481, "rewards/format_reward_func": 0.9955357164144516, "step": 14292 }, { "completion_length": 256.4910840988159, "epoch": 2.396663732763318, "grad_norm": 0.22800534481410561, "kl": 0.128997802734375, "learning_rate": 4.924876245191532e-07, "loss": 0.0001, "reward": 1.7839286178350449, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7883928902447224, "rewards/format_reward_func": 0.9955357164144516, "step": 14294 }, { "completion_length": 260.1651887893677, "epoch": 2.3969990360031854, "grad_norm": 0.25404098138978654, "kl": 0.384521484375, "learning_rate": 4.924847056443219e-07, "loss": 0.0004, "reward": 1.8250000774860382, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8250000216066837, "rewards/format_reward_func": 1.0, "step": 14296 }, { "completion_length": 250.84376335144043, "epoch": 2.397334339243053, "grad_norm": 0.2945103278601156, "kl": 0.115875244140625, "learning_rate": 4.924817862112008e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.05808377172797918, "rewards/equation_reward_func": 0.7651786096394062, "rewards/format_reward_func": 0.9866071492433548, "step": 14298 }, { "completion_length": 262.4776916503906, "epoch": 2.3976696424829207, "grad_norm": 0.13702960543474418, "kl": 0.113006591796875, "learning_rate": 4.924788662197968e-07, "loss": 0.0001, "reward": 1.7053572237491608, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7098214775323868, "rewards/format_reward_func": 0.9955357164144516, "step": 14300 }, { "completion_length": 260.9151906967163, "epoch": 2.398004945722788, "grad_norm": 0.16906523532274226, "kl": 0.54217529296875, "learning_rate": 4.924759456701166e-07, "loss": 0.0005, "reward": 1.7392857745289803, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7482143081724644, "rewards/format_reward_func": 0.9910714328289032, "step": 14302 }, { "completion_length": 258.808048248291, "epoch": 2.3983402489626555, "grad_norm": 0.37236756985967784, "kl": 0.363616943359375, "learning_rate": 4.924730245621669e-07, "loss": 0.0004, "reward": 1.8000000417232513, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000342726707, "rewards/format_reward_func": 1.0, "step": 14304 }, { "completion_length": 255.62054538726807, "epoch": 2.398675552202523, "grad_norm": 0.0919150041745982, "kl": 0.917205810546875, "learning_rate": 4.924701028959543e-07, "loss": 0.0009, "reward": 1.7750000432133675, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 14306 }, { "completion_length": 247.10269165039062, "epoch": 2.399010855442391, "grad_norm": 0.20273143904873656, "kl": 0.12310791015625, "learning_rate": 4.924671806714856e-07, "loss": 0.0001, "reward": 1.7589286342263222, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928917348385, "rewards/format_reward_func": 0.9955357164144516, "step": 14308 }, { "completion_length": 246.0580472946167, "epoch": 2.3993461586822584, "grad_norm": 0.19527974290438713, "kl": 0.1595458984375, "learning_rate": 4.924642578887675e-07, "loss": 0.0002, "reward": 1.8339286148548126, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8383928760886192, "rewards/format_reward_func": 0.9955357164144516, "step": 14310 }, { "completion_length": 260.1919746398926, "epoch": 2.3996814619221256, "grad_norm": 0.4782948238589906, "kl": 0.147247314453125, "learning_rate": 4.924613345478069e-07, "loss": 0.0001, "reward": 1.6857143640518188, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.6857143305242062, "rewards/format_reward_func": 1.0, "step": 14312 }, { "completion_length": 261.4419775009155, "epoch": 2.4000167651619932, "grad_norm": 0.5121912858089605, "kl": 2.307708740234375, "learning_rate": 4.924584106486105e-07, "loss": 0.0023, "reward": 1.7446429207921028, "reward_std": 0.06818529684096575, "rewards/equation_reward_func": 0.7580357529222965, "rewards/format_reward_func": 0.9866071492433548, "step": 14314 }, { "completion_length": 245.25447463989258, "epoch": 2.400352068401861, "grad_norm": 0.18907697095639242, "kl": 0.214569091796875, "learning_rate": 4.924554861911847e-07, "loss": 0.0002, "reward": 1.7571429312229156, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428902447224, "rewards/format_reward_func": 1.0, "step": 14316 }, { "completion_length": 242.28125953674316, "epoch": 2.4006873716417285, "grad_norm": 0.24829432582435484, "kl": 1.02117919921875, "learning_rate": 4.924525611755366e-07, "loss": 0.001, "reward": 1.8035714849829674, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.8125000260770321, "rewards/format_reward_func": 0.9910714328289032, "step": 14318 }, { "completion_length": 246.0178689956665, "epoch": 2.401022674881596, "grad_norm": 0.5520300313907123, "kl": 0.346954345703125, "learning_rate": 4.924496356016729e-07, "loss": 0.0003, "reward": 1.7267857864499092, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7401785887777805, "rewards/format_reward_func": 0.9866071492433548, "step": 14320 }, { "completion_length": 255.9821538925171, "epoch": 2.401357978121464, "grad_norm": 0.23108229551636367, "kl": 0.32366943359375, "learning_rate": 4.924467094696001e-07, "loss": 0.0003, "reward": 1.728571504354477, "reward_std": 0.06060915347188711, "rewards/equation_reward_func": 0.7375000342726707, "rewards/format_reward_func": 0.9910714328289032, "step": 14322 }, { "completion_length": 258.02679538726807, "epoch": 2.401693281361331, "grad_norm": 0.5556203153502393, "kl": 1.0614013671875, "learning_rate": 4.924437827793251e-07, "loss": 0.0011, "reward": 1.705357201397419, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.718750037252903, "rewards/format_reward_func": 0.9866071492433548, "step": 14324 }, { "completion_length": 250.02233505249023, "epoch": 2.4020285846011986, "grad_norm": 0.33785567885457496, "kl": 0.931396484375, "learning_rate": 4.924408555308547e-07, "loss": 0.0009, "reward": 1.7732143327593803, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7866071797907352, "rewards/format_reward_func": 0.9866071492433548, "step": 14326 }, { "completion_length": 257.00894260406494, "epoch": 2.4023638878410662, "grad_norm": 0.5425677862198246, "kl": 3.427093505859375, "learning_rate": 4.924379277241955e-07, "loss": 0.0034, "reward": 1.7357143685221672, "reward_std": 0.0505076264962554, "rewards/equation_reward_func": 0.7446428909897804, "rewards/format_reward_func": 0.9910714328289032, "step": 14328 }, { "completion_length": 241.6562614440918, "epoch": 2.402699191080934, "grad_norm": 0.08042267566696397, "kl": 0.248321533203125, "learning_rate": 4.924349993593543e-07, "loss": 0.0002, "reward": 1.7821428924798965, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7910714633762836, "rewards/format_reward_func": 0.9910714328289032, "step": 14330 }, { "completion_length": 250.99108409881592, "epoch": 2.4030344943208015, "grad_norm": 0.23087088388512614, "kl": 0.36553955078125, "learning_rate": 4.924320704363378e-07, "loss": 0.0004, "reward": 1.7357143461704254, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 0.9821428656578064, "step": 14332 }, { "completion_length": 253.00447750091553, "epoch": 2.4033697975606687, "grad_norm": 0.32966255660926874, "kl": 0.310394287109375, "learning_rate": 4.924291409551529e-07, "loss": 0.0003, "reward": 1.717857226729393, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7267857417464256, "rewards/format_reward_func": 0.9910714328289032, "step": 14334 }, { "completion_length": 240.29465103149414, "epoch": 2.4037051008005363, "grad_norm": 0.23779095458913324, "kl": 0.32281494140625, "learning_rate": 4.924262109158061e-07, "loss": 0.0003, "reward": 1.7839286252856255, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7973214611411095, "rewards/format_reward_func": 0.9866071492433548, "step": 14336 }, { "completion_length": 241.95536994934082, "epoch": 2.404040404040404, "grad_norm": 0.1600356620901426, "kl": 0.13165283203125, "learning_rate": 4.924232803183044e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.0328299580141902, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9955357164144516, "step": 14338 }, { "completion_length": 250.74108219146729, "epoch": 2.4043757072802716, "grad_norm": 0.24163101976938872, "kl": 0.1605224609375, "learning_rate": 4.924203491626544e-07, "loss": 0.0002, "reward": 1.7589286267757416, "reward_std": 0.07828682102262974, "rewards/equation_reward_func": 0.7723214514553547, "rewards/format_reward_func": 0.9866071492433548, "step": 14340 }, { "completion_length": 239.1517972946167, "epoch": 2.4047110105201392, "grad_norm": 0.2275927010975339, "kl": 0.12322998046875, "learning_rate": 4.924174174488628e-07, "loss": 0.0001, "reward": 1.7982143387198448, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8026785999536514, "rewards/format_reward_func": 0.9955357164144516, "step": 14342 }, { "completion_length": 230.55804538726807, "epoch": 2.405046313760007, "grad_norm": 0.18734378399333992, "kl": 0.105255126953125, "learning_rate": 4.924144851769365e-07, "loss": 0.0001, "reward": 1.7732143551111221, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.9955357164144516, "step": 14344 }, { "completion_length": 238.5267972946167, "epoch": 2.405381616999874, "grad_norm": 0.18580847734440029, "kl": 0.16204833984375, "learning_rate": 4.924115523468821e-07, "loss": 0.0002, "reward": 1.716071493923664, "reward_std": 0.05808377172797918, "rewards/equation_reward_func": 0.7294643204659224, "rewards/format_reward_func": 0.9866071492433548, "step": 14346 }, { "completion_length": 231.34375953674316, "epoch": 2.4057169202397417, "grad_norm": 0.19578708489691765, "kl": 0.121368408203125, "learning_rate": 4.924086189587065e-07, "loss": 0.0001, "reward": 1.7910714969038963, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7955357283353806, "rewards/format_reward_func": 0.9955357164144516, "step": 14348 }, { "completion_length": 231.26786994934082, "epoch": 2.4060522234796093, "grad_norm": 0.19892274262116755, "kl": 0.140625, "learning_rate": 4.924056850124164e-07, "loss": 0.0001, "reward": 1.719642922282219, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7241071686148643, "rewards/format_reward_func": 0.9955357164144516, "step": 14350 }, { "completion_length": 241.85715579986572, "epoch": 2.406387526719477, "grad_norm": 0.1743829897833669, "kl": 0.119781494140625, "learning_rate": 4.924027505080185e-07, "loss": 0.0001, "reward": 1.7857143357396126, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143245637417, "rewards/format_reward_func": 1.0, "step": 14352 }, { "completion_length": 239.40626049041748, "epoch": 2.4067228299593446, "grad_norm": 0.39051117823511666, "kl": 0.1546630859375, "learning_rate": 4.923998154455195e-07, "loss": 0.0002, "reward": 1.7732143104076385, "reward_std": 0.06818529777228832, "rewards/equation_reward_func": 0.7955357357859612, "rewards/format_reward_func": 0.977678582072258, "step": 14354 }, { "completion_length": 240.5580472946167, "epoch": 2.407058133199212, "grad_norm": 0.003754627012270124, "kl": 0.13525390625, "learning_rate": 4.923968798249264e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 14356 }, { "completion_length": 234.4821538925171, "epoch": 2.4073934364390794, "grad_norm": 0.1461006787847886, "kl": 0.112762451171875, "learning_rate": 4.923939436462457e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 14358 }, { "completion_length": 248.38393878936768, "epoch": 2.407728739678947, "grad_norm": 0.24628964012318919, "kl": 0.124847412109375, "learning_rate": 4.923910069094843e-07, "loss": 0.0001, "reward": 1.7504465207457542, "reward_std": 0.04987628059461713, "rewards/equation_reward_func": 0.7562500201165676, "rewards/format_reward_func": 0.9941964335739613, "step": 14360 }, { "completion_length": 235.34376049041748, "epoch": 2.4080640429188147, "grad_norm": 0.2245625890808432, "kl": 0.107025146484375, "learning_rate": 4.92388069614649e-07, "loss": 0.0001, "reward": 1.7107143700122833, "reward_std": 0.05555838905274868, "rewards/equation_reward_func": 0.7196428887546062, "rewards/format_reward_func": 0.9910714328289032, "step": 14362 }, { "completion_length": 249.2187623977661, "epoch": 2.4083993461586823, "grad_norm": 0.17260432031337294, "kl": 0.115936279296875, "learning_rate": 4.923851317617464e-07, "loss": 0.0001, "reward": 1.751785784959793, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7562500350177288, "rewards/format_reward_func": 0.9955357164144516, "step": 14364 }, { "completion_length": 244.25447463989258, "epoch": 2.40873464939855, "grad_norm": 0.2208763728434164, "kl": 0.113006591796875, "learning_rate": 4.923821933507834e-07, "loss": 0.0001, "reward": 1.796428643167019, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 14366 }, { "completion_length": 245.88394165039062, "epoch": 2.4090699526384176, "grad_norm": 0.23033703151636636, "kl": 0.10723876953125, "learning_rate": 4.923792543817667e-07, "loss": 0.0001, "reward": 1.7785715013742447, "reward_std": 0.04040610138326883, "rewards/equation_reward_func": 0.7875000312924385, "rewards/format_reward_func": 0.9910714328289032, "step": 14368 }, { "completion_length": 236.25894165039062, "epoch": 2.409405255878285, "grad_norm": 0.2851470594450002, "kl": 0.143096923828125, "learning_rate": 4.923763148547031e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 14370 }, { "completion_length": 235.01786708831787, "epoch": 2.4097405591181524, "grad_norm": 0.1443877591575158, "kl": 0.10845947265625, "learning_rate": 4.923733747695994e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428999304771, "rewards/format_reward_func": 1.0, "step": 14372 }, { "completion_length": 236.65179538726807, "epoch": 2.41007586235802, "grad_norm": 0.0027849364925823456, "kl": 0.105072021484375, "learning_rate": 4.923704341264623e-07, "loss": 0.0001, "reward": 1.776785783469677, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7812500335276127, "rewards/format_reward_func": 0.9955357164144516, "step": 14374 }, { "completion_length": 236.2455472946167, "epoch": 2.4104111655978877, "grad_norm": 0.2298392056406199, "kl": 0.108001708984375, "learning_rate": 4.923674929252985e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714581608772, "rewards/format_reward_func": 1.0, "step": 14376 }, { "completion_length": 234.21876049041748, "epoch": 2.410746468837755, "grad_norm": 0.1903379262753221, "kl": 0.114990234375, "learning_rate": 4.92364551166115e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.803571455180645, "rewards/format_reward_func": 1.0, "step": 14378 }, { "completion_length": 238.33036613464355, "epoch": 2.4110817720776225, "grad_norm": 0.2296659492073621, "kl": 0.106414794921875, "learning_rate": 4.923616088489183e-07, "loss": 0.0001, "reward": 1.787500061094761, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 14380 }, { "completion_length": 240.4509048461914, "epoch": 2.41141707531749, "grad_norm": 0.1183545544888929, "kl": 0.10882568359375, "learning_rate": 4.923586659737154e-07, "loss": 0.0001, "reward": 1.7696429044008255, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7741071879863739, "rewards/format_reward_func": 0.9955357164144516, "step": 14382 }, { "completion_length": 241.94197463989258, "epoch": 2.411752378557358, "grad_norm": 0.2451140468106907, "kl": 0.1093902587890625, "learning_rate": 4.92355722540513e-07, "loss": 0.0001, "reward": 1.7875000536441803, "reward_std": 0.06818529590964317, "rewards/equation_reward_func": 0.7919643186032772, "rewards/format_reward_func": 0.9955357164144516, "step": 14384 }, { "completion_length": 231.56251049041748, "epoch": 2.4120876817972254, "grad_norm": 0.15612636244581551, "kl": 0.103759765625, "learning_rate": 4.923527785493179e-07, "loss": 0.0001, "reward": 1.8178571835160255, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8178571686148643, "rewards/format_reward_func": 1.0, "step": 14386 }, { "completion_length": 237.1205472946167, "epoch": 2.412422985037093, "grad_norm": 0.22794551959835807, "kl": 0.105438232421875, "learning_rate": 4.923498340001369e-07, "loss": 0.0001, "reward": 1.7839286103844643, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928939700127, "rewards/format_reward_func": 0.9955357164144516, "step": 14388 }, { "completion_length": 241.57590675354004, "epoch": 2.4127582882769607, "grad_norm": 0.21870437344172014, "kl": 0.112030029296875, "learning_rate": 4.923468888929766e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428842842579, "rewards/format_reward_func": 1.0, "step": 14390 }, { "completion_length": 237.82143878936768, "epoch": 2.413093591516828, "grad_norm": 0.3153485246471543, "kl": 0.113677978515625, "learning_rate": 4.923439432278439e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 14392 }, { "completion_length": 240.8303680419922, "epoch": 2.4134288947566955, "grad_norm": 0.1720802940204395, "kl": 0.1099853515625, "learning_rate": 4.923409970047456e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571846336126, "rewards/format_reward_func": 1.0, "step": 14394 }, { "completion_length": 242.5312614440918, "epoch": 2.413764197996563, "grad_norm": 0.2575036019942336, "kl": 0.109466552734375, "learning_rate": 4.923380502236885e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 14396 }, { "completion_length": 249.15179824829102, "epoch": 2.414099501236431, "grad_norm": 0.12303614393479674, "kl": 0.11273193359375, "learning_rate": 4.923351028846794e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000342726707, "rewards/format_reward_func": 1.0, "step": 14398 }, { "completion_length": 237.24554634094238, "epoch": 2.4144348044762984, "grad_norm": 0.22113499831589165, "kl": 0.114288330078125, "learning_rate": 4.92332154987725e-07, "loss": 0.0001, "reward": 1.7803572416305542, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.784821443259716, "rewards/format_reward_func": 0.9955357164144516, "step": 14400 }, { "completion_length": 246.93304920196533, "epoch": 2.4147701077161656, "grad_norm": 0.34530621108268905, "kl": 0.130767822265625, "learning_rate": 4.923292065328321e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 14402 }, { "completion_length": 253.29018878936768, "epoch": 2.4151054109560333, "grad_norm": 0.22128352179118405, "kl": 0.125274658203125, "learning_rate": 4.923262575200076e-07, "loss": 0.0001, "reward": 1.7946428805589676, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.7991071753203869, "rewards/format_reward_func": 0.9955357164144516, "step": 14404 }, { "completion_length": 251.0848331451416, "epoch": 2.415440714195901, "grad_norm": 0.06485660401128394, "kl": 0.122039794921875, "learning_rate": 4.923233079492582e-07, "loss": 0.0001, "reward": 1.758928619325161, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7633928693830967, "rewards/format_reward_func": 0.9955357164144516, "step": 14406 }, { "completion_length": 249.37501430511475, "epoch": 2.4157760174357685, "grad_norm": 0.11814345401091123, "kl": 0.10894775390625, "learning_rate": 4.923203578205906e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 14408 }, { "completion_length": 246.2678689956665, "epoch": 2.416111320675636, "grad_norm": 0.044181690154128005, "kl": 0.11151123046875, "learning_rate": 4.923174071340117e-07, "loss": 0.0001, "reward": 1.7892857640981674, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7892857566475868, "rewards/format_reward_func": 1.0, "step": 14410 }, { "completion_length": 254.5000123977661, "epoch": 2.416446623915504, "grad_norm": 0.12192966153626056, "kl": 0.104949951171875, "learning_rate": 4.923144558895283e-07, "loss": 0.0001, "reward": 1.805357187986374, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8098214603960514, "rewards/format_reward_func": 0.9955357164144516, "step": 14412 }, { "completion_length": 250.5625123977661, "epoch": 2.416781927155371, "grad_norm": 0.18429756344803158, "kl": 0.102203369140625, "learning_rate": 4.923115040871472e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 14414 }, { "completion_length": 249.3169755935669, "epoch": 2.4171172303952386, "grad_norm": 0.15038284538850447, "kl": 0.105133056640625, "learning_rate": 4.923085517268752e-07, "loss": 0.0001, "reward": 1.805357187986374, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.8187500275671482, "rewards/format_reward_func": 0.9866071492433548, "step": 14416 }, { "completion_length": 254.67411994934082, "epoch": 2.4174525336351063, "grad_norm": 0.05553958299680993, "kl": 0.1148681640625, "learning_rate": 4.92305598808719e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714752972126, "rewards/format_reward_func": 1.0, "step": 14418 }, { "completion_length": 251.93751049041748, "epoch": 2.417787836874974, "grad_norm": 0.219178777856766, "kl": 0.1243896484375, "learning_rate": 4.923026453326856e-07, "loss": 0.0001, "reward": 1.739285759627819, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.739285746589303, "rewards/format_reward_func": 1.0, "step": 14420 }, { "completion_length": 250.5267972946167, "epoch": 2.4181231401148415, "grad_norm": 0.21583952375718596, "kl": 0.101226806640625, "learning_rate": 4.922996912987815e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143171131611, "rewards/format_reward_func": 1.0, "step": 14422 }, { "completion_length": 252.9821548461914, "epoch": 2.4184584433547087, "grad_norm": 0.14685803789461777, "kl": 0.110504150390625, "learning_rate": 4.922967367070139e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7428571619093418, "rewards/format_reward_func": 1.0, "step": 14424 }, { "completion_length": 250.14286613464355, "epoch": 2.4187937465945764, "grad_norm": 0.19287115760743048, "kl": 0.1021728515625, "learning_rate": 4.922937815573892e-07, "loss": 0.0001, "reward": 1.7232143580913544, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7276786081492901, "rewards/format_reward_func": 0.9955357164144516, "step": 14426 }, { "completion_length": 253.7053689956665, "epoch": 2.419129049834444, "grad_norm": 0.17641124181431558, "kl": 0.116424560546875, "learning_rate": 4.922908258499144e-07, "loss": 0.0001, "reward": 1.74642863124609, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7464286033064127, "rewards/format_reward_func": 1.0, "step": 14428 }, { "completion_length": 246.64733505249023, "epoch": 2.4194643530743116, "grad_norm": 0.24730402074427776, "kl": 0.0968017578125, "learning_rate": 4.922878695845964e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.07071067858487368, "rewards/equation_reward_func": 0.7589286118745804, "rewards/format_reward_func": 0.9910714328289032, "step": 14430 }, { "completion_length": 242.6205472946167, "epoch": 2.4197996563141793, "grad_norm": 0.1609843188328104, "kl": 0.107147216796875, "learning_rate": 4.922849127614417e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8071428872644901, "rewards/format_reward_func": 1.0, "step": 14432 }, { "completion_length": 246.95090293884277, "epoch": 2.420134959554047, "grad_norm": 0.08023298552883207, "kl": 0.1016845703125, "learning_rate": 4.922819553804575e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143081724644, "rewards/format_reward_func": 1.0, "step": 14434 }, { "completion_length": 258.5803689956665, "epoch": 2.420470262793914, "grad_norm": 0.1708562641566658, "kl": 0.09808349609375, "learning_rate": 4.922789974416503e-07, "loss": 0.0001, "reward": 1.805357187986374, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8098214417695999, "rewards/format_reward_func": 0.9955357164144516, "step": 14436 }, { "completion_length": 253.1294755935669, "epoch": 2.4208055660337817, "grad_norm": 0.1869148624482686, "kl": 0.091583251953125, "learning_rate": 4.92276038945027e-07, "loss": 0.0001, "reward": 1.7892857864499092, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857529222965, "rewards/format_reward_func": 1.0, "step": 14438 }, { "completion_length": 245.75893878936768, "epoch": 2.4211408692736494, "grad_norm": 0.23055645746875422, "kl": 0.1028900146484375, "learning_rate": 4.922730798905944e-07, "loss": 0.0001, "reward": 1.732142947614193, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428898721933, "rewards/format_reward_func": 1.0, "step": 14440 }, { "completion_length": 245.9330472946167, "epoch": 2.421476172513517, "grad_norm": 0.2640415440209392, "kl": 0.10809326171875, "learning_rate": 4.922701202783593e-07, "loss": 0.0001, "reward": 1.8107143491506577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 14442 }, { "completion_length": 249.7053689956665, "epoch": 2.4218114757533846, "grad_norm": 0.16123161749216006, "kl": 0.100982666015625, "learning_rate": 4.922671601083287e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143208384514, "rewards/format_reward_func": 1.0, "step": 14444 }, { "completion_length": 243.96429443359375, "epoch": 2.422146778993252, "grad_norm": 0.19344738804761819, "kl": 0.101409912109375, "learning_rate": 4.922641993805092e-07, "loss": 0.0001, "reward": 1.7214286550879478, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7214285954833031, "rewards/format_reward_func": 1.0, "step": 14446 }, { "completion_length": 243.48662090301514, "epoch": 2.4224820822331194, "grad_norm": 0.1864402544792117, "kl": 0.108001708984375, "learning_rate": 4.922612380949077e-07, "loss": 0.0001, "reward": 1.800000049173832, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 14448 }, { "completion_length": 254.58483123779297, "epoch": 2.422817385472987, "grad_norm": 0.18771169862166331, "kl": 0.095550537109375, "learning_rate": 4.922582762515311e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143193483353, "rewards/format_reward_func": 1.0, "step": 14450 }, { "completion_length": 256.90626335144043, "epoch": 2.4231526887128547, "grad_norm": 0.152067826141475, "kl": 0.112213134765625, "learning_rate": 4.92255313850386e-07, "loss": 0.0001, "reward": 1.7500000447034836, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 14452 }, { "completion_length": 244.7232255935669, "epoch": 2.4234879919527224, "grad_norm": 0.15337086305242245, "kl": 0.1212158203125, "learning_rate": 4.922523508914794e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428693830967, "rewards/format_reward_func": 1.0, "step": 14454 }, { "completion_length": 249.39733123779297, "epoch": 2.42382329519259, "grad_norm": 0.19277169712502537, "kl": 0.10882568359375, "learning_rate": 4.92249387374818e-07, "loss": 0.0001, "reward": 1.7285715118050575, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7285714447498322, "rewards/format_reward_func": 1.0, "step": 14456 }, { "completion_length": 239.508939743042, "epoch": 2.424158598432457, "grad_norm": 0.23544722794422707, "kl": 0.1253662109375, "learning_rate": 4.922464233004088e-07, "loss": 0.0001, "reward": 1.8000000789761543, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000081956387, "rewards/format_reward_func": 1.0, "step": 14458 }, { "completion_length": 246.9285831451416, "epoch": 2.424493901672325, "grad_norm": 0.16456192966586017, "kl": 0.104736328125, "learning_rate": 4.922434586682584e-07, "loss": 0.0001, "reward": 1.7607143372297287, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607142981141806, "rewards/format_reward_func": 1.0, "step": 14460 }, { "completion_length": 252.95983219146729, "epoch": 2.4248292049121924, "grad_norm": 0.22480382634898416, "kl": 0.097076416015625, "learning_rate": 4.922404934783738e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7321428898721933, "rewards/format_reward_func": 1.0, "step": 14462 }, { "completion_length": 250.0044755935669, "epoch": 2.42516450815206, "grad_norm": 0.11925795390631981, "kl": 0.1038055419921875, "learning_rate": 4.922375277307618e-07, "loss": 0.0001, "reward": 1.7553572207689285, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7598214522004128, "rewards/format_reward_func": 0.9955357164144516, "step": 14464 }, { "completion_length": 234.99554538726807, "epoch": 2.4254998113919277, "grad_norm": 0.15510926176807355, "kl": 0.11212158203125, "learning_rate": 4.922345614254292e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571827709675, "rewards/format_reward_func": 1.0, "step": 14466 }, { "completion_length": 239.2634048461914, "epoch": 2.425835114631795, "grad_norm": 0.26056520083745166, "kl": 0.114044189453125, "learning_rate": 4.922315945623827e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000335276127, "rewards/format_reward_func": 1.0, "step": 14468 }, { "completion_length": 241.0669755935669, "epoch": 2.4261704178716625, "grad_norm": 0.1362923694488133, "kl": 0.10235595703125, "learning_rate": 4.922286271416294e-07, "loss": 0.0001, "reward": 1.7892857566475868, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 14470 }, { "completion_length": 245.40626335144043, "epoch": 2.42650572111153, "grad_norm": 0.1893996972953304, "kl": 0.101776123046875, "learning_rate": 4.92225659163176e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 14472 }, { "completion_length": 239.88393878936768, "epoch": 2.426841024351398, "grad_norm": 0.2687254884525047, "kl": 0.107421875, "learning_rate": 4.922226906270292e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7750000357627869, "rewards/format_reward_func": 1.0, "step": 14474 }, { "completion_length": 240.14733219146729, "epoch": 2.4271763275912654, "grad_norm": 0.192068117999007, "kl": 0.109466552734375, "learning_rate": 4.92219721533196e-07, "loss": 0.0001, "reward": 1.8178571835160255, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8178571686148643, "rewards/format_reward_func": 1.0, "step": 14476 }, { "completion_length": 241.20983219146729, "epoch": 2.427511630831133, "grad_norm": 0.26444603268357253, "kl": 0.10882568359375, "learning_rate": 4.922167518816832e-07, "loss": 0.0001, "reward": 1.8000000640749931, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 14478 }, { "completion_length": 239.41072368621826, "epoch": 2.4278469340710003, "grad_norm": 0.09661934193422012, "kl": 0.1143798828125, "learning_rate": 4.922137816724976e-07, "loss": 0.0001, "reward": 1.7303572073578835, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.7348214648663998, "rewards/format_reward_func": 0.9955357164144516, "step": 14480 }, { "completion_length": 249.3214406967163, "epoch": 2.428182237310868, "grad_norm": 0.11849886046639548, "kl": 0.1033935546875, "learning_rate": 4.922108109056461e-07, "loss": 0.0001, "reward": 1.769642896950245, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7741071619093418, "rewards/format_reward_func": 0.9955357164144516, "step": 14482 }, { "completion_length": 241.2544765472412, "epoch": 2.4285175405507355, "grad_norm": 0.19818905557945873, "kl": 0.1063232421875, "learning_rate": 4.922078395811355e-07, "loss": 0.0001, "reward": 1.7928571924567223, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571738302708, "rewards/format_reward_func": 1.0, "step": 14484 }, { "completion_length": 257.62501430511475, "epoch": 2.428852843790603, "grad_norm": 0.16333273125407996, "kl": 0.12158203125, "learning_rate": 4.922048676989728e-07, "loss": 0.0001, "reward": 1.691071517765522, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.6955357491970062, "rewards/format_reward_func": 0.9955357164144516, "step": 14486 }, { "completion_length": 244.50894260406494, "epoch": 2.429188147030471, "grad_norm": 0.06860305031557319, "kl": 0.102630615234375, "learning_rate": 4.922018952591645e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7535714656114578, "rewards/format_reward_func": 1.0, "step": 14488 }, { "completion_length": 255.4285831451416, "epoch": 2.429523450270338, "grad_norm": 0.18951155606865122, "kl": 0.11529541015625, "learning_rate": 4.921989222617177e-07, "loss": 0.0001, "reward": 1.7714286372065544, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7714285999536514, "rewards/format_reward_func": 1.0, "step": 14490 }, { "completion_length": 243.508939743042, "epoch": 2.4298587535102056, "grad_norm": 0.06729385120696377, "kl": 0.10260009765625, "learning_rate": 4.921959487066392e-07, "loss": 0.0001, "reward": 1.7535715103149414, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714600235224, "rewards/format_reward_func": 1.0, "step": 14492 }, { "completion_length": 243.26340198516846, "epoch": 2.4301940567500733, "grad_norm": 0.21808525209582078, "kl": 0.1015625, "learning_rate": 4.921929745939357e-07, "loss": 0.0001, "reward": 1.8214286267757416, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8214286044239998, "rewards/format_reward_func": 1.0, "step": 14494 }, { "completion_length": 242.9866189956665, "epoch": 2.430529359989941, "grad_norm": 0.10459009473345975, "kl": 0.10888671875, "learning_rate": 4.921899999236143e-07, "loss": 0.0001, "reward": 1.7535714954137802, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 14496 }, { "completion_length": 243.4062614440918, "epoch": 2.4308646632298085, "grad_norm": 0.20468565131128358, "kl": 0.09014892578125, "learning_rate": 4.921870246956817e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 14498 }, { "completion_length": 240.4196548461914, "epoch": 2.431199966469676, "grad_norm": 0.2839771802697649, "kl": 0.102447509765625, "learning_rate": 4.921840489101447e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571827709675, "rewards/format_reward_func": 1.0, "step": 14500 }, { "completion_length": 243.91965293884277, "epoch": 2.431535269709544, "grad_norm": 0.13837876458820203, "kl": 0.09967041015625, "learning_rate": 4.921810725670104e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7964285984635353, "rewards/format_reward_func": 1.0, "step": 14502 }, { "completion_length": 248.0759048461914, "epoch": 2.431870572949411, "grad_norm": 0.0625324599377449, "kl": 0.10888671875, "learning_rate": 4.921780956662853e-07, "loss": 0.0001, "reward": 1.7357143461704254, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143275439739, "rewards/format_reward_func": 1.0, "step": 14504 }, { "completion_length": 240.58929824829102, "epoch": 2.4322058761892786, "grad_norm": 0.14144479185833814, "kl": 0.116973876953125, "learning_rate": 4.921751182079765e-07, "loss": 0.0001, "reward": 1.7750000655651093, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 14506 }, { "completion_length": 238.99554920196533, "epoch": 2.4325411794291463, "grad_norm": 0.12906166659910961, "kl": 0.115478515625, "learning_rate": 4.921721401920907e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464286051690578, "rewards/format_reward_func": 1.0, "step": 14508 }, { "completion_length": 244.4062614440918, "epoch": 2.432876482669014, "grad_norm": 0.19870262101296146, "kl": 0.1026611328125, "learning_rate": 4.921691616186349e-07, "loss": 0.0001, "reward": 1.7892857491970062, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 14510 }, { "completion_length": 244.85269165039062, "epoch": 2.433211785908881, "grad_norm": 0.34484337934409914, "kl": 0.0943756103515625, "learning_rate": 4.921661824876161e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143022119999, "rewards/format_reward_func": 1.0, "step": 14512 }, { "completion_length": 238.5848331451416, "epoch": 2.4335470891487487, "grad_norm": 0.16552693092879947, "kl": 0.10479736328125, "learning_rate": 4.921632027990408e-07, "loss": 0.0001, "reward": 1.7625000551342964, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7669643238186836, "rewards/format_reward_func": 0.9955357164144516, "step": 14514 }, { "completion_length": 233.008939743042, "epoch": 2.4338823923886164, "grad_norm": 0.23178198865499708, "kl": 0.109375, "learning_rate": 4.921602225529159e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 14516 }, { "completion_length": 241.25447940826416, "epoch": 2.434217695628484, "grad_norm": 0.23617625683615337, "kl": 0.1003265380859375, "learning_rate": 4.921572417492485e-07, "loss": 0.0001, "reward": 1.7500000968575478, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000260770321, "rewards/format_reward_func": 1.0, "step": 14518 }, { "completion_length": 236.50447463989258, "epoch": 2.4345529988683516, "grad_norm": 0.09500009038811856, "kl": 0.108428955078125, "learning_rate": 4.921542603880453e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000223517418, "rewards/format_reward_func": 1.0, "step": 14520 }, { "completion_length": 241.37947368621826, "epoch": 2.4348883021082193, "grad_norm": 0.10477360638602254, "kl": 0.10235595703125, "learning_rate": 4.921512784693132e-07, "loss": 0.0001, "reward": 1.8017857521772385, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8062500394880772, "rewards/format_reward_func": 0.9955357164144516, "step": 14522 }, { "completion_length": 244.4375123977661, "epoch": 2.435223605348087, "grad_norm": 0.5641491334390114, "kl": 0.11566162109375, "learning_rate": 4.921482959930592e-07, "loss": 0.0001, "reward": 1.7464286461472511, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 14524 }, { "completion_length": 239.9107265472412, "epoch": 2.435558908587954, "grad_norm": 0.18693819338494735, "kl": 0.114654541015625, "learning_rate": 4.9214531295929e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7455357313156128, "rewards/format_reward_func": 0.9955357164144516, "step": 14526 }, { "completion_length": 233.2009048461914, "epoch": 2.4358942118278217, "grad_norm": 0.20661760679939017, "kl": 0.104034423828125, "learning_rate": 4.921423293680124e-07, "loss": 0.0001, "reward": 1.8250000551342964, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.825000025331974, "rewards/format_reward_func": 1.0, "step": 14528 }, { "completion_length": 230.52679634094238, "epoch": 2.4362295150676894, "grad_norm": 0.10369726032449242, "kl": 0.112823486328125, "learning_rate": 4.921393452192334e-07, "loss": 0.0001, "reward": 1.8035714700818062, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714626312256, "rewards/format_reward_func": 1.0, "step": 14530 }, { "completion_length": 240.18751049041748, "epoch": 2.436564818307557, "grad_norm": 0.3213602942407995, "kl": 0.10504150390625, "learning_rate": 4.9213636051296e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.778571467846632, "rewards/format_reward_func": 1.0, "step": 14532 }, { "completion_length": 231.95983219146729, "epoch": 2.4369001215474246, "grad_norm": 0.2630550350345542, "kl": 0.102081298828125, "learning_rate": 4.921333752491987e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428924798965, "rewards/format_reward_func": 1.0, "step": 14534 }, { "completion_length": 230.3482265472412, "epoch": 2.437235424787292, "grad_norm": 0.13355432159632558, "kl": 0.104248046875, "learning_rate": 4.921303894279568e-07, "loss": 0.0001, "reward": 1.810714341700077, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8107143118977547, "rewards/format_reward_func": 1.0, "step": 14536 }, { "completion_length": 242.2857265472412, "epoch": 2.4375707280271595, "grad_norm": 0.23392933237799146, "kl": 0.1086273193359375, "learning_rate": 4.921274030492408e-07, "loss": 0.0001, "reward": 1.7178572416305542, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7178571857511997, "rewards/format_reward_func": 1.0, "step": 14538 }, { "completion_length": 233.24108505249023, "epoch": 2.437906031267027, "grad_norm": 0.23157423824280435, "kl": 0.10504150390625, "learning_rate": 4.921244161130578e-07, "loss": 0.0001, "reward": 1.7500000819563866, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7589286100119352, "rewards/format_reward_func": 0.9910714328289032, "step": 14540 }, { "completion_length": 237.29911518096924, "epoch": 2.4382413345068947, "grad_norm": 0.08473106919533094, "kl": 0.11676025390625, "learning_rate": 4.921214286194147e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571678698063, "rewards/format_reward_func": 1.0, "step": 14542 }, { "completion_length": 228.8571548461914, "epoch": 2.4385766377467624, "grad_norm": 0.1442209266300916, "kl": 0.10284423828125, "learning_rate": 4.921184405683182e-07, "loss": 0.0001, "reward": 1.789285771548748, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857454717159, "rewards/format_reward_func": 1.0, "step": 14544 }, { "completion_length": 227.5669755935669, "epoch": 2.43891194098663, "grad_norm": 0.0418952892430739, "kl": 0.0934600830078125, "learning_rate": 4.921154519597753e-07, "loss": 0.0001, "reward": 1.789285771548748, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.789285734295845, "rewards/format_reward_func": 1.0, "step": 14546 }, { "completion_length": 241.20983219146729, "epoch": 2.439247244226497, "grad_norm": 0.15438437741472294, "kl": 0.097076416015625, "learning_rate": 4.921124627937929e-07, "loss": 0.0001, "reward": 1.7946429178118706, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7991071678698063, "rewards/format_reward_func": 0.9955357164144516, "step": 14548 }, { "completion_length": 237.89733409881592, "epoch": 2.439582547466365, "grad_norm": 0.22889737678043695, "kl": 0.110015869140625, "learning_rate": 4.921094730703778e-07, "loss": 0.0001, "reward": 1.765178643167019, "reward_std": 0.03914341004565358, "rewards/equation_reward_func": 0.7669643200933933, "rewards/format_reward_func": 0.9982142895460129, "step": 14550 }, { "completion_length": 236.3214406967163, "epoch": 2.4399178507062325, "grad_norm": 0.09313547110453636, "kl": 0.1150360107421875, "learning_rate": 4.921064827895369e-07, "loss": 0.0001, "reward": 1.8178571686148643, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8178571555763483, "rewards/format_reward_func": 1.0, "step": 14552 }, { "completion_length": 237.19197750091553, "epoch": 2.4402531539461, "grad_norm": 0.10693348598193116, "kl": 0.093170166015625, "learning_rate": 4.921034919512772e-07, "loss": 0.0001, "reward": 1.7839286103844643, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7883928827941418, "rewards/format_reward_func": 0.9955357164144516, "step": 14554 }, { "completion_length": 239.31697368621826, "epoch": 2.4405884571859677, "grad_norm": 0.17416553763203155, "kl": 0.0992889404296875, "learning_rate": 4.921005005556055e-07, "loss": 0.0001, "reward": 1.7750000357627869, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7750000394880772, "rewards/format_reward_func": 1.0, "step": 14556 }, { "completion_length": 233.5312614440918, "epoch": 2.440923760425835, "grad_norm": 0.5400463217476548, "kl": 0.1147308349609375, "learning_rate": 4.920975086025286e-07, "loss": 0.0001, "reward": 1.8321429193019867, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8321428708732128, "rewards/format_reward_func": 1.0, "step": 14558 }, { "completion_length": 224.7678680419922, "epoch": 2.4412590636657026, "grad_norm": 0.246319043845516, "kl": 0.093414306640625, "learning_rate": 4.920945160920535e-07, "loss": 0.0001, "reward": 1.783928632736206, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7875000312924385, "rewards/format_reward_func": 0.9964285716414452, "step": 14560 }, { "completion_length": 239.2634048461914, "epoch": 2.44159436690557, "grad_norm": 0.1861008204380244, "kl": 0.1091461181640625, "learning_rate": 4.920915230241871e-07, "loss": 0.0001, "reward": 1.7535714730620384, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714618861675, "rewards/format_reward_func": 1.0, "step": 14562 }, { "completion_length": 232.36161994934082, "epoch": 2.441929670145438, "grad_norm": 0.22969570435893646, "kl": 0.108428955078125, "learning_rate": 4.920885293989362e-07, "loss": 0.0001, "reward": 1.7428571954369545, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 14564 }, { "completion_length": 236.60268783569336, "epoch": 2.4422649733853055, "grad_norm": 0.17641325910517738, "kl": 0.0992279052734375, "learning_rate": 4.920855352163077e-07, "loss": 0.0001, "reward": 1.8000000417232513, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.8000000230967999, "rewards/format_reward_func": 1.0, "step": 14566 }, { "completion_length": 243.73661708831787, "epoch": 2.442600276625173, "grad_norm": 0.2636670701693931, "kl": 0.107452392578125, "learning_rate": 4.920825404763086e-07, "loss": 0.0001, "reward": 1.7696429342031479, "reward_std": 0.07323605753481388, "rewards/equation_reward_func": 0.7741071619093418, "rewards/format_reward_func": 0.9955357164144516, "step": 14568 }, { "completion_length": 235.40625858306885, "epoch": 2.4429355798650403, "grad_norm": 0.20063605323852532, "kl": 0.106597900390625, "learning_rate": 4.920795451789458e-07, "loss": 0.0001, "reward": 1.7803572192788124, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7848214507102966, "rewards/format_reward_func": 0.9955357164144516, "step": 14570 }, { "completion_length": 237.8928689956665, "epoch": 2.443270883104908, "grad_norm": 0.2844660939043713, "kl": 0.105743408203125, "learning_rate": 4.920765493242261e-07, "loss": 0.0001, "reward": 1.7839286401867867, "reward_std": 0.06313453335314989, "rewards/equation_reward_func": 0.7883928790688515, "rewards/format_reward_func": 0.9955357164144516, "step": 14572 }, { "completion_length": 237.571439743042, "epoch": 2.4436061863447756, "grad_norm": 0.14908936651878493, "kl": 0.09649658203125, "learning_rate": 4.920735529121563e-07, "loss": 0.0001, "reward": 1.7910714596509933, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.8044643010944128, "rewards/format_reward_func": 0.9866071492433548, "step": 14574 }, { "completion_length": 240.28125953674316, "epoch": 2.443941489584643, "grad_norm": 0.1813296833370229, "kl": 0.111358642578125, "learning_rate": 4.920705559427436e-07, "loss": 0.0001, "reward": 1.7339286282658577, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.738392885774374, "rewards/format_reward_func": 0.9955357164144516, "step": 14576 }, { "completion_length": 245.95983219146729, "epoch": 2.444276792824511, "grad_norm": 0.08269867957004332, "kl": 0.124542236328125, "learning_rate": 4.920675584159947e-07, "loss": 0.0001, "reward": 1.7500000521540642, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 14578 }, { "completion_length": 247.1294765472412, "epoch": 2.444612096064378, "grad_norm": 0.2082926494731955, "kl": 0.1058349609375, "learning_rate": 4.920645603319164e-07, "loss": 0.0001, "reward": 1.7428571954369545, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571581840515, "rewards/format_reward_func": 1.0, "step": 14580 }, { "completion_length": 238.02233409881592, "epoch": 2.4449473993042456, "grad_norm": 0.25334244067411815, "kl": 0.1026611328125, "learning_rate": 4.920615616905158e-07, "loss": 0.0001, "reward": 1.8089286237955093, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.8133928701281548, "rewards/format_reward_func": 0.9955357164144516, "step": 14582 }, { "completion_length": 244.36608219146729, "epoch": 2.4452827025441133, "grad_norm": 0.22762717588504205, "kl": 0.1002197265625, "learning_rate": 4.920585624917998e-07, "loss": 0.0001, "reward": 1.8178571984171867, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.8267857246100903, "rewards/format_reward_func": 0.9910714328289032, "step": 14584 }, { "completion_length": 238.6741189956665, "epoch": 2.445618005783981, "grad_norm": 0.1654778832690156, "kl": 0.0996856689453125, "learning_rate": 4.920555627357752e-07, "loss": 0.0001, "reward": 1.7678571864962578, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571734577417, "rewards/format_reward_func": 1.0, "step": 14586 }, { "completion_length": 243.37501049041748, "epoch": 2.4459533090238486, "grad_norm": 0.2654349418206511, "kl": 0.099334716796875, "learning_rate": 4.920525624224489e-07, "loss": 0.0001, "reward": 1.76607146859169, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7705357596278191, "rewards/format_reward_func": 0.9955357164144516, "step": 14588 }, { "completion_length": 250.31697463989258, "epoch": 2.446288612263716, "grad_norm": 0.20491560862196873, "kl": 0.113525390625, "learning_rate": 4.920495615518279e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 14590 }, { "completion_length": 256.28126335144043, "epoch": 2.4466239155035834, "grad_norm": 0.0950820708130051, "kl": 0.118316650390625, "learning_rate": 4.92046560123919e-07, "loss": 0.0001, "reward": 1.8107143118977547, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8107143081724644, "rewards/format_reward_func": 1.0, "step": 14592 }, { "completion_length": 253.70536613464355, "epoch": 2.446959218743451, "grad_norm": 0.15718448474235885, "kl": 0.112945556640625, "learning_rate": 4.920435581387293e-07, "loss": 0.0001, "reward": 1.805357187986374, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.809821467846632, "rewards/format_reward_func": 0.9955357164144516, "step": 14594 }, { "completion_length": 259.1651906967163, "epoch": 2.4472945219833186, "grad_norm": 0.09285379066929722, "kl": 0.122222900390625, "learning_rate": 4.920405555962654e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 14596 }, { "completion_length": 256.9509057998657, "epoch": 2.4476298252231863, "grad_norm": 0.10765725585666376, "kl": 0.12353515625, "learning_rate": 4.920375524965346e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7526785954833031, "rewards/format_reward_func": 0.9955357164144516, "step": 14598 }, { "completion_length": 254.65179634094238, "epoch": 2.447965128463054, "grad_norm": 0.09697412589677633, "kl": 0.116363525390625, "learning_rate": 4.920345488395436e-07, "loss": 0.0001, "reward": 1.744642935693264, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7491071745753288, "rewards/format_reward_func": 0.9955357164144516, "step": 14600 }, { "completion_length": 258.3660831451416, "epoch": 2.448300431702921, "grad_norm": 0.2110317592273284, "kl": 0.10479736328125, "learning_rate": 4.920315446252992e-07, "loss": 0.0001, "reward": 1.7214286550879478, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7392857484519482, "rewards/format_reward_func": 0.9821428656578064, "step": 14602 }, { "completion_length": 259.3482246398926, "epoch": 2.4486357349427887, "grad_norm": 0.16048095784784516, "kl": 0.11065673828125, "learning_rate": 4.920285398538085e-07, "loss": 0.0001, "reward": 1.6964286491274834, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.6964286062866449, "rewards/format_reward_func": 1.0, "step": 14604 }, { "completion_length": 265.58929347991943, "epoch": 2.4489710381826564, "grad_norm": 0.3574648813796875, "kl": 0.12420654296875, "learning_rate": 4.920255345250784e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.08081220183521509, "rewards/equation_reward_func": 0.7589286044239998, "rewards/format_reward_func": 0.9910714328289032, "step": 14606 }, { "completion_length": 260.214298248291, "epoch": 2.449306341422524, "grad_norm": 0.22111108570420177, "kl": 0.118621826171875, "learning_rate": 4.920225286391157e-07, "loss": 0.0001, "reward": 1.740178644657135, "reward_std": 0.06439722329378128, "rewards/equation_reward_func": 0.7419643141329288, "rewards/format_reward_func": 0.9982142895460129, "step": 14608 }, { "completion_length": 254.24108123779297, "epoch": 2.4496416446623916, "grad_norm": 0.21975558457475472, "kl": 0.10028076171875, "learning_rate": 4.920195221959275e-07, "loss": 0.0001, "reward": 1.8000000640749931, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8000000193715096, "rewards/format_reward_func": 1.0, "step": 14610 }, { "completion_length": 258.1562623977661, "epoch": 2.4499769479022593, "grad_norm": 0.3037889689093855, "kl": 0.12811279296875, "learning_rate": 4.920165151955205e-07, "loss": 0.0001, "reward": 1.7446429431438446, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7491071708500385, "rewards/format_reward_func": 0.9955357164144516, "step": 14612 }, { "completion_length": 251.09376430511475, "epoch": 2.4503122511421265, "grad_norm": 0.11148829311822392, "kl": 0.096405029296875, "learning_rate": 4.920135076379019e-07, "loss": 0.0001, "reward": 1.7446429058909416, "reward_std": 0.047982245683670044, "rewards/equation_reward_func": 0.7491071801632643, "rewards/format_reward_func": 0.9955357164144516, "step": 14614 }, { "completion_length": 259.15626525878906, "epoch": 2.450647554381994, "grad_norm": 0.2130659164666184, "kl": 0.120697021484375, "learning_rate": 4.920104995230784e-07, "loss": 0.0001, "reward": 1.7910714969038963, "reward_std": 0.04293148312717676, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 14616 }, { "completion_length": 253.92412185668945, "epoch": 2.4509828576218617, "grad_norm": 0.1594905819651956, "kl": 0.123779296875, "learning_rate": 4.92007490851057e-07, "loss": 0.0001, "reward": 1.725000075995922, "reward_std": 0.06565991509705782, "rewards/equation_reward_func": 0.733928594738245, "rewards/format_reward_func": 0.9910714328289032, "step": 14618 }, { "completion_length": 258.1116180419922, "epoch": 2.4513181608617294, "grad_norm": 0.28001387833506153, "kl": 0.139801025390625, "learning_rate": 4.920044816218446e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.03535533882677555, "rewards/equation_reward_func": 0.7910714484751225, "rewards/format_reward_func": 0.9910714328289032, "step": 14620 }, { "completion_length": 256.36608028411865, "epoch": 2.451653464101597, "grad_norm": 0.15087349096042746, "kl": 0.132415771484375, "learning_rate": 4.920014718354481e-07, "loss": 0.0001, "reward": 1.7517857924103737, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 14622 }, { "completion_length": 251.73661994934082, "epoch": 2.451988767341464, "grad_norm": 0.0777120217393265, "kl": 0.12347412109375, "learning_rate": 4.919984614918746e-07, "loss": 0.0001, "reward": 1.7642857879400253, "reward_std": 0.05050762742757797, "rewards/equation_reward_func": 0.7821428775787354, "rewards/format_reward_func": 0.9821428656578064, "step": 14624 }, { "completion_length": 250.46876335144043, "epoch": 2.452324070581332, "grad_norm": 0.061192203297932936, "kl": 0.110107421875, "learning_rate": 4.919954505911309e-07, "loss": 0.0001, "reward": 1.8107143342494965, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8107143193483353, "rewards/format_reward_func": 1.0, "step": 14626 }, { "completion_length": 264.4732255935669, "epoch": 2.4526593738211995, "grad_norm": 0.25103566878335815, "kl": 0.142913818359375, "learning_rate": 4.919924391332239e-07, "loss": 0.0001, "reward": 1.730357214808464, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7348214406520128, "rewards/format_reward_func": 0.9955357164144516, "step": 14628 }, { "completion_length": 249.88393878936768, "epoch": 2.452994677061067, "grad_norm": 0.12985395819053508, "kl": 0.1202392578125, "learning_rate": 4.919894271181606e-07, "loss": 0.0001, "reward": 1.7892857640981674, "reward_std": 0.045456863939762115, "rewards/equation_reward_func": 0.798214316368103, "rewards/format_reward_func": 0.9910714328289032, "step": 14630 }, { "completion_length": 257.48215675354004, "epoch": 2.4533299803009347, "grad_norm": 0.20780368372354519, "kl": 0.151947021484375, "learning_rate": 4.91986414545948e-07, "loss": 0.0002, "reward": 1.7464286386966705, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7464285977184772, "rewards/format_reward_func": 1.0, "step": 14632 }, { "completion_length": 256.6294765472412, "epoch": 2.4536652835408024, "grad_norm": 0.20687599287668856, "kl": 0.11669921875, "learning_rate": 4.919834014165929e-07, "loss": 0.0001, "reward": 1.807142935693264, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.8160714581608772, "rewards/format_reward_func": 0.9910714328289032, "step": 14634 }, { "completion_length": 252.1294755935669, "epoch": 2.45400058678067, "grad_norm": 0.1576095082803586, "kl": 0.1048583984375, "learning_rate": 4.919803877301022e-07, "loss": 0.0001, "reward": 1.8017857819795609, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8062500208616257, "rewards/format_reward_func": 0.9955357164144516, "step": 14636 }, { "completion_length": 254.8571538925171, "epoch": 2.454335890020537, "grad_norm": 0.10649602704630118, "kl": 0.137786865234375, "learning_rate": 4.91977373486483e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 14638 }, { "completion_length": 251.45090579986572, "epoch": 2.454671193260405, "grad_norm": 0.17848525693874534, "kl": 0.128631591796875, "learning_rate": 4.919743586857422e-07, "loss": 0.0001, "reward": 1.742857202887535, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7428571749478579, "rewards/format_reward_func": 1.0, "step": 14640 }, { "completion_length": 256.9285840988159, "epoch": 2.4550064965002725, "grad_norm": 0.1561715109998979, "kl": 0.1240692138671875, "learning_rate": 4.919713433278866e-07, "loss": 0.0001, "reward": 1.7553571835160255, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7598214596509933, "rewards/format_reward_func": 0.9955357164144516, "step": 14642 }, { "completion_length": 250.8928680419922, "epoch": 2.45534179974014, "grad_norm": 0.09381988812620205, "kl": 0.139129638671875, "learning_rate": 4.919683274129234e-07, "loss": 0.0001, "reward": 1.8214286342263222, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8214285857975483, "rewards/format_reward_func": 1.0, "step": 14644 }, { "completion_length": 254.1785831451416, "epoch": 2.4556771029800073, "grad_norm": 0.1776446950481363, "kl": 0.1341552734375, "learning_rate": 4.919653109408593e-07, "loss": 0.0001, "reward": 1.776785746216774, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500260770321, "rewards/format_reward_func": 0.9955357164144516, "step": 14646 }, { "completion_length": 251.08483600616455, "epoch": 2.456012406219875, "grad_norm": 0.16014605591091147, "kl": 0.126861572265625, "learning_rate": 4.919622939117013e-07, "loss": 0.0001, "reward": 1.7758929058909416, "reward_std": 0.06439722329378128, "rewards/equation_reward_func": 0.7776786088943481, "rewards/format_reward_func": 0.9982142895460129, "step": 14648 }, { "completion_length": 254.19197750091553, "epoch": 2.4563477094597426, "grad_norm": 0.0025472339599904464, "kl": 0.126617431640625, "learning_rate": 4.919592763254564e-07, "loss": 0.0001, "reward": 1.791071467101574, "reward_std": 0.022728432901203632, "rewards/equation_reward_func": 0.7955357506871223, "rewards/format_reward_func": 0.9955357164144516, "step": 14650 }, { "completion_length": 250.99108409881592, "epoch": 2.45668301269961, "grad_norm": 0.294690839121686, "kl": 0.19085693359375, "learning_rate": 4.919562581821316e-07, "loss": 0.0002, "reward": 1.7410714626312256, "reward_std": 0.09343910962343216, "rewards/equation_reward_func": 0.7544643059372902, "rewards/format_reward_func": 0.9866071492433548, "step": 14652 }, { "completion_length": 241.83929634094238, "epoch": 2.457018315939478, "grad_norm": 0.2685941546167499, "kl": 0.141021728515625, "learning_rate": 4.919532394817338e-07, "loss": 0.0001, "reward": 1.8000000640749931, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 14654 }, { "completion_length": 249.29911994934082, "epoch": 2.4573536191793455, "grad_norm": 0.12056836743824613, "kl": 0.155242919921875, "learning_rate": 4.919502202242699e-07, "loss": 0.0002, "reward": 1.7767857685685158, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500316649675, "rewards/format_reward_func": 0.9955357164144516, "step": 14656 }, { "completion_length": 237.00893878936768, "epoch": 2.457688922419213, "grad_norm": 0.1632951800033488, "kl": 0.137969970703125, "learning_rate": 4.919472004097468e-07, "loss": 0.0001, "reward": 1.7517857551574707, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7562500350177288, "rewards/format_reward_func": 0.9955357164144516, "step": 14658 }, { "completion_length": 242.84375953674316, "epoch": 2.4580242256590803, "grad_norm": 0.38433586783614954, "kl": 0.147674560546875, "learning_rate": 4.919441800381717e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.06565991323441267, "rewards/equation_reward_func": 0.7607143148779869, "rewards/format_reward_func": 1.0, "step": 14660 }, { "completion_length": 233.5178689956665, "epoch": 2.458359528898948, "grad_norm": 0.15426940267481715, "kl": 0.13128662109375, "learning_rate": 4.919411591095512e-07, "loss": 0.0001, "reward": 1.7357143387198448, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 14662 }, { "completion_length": 238.9330472946167, "epoch": 2.4586948321388156, "grad_norm": 0.2574320114609894, "kl": 0.115234375, "learning_rate": 4.919381376238927e-07, "loss": 0.0001, "reward": 1.8214286267757416, "reward_std": 0.07071067579090595, "rewards/equation_reward_func": 0.821428582072258, "rewards/format_reward_func": 1.0, "step": 14664 }, { "completion_length": 235.0625123977661, "epoch": 2.459030135378683, "grad_norm": 0.2578124993607625, "kl": 0.14154052734375, "learning_rate": 4.919351155812027e-07, "loss": 0.0001, "reward": 1.8017857521772385, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8062500208616257, "rewards/format_reward_func": 0.9955357164144516, "step": 14666 }, { "completion_length": 229.8616180419922, "epoch": 2.459365438618551, "grad_norm": 0.18994664944682665, "kl": 0.11993408203125, "learning_rate": 4.919320929814884e-07, "loss": 0.0001, "reward": 1.7571429163217545, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428883820772, "rewards/format_reward_func": 1.0, "step": 14668 }, { "completion_length": 226.6071538925171, "epoch": 2.459700741858418, "grad_norm": 0.08334989454251703, "kl": 0.107177734375, "learning_rate": 4.919290698247568e-07, "loss": 0.0001, "reward": 1.8250000402331352, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.825000025331974, "rewards/format_reward_func": 1.0, "step": 14670 }, { "completion_length": 236.5759038925171, "epoch": 2.4600360450982857, "grad_norm": 0.24064977610399677, "kl": 0.124664306640625, "learning_rate": 4.919260461110146e-07, "loss": 0.0001, "reward": 1.7428572177886963, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.742857176810503, "rewards/format_reward_func": 1.0, "step": 14672 }, { "completion_length": 228.68304538726807, "epoch": 2.4603713483381533, "grad_norm": 0.3965086922599267, "kl": 0.116363525390625, "learning_rate": 4.919230218402692e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7500000298023224, "rewards/format_reward_func": 1.0, "step": 14674 }, { "completion_length": 227.08483219146729, "epoch": 2.460706651578021, "grad_norm": 0.07705829929685584, "kl": 0.12017822265625, "learning_rate": 4.919199970125271e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8035714365541935, "rewards/format_reward_func": 1.0, "step": 14676 }, { "completion_length": 229.67411518096924, "epoch": 2.4610419548178886, "grad_norm": 0.13454988026363998, "kl": 0.10888671875, "learning_rate": 4.919169716277956e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7321428898721933, "rewards/format_reward_func": 1.0, "step": 14678 }, { "completion_length": 236.2053680419922, "epoch": 2.461377258057756, "grad_norm": 0.251892997636684, "kl": 0.114898681640625, "learning_rate": 4.919139456860814e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 14680 }, { "completion_length": 227.08483028411865, "epoch": 2.4617125612976234, "grad_norm": 0.3806151196214663, "kl": 0.105987548828125, "learning_rate": 4.919109191873917e-07, "loss": 0.0001, "reward": 1.8142857551574707, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857477068901, "rewards/format_reward_func": 1.0, "step": 14682 }, { "completion_length": 240.7991189956665, "epoch": 2.462047864537491, "grad_norm": 0.15874235465177416, "kl": 0.12335205078125, "learning_rate": 4.919078921317334e-07, "loss": 0.0001, "reward": 1.7196429371833801, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.7241071946918964, "rewards/format_reward_func": 0.9955357164144516, "step": 14684 }, { "completion_length": 233.2812614440918, "epoch": 2.4623831677773587, "grad_norm": 0.24433920234307233, "kl": 0.116546630859375, "learning_rate": 4.919048645191133e-07, "loss": 0.0001, "reward": 1.8196429163217545, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.8241071589291096, "rewards/format_reward_func": 0.9955357164144516, "step": 14686 }, { "completion_length": 233.66518878936768, "epoch": 2.4627184710172263, "grad_norm": 0.1526931970382, "kl": 0.1051025390625, "learning_rate": 4.919018363495386e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 14688 }, { "completion_length": 238.74108219146729, "epoch": 2.463053774257094, "grad_norm": 0.10079218212524813, "kl": 0.10546875, "learning_rate": 4.918988076230161e-07, "loss": 0.0001, "reward": 1.7714286297559738, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7714286036789417, "rewards/format_reward_func": 1.0, "step": 14690 }, { "completion_length": 239.29911708831787, "epoch": 2.463389077496961, "grad_norm": 0.14274660573087072, "kl": 0.113311767578125, "learning_rate": 4.91895778339553e-07, "loss": 0.0001, "reward": 1.796428605914116, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 14692 }, { "completion_length": 246.5580472946167, "epoch": 2.4637243807368288, "grad_norm": 0.20910002876872932, "kl": 0.101806640625, "learning_rate": 4.91892748499156e-07, "loss": 0.0001, "reward": 1.7517857626080513, "reward_std": 0.05808377079665661, "rewards/equation_reward_func": 0.7562500163912773, "rewards/format_reward_func": 0.9955357164144516, "step": 14694 }, { "completion_length": 240.37054538726807, "epoch": 2.4640596839766964, "grad_norm": 0.3170325404349844, "kl": 0.119476318359375, "learning_rate": 4.918897181018323e-07, "loss": 0.0001, "reward": 1.762500062584877, "reward_std": 0.053033008240163326, "rewards/equation_reward_func": 0.7669643126428127, "rewards/format_reward_func": 0.9955357164144516, "step": 14696 }, { "completion_length": 237.92411518096924, "epoch": 2.464394987216564, "grad_norm": 0.17860278813200464, "kl": 0.110260009765625, "learning_rate": 4.918866871475887e-07, "loss": 0.0001, "reward": 1.791071467101574, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.795535746961832, "rewards/format_reward_func": 0.9955357164144516, "step": 14698 }, { "completion_length": 240.602689743042, "epoch": 2.4647302904564317, "grad_norm": 0.19296638822722686, "kl": 0.103668212890625, "learning_rate": 4.918836556364324e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 14700 }, { "completion_length": 244.63393783569336, "epoch": 2.4650655936962993, "grad_norm": 0.1819558090672466, "kl": 0.107757568359375, "learning_rate": 4.918806235683701e-07, "loss": 0.0001, "reward": 1.8017857745289803, "reward_std": 0.04798224475234747, "rewards/equation_reward_func": 0.8062500208616257, "rewards/format_reward_func": 0.9955357164144516, "step": 14702 }, { "completion_length": 239.3884048461914, "epoch": 2.4654008969361665, "grad_norm": 0.16922460661830055, "kl": 0.110870361328125, "learning_rate": 4.91877590943409e-07, "loss": 0.0001, "reward": 1.8500000685453415, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.850000012665987, "rewards/format_reward_func": 1.0, "step": 14704 }, { "completion_length": 243.34822177886963, "epoch": 2.465736200176034, "grad_norm": 0.174083358236681, "kl": 0.11090087890625, "learning_rate": 4.918745577615559e-07, "loss": 0.0001, "reward": 1.758928619325161, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7633928712457418, "rewards/format_reward_func": 0.9955357164144516, "step": 14706 }, { "completion_length": 236.4866180419922, "epoch": 2.4660715034159018, "grad_norm": 0.18333538920013637, "kl": 0.104248046875, "learning_rate": 4.918715240228181e-07, "loss": 0.0001, "reward": 1.7321429327130318, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7321428954601288, "rewards/format_reward_func": 1.0, "step": 14708 }, { "completion_length": 249.57590198516846, "epoch": 2.4664068066557694, "grad_norm": 0.050767253993794496, "kl": 0.119903564453125, "learning_rate": 4.918684897272022e-07, "loss": 0.0001, "reward": 1.7482143491506577, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.7526785843074322, "rewards/format_reward_func": 0.9955357164144516, "step": 14710 }, { "completion_length": 240.40179634094238, "epoch": 2.466742109895637, "grad_norm": 0.20012551468951106, "kl": 0.118896484375, "learning_rate": 4.918654548747154e-07, "loss": 0.0001, "reward": 1.7625000774860382, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.766964316368103, "rewards/format_reward_func": 0.9955357164144516, "step": 14712 }, { "completion_length": 244.321439743042, "epoch": 2.467077413135504, "grad_norm": 0.18902951420859376, "kl": 0.1083984375, "learning_rate": 4.918624194653646e-07, "loss": 0.0001, "reward": 1.771428644657135, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.771428607404232, "rewards/format_reward_func": 1.0, "step": 14714 }, { "completion_length": 238.95983028411865, "epoch": 2.467412716375372, "grad_norm": 0.16578032130190862, "kl": 0.136688232421875, "learning_rate": 4.918593834991569e-07, "loss": 0.0001, "reward": 1.821428619325161, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.8214285895228386, "rewards/format_reward_func": 1.0, "step": 14716 }, { "completion_length": 240.70983123779297, "epoch": 2.4677480196152395, "grad_norm": 0.2250877718203696, "kl": 0.108428955078125, "learning_rate": 4.918563469760992e-07, "loss": 0.0001, "reward": 1.8000000715255737, "reward_std": 0.06060915254056454, "rewards/equation_reward_func": 0.8089285865426064, "rewards/format_reward_func": 0.9910714328289032, "step": 14718 }, { "completion_length": 248.5803689956665, "epoch": 2.468083322855107, "grad_norm": 0.13311610670420726, "kl": 0.112213134765625, "learning_rate": 4.918533098961985e-07, "loss": 0.0001, "reward": 1.8142857551574707, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857439815998, "rewards/format_reward_func": 1.0, "step": 14720 }, { "completion_length": 242.0312614440918, "epoch": 2.4684186260949748, "grad_norm": 0.17924485307370552, "kl": 0.13079833984375, "learning_rate": 4.918502722594619e-07, "loss": 0.0001, "reward": 1.7642857730388641, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7642857600003481, "rewards/format_reward_func": 1.0, "step": 14722 }, { "completion_length": 249.31697940826416, "epoch": 2.4687539293348424, "grad_norm": 0.2366084108225645, "kl": 0.127410888671875, "learning_rate": 4.918472340658961e-07, "loss": 0.0001, "reward": 1.807142935693264, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 14724 }, { "completion_length": 248.94644451141357, "epoch": 2.4690892325747096, "grad_norm": 0.1999633689515607, "kl": 0.123321533203125, "learning_rate": 4.918441953155085e-07, "loss": 0.0001, "reward": 1.7500000670552254, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.0, "step": 14726 }, { "completion_length": 251.47768878936768, "epoch": 2.469424535814577, "grad_norm": 0.1271370337954408, "kl": 0.123931884765625, "learning_rate": 4.918411560083058e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.796428594738245, "rewards/format_reward_func": 1.0, "step": 14728 }, { "completion_length": 249.41965293884277, "epoch": 2.469759839054445, "grad_norm": 0.11077971547908103, "kl": 0.110931396484375, "learning_rate": 4.918381161442951e-07, "loss": 0.0001, "reward": 1.7732143178582191, "reward_std": 0.017677669413387775, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.9955357164144516, "step": 14730 }, { "completion_length": 246.1651906967163, "epoch": 2.4700951422943125, "grad_norm": 0.21287808362756122, "kl": 0.1270751953125, "learning_rate": 4.918350757234834e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7821428701281548, "rewards/format_reward_func": 1.0, "step": 14732 }, { "completion_length": 250.77233409881592, "epoch": 2.47043044553418, "grad_norm": 0.1836312903043879, "kl": 0.114288330078125, "learning_rate": 4.918320347458777e-07, "loss": 0.0001, "reward": 1.7839286476373672, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7883928790688515, "rewards/format_reward_func": 0.9955357164144516, "step": 14734 }, { "completion_length": 248.48661708831787, "epoch": 2.4707657487740473, "grad_norm": 0.13193506465499494, "kl": 0.11639404296875, "learning_rate": 4.91828993211485e-07, "loss": 0.0001, "reward": 1.8285714760422707, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8285714536905289, "rewards/format_reward_func": 1.0, "step": 14736 }, { "completion_length": 257.45536613464355, "epoch": 2.471101052013915, "grad_norm": 0.14764558623734747, "kl": 0.123046875, "learning_rate": 4.918259511203122e-07, "loss": 0.0001, "reward": 1.8017857670783997, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.8062500152736902, "rewards/format_reward_func": 0.9955357164144516, "step": 14738 }, { "completion_length": 239.48215293884277, "epoch": 2.4714363552537826, "grad_norm": 0.22586394380238098, "kl": 0.103912353515625, "learning_rate": 4.918229084723665e-07, "loss": 0.0001, "reward": 1.82589291036129, "reward_std": 0.0340926474891603, "rewards/equation_reward_func": 0.8276785910129547, "rewards/format_reward_func": 0.9982142895460129, "step": 14740 }, { "completion_length": 254.2321548461914, "epoch": 2.47177165849365, "grad_norm": 0.25298517588656405, "kl": 0.115631103515625, "learning_rate": 4.918198652676547e-07, "loss": 0.0001, "reward": 1.8125000447034836, "reward_std": 0.06313453242182732, "rewards/equation_reward_func": 0.8169643059372902, "rewards/format_reward_func": 0.9955357164144516, "step": 14742 }, { "completion_length": 259.14287090301514, "epoch": 2.472106961733518, "grad_norm": 0.16073060918881626, "kl": 0.110382080078125, "learning_rate": 4.918168215061841e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8053571619093418, "rewards/format_reward_func": 0.9910714328289032, "step": 14744 }, { "completion_length": 260.5535821914673, "epoch": 2.4724422649733855, "grad_norm": 0.3182146734872481, "kl": 0.12188720703125, "learning_rate": 4.918137771879614e-07, "loss": 0.0001, "reward": 1.76071435213089, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143316417933, "rewards/format_reward_func": 1.0, "step": 14746 }, { "completion_length": 251.88840579986572, "epoch": 2.4727775682132527, "grad_norm": 0.3186785826320642, "kl": 0.12640380859375, "learning_rate": 4.918107323129937e-07, "loss": 0.0001, "reward": 1.7964286282658577, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 14748 }, { "completion_length": 255.2187614440918, "epoch": 2.4731128714531203, "grad_norm": 0.2865873297261211, "kl": 0.111236572265625, "learning_rate": 4.91807686881288e-07, "loss": 0.0001, "reward": 1.7892857789993286, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 14750 }, { "completion_length": 248.99554920196533, "epoch": 2.473448174692988, "grad_norm": 0.1606260793115146, "kl": 0.104583740234375, "learning_rate": 4.918046408928515e-07, "loss": 0.0001, "reward": 1.8178572207689285, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8178571611642838, "rewards/format_reward_func": 1.0, "step": 14752 }, { "completion_length": 252.3928737640381, "epoch": 2.4737834779328556, "grad_norm": 0.3196020255422196, "kl": 0.107391357421875, "learning_rate": 4.91801594347691e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 14754 }, { "completion_length": 254.96429824829102, "epoch": 2.474118781172723, "grad_norm": 0.13566741367227556, "kl": 0.116729736328125, "learning_rate": 4.917985472458135e-07, "loss": 0.0001, "reward": 1.7517857775092125, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7562500275671482, "rewards/format_reward_func": 0.9955357164144516, "step": 14756 }, { "completion_length": 254.2500114440918, "epoch": 2.4744540844125904, "grad_norm": 0.20523246500830702, "kl": 0.113677978515625, "learning_rate": 4.917954995872262e-07, "loss": 0.0001, "reward": 1.773214340209961, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7767857313156128, "rewards/format_reward_func": 0.9964285716414452, "step": 14758 }, { "completion_length": 255.88393878936768, "epoch": 2.474789387652458, "grad_norm": 0.17088202418743903, "kl": 0.1111907958984375, "learning_rate": 4.917924513719359e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.750000037252903, "rewards/format_reward_func": 1.0, "step": 14760 }, { "completion_length": 248.63840770721436, "epoch": 2.4751246908923257, "grad_norm": 0.060042085703267425, "kl": 0.103485107421875, "learning_rate": 4.917894025999498e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7857143059372902, "rewards/format_reward_func": 1.0, "step": 14762 }, { "completion_length": 258.9285821914673, "epoch": 2.4754599941321933, "grad_norm": 0.10379061448307643, "kl": 0.118896484375, "learning_rate": 4.917863532712748e-07, "loss": 0.0001, "reward": 1.7750000730156898, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7839286029338837, "rewards/format_reward_func": 0.9910714328289032, "step": 14764 }, { "completion_length": 245.46876335144043, "epoch": 2.475795297372061, "grad_norm": 0.08166250755892202, "kl": 0.11309814453125, "learning_rate": 4.91783303385918e-07, "loss": 0.0001, "reward": 1.769642911851406, "reward_std": 0.022728431969881058, "rewards/equation_reward_func": 0.774107176810503, "rewards/format_reward_func": 0.9955357164144516, "step": 14766 }, { "completion_length": 242.16965579986572, "epoch": 2.4761306006119286, "grad_norm": 0.11966890403423652, "kl": 0.120269775390625, "learning_rate": 4.917802529438863e-07, "loss": 0.0001, "reward": 1.7857143506407738, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7857143096625805, "rewards/format_reward_func": 1.0, "step": 14768 }, { "completion_length": 244.65179538726807, "epoch": 2.476465903851796, "grad_norm": 0.06438476377658822, "kl": 0.12042236328125, "learning_rate": 4.91777201945187e-07, "loss": 0.0001, "reward": 1.7535714730620384, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7535714507102966, "rewards/format_reward_func": 1.0, "step": 14770 }, { "completion_length": 248.00000953674316, "epoch": 2.4768012070916634, "grad_norm": 0.25166210607398165, "kl": 0.1202392578125, "learning_rate": 4.917741503898268e-07, "loss": 0.0001, "reward": 1.8035714849829674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8035714495927095, "rewards/format_reward_func": 1.0, "step": 14772 }, { "completion_length": 243.7455472946167, "epoch": 2.477136510331531, "grad_norm": 0.12492238999535173, "kl": 0.111572265625, "learning_rate": 4.917710982778129e-07, "loss": 0.0001, "reward": 1.7107143700122833, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7107143048197031, "rewards/format_reward_func": 1.0, "step": 14774 }, { "completion_length": 256.3303689956665, "epoch": 2.4774718135713987, "grad_norm": 0.23558559483290095, "kl": 0.10693359375, "learning_rate": 4.917680456091523e-07, "loss": 0.0001, "reward": 1.798214353621006, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8026785925030708, "rewards/format_reward_func": 0.9955357164144516, "step": 14776 }, { "completion_length": 251.633939743042, "epoch": 2.4778071168112663, "grad_norm": 0.18466931698986533, "kl": 0.118194580078125, "learning_rate": 4.917649923838521e-07, "loss": 0.0001, "reward": 1.7607143595814705, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 14778 }, { "completion_length": 254.3214406967163, "epoch": 2.4781424200511335, "grad_norm": 0.18040244234467673, "kl": 0.122772216796875, "learning_rate": 4.917619386019191e-07, "loss": 0.0001, "reward": 1.7482143640518188, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7526785880327225, "rewards/format_reward_func": 0.9955357164144516, "step": 14780 }, { "completion_length": 246.53572463989258, "epoch": 2.478477723291001, "grad_norm": 0.0026001646841756804, "kl": 0.106536865234375, "learning_rate": 4.917588842633605e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8035714402794838, "rewards/format_reward_func": 1.0, "step": 14782 }, { "completion_length": 257.0714406967163, "epoch": 2.4788130265308688, "grad_norm": 0.2176137843266907, "kl": 0.1085205078125, "learning_rate": 4.917558293681834e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.778571454808116, "rewards/format_reward_func": 1.0, "step": 14784 }, { "completion_length": 252.01340293884277, "epoch": 2.4791483297707364, "grad_norm": 0.08079521274065427, "kl": 0.102691650390625, "learning_rate": 4.917527739163947e-07, "loss": 0.0001, "reward": 1.7750000804662704, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000320374966, "rewards/format_reward_func": 1.0, "step": 14786 }, { "completion_length": 241.19197750091553, "epoch": 2.479483633010604, "grad_norm": 0.06630944765696818, "kl": 0.09710693359375, "learning_rate": 4.917497179080013e-07, "loss": 0.0001, "reward": 1.8250000476837158, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8250000178813934, "rewards/format_reward_func": 1.0, "step": 14788 }, { "completion_length": 244.2857255935669, "epoch": 2.4798189362504717, "grad_norm": 0.15402009555722682, "kl": 0.108489990234375, "learning_rate": 4.917466613430107e-07, "loss": 0.0001, "reward": 1.7678572162985802, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 14790 }, { "completion_length": 253.34376430511475, "epoch": 2.4801542394903393, "grad_norm": 0.22679226122504897, "kl": 0.116607666015625, "learning_rate": 4.917436042214294e-07, "loss": 0.0001, "reward": 1.7464286386966705, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.755357164889574, "rewards/format_reward_func": 0.9910714328289032, "step": 14792 }, { "completion_length": 248.57143878936768, "epoch": 2.4804895427302065, "grad_norm": 0.14180333609406645, "kl": 0.107330322265625, "learning_rate": 4.917405465432649e-07, "loss": 0.0001, "reward": 1.757142923772335, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428939700127, "rewards/format_reward_func": 1.0, "step": 14794 }, { "completion_length": 250.1607255935669, "epoch": 2.480824845970074, "grad_norm": 0.21500318228619053, "kl": 0.118072509765625, "learning_rate": 4.91737488308524e-07, "loss": 0.0001, "reward": 1.7821428924798965, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428813040257, "rewards/format_reward_func": 1.0, "step": 14796 }, { "completion_length": 248.2678689956665, "epoch": 2.4811601492099418, "grad_norm": 0.17072486776960497, "kl": 0.1063385009765625, "learning_rate": 4.917344295172137e-07, "loss": 0.0001, "reward": 1.8000000566244125, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8000000268220901, "rewards/format_reward_func": 1.0, "step": 14798 }, { "completion_length": 241.50447845458984, "epoch": 2.4814954524498094, "grad_norm": 0.16210839854375575, "kl": 0.114166259765625, "learning_rate": 4.917313701693412e-07, "loss": 0.0001, "reward": 1.792857214808464, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571701049805, "rewards/format_reward_func": 1.0, "step": 14800 }, { "completion_length": 245.27233695983887, "epoch": 2.481830755689677, "grad_norm": 0.18009075898418456, "kl": 0.11663818359375, "learning_rate": 4.917283102649133e-07, "loss": 0.0001, "reward": 1.7821429371833801, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 14802 }, { "completion_length": 252.86161994934082, "epoch": 2.4821660589295442, "grad_norm": 0.22212604391809182, "kl": 0.1253662109375, "learning_rate": 4.917252498039374e-07, "loss": 0.0001, "reward": 1.7589286267757416, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7633928880095482, "rewards/format_reward_func": 0.9955357164144516, "step": 14804 }, { "completion_length": 247.05804824829102, "epoch": 2.482501362169412, "grad_norm": 0.17060599530084553, "kl": 0.10174560546875, "learning_rate": 4.917221887864202e-07, "loss": 0.0001, "reward": 1.776785783469677, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7812500186264515, "rewards/format_reward_func": 0.9955357164144516, "step": 14806 }, { "completion_length": 248.0044765472412, "epoch": 2.4828366654092795, "grad_norm": 0.19718760886727332, "kl": 0.108856201171875, "learning_rate": 4.91719127212369e-07, "loss": 0.0001, "reward": 1.7607143446803093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143092900515, "rewards/format_reward_func": 1.0, "step": 14808 }, { "completion_length": 256.0312604904175, "epoch": 2.483171968649147, "grad_norm": 0.002740858816126789, "kl": 0.102996826171875, "learning_rate": 4.917160650817906e-07, "loss": 0.0001, "reward": 1.735714353621006, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7357143200933933, "rewards/format_reward_func": 1.0, "step": 14810 }, { "completion_length": 246.3884048461914, "epoch": 2.4835072718890148, "grad_norm": 0.24967328188320786, "kl": 0.11102294921875, "learning_rate": 4.917130023946924e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7678571790456772, "rewards/format_reward_func": 1.0, "step": 14812 }, { "completion_length": 247.1696548461914, "epoch": 2.4838425751288824, "grad_norm": 0.14186057536510902, "kl": 0.105712890625, "learning_rate": 4.917099391510811e-07, "loss": 0.0001, "reward": 1.8464286103844643, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8464285992085934, "rewards/format_reward_func": 1.0, "step": 14814 }, { "completion_length": 243.70090293884277, "epoch": 2.4841778783687496, "grad_norm": 0.16875123376236464, "kl": 0.115570068359375, "learning_rate": 4.917068753509639e-07, "loss": 0.0001, "reward": 1.7821429148316383, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.782142885029316, "rewards/format_reward_func": 1.0, "step": 14816 }, { "completion_length": 251.5178680419922, "epoch": 2.4845131816086172, "grad_norm": 0.18352318078910093, "kl": 0.108489990234375, "learning_rate": 4.917038109943479e-07, "loss": 0.0001, "reward": 1.7071429267525673, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7071429006755352, "rewards/format_reward_func": 1.0, "step": 14818 }, { "completion_length": 245.8437623977661, "epoch": 2.484848484848485, "grad_norm": 0.16090355001841908, "kl": 0.105133056640625, "learning_rate": 4.917007460812401e-07, "loss": 0.0001, "reward": 1.8214286118745804, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8214285857975483, "rewards/format_reward_func": 1.0, "step": 14820 }, { "completion_length": 246.42858219146729, "epoch": 2.4851837880883525, "grad_norm": 0.1380564013172623, "kl": 0.09600830078125, "learning_rate": 4.916976806116476e-07, "loss": 0.0001, "reward": 1.8196429163217545, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8241071663796902, "rewards/format_reward_func": 0.9955357164144516, "step": 14822 }, { "completion_length": 246.72322750091553, "epoch": 2.48551909132822, "grad_norm": 0.15147188409912418, "kl": 0.097808837890625, "learning_rate": 4.916946145855774e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 14824 }, { "completion_length": 249.93304634094238, "epoch": 2.4858543945680873, "grad_norm": 0.25257574082870304, "kl": 0.1040191650390625, "learning_rate": 4.916915480030365e-07, "loss": 0.0001, "reward": 1.7464286535978317, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.746428593993187, "rewards/format_reward_func": 1.0, "step": 14826 }, { "completion_length": 247.65626049041748, "epoch": 2.486189697807955, "grad_norm": 0.09384685704779432, "kl": 0.09881591796875, "learning_rate": 4.916884808640323e-07, "loss": 0.0001, "reward": 1.7464286163449287, "reward_std": 0.005050762556493282, "rewards/equation_reward_func": 0.7464286126196384, "rewards/format_reward_func": 1.0, "step": 14828 }, { "completion_length": 248.2321548461914, "epoch": 2.4865250010478226, "grad_norm": 0.16828925861257932, "kl": 0.105712890625, "learning_rate": 4.916854131685713e-07, "loss": 0.0001, "reward": 1.755357213318348, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.7598214708268642, "rewards/format_reward_func": 0.9955357164144516, "step": 14830 }, { "completion_length": 242.9285831451416, "epoch": 2.4868603042876902, "grad_norm": 0.15619959437078137, "kl": 0.10552978515625, "learning_rate": 4.916823449166611e-07, "loss": 0.0001, "reward": 1.7267857939004898, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.7312500365078449, "rewards/format_reward_func": 0.9955357164144516, "step": 14832 }, { "completion_length": 254.9955472946167, "epoch": 2.487195607527558, "grad_norm": 0.2567715716716288, "kl": 0.0962982177734375, "learning_rate": 4.916792761083084e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 14834 }, { "completion_length": 255.8794755935669, "epoch": 2.4875309107674255, "grad_norm": 0.13534000071902677, "kl": 0.105438232421875, "learning_rate": 4.916762067435204e-07, "loss": 0.0001, "reward": 1.7928571999073029, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7928571812808514, "rewards/format_reward_func": 1.0, "step": 14836 }, { "completion_length": 252.67858600616455, "epoch": 2.4878662140072927, "grad_norm": 0.24524325288408574, "kl": 0.112060546875, "learning_rate": 4.916731368223042e-07, "loss": 0.0001, "reward": 1.7142857983708382, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.714285746216774, "rewards/format_reward_func": 1.0, "step": 14838 }, { "completion_length": 254.23215198516846, "epoch": 2.4882015172471603, "grad_norm": 0.21139274158098173, "kl": 0.104644775390625, "learning_rate": 4.916700663446668e-07, "loss": 0.0001, "reward": 1.7607143446803093, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7607143297791481, "rewards/format_reward_func": 1.0, "step": 14840 }, { "completion_length": 259.102689743042, "epoch": 2.488536820487028, "grad_norm": 0.14614650915831776, "kl": 0.117706298828125, "learning_rate": 4.916669953106154e-07, "loss": 0.0001, "reward": 1.7214286401867867, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7214285917580128, "rewards/format_reward_func": 1.0, "step": 14842 }, { "completion_length": 248.5491189956665, "epoch": 2.4888721237268956, "grad_norm": 0.24970700735528656, "kl": 0.1097412109375, "learning_rate": 4.916639237201568e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7357143126428127, "rewards/format_reward_func": 1.0, "step": 14844 }, { "completion_length": 250.30804538726807, "epoch": 2.4892074269667632, "grad_norm": 0.15992926840127522, "kl": 0.1121826171875, "learning_rate": 4.916608515732984e-07, "loss": 0.0001, "reward": 1.7035714834928513, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7035714685916901, "rewards/format_reward_func": 1.0, "step": 14846 }, { "completion_length": 249.04018878936768, "epoch": 2.4895427302066304, "grad_norm": 0.20425696513495342, "kl": 0.096221923828125, "learning_rate": 4.916577788700471e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7991071715950966, "rewards/format_reward_func": 0.9955357164144516, "step": 14848 }, { "completion_length": 255.94644165039062, "epoch": 2.489878033446498, "grad_norm": 0.2703017241730625, "kl": 0.104248046875, "learning_rate": 4.9165470561041e-07, "loss": 0.0001, "reward": 1.7660714834928513, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7705357447266579, "rewards/format_reward_func": 0.9955357164144516, "step": 14850 }, { "completion_length": 251.53572750091553, "epoch": 2.4902133366863657, "grad_norm": 0.08715365745771998, "kl": 0.109375, "learning_rate": 4.916516317943942e-07, "loss": 0.0001, "reward": 1.7464286237955093, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7464286014437675, "rewards/format_reward_func": 1.0, "step": 14852 }, { "completion_length": 242.1071548461914, "epoch": 2.4905486399262333, "grad_norm": 0.1947782628519035, "kl": 0.105072021484375, "learning_rate": 4.916485574220066e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.7812500223517418, "rewards/format_reward_func": 0.9955357164144516, "step": 14854 }, { "completion_length": 244.5178689956665, "epoch": 2.490883943166101, "grad_norm": 0.17149906856996142, "kl": 0.099365234375, "learning_rate": 4.916454824932545e-07, "loss": 0.0001, "reward": 1.8321429193019867, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.8321428745985031, "rewards/format_reward_func": 1.0, "step": 14856 }, { "completion_length": 253.70536613464355, "epoch": 2.4912192464059686, "grad_norm": 0.225006882706186, "kl": 0.106292724609375, "learning_rate": 4.916424070081448e-07, "loss": 0.0001, "reward": 1.7357143685221672, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7357143089175224, "rewards/format_reward_func": 1.0, "step": 14858 }, { "completion_length": 258.6651906967163, "epoch": 2.491554549645836, "grad_norm": 0.19474515995727912, "kl": 0.110687255859375, "learning_rate": 4.916393309666849e-07, "loss": 0.0001, "reward": 1.7892857640981674, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7892857491970062, "rewards/format_reward_func": 1.0, "step": 14860 }, { "completion_length": 252.4821548461914, "epoch": 2.4918898528857034, "grad_norm": 0.25165869699775634, "kl": 0.0942230224609375, "learning_rate": 4.916362543688816e-07, "loss": 0.0001, "reward": 1.7392857819795609, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.739285746589303, "rewards/format_reward_func": 1.0, "step": 14862 }, { "completion_length": 253.31697750091553, "epoch": 2.492225156125571, "grad_norm": 0.4725974865113826, "kl": 0.11981201171875, "learning_rate": 4.91633177214742e-07, "loss": 0.0001, "reward": 1.7517857924103737, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7562500238418579, "rewards/format_reward_func": 0.9955357164144516, "step": 14864 }, { "completion_length": 246.87947463989258, "epoch": 2.4925604593654387, "grad_norm": 0.16160872714609656, "kl": 0.10552978515625, "learning_rate": 4.916300995042732e-07, "loss": 0.0001, "reward": 1.8142857626080513, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857514321804, "rewards/format_reward_func": 1.0, "step": 14866 }, { "completion_length": 250.52679443359375, "epoch": 2.4928957626053063, "grad_norm": 0.09402971091803046, "kl": 0.097747802734375, "learning_rate": 4.916270212374824e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8035714440047741, "rewards/format_reward_func": 1.0, "step": 14868 }, { "completion_length": 256.6116189956665, "epoch": 2.4932310658451735, "grad_norm": 0.16822883694092985, "kl": 0.1134033203125, "learning_rate": 4.916239424143766e-07, "loss": 0.0001, "reward": 1.7375000640749931, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7419643253087997, "rewards/format_reward_func": 0.9955357164144516, "step": 14870 }, { "completion_length": 247.7544765472412, "epoch": 2.493566369085041, "grad_norm": 0.17898003366783408, "kl": 0.10797119140625, "learning_rate": 4.916208630349628e-07, "loss": 0.0001, "reward": 1.835714340209961, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.835714302957058, "rewards/format_reward_func": 1.0, "step": 14872 }, { "completion_length": 237.7410831451416, "epoch": 2.493901672324909, "grad_norm": 0.15222057489193858, "kl": 0.100433349609375, "learning_rate": 4.916177830992482e-07, "loss": 0.0001, "reward": 1.8107143267989159, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.8107143044471741, "rewards/format_reward_func": 1.0, "step": 14874 }, { "completion_length": 253.51786994934082, "epoch": 2.4942369755647764, "grad_norm": 0.16371729482573208, "kl": 0.10443115234375, "learning_rate": 4.9161470260724e-07, "loss": 0.0001, "reward": 1.7732143476605415, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7776785977184772, "rewards/format_reward_func": 0.9955357164144516, "step": 14876 }, { "completion_length": 263.071439743042, "epoch": 2.494572278804644, "grad_norm": 0.19259044880307022, "kl": 0.119903564453125, "learning_rate": 4.916116215589451e-07, "loss": 0.0001, "reward": 1.7732143476605415, "reward_std": 0.0883883461356163, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.9955357164144516, "step": 14878 }, { "completion_length": 253.60268878936768, "epoch": 2.4949075820445117, "grad_norm": 0.13104247323147747, "kl": 0.109039306640625, "learning_rate": 4.916085399543707e-07, "loss": 0.0001, "reward": 1.7321429178118706, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7321428973227739, "rewards/format_reward_func": 1.0, "step": 14880 }, { "completion_length": 247.28126335144043, "epoch": 2.495242885284379, "grad_norm": 0.0019853055196513178, "kl": 0.099822998046875, "learning_rate": 4.916054577935238e-07, "loss": 0.0001, "reward": 1.791071467101574, "reward_std": 0.012626906856894493, "rewards/equation_reward_func": 0.7955357432365417, "rewards/format_reward_func": 0.9955357164144516, "step": 14882 }, { "completion_length": 252.61608600616455, "epoch": 2.4955781885242465, "grad_norm": 0.18710983271999201, "kl": 0.117523193359375, "learning_rate": 4.916023750764116e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 14884 }, { "completion_length": 252.84375858306885, "epoch": 2.495913491764114, "grad_norm": 0.18797587280849457, "kl": 0.1116943359375, "learning_rate": 4.915992918030412e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.03282995708286762, "rewards/equation_reward_func": 0.8169643096625805, "rewards/format_reward_func": 0.9955357164144516, "step": 14886 }, { "completion_length": 247.48662090301514, "epoch": 2.496248795003982, "grad_norm": 0.27004660458537005, "kl": 0.1292724609375, "learning_rate": 4.915962079734195e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.04545686300843954, "rewards/equation_reward_func": 0.7964285910129547, "rewards/format_reward_func": 1.0, "step": 14888 }, { "completion_length": 252.14733219146729, "epoch": 2.4965840982438494, "grad_norm": 0.12158380895645579, "kl": 0.120574951171875, "learning_rate": 4.915931235875538e-07, "loss": 0.0001, "reward": 1.7785714864730835, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 14890 }, { "completion_length": 261.8794765472412, "epoch": 2.4969194014837166, "grad_norm": 0.12848837493711257, "kl": 0.1173095703125, "learning_rate": 4.915900386454512e-07, "loss": 0.0001, "reward": 1.817857213318348, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.8178571723401546, "rewards/format_reward_func": 1.0, "step": 14892 }, { "completion_length": 257.508939743042, "epoch": 2.4972547047235842, "grad_norm": 0.37214306927685425, "kl": 0.11920166015625, "learning_rate": 4.915869531471188e-07, "loss": 0.0001, "reward": 1.7642857804894447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857357859612, "rewards/format_reward_func": 1.0, "step": 14894 }, { "completion_length": 257.415189743042, "epoch": 2.497590007963452, "grad_norm": 0.19139334417935533, "kl": 0.1249847412109375, "learning_rate": 4.915838670925636e-07, "loss": 0.0001, "reward": 1.778571479022503, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7785714585334063, "rewards/format_reward_func": 1.0, "step": 14896 }, { "completion_length": 258.83037090301514, "epoch": 2.4979253112033195, "grad_norm": 0.1739975764465377, "kl": 0.11895751953125, "learning_rate": 4.915807804817927e-07, "loss": 0.0001, "reward": 1.7625000402331352, "reward_std": 0.06313453521579504, "rewards/equation_reward_func": 0.775892885401845, "rewards/format_reward_func": 0.9866071492433548, "step": 14898 }, { "completion_length": 257.80358028411865, "epoch": 2.498260614443187, "grad_norm": 0.12046699172711495, "kl": 0.124237060546875, "learning_rate": 4.915776933148135e-07, "loss": 0.0001, "reward": 1.81428574770689, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 14900 }, { "completion_length": 263.2187604904175, "epoch": 2.498595917683055, "grad_norm": 0.35322802369354245, "kl": 0.1260986328125, "learning_rate": 4.915746055916327e-07, "loss": 0.0001, "reward": 1.7803571969270706, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7848214488476515, "rewards/format_reward_func": 0.9955357164144516, "step": 14902 }, { "completion_length": 246.6428689956665, "epoch": 2.4989312209229224, "grad_norm": 0.004772484218738324, "kl": 0.1209869384765625, "learning_rate": 4.915715173122575e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 14904 }, { "completion_length": 258.6875104904175, "epoch": 2.4992665241627896, "grad_norm": 0.14919904400282405, "kl": 0.1072998046875, "learning_rate": 4.915684284766953e-07, "loss": 0.0001, "reward": 1.7785714715719223, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714641213417, "rewards/format_reward_func": 1.0, "step": 14906 }, { "completion_length": 253.6205472946167, "epoch": 2.4996018274026572, "grad_norm": 0.25196101649712355, "kl": 0.10772705078125, "learning_rate": 4.915653390849529e-07, "loss": 0.0001, "reward": 1.832142911851406, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8321428708732128, "rewards/format_reward_func": 1.0, "step": 14908 }, { "completion_length": 253.071439743042, "epoch": 2.499937130642525, "grad_norm": 0.1945168808074864, "kl": 0.124420166015625, "learning_rate": 4.915622491370376e-07, "loss": 0.0001, "reward": 1.7821429073810577, "reward_std": 0.055558389984071255, "rewards/equation_reward_func": 0.7910714522004128, "rewards/format_reward_func": 0.9910714328289032, "step": 14910 }, { "completion_length": 250.1384038925171, "epoch": 2.5002724338823925, "grad_norm": 0.15781246856605685, "kl": 0.1234130859375, "learning_rate": 4.915591586329563e-07, "loss": 0.0001, "reward": 1.7928571850061417, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7928571812808514, "rewards/format_reward_func": 1.0, "step": 14912 }, { "completion_length": 255.00894165039062, "epoch": 2.5006077371222597, "grad_norm": 0.2201245062876672, "kl": 0.155975341796875, "learning_rate": 4.915560675727164e-07, "loss": 0.0002, "reward": 1.8053572103381157, "reward_std": 0.05303300730884075, "rewards/equation_reward_func": 0.8098214566707611, "rewards/format_reward_func": 0.9955357164144516, "step": 14914 }, { "completion_length": 252.48215579986572, "epoch": 2.5009430403621273, "grad_norm": 0.14755556808748146, "kl": 0.123779296875, "learning_rate": 4.915529759563248e-07, "loss": 0.0001, "reward": 1.796428620815277, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286096394062, "rewards/format_reward_func": 1.0, "step": 14916 }, { "completion_length": 250.58483123779297, "epoch": 2.501278343601995, "grad_norm": 0.16455636465986687, "kl": 0.120697021484375, "learning_rate": 4.915498837837887e-07, "loss": 0.0001, "reward": 1.785714365541935, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857143133878708, "rewards/format_reward_func": 1.0, "step": 14918 }, { "completion_length": 260.75894260406494, "epoch": 2.5016136468418626, "grad_norm": 0.17272260899471706, "kl": 0.1383056640625, "learning_rate": 4.915467910551153e-07, "loss": 0.0001, "reward": 1.814285784959793, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8142857477068901, "rewards/format_reward_func": 1.0, "step": 14920 }, { "completion_length": 256.6205463409424, "epoch": 2.5019489500817302, "grad_norm": 0.19178782600827965, "kl": 0.25885009765625, "learning_rate": 4.915436977703114e-07, "loss": 0.0003, "reward": 1.8107143566012383, "reward_std": 0.055558388121426105, "rewards/equation_reward_func": 0.8107143063098192, "rewards/format_reward_func": 1.0, "step": 14922 }, { "completion_length": 258.1964416503906, "epoch": 2.502284253321598, "grad_norm": 0.39726649210975157, "kl": 0.156768798828125, "learning_rate": 4.915406039293845e-07, "loss": 0.0002, "reward": 1.7821428924798965, "reward_std": 0.06565991416573524, "rewards/equation_reward_func": 0.7910714633762836, "rewards/format_reward_func": 0.9910714328289032, "step": 14924 }, { "completion_length": 268.22769355773926, "epoch": 2.5026195565614655, "grad_norm": 0.2066904623535352, "kl": 0.150665283203125, "learning_rate": 4.915375095323417e-07, "loss": 0.0002, "reward": 1.780357226729393, "reward_std": 0.0681852949783206, "rewards/equation_reward_func": 0.7848214618861675, "rewards/format_reward_func": 0.9955357164144516, "step": 14926 }, { "completion_length": 257.2232246398926, "epoch": 2.5029548598013327, "grad_norm": 0.21913485169174565, "kl": 0.150390625, "learning_rate": 4.915344145791898e-07, "loss": 0.0002, "reward": 1.7142858058214188, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.7142857611179352, "rewards/format_reward_func": 1.0, "step": 14928 }, { "completion_length": 263.6607255935669, "epoch": 2.5032901630412003, "grad_norm": 0.2649583866173159, "kl": 0.17791748046875, "learning_rate": 4.915313190699362e-07, "loss": 0.0002, "reward": 1.7160714864730835, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7205357383936644, "rewards/format_reward_func": 0.9955357164144516, "step": 14930 }, { "completion_length": 266.9955472946167, "epoch": 2.503625466281068, "grad_norm": 0.20459147581001738, "kl": 0.191192626953125, "learning_rate": 4.915282230045878e-07, "loss": 0.0002, "reward": 1.7732143253087997, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7776786051690578, "rewards/format_reward_func": 0.9955357164144516, "step": 14932 }, { "completion_length": 255.90180015563965, "epoch": 2.5039607695209356, "grad_norm": 0.25484307197764544, "kl": 0.142059326171875, "learning_rate": 4.91525126383152e-07, "loss": 0.0001, "reward": 1.7964286357164383, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7964286021888256, "rewards/format_reward_func": 1.0, "step": 14934 }, { "completion_length": 267.54912090301514, "epoch": 2.504296072760803, "grad_norm": 0.15722467533159465, "kl": 0.2144775390625, "learning_rate": 4.915220292056359e-07, "loss": 0.0002, "reward": 1.753571517765522, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714730620384, "rewards/format_reward_func": 1.0, "step": 14936 }, { "completion_length": 250.33483219146729, "epoch": 2.5046313760006704, "grad_norm": 0.13771303946559407, "kl": 0.122711181640625, "learning_rate": 4.915189314720465e-07, "loss": 0.0001, "reward": 1.7982143387198448, "reward_std": 0.04293148219585419, "rewards/equation_reward_func": 0.802678607404232, "rewards/format_reward_func": 0.9955357164144516, "step": 14938 }, { "completion_length": 247.95536708831787, "epoch": 2.504966679240538, "grad_norm": 0.15147710220354377, "kl": 0.14837646484375, "learning_rate": 4.915158331823909e-07, "loss": 0.0001, "reward": 1.796428643167019, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7964285910129547, "rewards/format_reward_func": 1.0, "step": 14940 }, { "completion_length": 252.75000953674316, "epoch": 2.5053019824804057, "grad_norm": 0.09408187522982184, "kl": 0.2237548828125, "learning_rate": 4.915127343366763e-07, "loss": 0.0002, "reward": 1.760714367032051, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7607143111526966, "rewards/format_reward_func": 1.0, "step": 14942 }, { "completion_length": 250.71430015563965, "epoch": 2.5056372857202733, "grad_norm": 0.09563232486259707, "kl": 0.1910400390625, "learning_rate": 4.915096349349098e-07, "loss": 0.0002, "reward": 1.7875000312924385, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.7919643223285675, "rewards/format_reward_func": 0.9955357164144516, "step": 14944 }, { "completion_length": 249.0937623977661, "epoch": 2.505972588960141, "grad_norm": 0.17560738401963297, "kl": 0.17327880859375, "learning_rate": 4.915065349770987e-07, "loss": 0.0002, "reward": 1.7428572252392769, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7428571805357933, "rewards/format_reward_func": 1.0, "step": 14946 }, { "completion_length": 247.321439743042, "epoch": 2.5063078922000086, "grad_norm": 0.4454910772698461, "kl": 0.14111328125, "learning_rate": 4.9150343446325e-07, "loss": 0.0001, "reward": 1.7571429312229156, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7571428865194321, "rewards/format_reward_func": 1.0, "step": 14948 }, { "completion_length": 249.8571538925171, "epoch": 2.506643195439876, "grad_norm": 0.24536262681171336, "kl": 0.170684814453125, "learning_rate": 4.915003333933708e-07, "loss": 0.0002, "reward": 1.7937500774860382, "reward_std": 0.049244935624301434, "rewards/equation_reward_func": 0.7955357395112514, "rewards/format_reward_func": 0.9982142895460129, "step": 14950 }, { "completion_length": 238.4687614440918, "epoch": 2.5069784986797434, "grad_norm": 0.25230609198150733, "kl": 0.148651123046875, "learning_rate": 4.914972317674683e-07, "loss": 0.0001, "reward": 1.782142885029316, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428831666708, "rewards/format_reward_func": 1.0, "step": 14952 }, { "completion_length": 250.16072368621826, "epoch": 2.507313801919611, "grad_norm": 0.1501950138335119, "kl": 0.25262451171875, "learning_rate": 4.914941295855496e-07, "loss": 0.0003, "reward": 1.7785714715719223, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 14954 }, { "completion_length": 233.84376335144043, "epoch": 2.5076491051594787, "grad_norm": 0.10516466220468688, "kl": 0.159942626953125, "learning_rate": 4.91491026847622e-07, "loss": 0.0002, "reward": 1.7857143431901932, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.7857142947614193, "rewards/format_reward_func": 1.0, "step": 14956 }, { "completion_length": 245.0982265472412, "epoch": 2.507984408399346, "grad_norm": 0.22351192817369384, "kl": 0.1419677734375, "learning_rate": 4.914879235536924e-07, "loss": 0.0001, "reward": 1.8250000402331352, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.8250000290572643, "rewards/format_reward_func": 1.0, "step": 14958 }, { "completion_length": 234.88840293884277, "epoch": 2.5083197116392135, "grad_norm": 0.10862469988182176, "kl": 0.117095947265625, "learning_rate": 4.914848197037681e-07, "loss": 0.0001, "reward": 1.7232143506407738, "reward_std": 0.02777919452637434, "rewards/equation_reward_func": 0.727678595110774, "rewards/format_reward_func": 0.9955357164144516, "step": 14960 }, { "completion_length": 246.5982255935669, "epoch": 2.508655014879081, "grad_norm": 0.34570987382328267, "kl": 0.117919921875, "learning_rate": 4.914817152978561e-07, "loss": 0.0001, "reward": 1.7428572103381157, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7428571693599224, "rewards/format_reward_func": 1.0, "step": 14962 }, { "completion_length": 245.7232265472412, "epoch": 2.508990318118949, "grad_norm": 0.16383356087510462, "kl": 0.11492919921875, "learning_rate": 4.914786103359639e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7678571902215481, "rewards/format_reward_func": 1.0, "step": 14964 }, { "completion_length": 229.80804634094238, "epoch": 2.5093256213588164, "grad_norm": 0.3562128005704095, "kl": 0.173583984375, "learning_rate": 4.914755048180981e-07, "loss": 0.0002, "reward": 1.7875000536441803, "reward_std": 0.058083769865334034, "rewards/equation_reward_func": 0.7919643074274063, "rewards/format_reward_func": 0.9955357164144516, "step": 14966 }, { "completion_length": 235.73215293884277, "epoch": 2.509660924598684, "grad_norm": 0.20378825155179361, "kl": 0.159698486328125, "learning_rate": 4.914723987442664e-07, "loss": 0.0002, "reward": 1.7785715013742447, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7785714603960514, "rewards/format_reward_func": 1.0, "step": 14968 }, { "completion_length": 240.1250114440918, "epoch": 2.5099962278385517, "grad_norm": 0.0031109090255418614, "kl": 0.117584228515625, "learning_rate": 4.914692921144756e-07, "loss": 0.0001, "reward": 1.8500000312924385, "reward_std": 0.010101525112986565, "rewards/equation_reward_func": 0.8500000312924385, "rewards/format_reward_func": 1.0, "step": 14970 }, { "completion_length": 243.4732265472412, "epoch": 2.510331531078419, "grad_norm": 0.1671077710760345, "kl": 0.12384033203125, "learning_rate": 4.91466184928733e-07, "loss": 0.0001, "reward": 1.8285714611411095, "reward_std": 0.04040610045194626, "rewards/equation_reward_func": 0.8285714574158192, "rewards/format_reward_func": 1.0, "step": 14972 }, { "completion_length": 243.3616180419922, "epoch": 2.5106668343182865, "grad_norm": 0.0040697662114411014, "kl": 0.123992919921875, "learning_rate": 4.914630771870457e-07, "loss": 0.0001, "reward": 1.7607143372297287, "reward_std": 0.015152287669479847, "rewards/equation_reward_func": 0.7607143260538578, "rewards/format_reward_func": 1.0, "step": 14974 }, { "completion_length": 233.06251049041748, "epoch": 2.511002137558154, "grad_norm": 0.2969220308731453, "kl": 0.117645263671875, "learning_rate": 4.914599688894208e-07, "loss": 0.0001, "reward": 1.7928572073578835, "reward_std": 0.06060915067791939, "rewards/equation_reward_func": 0.7928571663796902, "rewards/format_reward_func": 1.0, "step": 14976 }, { "completion_length": 244.55804443359375, "epoch": 2.511337440798022, "grad_norm": 0.14675272791289684, "kl": 0.14947509765625, "learning_rate": 4.914568600358656e-07, "loss": 0.0001, "reward": 1.7750000432133675, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7750000432133675, "rewards/format_reward_func": 1.0, "step": 14978 }, { "completion_length": 242.08929824829102, "epoch": 2.5116727440378894, "grad_norm": 0.23792415652608712, "kl": 0.123748779296875, "learning_rate": 4.914537506263871e-07, "loss": 0.0001, "reward": 1.7285715118050575, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7285714522004128, "rewards/format_reward_func": 1.0, "step": 14980 }, { "completion_length": 237.70983219146729, "epoch": 2.5120080472777566, "grad_norm": 0.16721352864248729, "kl": 0.1259307861328125, "learning_rate": 4.914506406609927e-07, "loss": 0.0001, "reward": 1.7732143476605415, "reward_std": 0.037880719639360905, "rewards/equation_reward_func": 0.7776786014437675, "rewards/format_reward_func": 0.9955357164144516, "step": 14982 }, { "completion_length": 242.0669755935669, "epoch": 2.5123433505176243, "grad_norm": 0.1793158228133712, "kl": 0.114105224609375, "learning_rate": 4.914475301396893e-07, "loss": 0.0001, "reward": 1.767857201397419, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7678571715950966, "rewards/format_reward_func": 1.0, "step": 14984 }, { "completion_length": 234.45090198516846, "epoch": 2.512678653757492, "grad_norm": 0.1808318129904359, "kl": 0.1202392578125, "learning_rate": 4.914444190624842e-07, "loss": 0.0001, "reward": 1.778571493923664, "reward_std": 0.02020305022597313, "rewards/equation_reward_func": 0.7785714492201805, "rewards/format_reward_func": 1.0, "step": 14986 }, { "completion_length": 232.07590198516846, "epoch": 2.5130139569973595, "grad_norm": 0.00458665470919745, "kl": 0.172943115234375, "learning_rate": 4.914413074293845e-07, "loss": 0.0002, "reward": 1.7642857655882835, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7642857395112514, "rewards/format_reward_func": 1.0, "step": 14988 }, { "completion_length": 241.29018783569336, "epoch": 2.513349260237227, "grad_norm": 0.21982994897106903, "kl": 0.13336181640625, "learning_rate": 4.914381952403974e-07, "loss": 0.0001, "reward": 1.8142857626080513, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8142857402563095, "rewards/format_reward_func": 1.0, "step": 14990 }, { "completion_length": 230.55358123779297, "epoch": 2.513684563477095, "grad_norm": 0.23499309856862302, "kl": 0.129669189453125, "learning_rate": 4.9143508249553e-07, "loss": 0.0001, "reward": 1.7535715028643608, "reward_std": 0.035355337895452976, "rewards/equation_reward_func": 0.7535714767873287, "rewards/format_reward_func": 1.0, "step": 14992 }, { "completion_length": 240.4241180419922, "epoch": 2.5140198667169624, "grad_norm": 0.15111986044046005, "kl": 0.165252685546875, "learning_rate": 4.914319691947896e-07, "loss": 0.0002, "reward": 1.807142898440361, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 14994 }, { "completion_length": 227.11608028411865, "epoch": 2.5143551699568296, "grad_norm": 0.13801908442240357, "kl": 0.121490478515625, "learning_rate": 4.914288553381833e-07, "loss": 0.0001, "reward": 1.782142922282219, "reward_std": 0.02525381278246641, "rewards/equation_reward_func": 0.7821428906172514, "rewards/format_reward_func": 1.0, "step": 14996 }, { "completion_length": 232.95090293884277, "epoch": 2.5146904731966973, "grad_norm": 0.13381548188941406, "kl": 0.12237548828125, "learning_rate": 4.914257409257182e-07, "loss": 0.0001, "reward": 1.8071429058909416, "reward_std": 0.05050762556493282, "rewards/equation_reward_func": 0.8071428798139095, "rewards/format_reward_func": 1.0, "step": 14998 }, { "completion_length": 230.81250953674316, "epoch": 2.515025776436565, "grad_norm": 0.003928641599324006, "kl": 0.128692626953125, "learning_rate": 4.914226259574015e-07, "loss": 0.0001, "reward": 1.7357143610715866, "reward_std": 0.030304575338959694, "rewards/equation_reward_func": 0.7357143014669418, "rewards/format_reward_func": 1.0, "step": 15000 } ], "logging_steps": 2, "max_steps": 134996, "num_input_tokens_seen": 0, "num_train_epochs": 23, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }