diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,1570 +1,50230 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 100, - "global_step": 566, + "epoch": 0.9997759689343589, + "eval_steps": 500, + "global_step": 3347, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "completion_length": 1013.5505332946777, - "epoch": 0.0088339222614841, - "grad_norm": 0.05966813489794731, - "kl": 0.00011321306228637696, - "learning_rate": 1.7543859649122807e-06, + "clip_ratio": 0.0, + "completion_length": 1023.6071472167969, + "epoch": 0.00029870808752146963, + "grad_norm": 0.0751478523015976, + "kl": 0.0, + "learning_rate": 5.970149253731344e-08, + "loss": 0.0015, + "reward": 0.2885044738650322, + "reward_std": 0.015625000465661287, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.250558041036129, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8102874755859, + "epoch": 0.0005974161750429393, + "grad_norm": 0.05037630721926689, + "kl": 0.0, + "learning_rate": 1.1940298507462688e-07, + "loss": 0.0003, + "reward": 0.3270089402794838, + "reward_std": 0.008173753973096609, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2555803582072258, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.0008961242625644089, + "grad_norm": 0.0352492593228817, + "kl": 0.00017976760864257812, + "learning_rate": 1.7910447761194033e-07, + "loss": 0.0, + "reward": 0.2522321492433548, + "reward_std": 0.003992978483438492, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2522321492433548, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.0011948323500858785, + "grad_norm": 0.0006523040356114507, + "kl": 0.0001761913299560547, + "learning_rate": 2.3880597014925377e-07, "loss": 0.0, - "reward": 0.04401041765231639, - "reward_std": 0.055143982777372, - "rewards/accuracy_reward": 0.04401041765231639, + "reward": 0.321428582072258, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.25, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.9754638671875, + "epoch": 0.0014935404376073482, + "grad_norm": 0.05841083824634552, + "kl": 0.00017952919006347656, + "learning_rate": 2.9850746268656716e-07, + "loss": 0.0043, + "reward": 0.3286830484867096, + "reward_std": 0.008063508197665215, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2572544664144516, "step": 5 }, { - "completion_length": 1006.8708503723144, - "epoch": 0.0176678445229682, - "grad_norm": 0.09209787100553513, - "kl": 0.0002698063850402832, - "learning_rate": 3.5087719298245615e-06, + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.0017922485251288178, + "grad_norm": 0.078989677131176, + "kl": 0.00018906593322753906, + "learning_rate": 3.5820895522388065e-07, + "loss": 0.0, + "reward": 0.2879464402794838, + "reward_std": 0.00845726439729333, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2522321492433548, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.0020909566126502874, + "grad_norm": 0.06333835422992706, + "kl": 0.00018548965454101562, + "learning_rate": 4.179104477611941e-07, + "loss": 0.0, + "reward": 0.2500000074505806, + "reward_std": 0.004464285913854837, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2500000074505806, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.002389664700171757, + "grad_norm": 0.054782044142484665, + "kl": 0.00018787384033203125, + "learning_rate": 4.776119402985075e-07, + "loss": 0.0, + "reward": 0.3225446566939354, + "reward_std": 0.004464285913854837, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.251116082072258, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.0026883727876932267, + "grad_norm": 0.03450215980410576, + "kl": 0.0001900196075439453, + "learning_rate": 5.373134328358209e-07, + "loss": 0.0, + "reward": 0.2868303656578064, + "reward_std": 0.0030496877152472734, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2511160746216774, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.0029870808752146963, + "grad_norm": 0.07214590907096863, + "kl": 0.0001811981201171875, + "learning_rate": 5.970149253731343e-07, "loss": 0.0, - "reward": 0.07447916835080833, - "reward_std": 0.09157753810286522, - "rewards/accuracy_reward": 0.07447916835080833, + "reward": 0.3264509066939354, + "reward_std": 0.018674687715247273, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2527901828289032, "step": 10 }, { - "completion_length": 976.4518432617188, - "epoch": 0.026501766784452298, - "grad_norm": 0.0886874720454216, - "kl": 0.0022804975509643556, - "learning_rate": 5.263157894736842e-06, - "loss": 0.0001, - "reward": 0.1317708358168602, - "reward_std": 0.11912086028605699, - "rewards/accuracy_reward": 0.1317708358168602, + "clip_ratio": 0.0, + "completion_length": 1023.6450958251953, + "epoch": 0.003285788962736166, + "grad_norm": 0.06939146667718887, + "kl": 0.00018024444580078125, + "learning_rate": 6.567164179104478e-07, + "loss": 0.0006, + "reward": 0.2935267984867096, + "reward_std": 0.011653360910713673, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2578125074505806, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.0035844970502576356, + "grad_norm": 0.03321776166558266, + "kl": 0.00017189979553222656, + "learning_rate": 7.164179104477613e-07, + "loss": 0.0, + "reward": 0.321986623108387, + "reward_std": 0.0022321429569274187, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.250558041036129, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.6383972167969, + "epoch": 0.003883205137779105, + "grad_norm": 0.0678122490644455, + "kl": 0.00017642974853515625, + "learning_rate": 7.761194029850747e-07, + "loss": 0.0027, + "reward": 0.2924107313156128, + "reward_std": 0.008928571827709675, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2566964402794838, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.004181913225300575, + "grad_norm": 0.03163949027657509, + "kl": 0.00018024444580078125, + "learning_rate": 8.358208955223882e-07, + "loss": 0.0, + "reward": 0.321986623108387, + "reward_std": 0.0022321429569274187, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.250558041036129, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2477722167969, + "epoch": 0.004480621312822045, + "grad_norm": 0.06948882341384888, + "kl": 0.00019431114196777344, + "learning_rate": 8.955223880597015e-07, + "loss": 0.0015, + "reward": 0.2946428656578064, + "reward_std": 0.01989922858774662, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2566964328289032, "step": 15 }, { - "completion_length": 903.1588745117188, - "epoch": 0.0353356890459364, - "grad_norm": 0.10459358990192413, - "kl": 0.009592819213867187, - "learning_rate": 7.017543859649123e-06, - "loss": 0.0004, - "reward": 0.22578125605359672, - "reward_std": 0.15993888727389277, - "rewards/accuracy_reward": 0.22578125605359672, + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.004779329400343514, + "grad_norm": 0.03945206478238106, + "kl": 0.00020074844360351562, + "learning_rate": 9.55223880597015e-07, + "loss": 0.0, + "reward": 0.2885044738650322, + "reward_std": 0.01116071455180645, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.250558041036129, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.005078037487864984, + "grad_norm": 0.05198342725634575, + "kl": 0.00016546249389648438, + "learning_rate": 1.0149253731343285e-06, + "loss": 0.0, + "reward": 0.290736623108387, + "reward_std": 0.020089286379516125, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.250558041036129, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0848388671875, + "epoch": 0.005376745575386453, + "grad_norm": 0.05902159586548805, + "kl": 0.000194549560546875, + "learning_rate": 1.0746268656716418e-06, + "loss": 0.002, + "reward": 0.3270089477300644, + "reward_std": 0.006506371544674039, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2555803656578064, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.2522430419922, + "epoch": 0.0056754536629079234, + "grad_norm": 0.07848473638296127, + "kl": 0.00019431114196777344, + "learning_rate": 1.1343283582089555e-06, + "loss": 0.0053, + "reward": 0.3130580484867096, + "reward_std": 0.05599551647901535, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2572544738650322, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.005974161750429393, + "grad_norm": 0.03742428869009018, + "kl": 0.0001838207244873047, + "learning_rate": 1.1940298507462686e-06, + "loss": 0.0, + "reward": 0.333147332072258, + "reward_std": 0.019755832850933075, + "rewards/accuracy_reward": 0.0803571492433548, "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2527901828289032, "step": 20 }, { - "completion_length": 697.5211124420166, - "epoch": 0.044169611307420496, - "grad_norm": 0.1883080005645752, - "kl": 0.032245635986328125, - "learning_rate": 8.771929824561405e-06, + "clip_ratio": 0.0, + "completion_length": 1023.7053680419922, + "epoch": 0.006272869837950863, + "grad_norm": 0.046205632388591766, + "kl": 0.000179290771484375, + "learning_rate": 1.253731343283582e-06, + "loss": 0.0003, + "reward": 0.3264509066939354, + "reward_std": 0.006842837436124682, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.255022332072258, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2410888671875, + "epoch": 0.006571577925472332, + "grad_norm": 0.06906045228242874, + "kl": 0.00018525123596191406, + "learning_rate": 1.3134328358208956e-06, + "loss": 0.0019, + "reward": 0.258928582072258, + "reward_std": 0.02029980206862092, + "rewards/accuracy_reward": 0.004464285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2544642984867096, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.006870286012993802, + "grad_norm": 0.0024495425168424845, + "kl": 0.0001976490020751953, + "learning_rate": 1.373134328358209e-06, + "loss": 0.0, + "reward": 0.285714291036129, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.25, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.9375152587891, + "epoch": 0.007168994100515271, + "grad_norm": 0.07108546793460846, + "kl": 0.00021314620971679688, + "learning_rate": 1.4328358208955226e-06, "loss": 0.0013, - "reward": 0.31822917638346554, - "reward_std": 0.21630380251444875, - "rewards/accuracy_reward": 0.31822917638346554, + "reward": 0.3839285895228386, + "reward_std": 0.04322826047427952, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2566964402794838, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.007467702188036741, + "grad_norm": 0.045215509831905365, + "kl": 0.0002143383026123047, + "learning_rate": 1.4925373134328358e-06, + "loss": 0.0, + "reward": 0.2885044738650322, + "reward_std": 0.01116071455180645, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.250558041036129, "step": 25 }, { - "completion_length": 606.0070510864258, - "epoch": 0.053003533568904596, - "grad_norm": 0.14597563445568085, - "kl": 0.030352783203125, - "learning_rate": 1.0526315789473684e-05, - "loss": 0.0012, - "reward": 0.36692709662020206, - "reward_std": 0.22790117720142006, - "rewards/accuracy_reward": 0.36510417982935905, - "rewards/format_reward": 0.001822916720993817, + "clip_ratio": 0.0, + "completion_length": 1023.7522430419922, + "epoch": 0.00776641027555821, + "grad_norm": 0.07724171876907349, + "kl": 0.00022912025451660156, + "learning_rate": 1.5522388059701494e-06, + "loss": 0.0009, + "reward": 0.297433041036129, + "reward_std": 0.030520827043801546, + "rewards/accuracy_reward": 0.04241071501746774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.255022332072258, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7142944335938, + "epoch": 0.00806511836307968, + "grad_norm": 0.03978069871664047, + "kl": 0.0002627372741699219, + "learning_rate": 1.6119402985074628e-06, + "loss": 0.0007, + "reward": 0.3939732313156128, + "reward_std": 0.0030496877152472734, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2511160746216774, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.3303680419922, + "epoch": 0.00836382645060115, + "grad_norm": 0.03692740947008133, + "kl": 0.00023889541625976562, + "learning_rate": 1.6716417910447764e-06, + "loss": 0.0008, + "reward": 0.2890625074505806, + "reward_std": 0.004464285913854837, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2533482164144516, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.7455444335938, + "epoch": 0.008662534538122619, + "grad_norm": 0.09612418711185455, + "kl": 0.00029921531677246094, + "learning_rate": 1.7313432835820898e-06, + "loss": 0.0067, + "reward": 0.2979910895228386, + "reward_std": 0.05986018409021199, + "rewards/accuracy_reward": 0.031250000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.266741082072258, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.5424194335938, + "epoch": 0.00896124262564409, + "grad_norm": 0.09618248790502548, + "kl": 0.00033283233642578125, + "learning_rate": 1.791044776119403e-06, + "loss": 0.0023, + "reward": 0.3733259066939354, + "reward_std": 0.016252488363534212, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2661830484867096, "step": 30 }, { - "completion_length": 636.2380416870117, - "epoch": 0.061837455830388695, - "grad_norm": 0.20492921769618988, - "kl": 0.09047088623046876, - "learning_rate": 1.2280701754385966e-05, - "loss": 0.0036, - "reward": 0.3328125111525878, - "reward_std": 0.220460736611858, - "rewards/accuracy_reward": 0.3221354271983728, - "rewards/format_reward": 0.010677083604969084, + "clip_ratio": 0.0, + "completion_length": 1018.0357208251953, + "epoch": 0.009259950713165559, + "grad_norm": 0.08126083016395569, + "kl": 0.0003542900085449219, + "learning_rate": 1.8507462686567165e-06, + "loss": 0.0048, + "reward": 0.3085937649011612, + "reward_std": 0.030236493097618222, + "rewards/accuracy_reward": 0.04687500186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2617187574505806, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2901916503906, + "epoch": 0.009558658800687028, + "grad_norm": 0.06632189452648163, + "kl": 0.0003542900085449219, + "learning_rate": 1.91044776119403e-06, + "loss": 0.0009, + "reward": 0.340959832072258, + "reward_std": 0.028546550776809454, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2561383992433548, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.1138610839844, + "epoch": 0.009857366888208497, + "grad_norm": 0.07241165637969971, + "kl": 0.0004105567932128906, + "learning_rate": 1.9701492537313433e-06, + "loss": 0.002, + "reward": 0.3007812649011612, + "reward_std": 0.028213097481057048, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2561384066939354, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.010156074975729968, + "grad_norm": 0.08299077302217484, + "kl": 0.000446319580078125, + "learning_rate": 2.029850746268657e-06, + "loss": 0.0, + "reward": 0.301897332072258, + "reward_std": 0.035454992670565844, + "rewards/accuracy_reward": 0.04687500209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2550223395228386, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.3236846923828, + "epoch": 0.010454783063251438, + "grad_norm": 0.06465920805931091, + "kl": 0.0005779266357421875, + "learning_rate": 2.08955223880597e-06, + "loss": 0.0038, + "reward": 0.3152901902794838, + "reward_std": 0.026652695145457983, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2617187574505806, "step": 35 }, { - "completion_length": 618.694030380249, - "epoch": 0.0706713780918728, - "grad_norm": 0.2138100117444992, - "kl": 0.16015625, - "learning_rate": 1.4035087719298246e-05, - "loss": 0.0064, - "reward": 1.017968778871, - "reward_std": 0.4298093942925334, - "rewards/accuracy_reward": 0.3385416760342196, - "rewards/format_reward": 0.6794270988553762, + "clip_ratio": 0.0, + "completion_length": 1022.6517944335938, + "epoch": 0.010753491150772907, + "grad_norm": 0.08378256857395172, + "kl": 0.0006313323974609375, + "learning_rate": 2.1492537313432837e-06, + "loss": 0.0012, + "reward": 0.2801339402794838, + "reward_std": 0.033984160516411066, + "rewards/accuracy_reward": 0.0200892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2600446566939354, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.4665222167969, + "epoch": 0.011052199238294378, + "grad_norm": 0.09047219902276993, + "kl": 0.00070953369140625, + "learning_rate": 2.2089552238805973e-06, + "loss": 0.0006, + "reward": 0.297991082072258, + "reward_std": 0.029569087782874703, + "rewards/accuracy_reward": 0.0401785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2578125074505806, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7968902587891, + "epoch": 0.011350907325815847, + "grad_norm": 0.07533952593803406, + "kl": 0.00069427490234375, + "learning_rate": 2.268656716417911e-06, + "loss": 0.0005, + "reward": 0.2901785895228386, + "reward_std": 0.011506952345371246, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2544642984867096, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7901916503906, + "epoch": 0.011649615413337316, + "grad_norm": 0.07415170222520828, + "kl": 0.000820159912109375, + "learning_rate": 2.328358208955224e-06, + "loss": 0.001, + "reward": 0.266183041036129, + "reward_std": 0.02455122722312808, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2572544664144516, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.2031402587891, + "epoch": 0.011948323500858785, + "grad_norm": 0.10437313467264175, + "kl": 0.0010995864868164062, + "learning_rate": 2.3880597014925373e-06, + "loss": 0.0065, + "reward": 0.3683035895228386, + "reward_std": 0.06828411226160824, + "rewards/accuracy_reward": 0.10044643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2678571566939354, "step": 40 }, { - "completion_length": 624.4760665893555, - "epoch": 0.07950530035335689, - "grad_norm": 0.2600502073764801, - "kl": 0.16715087890625, - "learning_rate": 1.578947368421053e-05, - "loss": 0.0067, - "reward": 1.152864619344473, - "reward_std": 0.4190581990405917, - "rewards/accuracy_reward": 0.33307292573153974, - "rewards/format_reward": 0.8197916857898235, + "clip_ratio": 0.0, + "completion_length": 1023.7589416503906, + "epoch": 0.012247031588380256, + "grad_norm": 0.09542272239923477, + "kl": 0.0010671615600585938, + "learning_rate": 2.447761194029851e-06, + "loss": 0.0006, + "reward": 0.3671875149011612, + "reward_std": 0.03191147721372545, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.255580373108387, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.012545739675901725, + "grad_norm": 0.0874330922961235, + "kl": 0.000988006591796875, + "learning_rate": 2.507462686567164e-06, + "loss": 0.0, + "reward": 0.2924107313156128, + "reward_std": 0.021127322455868125, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2522321566939354, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8817138671875, + "epoch": 0.012844447763423195, + "grad_norm": 0.10233943909406662, + "kl": 0.001270294189453125, + "learning_rate": 2.5671641791044776e-06, + "loss": 0.004, + "reward": 0.2779017984867096, + "reward_std": 0.07127907313406467, + "rewards/accuracy_reward": 0.017857144121080637, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2600446492433548, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.5268249511719, + "epoch": 0.013143155850944664, + "grad_norm": 0.12922854721546173, + "kl": 0.001861572265625, + "learning_rate": 2.6268656716417912e-06, + "loss": 0.0066, + "reward": 0.3270089402794838, + "reward_std": 0.0643874048255384, + "rewards/accuracy_reward": 0.05580357392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2712053656578064, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3102874755859, + "epoch": 0.013441863938466135, + "grad_norm": 0.12331593781709671, + "kl": 0.0018310546875, + "learning_rate": 2.686567164179105e-06, + "loss": 0.0037, + "reward": 0.3052455559372902, + "reward_std": 0.027776234317570925, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2695312649011612, "step": 45 }, { - "completion_length": 615.0362155914306, - "epoch": 0.08833922261484099, - "grad_norm": 0.23412857949733734, - "kl": 0.25006103515625, - "learning_rate": 1.754385964912281e-05, - "loss": 0.01, - "reward": 1.0809896141290665, - "reward_std": 0.48847288116812704, - "rewards/accuracy_reward": 0.32395834361668674, - "rewards/format_reward": 0.7570312678813934, + "clip_ratio": 0.0, + "completion_length": 1023.1473388671875, + "epoch": 0.013740572025987604, + "grad_norm": 0.0911710262298584, + "kl": 0.0016422271728515625, + "learning_rate": 2.746268656716418e-06, + "loss": 0.0032, + "reward": 0.3281250149011612, + "reward_std": 0.02321313344873488, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2544642984867096, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.1339569091797, + "epoch": 0.014039280113509073, + "grad_norm": 0.119549460709095, + "kl": 0.0025844573974609375, + "learning_rate": 2.8059701492537316e-06, + "loss": 0.0043, + "reward": 0.3448660895228386, + "reward_std": 0.04036526405252516, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.271205373108387, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.8192443847656, + "epoch": 0.014337988201030542, + "grad_norm": 0.12551471590995789, + "kl": 0.0030918121337890625, + "learning_rate": 2.8656716417910452e-06, + "loss": 0.0024, + "reward": 0.2974330484867096, + "reward_std": 0.030765820061787963, + "rewards/accuracy_reward": 0.0334821455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2639509066939354, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.7500305175781, + "epoch": 0.014636696288552013, + "grad_norm": 0.10964136570692062, + "kl": 0.003856658935546875, + "learning_rate": 2.925373134328359e-06, + "loss": 0.0003, + "reward": 0.3085937649011612, + "reward_std": 0.04489441215991974, + "rewards/accuracy_reward": 0.042410716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2661830484867096, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.497802734375, + "epoch": 0.014935404376073482, + "grad_norm": 0.1490764617919922, + "kl": 0.00424957275390625, + "learning_rate": 2.9850746268656716e-06, + "loss": 0.005, + "reward": 0.3214285895228386, + "reward_std": 0.09050451219081879, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2834821492433548, "step": 50 }, { - "completion_length": 676.0700714111329, - "epoch": 0.09717314487632508, - "grad_norm": 0.17208200693130493, - "kl": 0.298388671875, - "learning_rate": 1.929824561403509e-05, - "loss": 0.0119, - "reward": 1.0783854477107524, - "reward_std": 0.5204509677365422, - "rewards/accuracy_reward": 0.35520834298804405, - "rewards/format_reward": 0.7231771059334278, + "clip_ratio": 0.0, + "completion_length": 1019.3750305175781, + "epoch": 0.015234112463594952, + "grad_norm": 0.12326274812221527, + "kl": 0.004718780517578125, + "learning_rate": 3.044776119402985e-06, + "loss": 0.0066, + "reward": 0.380022332072258, + "reward_std": 0.034625288331881166, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2728794813156128, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.7433471679688, + "epoch": 0.01553282055111642, + "grad_norm": 0.14518426358699799, + "kl": 0.005863189697265625, + "learning_rate": 3.1044776119402988e-06, + "loss": 0.0045, + "reward": 0.3275669738650322, + "reward_std": 0.059175061993300915, + "rewards/accuracy_reward": 0.04910714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.278459832072258, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.8571929931641, + "epoch": 0.01583152863863789, + "grad_norm": 0.17860957980155945, + "kl": 0.0060272216796875, + "learning_rate": 3.164179104477612e-06, + "loss": 0.0078, + "reward": 0.3716518059372902, + "reward_std": 0.10336522944271564, + "rewards/accuracy_reward": 0.08482143259607255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2868303656578064, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.0335235595703, + "epoch": 0.01613023672615936, + "grad_norm": 0.15927623212337494, + "kl": 0.00890350341796875, + "learning_rate": 3.2238805970149255e-06, + "loss": 0.0041, + "reward": 0.4017857238650322, + "reward_std": 0.08400714676827192, + "rewards/accuracy_reward": 0.1049107164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2968750223517418, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.9844055175781, + "epoch": 0.016428944813680832, + "grad_norm": 0.15508444607257843, + "kl": 0.00812530517578125, + "learning_rate": 3.283582089552239e-06, + "loss": 0.0093, + "reward": 0.314174123108387, + "reward_std": 0.06957660522311926, + "rewards/accuracy_reward": 0.0200892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.294084832072258, "step": 55 }, { - "completion_length": 713.4130424499511, - "epoch": 0.10600706713780919, - "grad_norm": 3.232530117034912, - "kl": 1.23798828125, - "learning_rate": 1.9998285788966027e-05, - "loss": 0.0495, - "reward": 0.8888021018356085, - "reward_std": 0.6034601878374815, - "rewards/accuracy_reward": 0.30364583982154725, - "rewards/format_reward": 0.5851562701165676, + "clip_ratio": 0.0, + "completion_length": 993.435302734375, + "epoch": 0.0167276529012023, + "grad_norm": 0.16929394006729126, + "kl": 0.01177215576171875, + "learning_rate": 3.3432835820895528e-06, + "loss": 0.017, + "reward": 0.317522332072258, + "reward_std": 0.09692493174225092, + "rewards/accuracy_reward": 0.017857143888249993, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2996651828289032, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.3504943847656, + "epoch": 0.01702636098872377, + "grad_norm": 0.23093706369400024, + "kl": 0.0146026611328125, + "learning_rate": 3.402985074626866e-06, + "loss": 0.0231, + "reward": 0.380022332072258, + "reward_std": 0.12357715051621199, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.317522332072258, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.1183471679688, + "epoch": 0.017325069076245238, + "grad_norm": 0.192862868309021, + "kl": 0.01385498046875, + "learning_rate": 3.4626865671641795e-06, + "loss": 0.0111, + "reward": 0.426897332072258, + "reward_std": 0.1292344257235527, + "rewards/accuracy_reward": 0.11383929569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3130580484867096, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.7768402099609, + "epoch": 0.01762377716376671, + "grad_norm": 0.20153336226940155, + "kl": 0.01800537109375, + "learning_rate": 3.5223880597014927e-06, + "loss": 0.0148, + "reward": 0.4168526977300644, + "reward_std": 0.0970492959022522, + "rewards/accuracy_reward": 0.07589285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.340959832072258, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.2054138183594, + "epoch": 0.01792248525128818, + "grad_norm": 0.18050658702850342, + "kl": 0.0183258056640625, + "learning_rate": 3.582089552238806e-06, + "loss": 0.0165, + "reward": 0.3577008992433548, + "reward_std": 0.1040149750187993, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3041294813156128, "step": 60 }, { - "completion_length": 632.6083526611328, - "epoch": 0.11484098939929328, - "grad_norm": 7.755772590637207, - "kl": 6.648046875, - "learning_rate": 1.998781218310425e-05, - "loss": 0.2658, - "reward": 0.7747396051883697, - "reward_std": 0.5958925042301416, - "rewards/accuracy_reward": 0.23645834093913437, - "rewards/format_reward": 0.5382812663912773, + "clip_ratio": 0.0, + "completion_length": 942.3683471679688, + "epoch": 0.018221193338809647, + "grad_norm": 0.23832127451896667, + "kl": 0.031219482421875, + "learning_rate": 3.6417910447761195e-06, + "loss": 0.0369, + "reward": 0.4012276902794838, + "reward_std": 0.11255132593214512, + "rewards/accuracy_reward": 0.04464285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3565848395228386, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.9821929931641, + "epoch": 0.018519901426331118, + "grad_norm": 0.20858728885650635, + "kl": 0.03289794921875, + "learning_rate": 3.701492537313433e-06, + "loss": 0.0239, + "reward": 0.3889509066939354, + "reward_std": 0.10311859752982855, + "rewards/accuracy_reward": 0.006696428870782256, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3822544813156128, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.9799499511719, + "epoch": 0.01881860951385259, + "grad_norm": 0.22513005137443542, + "kl": 0.03155517578125, + "learning_rate": 3.7611940298507467e-06, + "loss": 0.0311, + "reward": 0.372767873108387, + "reward_std": 0.12530755810439587, + "rewards/accuracy_reward": 0.011160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3616071566939354, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.1518402099609, + "epoch": 0.019117317601374056, + "grad_norm": 0.2475094348192215, + "kl": 0.04022216796875, + "learning_rate": 3.82089552238806e-06, + "loss": 0.0286, + "reward": 0.5474330633878708, + "reward_std": 0.16661866754293442, + "rewards/accuracy_reward": 0.14732143748551607, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4001116305589676, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.2567291259766, + "epoch": 0.019416025688895527, + "grad_norm": 0.2404819130897522, + "kl": 0.04083251953125, + "learning_rate": 3.8805970149253735e-06, + "loss": 0.0245, + "reward": 0.3777901977300644, + "reward_std": 0.1338137723505497, + "rewards/accuracy_reward": 0.011160714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3666294738650322, "step": 65 }, { - "completion_length": 557.6237148284912, - "epoch": 0.12367491166077739, - "grad_norm": 4.744003772735596, - "kl": 3.2984375, - "learning_rate": 1.9967827272672407e-05, - "loss": 0.132, - "reward": 1.003645859658718, - "reward_std": 0.5127768199890852, - "rewards/accuracy_reward": 0.24166667386889457, - "rewards/format_reward": 0.7619791872799396, - "step": 70 + "clip_ratio": 0.0, + "completion_length": 957.7902221679688, + "epoch": 0.019714733776416995, + "grad_norm": 0.24281099438667297, + "kl": 0.04852294921875, + "learning_rate": 3.940298507462687e-06, + "loss": 0.0332, + "reward": 0.4458705484867096, + "reward_std": 0.156218895688653, + "rewards/accuracy_reward": 0.058035717345774174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3878348395228386, + "step": 66 }, { - "completion_length": 557.2198093414306, - "epoch": 0.13250883392226148, - "grad_norm": 3.193718671798706, - "kl": 2.5065673828125, - "learning_rate": 1.993835008912268e-05, - "loss": 0.1003, - "reward": 1.1052083663642407, - "reward_std": 0.4220279671251774, - "rewards/accuracy_reward": 0.2424479237990454, - "rewards/format_reward": 0.8627604350447655, - "step": 75 + "clip_ratio": 0.0, + "completion_length": 964.0424499511719, + "epoch": 0.020013441863938466, + "grad_norm": 0.22950349748134613, + "kl": 0.05462646484375, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0206, + "reward": 0.4810268059372902, + "reward_std": 0.15524197556078434, + "rewards/accuracy_reward": 0.08035714458674192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.400669664144516, + "step": 67 }, { - "completion_length": 776.5580970764161, - "epoch": 0.1413427561837456, - "grad_norm": 2.637300729751587, - "kl": 4.01103515625, - "learning_rate": 1.9899408703314383e-05, - "loss": 0.1607, - "reward": 0.6713541846722364, - "reward_std": 0.5498738510534167, - "rewards/accuracy_reward": 0.14401042095851152, - "rewards/format_reward": 0.5273437643423676, - "step": 80 + "clip_ratio": 0.0, + "completion_length": 925.8370819091797, + "epoch": 0.020312149951459937, + "grad_norm": 0.24157212674617767, + "kl": 0.0689697265625, + "learning_rate": 4.059701492537314e-06, + "loss": 0.0256, + "reward": 0.541294664144516, + "reward_std": 0.1676010899245739, + "rewards/accuracy_reward": 0.11830357369035482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.422991082072258, + "step": 68 }, { - "completion_length": 687.9679878234863, - "epoch": 0.1501766784452297, - "grad_norm": 0.6225027441978455, - "kl": 0.4655029296875, - "learning_rate": 1.985104019878233e-05, - "loss": 0.0186, - "reward": 1.0481771223247052, - "reward_std": 0.4813871243968606, - "rewards/accuracy_reward": 0.25260417349636555, - "rewards/format_reward": 0.7955729350447655, - "step": 85 + "clip_ratio": 0.0, + "completion_length": 883.8504791259766, + "epoch": 0.020610858038981404, + "grad_norm": 0.26598554849624634, + "kl": 0.07861328125, + "learning_rate": 4.119402985074627e-06, + "loss": 0.0338, + "reward": 0.4983259215950966, + "reward_std": 0.11011723428964615, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4447544887661934, + "step": 69 }, { - "completion_length": 603.3086116790771, - "epoch": 0.15901060070671377, - "grad_norm": 0.5264981985092163, - "kl": 0.457861328125, - "learning_rate": 1.9793290636422503e-05, - "loss": 0.0183, - "reward": 1.109895868599415, - "reward_std": 0.4686966996639967, - "rewards/accuracy_reward": 0.2692708411952481, - "rewards/format_reward": 0.8406250208616257, - "step": 90 + "clip_ratio": 0.0, + "completion_length": 873.9754791259766, + "epoch": 0.020909566126502875, + "grad_norm": 0.2744751572608948, + "kl": 0.0899658203125, + "learning_rate": 4.17910447761194e-06, + "loss": 0.0456, + "reward": 0.5368303805589676, + "reward_std": 0.1843670718371868, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.452008955180645, + "step": 70 }, { - "completion_length": 667.2664253234864, - "epoch": 0.16784452296819788, - "grad_norm": 0.7446408271789551, - "kl": 0.4615966796875, - "learning_rate": 1.9726215010628717e-05, - "loss": 0.0185, - "reward": 1.0520833618938923, - "reward_std": 0.48771157767623663, - "rewards/accuracy_reward": 0.25989584187045695, - "rewards/format_reward": 0.7921875193715096, - "step": 95 + "clip_ratio": 0.0, + "completion_length": 827.6518249511719, + "epoch": 0.021208274214024346, + "grad_norm": 0.19786466658115387, + "kl": 0.0999755859375, + "learning_rate": 4.238805970149254e-06, + "loss": 0.0193, + "reward": 0.5664062798023224, + "reward_std": 0.11361684463918209, + "rewards/accuracy_reward": 0.08258929010480642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169813156128, + "step": 71 }, { - "completion_length": 597.6562690734863, - "epoch": 0.17667844522968199, - "grad_norm": 0.5097666382789612, - "kl": 0.4584228515625, - "learning_rate": 1.96498771969219e-05, - "loss": 0.0183, - "reward": 1.035937537997961, - "reward_std": 0.4228799358010292, - "rewards/accuracy_reward": 0.17239583788905294, - "rewards/format_reward": 0.8635416880249978, - "step": 100 + "clip_ratio": 0.0, + "completion_length": 863.5625457763672, + "epoch": 0.021506982301545814, + "grad_norm": 0.2514093518257141, + "kl": 0.113037109375, + "learning_rate": 4.298507462686567e-06, + "loss": 0.0266, + "reward": 0.5731026977300644, + "reward_std": 0.11702866107225418, + "rewards/accuracy_reward": 0.10937500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4637276977300644, + "step": 72 }, { - "epoch": 0.17667844522968199, - "eval_completion_length": 580.9896065848214, - "eval_kl": 0.375, - "eval_loss": 0.014898359775543213, - "eval_reward": 1.0907738719667708, - "eval_reward_std": 0.3588516584464482, - "eval_rewards/accuracy_reward": 0.18303572067192622, - "eval_rewards/format_reward": 0.9077381065913609, - "eval_runtime": 60.5321, - "eval_samples_per_second": 1.635, - "eval_steps_per_second": 0.033, - "step": 100 + "clip_ratio": 0.0, + "completion_length": 814.0647735595703, + "epoch": 0.021805690389067284, + "grad_norm": 0.22492463886737823, + "kl": 0.1239013671875, + "learning_rate": 4.358208955223881e-06, + "loss": 0.015, + "reward": 0.6484375298023224, + "reward_std": 0.09207698702812195, + "rewards/accuracy_reward": 0.17187500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4765625223517418, + "step": 73 }, { - "completion_length": 598.6833534240723, - "epoch": 0.1855123674911661, - "grad_norm": 1.9710701704025269, - "kl": 0.380517578125, - "learning_rate": 1.9564349891122017e-05, - "loss": 0.0152, - "reward": 1.1434896238148213, - "reward_std": 0.3401633009314537, - "rewards/accuracy_reward": 0.22395833923947067, - "rewards/format_reward": 0.9195312671363354, - "step": 105 + "clip_ratio": 0.0, + "completion_length": 876.6272735595703, + "epoch": 0.022104398476588755, + "grad_norm": 0.28744417428970337, + "kl": 0.1357421875, + "learning_rate": 4.417910447761195e-06, + "loss": 0.0254, + "reward": 0.5212053880095482, + "reward_std": 0.13485162984579802, + "rewards/accuracy_reward": 0.05133928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4698660969734192, + "step": 74 }, { - "completion_length": 603.9349147796631, - "epoch": 0.19434628975265017, - "grad_norm": 1.9823681116104126, - "kl": 340.9431396484375, - "learning_rate": 1.946971454012051e-05, - "loss": 13.6598, - "reward": 1.080468788743019, - "reward_std": 0.444459193572402, - "rewards/accuracy_reward": 0.24557292219251395, - "rewards/format_reward": 0.8348958536982536, - "step": 110 + "clip_ratio": 0.0, + "completion_length": 864.1741485595703, + "epoch": 0.022403106564110223, + "grad_norm": 0.29183611273765564, + "kl": 0.15673828125, + "learning_rate": 4.477611940298508e-06, + "loss": 0.0287, + "reward": 0.5664062649011612, + "reward_std": 0.140617111697793, + "rewards/accuracy_reward": 0.09598214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4704241380095482, + "step": 75 }, { - "completion_length": 521.6375156402588, - "epoch": 0.20318021201413428, - "grad_norm": 0.2644859850406647, - "kl": 0.611279296875, - "learning_rate": 1.9366061264319112e-05, - "loss": 0.0245, - "reward": 1.164322955161333, - "reward_std": 0.30429360857233406, - "rewards/accuracy_reward": 0.22239583961199968, - "rewards/format_reward": 0.9419271014630795, - "step": 115 + "clip_ratio": 0.0, + "completion_length": 862.1495971679688, + "epoch": 0.022701814651631694, + "grad_norm": 0.3076688349246979, + "kl": 0.201416015625, + "learning_rate": 4.537313432835822e-06, + "loss": 0.0276, + "reward": 0.537946455180645, + "reward_std": 0.07972304616123438, + "rewards/accuracy_reward": 0.06250000302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.475446455180645, + "step": 76 }, { - "completion_length": 610.4583511352539, - "epoch": 0.21201413427561838, - "grad_norm": 3.911526679992676, - "kl": 0.66932373046875, - "learning_rate": 1.9253488771809024e-05, - "loss": 0.0268, - "reward": 1.0346354454755784, - "reward_std": 0.4576338665559888, - "rewards/accuracy_reward": 0.252083339728415, - "rewards/format_reward": 0.7825521025806665, - "step": 120 + "clip_ratio": 0.0, + "completion_length": 858.2098693847656, + "epoch": 0.02300052273915316, + "grad_norm": 0.34871283173561096, + "kl": 0.2197265625, + "learning_rate": 4.597014925373134e-06, + "loss": 0.0316, + "reward": 0.5641741305589676, + "reward_std": 0.11127364751882851, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169813156128, + "step": 77 }, { - "completion_length": 543.1140796661377, - "epoch": 0.22084805653710246, - "grad_norm": 30.835792541503906, - "kl": 2.0382080078125, - "learning_rate": 1.9132104264372065e-05, - "loss": 0.0816, - "reward": 1.1661458693444728, - "reward_std": 0.40763126518577336, - "rewards/accuracy_reward": 0.27812500749714675, - "rewards/format_reward": 0.8880208529531955, - "step": 125 + "clip_ratio": 0.0, + "completion_length": 915.0826263427734, + "epoch": 0.023299230826674632, + "grad_norm": 0.3924891948699951, + "kl": 0.26806640625, + "learning_rate": 4.656716417910448e-06, + "loss": 0.0398, + "reward": 0.5998884215950966, + "reward_std": 0.13031796365976334, + "rewards/accuracy_reward": 0.1361607238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4637276977300644, + "step": 78 }, { - "completion_length": 499.7648586273193, - "epoch": 0.22968197879858657, - "grad_norm": 5.783499240875244, - "kl": 0.8605224609375, - "learning_rate": 1.9002023335393366e-05, - "loss": 0.0344, - "reward": 1.2244791992008686, - "reward_std": 0.36452554948627947, - "rewards/accuracy_reward": 0.31171875859145076, - "rewards/format_reward": 0.9127604357898236, - "step": 130 + "clip_ratio": 0.0, + "completion_length": 930.4888916015625, + "epoch": 0.023597938914196103, + "grad_norm": 0.37868160009384155, + "kl": 0.287109375, + "learning_rate": 4.716417910447761e-06, + "loss": 0.0338, + "reward": 0.5033482387661934, + "reward_std": 0.08646446093916893, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4654018133878708, + "step": 79 }, { - "completion_length": 511.65626640319823, - "epoch": 0.23851590106007067, - "grad_norm": 3.061586856842041, - "kl": 2.383056640625, - "learning_rate": 1.8863369859782824e-05, - "loss": 0.0954, - "reward": 1.143489620089531, - "reward_std": 0.4212816862389445, - "rewards/accuracy_reward": 0.2687500066822395, - "rewards/format_reward": 0.8747396059334278, - "step": 135 + "clip_ratio": 0.0, + "completion_length": 908.9665679931641, + "epoch": 0.02389664700171757, + "grad_norm": 0.3620331883430481, + "kl": 0.2880859375, + "learning_rate": 4.7761194029850745e-06, + "loss": 0.0292, + "reward": 0.538504496216774, + "reward_std": 0.16548570804297924, + "rewards/accuracy_reward": 0.07589285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4626116305589676, + "step": 80 }, { - "completion_length": 474.1388137817383, - "epoch": 0.24734982332155478, - "grad_norm": 0.26143816113471985, - "kl": 0.4742919921875, - "learning_rate": 1.8716275876010135e-05, - "loss": 0.019, - "reward": 1.250520869344473, - "reward_std": 0.3101725110784173, - "rewards/accuracy_reward": 0.30026042584795504, - "rewards/format_reward": 0.9502604372799397, - "step": 140 + "clip_ratio": 0.0, + "completion_length": 938.8504791259766, + "epoch": 0.02419535508923904, + "grad_norm": 0.5609309077262878, + "kl": 0.3037109375, + "learning_rate": 4.8358208955223885e-06, + "loss": 0.0338, + "reward": 0.5563616305589676, + "reward_std": 0.18622678145766258, + "rewards/accuracy_reward": 0.13616072130389512, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.4179687649011612, + "step": 81 }, { - "completion_length": 490.17579498291013, - "epoch": 0.25618374558303886, - "grad_norm": 0.5461937189102173, - "kl": 0.3491943359375, - "learning_rate": 1.8560881460365726e-05, - "loss": 0.014, - "reward": 1.185937537252903, - "reward_std": 0.3633216926828027, - "rewards/accuracy_reward": 0.2671875073108822, - "rewards/format_reward": 0.9187500171363354, - "step": 145 + "clip_ratio": 0.0, + "completion_length": 929.0580749511719, + "epoch": 0.024494063176760512, + "grad_norm": 0.7582675814628601, + "kl": 0.30029296875, + "learning_rate": 4.895522388059702e-06, + "loss": 0.0664, + "reward": 0.4095982238650322, + "reward_std": 0.18518700450658798, + "rewards/accuracy_reward": 0.033482144586741924, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3761160895228386, + "step": 82 }, { - "completion_length": 526.072413635254, - "epoch": 0.26501766784452296, - "grad_norm": 0.19244790077209473, - "kl": 0.3271240234375, - "learning_rate": 1.8397334593567347e-05, - "loss": 0.0131, - "reward": 1.183854202926159, - "reward_std": 0.40570886749774215, - "rewards/accuracy_reward": 0.2950520919170231, - "rewards/format_reward": 0.8888021044433116, - "step": 150 + "clip_ratio": 0.0, + "completion_length": 908.607177734375, + "epoch": 0.02479277126428198, + "grad_norm": 4.737978935241699, + "kl": 0.2861328125, + "learning_rate": 4.955223880597016e-06, + "loss": 0.0719, + "reward": 0.3883928805589676, + "reward_std": 0.16539951413869858, + "rewards/accuracy_reward": 0.0223214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3660714477300644, + "step": 83 }, { - "completion_length": 555.1791851043702, - "epoch": 0.27385159010600707, - "grad_norm": 0.26226282119750977, - "kl": 0.30872802734375, - "learning_rate": 1.8225791019839375e-05, - "loss": 0.0124, - "reward": 0.9653646148741245, - "reward_std": 0.5673671968281269, - "rewards/accuracy_reward": 0.2473958402639255, - "rewards/format_reward": 0.7179687693715096, - "step": 155 + "clip_ratio": 0.0, + "completion_length": 911.4241485595703, + "epoch": 0.02509147935180345, + "grad_norm": 14.508854866027832, + "kl": 0.3349609375, + "learning_rate": 5.014925373134328e-06, + "loss": 0.0965, + "reward": 0.3878348395228386, + "reward_std": 0.2229548655450344, + "rewards/accuracy_reward": 0.03794643050059676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3498883992433548, + "step": 84 }, { - "completion_length": 339.8388107299805, - "epoch": 0.2826855123674912, - "grad_norm": 0.6711130738258362, - "kl": 0.5605224609375, - "learning_rate": 1.8046414098598947e-05, - "loss": 0.0224, - "reward": 1.0901041992008687, - "reward_std": 0.38608927368186413, - "rewards/accuracy_reward": 0.22369792361278087, - "rewards/format_reward": 0.8664062671363354, - "step": 160 + "clip_ratio": 0.0, + "completion_length": 918.4018096923828, + "epoch": 0.02539018743932492, + "grad_norm": 12.68013858795166, + "kl": 0.431640625, + "learning_rate": 5.074626865671642e-06, + "loss": 0.0823, + "reward": 0.460937537252903, + "reward_std": 0.20223992690443993, + "rewards/accuracy_reward": 0.10937500302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3515625223517418, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 913.3460388183594, + "epoch": 0.02568889552684639, + "grad_norm": 14.316661834716797, + "kl": 0.541015625, + "learning_rate": 5.134328358208955e-06, + "loss": 0.0888, + "reward": 0.415736623108387, + "reward_std": 0.18256152793765068, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3554687649011612, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 905.6094055175781, + "epoch": 0.02598760361436786, + "grad_norm": 55.25035095214844, + "kl": 1.07373046875, + "learning_rate": 5.194029850746269e-06, + "loss": 0.1225, + "reward": 0.4776785969734192, + "reward_std": 0.20144647359848022, + "rewards/accuracy_reward": 0.1093750074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.368303582072258, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 906.3348693847656, + "epoch": 0.026286311701889328, + "grad_norm": 8.275175094604492, + "kl": 0.7666015625, + "learning_rate": 5.2537313432835825e-06, + "loss": 0.0901, + "reward": 0.4068080559372902, + "reward_std": 0.22353703156113625, + "rewards/accuracy_reward": 0.0580357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.348772332072258, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 882.7589721679688, + "epoch": 0.0265850197894108, + "grad_norm": 63.186100006103516, + "kl": 1.240234375, + "learning_rate": 5.3134328358208965e-06, + "loss": 0.1082, + "reward": 0.5050223469734192, + "reward_std": 0.19170860201120377, + "rewards/accuracy_reward": 0.1160714291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.388950914144516, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 849.1808471679688, + "epoch": 0.02688372787693227, + "grad_norm": 44.37150192260742, + "kl": 1.3037109375, + "learning_rate": 5.37313432835821e-06, + "loss": 0.0924, + "reward": 0.4720982313156128, + "reward_std": 0.21317226439714432, + "rewards/accuracy_reward": 0.07812500419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3939732313156128, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 830.3616485595703, + "epoch": 0.027182435964453737, + "grad_norm": 8.94607925415039, + "kl": 0.6904296875, + "learning_rate": 5.432835820895522e-06, + "loss": 0.0378, + "reward": 0.4430803805589676, + "reward_std": 0.2083820104598999, + "rewards/accuracy_reward": 0.03571428591385484, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4073660895228386, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 818.7969055175781, + "epoch": 0.027481144051975208, + "grad_norm": 7.796857833862305, + "kl": 0.83203125, + "learning_rate": 5.492537313432836e-06, + "loss": -0.0188, + "reward": 0.4358259215950966, + "reward_std": 0.2140517234802246, + "rewards/accuracy_reward": 0.03794643119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3978794813156128, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 755.9777221679688, + "epoch": 0.027779852139496675, + "grad_norm": 2.8753950595855713, + "kl": 0.62744140625, + "learning_rate": 5.552238805970149e-06, + "loss": -0.0779, + "reward": 0.4687500223517418, + "reward_std": 0.21278522536158562, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.377232164144516, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 808.7835083007812, + "epoch": 0.028078560227018146, + "grad_norm": 8.365134239196777, + "kl": 0.75341796875, + "learning_rate": 5.611940298507463e-06, + "loss": -0.1142, + "reward": 0.4458705559372902, + "reward_std": 0.1986469253897667, + "rewards/accuracy_reward": 0.046875002793967724, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3989955559372902, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 790.1719055175781, + "epoch": 0.028377268314539617, + "grad_norm": 2.5462989807128906, + "kl": 0.49755859375, + "learning_rate": 5.671641791044776e-06, + "loss": -0.146, + "reward": 0.4642857387661934, + "reward_std": 0.1414599046111107, + "rewards/accuracy_reward": 0.04241071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4218750223517418, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 811.7232666015625, + "epoch": 0.028675976402061085, + "grad_norm": 3.0434494018554688, + "kl": 0.369140625, + "learning_rate": 5.7313432835820904e-06, + "loss": -0.1388, + "reward": 0.5083705559372902, + "reward_std": 0.174253448843956, + "rewards/accuracy_reward": 0.09375000675208867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4146205559372902, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 842.2969055175781, + "epoch": 0.028974684489582556, + "grad_norm": 1.7823485136032104, + "kl": 0.3515625, + "learning_rate": 5.791044776119404e-06, + "loss": -0.1518, + "reward": 0.5083705633878708, + "reward_std": 0.19355186261236668, + "rewards/accuracy_reward": 0.07366071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4347098469734192, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 846.2455902099609, + "epoch": 0.029273392577104027, + "grad_norm": 56.451053619384766, + "kl": 1.380859375, + "learning_rate": 5.850746268656718e-06, + "loss": -0.1092, + "reward": 0.612723246216774, + "reward_std": 0.19649419654160738, + "rewards/accuracy_reward": 0.1718750111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4408482387661934, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 875.8728179931641, + "epoch": 0.029572100664625494, + "grad_norm": 2.5618855953216553, + "kl": 0.339599609375, + "learning_rate": 5.91044776119403e-06, + "loss": -0.0958, + "reward": 0.4698661044239998, + "reward_std": 0.11474025249481201, + "rewards/accuracy_reward": 0.020089286379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4497767984867096, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 900.5893249511719, + "epoch": 0.029870808752146965, + "grad_norm": 52.60367965698242, + "kl": 1.82275390625, + "learning_rate": 5.970149253731343e-06, + "loss": -0.0202, + "reward": 0.545200914144516, + "reward_std": 0.10669242963194847, + "rewards/accuracy_reward": 0.07812500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4670759066939354, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 916.8013763427734, + "epoch": 0.030169516839668432, + "grad_norm": 2.604782819747925, + "kl": 0.47998046875, + "learning_rate": 6.029850746268657e-06, + "loss": -0.1136, + "reward": 0.5597098544239998, + "reward_std": 0.17138748988509178, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4525669887661934, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 901.0268402099609, + "epoch": 0.030468224927189903, + "grad_norm": 1.3898565769195557, + "kl": 0.35888671875, + "learning_rate": 6.08955223880597e-06, + "loss": -0.1029, + "reward": 0.5435268133878708, + "reward_std": 0.1559096109122038, + "rewards/accuracy_reward": 0.09598214458674192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.447544664144516, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 896.8326263427734, + "epoch": 0.030766933014711374, + "grad_norm": 1.6913673877716064, + "kl": 0.246337890625, + "learning_rate": 6.149253731343284e-06, + "loss": -0.0519, + "reward": 0.517299123108387, + "reward_std": 0.14624115824699402, + "rewards/accuracy_reward": 0.06919643119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4481026977300644, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 894.279052734375, + "epoch": 0.03106564110223284, + "grad_norm": 3.8205807209014893, + "kl": 0.29296875, + "learning_rate": 6.2089552238805975e-06, + "loss": -0.0826, + "reward": 0.5675223469734192, + "reward_std": 0.16375590674579144, + "rewards/accuracy_reward": 0.12276786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4447544887661934, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 896.2255096435547, + "epoch": 0.03136434918975431, + "grad_norm": 5.120176315307617, + "kl": 0.30517578125, + "learning_rate": 6.2686567164179116e-06, + "loss": -0.1061, + "reward": 0.5424107536673546, + "reward_std": 0.1553029716014862, + "rewards/accuracy_reward": 0.1049107164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4375000223517418, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 890.310302734375, + "epoch": 0.03166305727727578, + "grad_norm": 18.864768981933594, + "kl": 0.875, + "learning_rate": 6.328358208955224e-06, + "loss": -0.1028, + "reward": 0.4687500149011612, + "reward_std": 0.17162402719259262, + "rewards/accuracy_reward": 0.029017859371379018, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.439732164144516, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 882.5714569091797, + "epoch": 0.03196176536479725, + "grad_norm": 11.068634033203125, + "kl": 0.9296875, + "learning_rate": 6.388059701492538e-06, + "loss": -0.0979, + "reward": 0.5033482313156128, + "reward_std": 0.13748246990144253, + "rewards/accuracy_reward": 0.06026786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4430803880095482, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 862.6049499511719, + "epoch": 0.03226047345231872, + "grad_norm": 46.697425842285156, + "kl": 2.14453125, + "learning_rate": 6.447761194029851e-06, + "loss": -0.0865, + "reward": 0.4821428805589676, + "reward_std": 0.21524933725595474, + "rewards/accuracy_reward": 0.0602678582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4218750074505806, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 838.7879943847656, + "epoch": 0.03255918153984019, + "grad_norm": 8.438027381896973, + "kl": 0.9580078125, + "learning_rate": 6.507462686567164e-06, + "loss": -0.109, + "reward": 0.611607164144516, + "reward_std": 0.2075663022696972, + "rewards/accuracy_reward": 0.17633929336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4352678805589676, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 845.8973693847656, + "epoch": 0.032857889627361664, + "grad_norm": 4.530308246612549, + "kl": 0.755859375, + "learning_rate": 6.567164179104478e-06, + "loss": -0.1136, + "reward": 0.4464285895228386, + "reward_std": 0.16633978858590126, + "rewards/accuracy_reward": 0.020089286845177412, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4263393059372902, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 869.4933471679688, + "epoch": 0.03315659771488313, + "grad_norm": 5.754236698150635, + "kl": 0.62158203125, + "learning_rate": 6.6268656716417915e-06, + "loss": -0.1181, + "reward": 0.4369419813156128, + "reward_std": 0.18428738042712212, + "rewards/accuracy_reward": 0.02008928661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4168526977300644, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 875.8192443847656, + "epoch": 0.0334553058024046, + "grad_norm": 5.605788707733154, + "kl": 0.8515625, + "learning_rate": 6.6865671641791055e-06, + "loss": -0.0628, + "reward": 0.454241082072258, + "reward_std": 0.1680654212832451, + "rewards/accuracy_reward": 0.05803571571595967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3962053805589676, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 898.7120971679688, + "epoch": 0.03375401388992607, + "grad_norm": 9.850069999694824, + "kl": 1.1025390625, + "learning_rate": 6.746268656716418e-06, + "loss": -0.0395, + "reward": 0.534598246216774, + "reward_std": 0.11646133661270142, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4229910969734192, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 892.2857513427734, + "epoch": 0.03405272197744754, + "grad_norm": 8.856060028076172, + "kl": 1.0478515625, + "learning_rate": 6.805970149253732e-06, + "loss": -0.0183, + "reward": 0.5424107387661934, + "reward_std": 0.17208773083984852, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4151785969734192, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 922.8482666015625, + "epoch": 0.03435143006496901, + "grad_norm": 21.9963436126709, + "kl": 1.4765625, + "learning_rate": 6.865671641791045e-06, + "loss": -0.0348, + "reward": 0.4263393059372902, + "reward_std": 0.14104333519935608, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4174107387661934, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.4620971679688, + "epoch": 0.034650138152490476, + "grad_norm": 25.102123260498047, + "kl": 0.572265625, + "learning_rate": 6.925373134328359e-06, + "loss": -0.0089, + "reward": 0.4659598469734192, + "reward_std": 0.15194939076900482, + "rewards/accuracy_reward": 0.03125000069849193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4347098469734192, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.5558471679688, + "epoch": 0.034948846240011947, + "grad_norm": 35.761070251464844, + "kl": 1.0390625, + "learning_rate": 6.985074626865672e-06, + "loss": -0.0041, + "reward": 0.4905134066939354, + "reward_std": 0.1544114276766777, + "rewards/accuracy_reward": 0.08035714528523386, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4101562649011612, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.7232513427734, + "epoch": 0.03524755432753342, + "grad_norm": 18.315874099731445, + "kl": 0.61767578125, + "learning_rate": 7.044776119402985e-06, + "loss": 0.0213, + "reward": 0.583147332072258, + "reward_std": 0.11867288500070572, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4335937649011612, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.6495819091797, + "epoch": 0.03554626241505489, + "grad_norm": 54.376068115234375, + "kl": 2.318359375, + "learning_rate": 7.1044776119402994e-06, + "loss": 0.093, + "reward": 0.5546875298023224, + "reward_std": 0.1974365897476673, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4319196566939354, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.9040679931641, + "epoch": 0.03584497050257636, + "grad_norm": 33.7416877746582, + "kl": 1.58984375, + "learning_rate": 7.164179104477612e-06, + "loss": 0.055, + "reward": 0.501674123108387, + "reward_std": 0.1619513500481844, + "rewards/accuracy_reward": 0.07812500419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.423549123108387, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.3839721679688, + "epoch": 0.03614367859009783, + "grad_norm": 95.91656494140625, + "kl": 3.1015625, + "learning_rate": 7.223880597014926e-06, + "loss": 0.1312, + "reward": 0.4693080559372902, + "reward_std": 0.16546879708766937, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4335937723517418, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.5000610351562, + "epoch": 0.036442386677619294, + "grad_norm": 11.819601058959961, + "kl": 1.4033203125, + "learning_rate": 7.283582089552239e-06, + "loss": 0.0505, + "reward": 0.4151785895228386, + "reward_std": 0.12369075417518616, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4062500149011612, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.1161193847656, + "epoch": 0.036741094765140765, + "grad_norm": 20.342744827270508, + "kl": 0.504638671875, + "learning_rate": 7.343283582089553e-06, + "loss": 0.0171, + "reward": 0.5111607313156128, + "reward_std": 0.19900008291006088, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.424107164144516, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.8393249511719, + "epoch": 0.037039802852662236, + "grad_norm": 25.30986976623535, + "kl": 0.5791015625, + "learning_rate": 7.402985074626866e-06, + "loss": 0.0228, + "reward": 0.529575914144516, + "reward_std": 0.1701529398560524, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4402901977300644, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.6049499511719, + "epoch": 0.03733851094018371, + "grad_norm": 21.2097110748291, + "kl": 0.61328125, + "learning_rate": 7.46268656716418e-06, + "loss": 0.0253, + "reward": 0.5016741305589676, + "reward_std": 0.1154823824763298, + "rewards/accuracy_reward": 0.07589286053553224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4257812723517418, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.3906860351562, + "epoch": 0.03763721902770518, + "grad_norm": 16.9224910736084, + "kl": 1.0986328125, + "learning_rate": 7.522388059701493e-06, + "loss": 0.0317, + "reward": 0.5351562723517418, + "reward_std": 0.11430800706148148, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4257812723517418, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.1786041259766, + "epoch": 0.03793592711522664, + "grad_norm": 55.054832458496094, + "kl": 2.587890625, + "learning_rate": 7.582089552238806e-06, + "loss": 0.1152, + "reward": 0.5390625298023224, + "reward_std": 0.14916206151247025, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.407366082072258, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.9152221679688, + "epoch": 0.03823463520274811, + "grad_norm": 51.532997131347656, + "kl": 2.255859375, + "learning_rate": 7.64179104477612e-06, + "loss": 0.0909, + "reward": 0.4603794887661934, + "reward_std": 0.14697221666574478, + "rewards/accuracy_reward": 0.06250000349245965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3978794813156128, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.9955749511719, + "epoch": 0.038533343290269584, + "grad_norm": 59.899314880371094, + "kl": 2.41015625, + "learning_rate": 7.701492537313433e-06, + "loss": 0.1044, + "reward": 0.5825893133878708, + "reward_std": 0.16638080775737762, + "rewards/accuracy_reward": 0.17857143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4040178805589676, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.0848693847656, + "epoch": 0.038832051377791055, + "grad_norm": 23.343751907348633, + "kl": 1.7119140625, + "learning_rate": 7.761194029850747e-06, + "loss": 0.0704, + "reward": 0.4743303805589676, + "reward_std": 0.16018992103636265, + "rewards/accuracy_reward": 0.06919643119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4051339402794838, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.4487152099609, + "epoch": 0.039130759465312526, + "grad_norm": 4.048102855682373, + "kl": 1.2314453125, + "learning_rate": 7.82089552238806e-06, + "loss": 0.0444, + "reward": 0.4525669813156128, + "reward_std": 0.19185778126120567, + "rewards/accuracy_reward": 0.058035717345774174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3945312649011612, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.5625305175781, + "epoch": 0.03942946755283399, + "grad_norm": 14.837838172912598, + "kl": 0.8046875, + "learning_rate": 7.880597014925373e-06, + "loss": 0.0244, + "reward": 0.4129464402794838, + "reward_std": 0.15276310592889786, + "rewards/accuracy_reward": 0.022321428870782256, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3906250149011612, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.4152221679688, + "epoch": 0.03972817564035546, + "grad_norm": 15.199385643005371, + "kl": 1.29296875, + "learning_rate": 7.940298507462687e-06, + "loss": 0.0349, + "reward": 0.5156250223517418, + "reward_std": 0.13716034032404423, + "rewards/accuracy_reward": 0.12723215157166123, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3883928805589676, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.4107513427734, + "epoch": 0.04002688372787693, + "grad_norm": 17.1458740234375, + "kl": 1.3291015625, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0494, + "reward": 0.4503348469734192, + "reward_std": 0.11851304769515991, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3789062649011612, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.4710235595703, + "epoch": 0.0403255918153984, + "grad_norm": 9.233489036560059, + "kl": 1.228515625, + "learning_rate": 8.059701492537314e-06, + "loss": 0.0585, + "reward": 0.4464285969734192, + "reward_std": 0.14439214766025543, + "rewards/accuracy_reward": 0.05133928684517741, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3950893133878708, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.5736999511719, + "epoch": 0.04062429990291987, + "grad_norm": 22.39585304260254, + "kl": 1.623046875, + "learning_rate": 8.119402985074628e-06, + "loss": 0.0694, + "reward": 0.3906250223517418, + "reward_std": 0.14390075765550137, + "rewards/accuracy_reward": 0.01785714295692742, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3727678805589676, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.8549499511719, + "epoch": 0.040923007990441344, + "grad_norm": 2.6861603260040283, + "kl": 0.8251953125, + "learning_rate": 8.179104477611942e-06, + "loss": 0.0329, + "reward": 0.4196428805589676, + "reward_std": 0.11932878009974957, + "rewards/accuracy_reward": 0.0022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4174107313156128, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.2053985595703, + "epoch": 0.04122171607796281, + "grad_norm": 17.06727409362793, + "kl": 1.34375, + "learning_rate": 8.238805970149254e-06, + "loss": 0.0571, + "reward": 0.4235491305589676, + "reward_std": 0.13081580586731434, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3833705559372902, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.0245971679688, + "epoch": 0.04152042416548428, + "grad_norm": 2.7180771827697754, + "kl": 1.0390625, + "learning_rate": 8.298507462686568e-06, + "loss": 0.0434, + "reward": 0.5580357387661934, + "reward_std": 0.13637143932282925, + "rewards/accuracy_reward": 0.1562500107102096, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4017857313156128, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.1205902099609, + "epoch": 0.04181913225300575, + "grad_norm": 4.912252902984619, + "kl": 1.064453125, + "learning_rate": 8.35820895522388e-06, + "loss": 0.0385, + "reward": 0.3989955559372902, + "reward_std": 0.13793889805674553, + "rewards/accuracy_reward": 0.02008928661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3789062649011612, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.591552734375, + "epoch": 0.04211784034052722, + "grad_norm": 12.057145118713379, + "kl": 0.9150390625, + "learning_rate": 8.417910447761194e-06, + "loss": 0.0323, + "reward": 0.4151785895228386, + "reward_std": 0.16175905242562294, + "rewards/accuracy_reward": 0.022321430267766118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.392857164144516, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.404052734375, + "epoch": 0.04241654842804869, + "grad_norm": 7.339812278747559, + "kl": 0.9765625, + "learning_rate": 8.477611940298508e-06, + "loss": 0.0473, + "reward": 0.4224330559372902, + "reward_std": 0.11807778477668762, + "rewards/accuracy_reward": 0.004464285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4179687723517418, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.4308624267578, + "epoch": 0.042715256515570156, + "grad_norm": 12.732094764709473, + "kl": 0.9375, + "learning_rate": 8.537313432835822e-06, + "loss": 0.0413, + "reward": 0.4888393059372902, + "reward_std": 0.1263376884162426, + "rewards/accuracy_reward": 0.08035714412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4084821566939354, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.5112152099609, + "epoch": 0.04301396460309163, + "grad_norm": 10.209531784057617, + "kl": 0.7158203125, + "learning_rate": 8.597014925373135e-06, + "loss": 0.0338, + "reward": 0.412388414144516, + "reward_std": 0.13936122506856918, + "rewards/accuracy_reward": 0.011160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4012276902794838, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.7053985595703, + "epoch": 0.0433126726906131, + "grad_norm": 14.567824363708496, + "kl": 0.87109375, + "learning_rate": 8.656716417910447e-06, + "loss": 0.0449, + "reward": 0.4525669887661934, + "reward_std": 0.13644271157681942, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4079241305589676, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.2277221679688, + "epoch": 0.04361138077813457, + "grad_norm": 7.994652271270752, + "kl": 0.62548828125, + "learning_rate": 8.716417910447761e-06, + "loss": 0.0295, + "reward": 0.4464285969734192, + "reward_std": 0.13929898291826248, + "rewards/accuracy_reward": 0.046875003492459655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3995535895228386, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.3393096923828, + "epoch": 0.04391008886565604, + "grad_norm": 3.357189893722534, + "kl": 1.2666015625, + "learning_rate": 8.776119402985075e-06, + "loss": 0.0531, + "reward": 0.478794664144516, + "reward_std": 0.18317292630672455, + "rewards/accuracy_reward": 0.08035714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3984375149011612, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.6473846435547, + "epoch": 0.04420879695317751, + "grad_norm": 11.469348907470703, + "kl": 0.96875, + "learning_rate": 8.83582089552239e-06, + "loss": 0.0277, + "reward": 0.4458705559372902, + "reward_std": 0.11754808202385902, + "rewards/accuracy_reward": 0.0379464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.407924123108387, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.4464721679688, + "epoch": 0.044507505040698975, + "grad_norm": 19.941659927368164, + "kl": 1.4599609375, + "learning_rate": 8.895522388059702e-06, + "loss": 0.0632, + "reward": 0.4720982313156128, + "reward_std": 0.14117381907999516, + "rewards/accuracy_reward": 0.05580357392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4162946566939354, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.3616485595703, + "epoch": 0.044806213128220446, + "grad_norm": 61.0506706237793, + "kl": 2.6953125, + "learning_rate": 8.955223880597016e-06, + "loss": 0.1088, + "reward": 0.4347098395228386, + "reward_std": 0.15587732940912247, + "rewards/accuracy_reward": 0.022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4123884066939354, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.1741485595703, + "epoch": 0.04510492121574192, + "grad_norm": 44.74599075317383, + "kl": 4.40234375, + "learning_rate": 9.01492537313433e-06, + "loss": 0.1837, + "reward": 0.4146205484867096, + "reward_std": 0.12106269225478172, + "rewards/accuracy_reward": 0.0066964291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4079241156578064, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.5156555175781, + "epoch": 0.04540362930326339, + "grad_norm": 149.33062744140625, + "kl": 0.908203125, + "learning_rate": 9.074626865671644e-06, + "loss": 0.0504, + "reward": 0.4654018059372902, + "reward_std": 0.13799849338829517, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3828125074505806, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.5893249511719, + "epoch": 0.04570233739078486, + "grad_norm": 131.094482421875, + "kl": 0.63720703125, + "learning_rate": 9.134328358208956e-06, + "loss": 0.0323, + "reward": 0.4955357313156128, + "reward_std": 0.12579014152288437, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3861607313156128, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.9308471679688, + "epoch": 0.04600104547830632, + "grad_norm": 114.0064926147461, + "kl": 0.8564453125, + "learning_rate": 9.194029850746268e-06, + "loss": 0.0399, + "reward": 0.4062500223517418, + "reward_std": 0.14617467671632767, + "rewards/accuracy_reward": 0.013392857741564512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3928571566939354, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.9085235595703, + "epoch": 0.04629975356582779, + "grad_norm": 80.49903106689453, + "kl": 2.03125, + "learning_rate": 9.253731343283582e-06, + "loss": 0.0898, + "reward": 0.5117187798023224, + "reward_std": 0.1621025651693344, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3889509066939354, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.0268249511719, + "epoch": 0.046598461653349264, + "grad_norm": 19.588335037231445, + "kl": 2.81640625, + "learning_rate": 9.313432835820896e-06, + "loss": 0.1037, + "reward": 0.4810268059372902, + "reward_std": 0.11174653470516205, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3738839477300644, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.1652221679688, + "epoch": 0.046897169740870735, + "grad_norm": 11.881416320800781, + "kl": 1.908203125, + "learning_rate": 9.37313432835821e-06, + "loss": 0.0578, + "reward": 0.4419643059372902, + "reward_std": 0.12489903531968594, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4017857387661934, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.6161193847656, + "epoch": 0.047195877828392206, + "grad_norm": 56.063629150390625, + "kl": 3.53125, + "learning_rate": 9.432835820895523e-06, + "loss": 0.1333, + "reward": 0.3917410895228386, + "reward_std": 0.14296012558043003, + "rewards/accuracy_reward": 0.011160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3805803656578064, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.0937957763672, + "epoch": 0.04749458591591367, + "grad_norm": 76.87750244140625, + "kl": 4.58984375, + "learning_rate": 9.492537313432837e-06, + "loss": 0.1846, + "reward": 0.4760044887661934, + "reward_std": 0.12241644039750099, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3934151977300644, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.7902221679688, + "epoch": 0.04779329400343514, + "grad_norm": 23.742570877075195, + "kl": 2.91015625, + "learning_rate": 9.552238805970149e-06, + "loss": 0.124, + "reward": 0.572544664144516, + "reward_std": 0.12264516577124596, + "rewards/accuracy_reward": 0.15178572130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.420758955180645, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.0111999511719, + "epoch": 0.04809200209095661, + "grad_norm": 58.59557342529297, + "kl": 4.578125, + "learning_rate": 9.611940298507465e-06, + "loss": 0.1829, + "reward": 0.4207589477300644, + "reward_std": 0.13993500545620918, + "rewards/accuracy_reward": 0.015625, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4051339477300644, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.7701416015625, + "epoch": 0.04839071017847808, + "grad_norm": 6.296003341674805, + "kl": 3.9921875, + "learning_rate": 9.671641791044777e-06, + "loss": 0.1618, + "reward": 0.4765625149011612, + "reward_std": 0.14240018650889397, + "rewards/accuracy_reward": 0.0580357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4185268059372902, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.3861999511719, + "epoch": 0.048689418265999554, + "grad_norm": 45.102054595947266, + "kl": 2.681640625, + "learning_rate": 9.73134328358209e-06, + "loss": 0.1033, + "reward": 0.4687500298023224, + "reward_std": 0.11666610836982727, + "rewards/accuracy_reward": 0.04017857206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4285714477300644, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.888427734375, + "epoch": 0.048988126353521025, + "grad_norm": 36.31329345703125, + "kl": 3.755859375, + "learning_rate": 9.791044776119403e-06, + "loss": 0.1494, + "reward": 0.487165205180645, + "reward_std": 0.11919216066598892, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.435825914144516, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3460235595703, + "epoch": 0.04928683444104249, + "grad_norm": 11.566018104553223, + "kl": 2.8515625, + "learning_rate": 9.850746268656717e-06, + "loss": 0.1163, + "reward": 0.4380580559372902, + "reward_std": 0.10823113098740578, + "rewards/accuracy_reward": 0.004464285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4335937723517418, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.0312957763672, + "epoch": 0.04958554252856396, + "grad_norm": 4.776526927947998, + "kl": 2.0419921875, + "learning_rate": 9.910447761194031e-06, + "loss": 0.086, + "reward": 0.5435268059372902, + "reward_std": 0.13900111429393291, + "rewards/accuracy_reward": 0.09375000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4497768059372902, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.0067443847656, + "epoch": 0.04988425061608543, + "grad_norm": 51.19269561767578, + "kl": 2.154296875, + "learning_rate": 9.970149253731344e-06, + "loss": 0.0925, + "reward": 0.549665205180645, + "reward_std": 0.11131735146045685, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4358259066939354, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.5469207763672, + "epoch": 0.0501829587036069, + "grad_norm": 25.84944725036621, + "kl": 1.33203125, + "learning_rate": 1.0029850746268656e-05, + "loss": 0.0555, + "reward": 0.4213169813156128, + "reward_std": 0.11720035411417484, + "rewards/accuracy_reward": 0.0022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4190848395228386, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.341552734375, + "epoch": 0.05048166679112837, + "grad_norm": 8.393462181091309, + "kl": 0.8232421875, + "learning_rate": 1.008955223880597e-05, + "loss": 0.0337, + "reward": 0.4486607313156128, + "reward_std": 0.1001157071441412, + "rewards/accuracy_reward": 0.004464285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4441964477300644, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.1339569091797, + "epoch": 0.05078037487864984, + "grad_norm": 4.59559965133667, + "kl": 1.2802734375, + "learning_rate": 1.0149253731343284e-05, + "loss": 0.0529, + "reward": 0.478794664144516, + "reward_std": 0.1304942723363638, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4006696566939354, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.5156707763672, + "epoch": 0.05107908296617131, + "grad_norm": 25.832996368408203, + "kl": 1.416015625, + "learning_rate": 1.0208955223880598e-05, + "loss": 0.0641, + "reward": 0.4430803805589676, + "reward_std": 0.12092806026339531, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4051339477300644, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.6094055175781, + "epoch": 0.05137779105369278, + "grad_norm": 15.057541847229004, + "kl": 1.384765625, + "learning_rate": 1.026865671641791e-05, + "loss": 0.0635, + "reward": 0.3967634066939354, + "reward_std": 0.11627230793237686, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3967634066939354, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.3393249511719, + "epoch": 0.05167649914121425, + "grad_norm": 4.131141662597656, + "kl": 0.8955078125, + "learning_rate": 1.0328358208955225e-05, + "loss": 0.0418, + "reward": 0.416294664144516, + "reward_std": 0.13009744882583618, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3761160895228386, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.0625457763672, + "epoch": 0.05197520722873572, + "grad_norm": 6.466559410095215, + "kl": 0.98828125, + "learning_rate": 1.0388059701492539e-05, + "loss": 0.0413, + "reward": 0.3794643059372902, + "reward_std": 0.13988961465656757, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3705357313156128, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.9174499511719, + "epoch": 0.052273915316257184, + "grad_norm": 4.978199005126953, + "kl": 1.275390625, + "learning_rate": 1.0447761194029851e-05, + "loss": 0.0525, + "reward": 0.3900669887661934, + "reward_std": 0.1264602392911911, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3521205559372902, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.2120971679688, + "epoch": 0.052572623403778655, + "grad_norm": 13.837759971618652, + "kl": 1.59375, + "learning_rate": 1.0507462686567165e-05, + "loss": 0.0655, + "reward": 0.3956473469734192, + "reward_std": 0.16387009248137474, + "rewards/accuracy_reward": 0.05357143026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.342075914144516, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.7879791259766, + "epoch": 0.052871331491300126, + "grad_norm": 18.908859252929688, + "kl": 1.1240234375, + "learning_rate": 1.0567164179104479e-05, + "loss": 0.0396, + "reward": 0.4877232238650322, + "reward_std": 0.14212634041905403, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3738839477300644, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 970.2054138183594, + "epoch": 0.0531700395788216, + "grad_norm": 44.44590759277344, + "kl": 0.888671875, + "learning_rate": 1.0626865671641793e-05, + "loss": 0.0202, + "reward": 0.4564732313156128, + "reward_std": 0.14227548241615295, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3828125149011612, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.1987152099609, + "epoch": 0.05346874766634307, + "grad_norm": 21.000389099121094, + "kl": 0.7294921875, + "learning_rate": 1.0686567164179105e-05, + "loss": -0.0157, + "reward": 0.466517873108387, + "reward_std": 0.14674829505383968, + "rewards/accuracy_reward": 0.0803571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3861607313156128, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 930.888427734375, + "epoch": 0.05376745575386454, + "grad_norm": 7.594728946685791, + "kl": 0.546875, + "learning_rate": 1.074626865671642e-05, + "loss": -0.0409, + "reward": 0.4715401977300644, + "reward_std": 0.16212094575166702, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4023437649011612, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.9732513427734, + "epoch": 0.054066163841386, + "grad_norm": 6.875792980194092, + "kl": 0.556640625, + "learning_rate": 1.0805970149253733e-05, + "loss": -0.0434, + "reward": 0.4481026977300644, + "reward_std": 0.1302205976098776, + "rewards/accuracy_reward": 0.04017857206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4079241305589676, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.8303985595703, + "epoch": 0.054364871928907474, + "grad_norm": 7.0643181800842285, + "kl": 0.8974609375, + "learning_rate": 1.0865671641791044e-05, + "loss": -0.0121, + "reward": 0.4681919738650322, + "reward_std": 0.1092926375567913, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.361049123108387, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.4219207763672, + "epoch": 0.054663580016428945, + "grad_norm": 8.97321605682373, + "kl": 0.888671875, + "learning_rate": 1.0925373134328358e-05, + "loss": -0.0079, + "reward": 0.4179687649011612, + "reward_std": 0.13002351857721806, + "rewards/accuracy_reward": 0.03125000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3867187649011612, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.872802734375, + "epoch": 0.054962288103950416, + "grad_norm": 12.08703899383545, + "kl": 1.318359375, + "learning_rate": 1.0985074626865672e-05, + "loss": 0.0342, + "reward": 0.456473246216774, + "reward_std": 0.1307181641459465, + "rewards/accuracy_reward": 0.0803571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3761160895228386, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.5312805175781, + "epoch": 0.05526099619147189, + "grad_norm": 15.324989318847656, + "kl": 1.259765625, + "learning_rate": 1.1044776119402986e-05, + "loss": 0.0198, + "reward": 0.4458705484867096, + "reward_std": 0.12792900949716568, + "rewards/accuracy_reward": 0.04241071501746774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4034598395228386, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.5960235595703, + "epoch": 0.05555970427899335, + "grad_norm": 34.17826461791992, + "kl": 1.26171875, + "learning_rate": 1.1104477611940298e-05, + "loss": 0.0425, + "reward": 0.4893973395228386, + "reward_std": 0.12333940155804157, + "rewards/accuracy_reward": 0.0892857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.400111623108387, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.5937957763672, + "epoch": 0.05585841236651482, + "grad_norm": 18.674142837524414, + "kl": 1.494140625, + "learning_rate": 1.1164179104477612e-05, + "loss": 0.0673, + "reward": 0.4838169813156128, + "reward_std": 0.1404181495308876, + "rewards/accuracy_reward": 0.08258928824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4012276977300644, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.6428985595703, + "epoch": 0.05615712045403629, + "grad_norm": 38.527103424072266, + "kl": 2.623046875, + "learning_rate": 1.1223880597014926e-05, + "loss": 0.1227, + "reward": 0.486049123108387, + "reward_std": 0.10906369611620903, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4123884066939354, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.2254943847656, + "epoch": 0.056455828541557763, + "grad_norm": 11.32400894165039, + "kl": 1.1748046875, + "learning_rate": 1.128358208955224e-05, + "loss": 0.0462, + "reward": 0.5279017984867096, + "reward_std": 0.1207624152302742, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4430803805589676, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.2121124267578, + "epoch": 0.056754536629079234, + "grad_norm": 141.17724609375, + "kl": 6.52734375, + "learning_rate": 1.1343283582089553e-05, + "loss": 0.2596, + "reward": 0.479352705180645, + "reward_std": 0.12518282234668732, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4391741305589676, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.1607666015625, + "epoch": 0.057053244716600705, + "grad_norm": 10.421899795532227, + "kl": 1.3125, + "learning_rate": 1.1402985074626867e-05, + "loss": 0.0539, + "reward": 0.4486607387661934, + "reward_std": 0.13734684884548187, + "rewards/accuracy_reward": 0.011160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4375000223517418, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.0826416015625, + "epoch": 0.05735195280412217, + "grad_norm": 12.770522117614746, + "kl": 2.08984375, + "learning_rate": 1.1462686567164181e-05, + "loss": 0.0809, + "reward": 0.5318080633878708, + "reward_std": 0.1537629049271345, + "rewards/accuracy_reward": 0.09151786123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4402901977300644, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.9978179931641, + "epoch": 0.05765066089164364, + "grad_norm": 13.724736213684082, + "kl": 2.3046875, + "learning_rate": 1.1522388059701493e-05, + "loss": 0.0989, + "reward": 0.506138414144516, + "reward_std": 0.16056888736784458, + "rewards/accuracy_reward": 0.08258928824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.423549123108387, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.9844207763672, + "epoch": 0.05794936897916511, + "grad_norm": 6.908210754394531, + "kl": 2.30859375, + "learning_rate": 1.1582089552238807e-05, + "loss": 0.1006, + "reward": 0.4592634215950966, + "reward_std": 0.14959662966430187, + "rewards/accuracy_reward": 0.04464285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4146205559372902, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.6585083007812, + "epoch": 0.05824807706668658, + "grad_norm": 80.83358764648438, + "kl": 4.640625, + "learning_rate": 1.1641791044776121e-05, + "loss": 0.1947, + "reward": 0.4056919813156128, + "reward_std": 0.2114727571606636, + "rewards/accuracy_reward": 0.026785715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3789062649011612, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.1897735595703, + "epoch": 0.05854678515420805, + "grad_norm": 80.42219543457031, + "kl": 4.8515625, + "learning_rate": 1.1701492537313435e-05, + "loss": 0.2093, + "reward": 0.4296875149011612, + "reward_std": 0.16256752610206604, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3582589477300644, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.7120971679688, + "epoch": 0.05884549324172952, + "grad_norm": 9.038883209228516, + "kl": 2.13671875, + "learning_rate": 1.1761194029850746e-05, + "loss": 0.0885, + "reward": 0.3621651902794838, + "reward_std": 0.1617821753025055, + "rewards/accuracy_reward": 0.0022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3599330484867096, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.1250457763672, + "epoch": 0.05914420132925099, + "grad_norm": 6.674751281738281, + "kl": 2.484375, + "learning_rate": 1.182089552238806e-05, + "loss": 0.1068, + "reward": 0.3900669813156128, + "reward_std": 0.18047020584344864, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3454241305589676, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.5937957763672, + "epoch": 0.05944290941677246, + "grad_norm": 8.222381591796875, + "kl": 1.88671875, + "learning_rate": 1.1880597014925374e-05, + "loss": 0.0874, + "reward": 0.3733259066939354, + "reward_std": 0.1916801854968071, + "rewards/accuracy_reward": 0.015625, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3577009066939354, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.7478179931641, + "epoch": 0.05974161750429393, + "grad_norm": 4.286056995391846, + "kl": 1.81640625, + "learning_rate": 1.1940298507462686e-05, + "loss": 0.0843, + "reward": 0.3409598395228386, + "reward_std": 0.22732551395893097, + "rewards/accuracy_reward": 0.022321430267766118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3186384066939354, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.5960388183594, + "epoch": 0.0600403255918154, + "grad_norm": 6.299842834472656, + "kl": 2.09375, + "learning_rate": 1.2e-05, + "loss": 0.0911, + "reward": 0.3716518059372902, + "reward_std": 0.1910732015967369, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3270089402794838, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.7500305175781, + "epoch": 0.060339033679336865, + "grad_norm": 18.595163345336914, + "kl": 1.603515625, + "learning_rate": 1.2059701492537314e-05, + "loss": 0.0878, + "reward": 0.4246652126312256, + "reward_std": 0.17368673160672188, + "rewards/accuracy_reward": 0.07812500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3465401977300644, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.5156707763672, + "epoch": 0.060637741766858336, + "grad_norm": 9.142512321472168, + "kl": 1.43359375, + "learning_rate": 1.2119402985074628e-05, + "loss": 0.0748, + "reward": 0.4045759066939354, + "reward_std": 0.20187678188085556, + "rewards/accuracy_reward": 0.04241071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3621651977300644, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.9085388183594, + "epoch": 0.06093644985437981, + "grad_norm": 16.84723663330078, + "kl": 1.720703125, + "learning_rate": 1.217910447761194e-05, + "loss": 0.0962, + "reward": 0.4575893133878708, + "reward_std": 0.16252248361706734, + "rewards/accuracy_reward": 0.09151786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3660714477300644, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.3393402099609, + "epoch": 0.06123515794190128, + "grad_norm": 37.39816665649414, + "kl": 1.787109375, + "learning_rate": 1.2238805970149255e-05, + "loss": 0.1063, + "reward": 0.4525669887661934, + "reward_std": 0.16749456152319908, + "rewards/accuracy_reward": 0.055803572526201606, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.396763414144516, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.9933471679688, + "epoch": 0.06153386602942275, + "grad_norm": 43.13191223144531, + "kl": 3.25, + "learning_rate": 1.2298507462686569e-05, + "loss": 0.1879, + "reward": 0.4185267984867096, + "reward_std": 0.17312980443239212, + "rewards/accuracy_reward": 0.0290178582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3895089402794838, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.154052734375, + "epoch": 0.06183257411694422, + "grad_norm": 59.97184753417969, + "kl": 7.6875, + "learning_rate": 1.2358208955223883e-05, + "loss": 0.3748, + "reward": 0.5027901902794838, + "reward_std": 0.19650321453809738, + "rewards/accuracy_reward": 0.11830357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.384486623108387, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.7120819091797, + "epoch": 0.06213128220446568, + "grad_norm": 68.30619049072266, + "kl": 6.25, + "learning_rate": 1.2417910447761195e-05, + "loss": 0.3095, + "reward": 0.4029018059372902, + "reward_std": 0.16209018975496292, + "rewards/accuracy_reward": 0.02455357275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3783482313156128, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 915.466552734375, + "epoch": 0.062429990291987154, + "grad_norm": 52.50307846069336, + "kl": 5.6640625, + "learning_rate": 1.2477611940298509e-05, + "loss": 0.3076, + "reward": 0.4475446566939354, + "reward_std": 0.14089280925691128, + "rewards/accuracy_reward": 0.05133928800933063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.396205373108387, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.8147583007812, + "epoch": 0.06272869837950862, + "grad_norm": 33.17428207397461, + "kl": 3.18359375, + "learning_rate": 1.2537313432835823e-05, + "loss": 0.19, + "reward": 0.4241071566939354, + "reward_std": 0.1594633385539055, + "rewards/accuracy_reward": 0.04687500209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3772321566939354, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 893.1339874267578, + "epoch": 0.06302740646703009, + "grad_norm": 99.62578582763672, + "kl": 1.427734375, + "learning_rate": 1.2597014925373134e-05, + "loss": 0.1392, + "reward": 0.4815848395228386, + "reward_std": 0.1511471774429083, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3945312723517418, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 905.6875305175781, + "epoch": 0.06332611455455156, + "grad_norm": 73.0406265258789, + "kl": 1.76953125, + "learning_rate": 1.2656716417910448e-05, + "loss": 0.1689, + "reward": 0.4681919813156128, + "reward_std": 0.1448311023414135, + "rewards/accuracy_reward": 0.08482143376022577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3833705484867096, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 908.6652069091797, + "epoch": 0.06362482264207303, + "grad_norm": 29.058643341064453, + "kl": 3.09765625, + "learning_rate": 1.2716417910447762e-05, + "loss": 0.2138, + "reward": 0.4335937649011612, + "reward_std": 0.17419232428073883, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3800223395228386, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 895.0178985595703, + "epoch": 0.0639235307295945, + "grad_norm": 68.38675689697266, + "kl": 6.359375, + "learning_rate": 1.2776119402985076e-05, + "loss": 0.3548, + "reward": 0.4893973395228386, + "reward_std": 0.18311858922243118, + "rewards/accuracy_reward": 0.10714286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3822544813156128, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 919.7277069091797, + "epoch": 0.06422223881711597, + "grad_norm": 71.74022674560547, + "kl": 6.9453125, + "learning_rate": 1.2835820895522388e-05, + "loss": 0.3794, + "reward": 0.416294664144516, + "reward_std": 0.13506715185940266, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3761160969734192, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 888.8393402099609, + "epoch": 0.06452094690463744, + "grad_norm": 41.10442352294922, + "kl": 5.6796875, + "learning_rate": 1.2895522388059702e-05, + "loss": 0.3384, + "reward": 0.4023437574505806, + "reward_std": 0.1772681325674057, + "rewards/accuracy_reward": 0.02455357275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3777901977300644, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 924.4620971679688, + "epoch": 0.06481965499215891, + "grad_norm": 23.740840911865234, + "kl": 3.056640625, + "learning_rate": 1.2955223880597016e-05, + "loss": 0.1957, + "reward": 0.4944196566939354, + "reward_std": 0.14039370976388454, + "rewards/accuracy_reward": 0.11607143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3783482313156128, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 887.9576263427734, + "epoch": 0.06511836307968039, + "grad_norm": 29.743833541870117, + "kl": 1.96484375, + "learning_rate": 1.3014925373134329e-05, + "loss": 0.1428, + "reward": 0.4257812649011612, + "reward_std": 0.16779473423957825, + "rewards/accuracy_reward": 0.026785715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3989955484867096, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.2812957763672, + "epoch": 0.06541707116720186, + "grad_norm": 25.796064376831055, + "kl": 3.01953125, + "learning_rate": 1.3074626865671643e-05, + "loss": 0.1884, + "reward": 0.4302455484867096, + "reward_std": 0.1335033681243658, + "rewards/accuracy_reward": 0.0424107164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.387834832072258, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.7187957763672, + "epoch": 0.06571577925472333, + "grad_norm": 55.545021057128906, + "kl": 5.6171875, + "learning_rate": 1.3134328358208957e-05, + "loss": 0.2931, + "reward": 0.4704241305589676, + "reward_std": 0.13174151442945004, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3900669813156128, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.7567596435547, + "epoch": 0.06601448734224478, + "grad_norm": 68.98950958251953, + "kl": 6.5703125, + "learning_rate": 1.319402985074627e-05, + "loss": 0.3231, + "reward": 0.4257812723517418, + "reward_std": 0.12365815229713917, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3856026902794838, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.7924499511719, + "epoch": 0.06631319542976626, + "grad_norm": 48.47696304321289, + "kl": 2.06640625, + "learning_rate": 1.3253731343283583e-05, + "loss": 0.1355, + "reward": 0.4899553805589676, + "reward_std": 0.15557361766695976, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.416294664144516, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.529052734375, + "epoch": 0.06661190351728773, + "grad_norm": 56.46611785888672, + "kl": 1.359375, + "learning_rate": 1.3313432835820897e-05, + "loss": 0.0865, + "reward": 0.5435268208384514, + "reward_std": 0.1462959609925747, + "rewards/accuracy_reward": 0.10044643329456449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.443080373108387, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.3861999511719, + "epoch": 0.0669106116048092, + "grad_norm": 34.48484802246094, + "kl": 2.55859375, + "learning_rate": 1.3373134328358211e-05, + "loss": 0.1305, + "reward": 0.5552455708384514, + "reward_std": 0.13658787868916988, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4347098395228386, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.2143402099609, + "epoch": 0.06720931969233067, + "grad_norm": 115.91703033447266, + "kl": 7.1796875, + "learning_rate": 1.3432835820895525e-05, + "loss": 0.3005, + "reward": 0.510044664144516, + "reward_std": 0.1731738466769457, + "rewards/accuracy_reward": 0.09375000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4162946566939354, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.8661193847656, + "epoch": 0.06750802777985214, + "grad_norm": 160.4628448486328, + "kl": 8.96875, + "learning_rate": 1.3492537313432836e-05, + "loss": 0.3625, + "reward": 0.4893973395228386, + "reward_std": 0.12758938036859035, + "rewards/accuracy_reward": 0.049107146449387074, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4402901977300644, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.6362152099609, + "epoch": 0.06780673586737361, + "grad_norm": 236.97592163085938, + "kl": 12.109375, + "learning_rate": 1.355223880597015e-05, + "loss": 0.4886, + "reward": 0.5273437798023224, + "reward_std": 0.17186901718378067, + "rewards/accuracy_reward": 0.09151786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4358259215950966, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.8951416015625, + "epoch": 0.06810544395489508, + "grad_norm": 81.32587432861328, + "kl": 7.0234375, + "learning_rate": 1.3611940298507464e-05, + "loss": 0.2837, + "reward": 0.4843750223517418, + "reward_std": 0.17621351778507233, + "rewards/accuracy_reward": 0.06026786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.424107164144516, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.5893402099609, + "epoch": 0.06840415204241655, + "grad_norm": 49.69251251220703, + "kl": 1.17578125, + "learning_rate": 1.3671641791044776e-05, + "loss": 0.0464, + "reward": 0.4799107387661934, + "reward_std": 0.14626395516097546, + "rewards/accuracy_reward": 0.03348214481957257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4464285895228386, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.0535888671875, + "epoch": 0.06870286012993802, + "grad_norm": 46.567710876464844, + "kl": 1.4228515625, + "learning_rate": 1.373134328358209e-05, + "loss": 0.0573, + "reward": 0.478794664144516, + "reward_std": 0.1307935044169426, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.427455373108387, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.372802734375, + "epoch": 0.0690015682174595, + "grad_norm": 44.05610275268555, + "kl": 1.740234375, + "learning_rate": 1.3791044776119404e-05, + "loss": 0.0691, + "reward": 0.4832589477300644, + "reward_std": 0.14532428793609142, + "rewards/accuracy_reward": 0.06026786123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4229910895228386, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.4040222167969, + "epoch": 0.06930027630498095, + "grad_norm": 38.29716491699219, + "kl": 2.375, + "learning_rate": 1.3850746268656718e-05, + "loss": 0.0864, + "reward": 0.513950914144516, + "reward_std": 0.11799372360110283, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4313616305589676, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.8303833007812, + "epoch": 0.06959898439250242, + "grad_norm": 17.79549217224121, + "kl": 4.28515625, + "learning_rate": 1.391044776119403e-05, + "loss": 0.1681, + "reward": 0.5111607313156128, + "reward_std": 0.13445835933089256, + "rewards/accuracy_reward": 0.09151786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.419642873108387, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7276916503906, + "epoch": 0.06989769248002389, + "grad_norm": 17.710920333862305, + "kl": 4.28125, + "learning_rate": 1.3970149253731344e-05, + "loss": 0.168, + "reward": 0.5011160895228386, + "reward_std": 0.12847845442593098, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4185267984867096, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7053833007812, + "epoch": 0.07019640056754536, + "grad_norm": 12.201936721801758, + "kl": 4.46484375, + "learning_rate": 1.4029850746268658e-05, + "loss": 0.1776, + "reward": 0.4603794887661934, + "reward_std": 0.14092325046658516, + "rewards/accuracy_reward": 0.04687500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4135044887661934, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.013427734375, + "epoch": 0.07049510865506683, + "grad_norm": 9.173975944519043, + "kl": 3.98828125, + "learning_rate": 1.408955223880597e-05, + "loss": 0.1565, + "reward": 0.5580357387661934, + "reward_std": 0.17449573427438736, + "rewards/accuracy_reward": 0.14508929336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4129464477300644, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.0707938167425883, + "grad_norm": 5.598355770111084, + "kl": 2.0390625, + "learning_rate": 1.4149253731343285e-05, + "loss": 0.0814, + "reward": 0.4665178805589676, + "reward_std": 0.12797541730105877, + "rewards/accuracy_reward": 0.04687500209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.419642873108387, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07109252483010978, + "grad_norm": 6.011465072631836, + "kl": 0.646484375, + "learning_rate": 1.4208955223880599e-05, + "loss": 0.0258, + "reward": 0.5273437798023224, + "reward_std": 0.14172394201159477, + "rewards/accuracy_reward": 0.06919643376022577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4581473395228386, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07139123291763125, + "grad_norm": 7.991024494171143, + "kl": 0.4296875, + "learning_rate": 1.4268656716417913e-05, + "loss": 0.0172, + "reward": 0.5541294887661934, + "reward_std": 0.13676651753485203, + "rewards/accuracy_reward": 0.09598214994184673, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4581473469734192, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07168994100515272, + "grad_norm": 5.961028099060059, + "kl": 0.75390625, + "learning_rate": 1.4328358208955224e-05, + "loss": 0.0302, + "reward": 0.5089285895228386, + "reward_std": 0.12210430856794119, + "rewards/accuracy_reward": 0.05580357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4531250149011612, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07198864909267419, + "grad_norm": 1.6196869611740112, + "kl": 0.521484375, + "learning_rate": 1.4388059701492538e-05, + "loss": 0.0209, + "reward": 0.557477705180645, + "reward_std": 0.12993866857141256, + "rewards/accuracy_reward": 0.10044643585570157, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4570312723517418, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07228735718019566, + "grad_norm": 4.197913646697998, + "kl": 0.6396484375, + "learning_rate": 1.4447761194029852e-05, + "loss": 0.0256, + "reward": 0.5864955559372902, + "reward_std": 0.1247691921889782, + "rewards/accuracy_reward": 0.10937500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4771205559372902, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07258606526771712, + "grad_norm": 4.300390720367432, + "kl": 0.56591796875, + "learning_rate": 1.4507462686567166e-05, + "loss": 0.0226, + "reward": 0.5390625298023224, + "reward_std": 0.09267349913716316, + "rewards/accuracy_reward": 0.06250000279396772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4765625223517418, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.5201110839844, + "epoch": 0.07288477335523859, + "grad_norm": 1.1552999019622803, + "kl": 0.212646484375, + "learning_rate": 1.4567164179104478e-05, + "loss": 0.0085, + "reward": 0.6590402126312256, + "reward_std": 0.06539032235741615, + "rewards/accuracy_reward": 0.16741072200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07318348144276006, + "grad_norm": 1.492193579673767, + "kl": 0.351806640625, + "learning_rate": 1.4626865671641792e-05, + "loss": 0.0141, + "reward": 0.499441996216774, + "reward_std": 0.06860044156201184, + "rewards/accuracy_reward": 0.015625000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07348218953028153, + "grad_norm": 0.9581654667854309, + "kl": 0.208740234375, + "learning_rate": 1.4686567164179106e-05, + "loss": 0.0084, + "reward": 0.7165178954601288, + "reward_std": 0.15417575649917126, + "rewards/accuracy_reward": 0.2321428693830967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.073780897617803, + "grad_norm": 1.5160243511199951, + "kl": 0.36669921875, + "learning_rate": 1.4746268656716418e-05, + "loss": 0.0147, + "reward": 0.572544664144516, + "reward_std": 0.11243942845612764, + "rewards/accuracy_reward": 0.10937500861473382, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.463169664144516, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07407960570532447, + "grad_norm": 1.9926069974899292, + "kl": 0.748046875, + "learning_rate": 1.4805970149253732e-05, + "loss": 0.0299, + "reward": 0.4324776977300644, + "reward_std": 0.11914189718663692, + "rewards/accuracy_reward": 0.0066964291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4257812723517418, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07437831379284594, + "grad_norm": 2.3986759185791016, + "kl": 0.962890625, + "learning_rate": 1.4865671641791046e-05, + "loss": 0.0385, + "reward": 0.4893973544239998, + "reward_std": 0.12053784355521202, + "rewards/accuracy_reward": 0.07589285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4135044813156128, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07467702188036741, + "grad_norm": 3.232987880706787, + "kl": 1.1015625, + "learning_rate": 1.492537313432836e-05, + "loss": 0.044, + "reward": 0.4280134215950966, + "reward_std": 0.14958371967077255, + "rewards/accuracy_reward": 0.022321430267766118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4056919887661934, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07497572996788888, + "grad_norm": 3.2763867378234863, + "kl": 0.7099609375, + "learning_rate": 1.4985074626865673e-05, + "loss": 0.0284, + "reward": 0.4497767984867096, + "reward_std": 0.17807012796401978, + "rewards/accuracy_reward": 0.05357143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.396205373108387, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07527443805541036, + "grad_norm": 2.6948933601379395, + "kl": 1.126953125, + "learning_rate": 1.5044776119402987e-05, + "loss": 0.0451, + "reward": 0.4229910969734192, + "reward_std": 0.15340816415846348, + "rewards/accuracy_reward": 0.04910714412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3738839477300644, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07557314614293181, + "grad_norm": 14.417381286621094, + "kl": 2.16015625, + "learning_rate": 1.51044776119403e-05, + "loss": 0.0863, + "reward": 0.5412946790456772, + "reward_std": 0.22873074933886528, + "rewards/accuracy_reward": 0.14062500931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4006696566939354, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07587185423045328, + "grad_norm": 17.179765701293945, + "kl": 2.4921875, + "learning_rate": 1.5164179104477611e-05, + "loss": 0.0997, + "reward": 0.4263393059372902, + "reward_std": 0.17665772140026093, + "rewards/accuracy_reward": 0.05580357322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3705357387661934, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.3058166503906, + "epoch": 0.07617056231797475, + "grad_norm": 8.729497909545898, + "kl": 1.609375, + "learning_rate": 1.5223880597014925e-05, + "loss": 0.0638, + "reward": 0.4581473544239998, + "reward_std": 0.17730459943413734, + "rewards/accuracy_reward": 0.08482143515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3733259066939354, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.4442138671875, + "epoch": 0.07646927040549623, + "grad_norm": 4.335572242736816, + "kl": 1.142578125, + "learning_rate": 1.528358208955224e-05, + "loss": 0.045, + "reward": 0.3984375223517418, + "reward_std": 0.13764513842761517, + "rewards/accuracy_reward": 0.04464285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3537946566939354, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.0767679784930177, + "grad_norm": 5.5548930168151855, + "kl": 0.892578125, + "learning_rate": 1.5343283582089555e-05, + "loss": 0.0357, + "reward": 0.4363839477300644, + "reward_std": 0.17951035499572754, + "rewards/accuracy_reward": 0.06250000302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3738839402794838, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07706668658053917, + "grad_norm": 9.46628189086914, + "kl": 0.796875, + "learning_rate": 1.5402985074626866e-05, + "loss": 0.0319, + "reward": 0.5295759215950966, + "reward_std": 0.1364874318242073, + "rewards/accuracy_reward": 0.14732143515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3822544887661934, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07736539466806064, + "grad_norm": 12.482342720031738, + "kl": 1.619140625, + "learning_rate": 1.546268656716418e-05, + "loss": 0.0647, + "reward": 0.5027901902794838, + "reward_std": 0.20010271295905113, + "rewards/accuracy_reward": 0.10044643515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4023437649011612, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07766410275558211, + "grad_norm": 19.698942184448242, + "kl": 1.884765625, + "learning_rate": 1.5522388059701494e-05, + "loss": 0.0754, + "reward": 0.4994419887661934, + "reward_std": 0.1738654337823391, + "rewards/accuracy_reward": 0.07589286100119352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4235491305589676, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.9553680419922, + "epoch": 0.07796281084310358, + "grad_norm": 3.571814775466919, + "kl": 0.7861328125, + "learning_rate": 1.5582089552238808e-05, + "loss": 0.0315, + "reward": 0.4386161044239998, + "reward_std": 0.1646421067416668, + "rewards/accuracy_reward": 0.02232142980210483, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4162946566939354, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07826151893062505, + "grad_norm": 5.900186538696289, + "kl": 0.783203125, + "learning_rate": 1.564179104477612e-05, + "loss": 0.0314, + "reward": 0.506138414144516, + "reward_std": 0.16562099754810333, + "rewards/accuracy_reward": 0.06919643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4369419887661934, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07856022701814652, + "grad_norm": 3.1799721717834473, + "kl": 0.36572265625, + "learning_rate": 1.5701492537313433e-05, + "loss": 0.0147, + "reward": 0.5345982313156128, + "reward_std": 0.12756041157990694, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4430803805589676, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07885893510566798, + "grad_norm": 2.9641880989074707, + "kl": 0.353515625, + "learning_rate": 1.5761194029850747e-05, + "loss": 0.0141, + "reward": 0.5072544887661934, + "reward_std": 0.17674772441387177, + "rewards/accuracy_reward": 0.07366071920841932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4335937723517418, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07915764319318945, + "grad_norm": 2.200047731399536, + "kl": 0.44921875, + "learning_rate": 1.582089552238806e-05, + "loss": 0.018, + "reward": 0.467075914144516, + "reward_std": 0.10860721580684185, + "rewards/accuracy_reward": 0.03571428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4313616305589676, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.1093902587891, + "epoch": 0.07945635128071092, + "grad_norm": 2.2852725982666016, + "kl": 0.4716796875, + "learning_rate": 1.5880597014925375e-05, + "loss": 0.0171, + "reward": 0.4799107387661934, + "reward_std": 0.13030445389449596, + "rewards/accuracy_reward": 0.020089286845177412, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4598214477300644, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.07975505936823239, + "grad_norm": 11.849265098571777, + "kl": 1.298828125, + "learning_rate": 1.594029850746269e-05, + "loss": 0.0519, + "reward": 0.4760044813156128, + "reward_std": 0.13737953826785088, + "rewards/accuracy_reward": 0.049107145285233855, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4268973395228386, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.08005376745575386, + "grad_norm": 38.535850524902344, + "kl": 2.7734375, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.111, + "reward": 0.459263414144516, + "reward_std": 0.13682260550558567, + "rewards/accuracy_reward": 0.05580357392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4034598395228386, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.9353179931641, + "epoch": 0.08035247554327533, + "grad_norm": 8.838988304138184, + "kl": 1.3056640625, + "learning_rate": 1.6059701492537313e-05, + "loss": 0.0528, + "reward": 0.5563616305589676, + "reward_std": 0.1662580631673336, + "rewards/accuracy_reward": 0.12276786542497575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4335937723517418, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5290222167969, + "epoch": 0.0806511836307968, + "grad_norm": 2.9227540493011475, + "kl": 0.66650390625, + "learning_rate": 1.6119402985074627e-05, + "loss": 0.0325, + "reward": 0.6015625223517418, + "reward_std": 0.12103069666773081, + "rewards/accuracy_reward": 0.13839286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4631696566939354, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6361999511719, + "epoch": 0.08094989171831828, + "grad_norm": 3.8717312812805176, + "kl": 0.578125, + "learning_rate": 1.617910447761194e-05, + "loss": 0.023, + "reward": 0.5608259215950966, + "reward_std": 0.14726759679615498, + "rewards/accuracy_reward": 0.09151786030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4693080559372902, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.2656402587891, + "epoch": 0.08124859980583975, + "grad_norm": 4.838365077972412, + "kl": 0.978515625, + "learning_rate": 1.6238805970149255e-05, + "loss": 0.0426, + "reward": 0.5267857313156128, + "reward_std": 0.1316747535020113, + "rewards/accuracy_reward": 0.0647321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4620535895228386, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.3058471679688, + "epoch": 0.08154730789336122, + "grad_norm": 4.207040786743164, + "kl": 1.2255859375, + "learning_rate": 1.629850746268657e-05, + "loss": 0.0508, + "reward": 0.4754464626312256, + "reward_std": 0.1388342585414648, + "rewards/accuracy_reward": 0.026785715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.448660746216774, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.4576263427734, + "epoch": 0.08184601598088269, + "grad_norm": 16.915494918823242, + "kl": 1.234375, + "learning_rate": 1.6358208955223883e-05, + "loss": 0.0558, + "reward": 0.6529018133878708, + "reward_std": 0.15058407932519913, + "rewards/accuracy_reward": 0.207589291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4453125223517418, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.1495971679688, + "epoch": 0.08214472406840415, + "grad_norm": 20.423328399658203, + "kl": 1.748046875, + "learning_rate": 1.6417910447761197e-05, + "loss": 0.0793, + "reward": 0.5206473395228386, + "reward_std": 0.18143665418028831, + "rewards/accuracy_reward": 0.08482143189758062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4358259215950966, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.4130096435547, + "epoch": 0.08244343215592562, + "grad_norm": 15.560894012451172, + "kl": 2.185546875, + "learning_rate": 1.6477611940298508e-05, + "loss": 0.0921, + "reward": 0.5072544813156128, + "reward_std": 0.1609533280134201, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4291294813156128, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.0245819091797, + "epoch": 0.08274214024344709, + "grad_norm": 10.355707168579102, + "kl": 2.00390625, + "learning_rate": 1.6537313432835822e-05, + "loss": 0.0874, + "reward": 0.5312500298023224, + "reward_std": 0.1587973367422819, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4419643059372902, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.8013916015625, + "epoch": 0.08304084833096856, + "grad_norm": 28.170976638793945, + "kl": 1.400390625, + "learning_rate": 1.6597014925373136e-05, + "loss": 0.0605, + "reward": 0.486607164144516, + "reward_std": 0.1581554338335991, + "rewards/accuracy_reward": 0.04910714505240321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4375000149011612, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.3750305175781, + "epoch": 0.08333955641849003, + "grad_norm": 19.208927154541016, + "kl": 1.4609375, + "learning_rate": 1.665671641791045e-05, + "loss": 0.0684, + "reward": 0.487165205180645, + "reward_std": 0.14077411033213139, + "rewards/accuracy_reward": 0.0602678582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4268973469734192, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.4375457763672, + "epoch": 0.0836382645060115, + "grad_norm": 28.153240203857422, + "kl": 2.76953125, + "learning_rate": 1.671641791044776e-05, + "loss": 0.1179, + "reward": 0.4542410895228386, + "reward_std": 0.13080771267414093, + "rewards/accuracy_reward": 0.046875000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4073660895228386, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.5357666015625, + "epoch": 0.08393697259353297, + "grad_norm": 5.308176517486572, + "kl": 1.626953125, + "learning_rate": 1.6776119402985075e-05, + "loss": 0.0719, + "reward": 0.511160746216774, + "reward_std": 0.181702621281147, + "rewards/accuracy_reward": 0.08928571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4218750149011612, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.5067443847656, + "epoch": 0.08423568068105444, + "grad_norm": 27.18752670288086, + "kl": 1.978515625, + "learning_rate": 1.683582089552239e-05, + "loss": 0.0903, + "reward": 0.5011160969734192, + "reward_std": 0.17068877257406712, + "rewards/accuracy_reward": 0.17187500931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3292410895228386, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.8393402099609, + "epoch": 0.08453438876857591, + "grad_norm": 8.62528133392334, + "kl": 0.9951171875, + "learning_rate": 1.6895522388059703e-05, + "loss": 0.0401, + "reward": 0.3353794738650322, + "reward_std": 0.13573652505874634, + "rewards/accuracy_reward": 0.04017857206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2952008992433548, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.0937805175781, + "epoch": 0.08483309685609738, + "grad_norm": 3.338029384613037, + "kl": 0.56298828125, + "learning_rate": 1.6955223880597017e-05, + "loss": 0.0374, + "reward": 0.4285714402794838, + "reward_std": 0.17420504242181778, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3147321566939354, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.0714721679688, + "epoch": 0.08513180494361886, + "grad_norm": 3.667935848236084, + "kl": 0.1982421875, + "learning_rate": 1.701492537313433e-05, + "loss": 0.022, + "reward": 0.4726562649011612, + "reward_std": 0.1425983477383852, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4190848395228386, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.8236999511719, + "epoch": 0.08543051303114031, + "grad_norm": 3.020571231842041, + "kl": 0.164794921875, + "learning_rate": 1.7074626865671645e-05, + "loss": 0.0103, + "reward": 0.6010044887661934, + "reward_std": 0.1364169605076313, + "rewards/accuracy_reward": 0.14062500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4603794813156128, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.9821929931641, + "epoch": 0.08572922111866178, + "grad_norm": 3.1930997371673584, + "kl": 0.18359375, + "learning_rate": 1.7134328358208956e-05, + "loss": 0.0096, + "reward": 0.632812537252903, + "reward_std": 0.07826855406165123, + "rewards/accuracy_reward": 0.1517857222352177, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.7924499511719, + "epoch": 0.08602792920618325, + "grad_norm": 4.238061428070068, + "kl": 0.23046875, + "learning_rate": 1.719402985074627e-05, + "loss": 0.0129, + "reward": 0.4748884215950966, + "reward_std": 0.09601925872266293, + "rewards/accuracy_reward": 0.008928572060540318, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4659598469734192, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.0491638183594, + "epoch": 0.08632663729370472, + "grad_norm": 7.937732219696045, + "kl": 0.342529296875, + "learning_rate": 1.7253731343283584e-05, + "loss": 0.0191, + "reward": 0.546316996216774, + "reward_std": 0.10327458009123802, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4481026977300644, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.9107666015625, + "epoch": 0.0866253453812262, + "grad_norm": 7.311333656311035, + "kl": 0.45458984375, + "learning_rate": 1.7313432835820894e-05, + "loss": 0.0227, + "reward": 0.5251116380095482, + "reward_std": 0.162435632199049, + "rewards/accuracy_reward": 0.09598215017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4291294813156128, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.0379943847656, + "epoch": 0.08692405346874767, + "grad_norm": 28.691984176635742, + "kl": 2.548828125, + "learning_rate": 1.7373134328358208e-05, + "loss": 0.1091, + "reward": 0.4888392984867096, + "reward_std": 0.1910441666841507, + "rewards/accuracy_reward": 0.10714286286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3816964477300644, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.482177734375, + "epoch": 0.08722276155626914, + "grad_norm": 32.234458923339844, + "kl": 2.865234375, + "learning_rate": 1.7432835820895522e-05, + "loss": 0.1298, + "reward": 0.3822544813156128, + "reward_std": 0.17954860255122185, + "rewards/accuracy_reward": 0.03125000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3510044813156128, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0937805175781, + "epoch": 0.08752146964379061, + "grad_norm": 38.210205078125, + "kl": 3.44140625, + "learning_rate": 1.7492537313432836e-05, + "loss": 0.1436, + "reward": 0.522321455180645, + "reward_std": 0.17504198104143143, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3437500149011612, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.4531555175781, + "epoch": 0.08782017773131208, + "grad_norm": 15.753199577331543, + "kl": 2.134765625, + "learning_rate": 1.755223880597015e-05, + "loss": 0.0842, + "reward": 0.391741082072258, + "reward_std": 0.1593552641570568, + "rewards/accuracy_reward": 0.020089286845177412, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3716518059372902, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.3013763427734, + "epoch": 0.08811888581883355, + "grad_norm": 4.432027339935303, + "kl": 1.041015625, + "learning_rate": 1.7611940298507464e-05, + "loss": 0.0425, + "reward": 0.450892873108387, + "reward_std": 0.1540684662759304, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3950892984867096, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.1674499511719, + "epoch": 0.08841759390635502, + "grad_norm": 3.732396125793457, + "kl": 1.02734375, + "learning_rate": 1.767164179104478e-05, + "loss": 0.0432, + "reward": 0.4497768059372902, + "reward_std": 0.17469631880521774, + "rewards/accuracy_reward": 0.042410716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4073660895228386, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.5803680419922, + "epoch": 0.08871630199387648, + "grad_norm": 6.423310279846191, + "kl": 0.4736328125, + "learning_rate": 1.7731343283582092e-05, + "loss": 0.019, + "reward": 0.5457589700818062, + "reward_std": 0.1290696281939745, + "rewards/accuracy_reward": 0.10044643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4453125149011612, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.6339721679688, + "epoch": 0.08901501008139795, + "grad_norm": 3.9548232555389404, + "kl": 0.8740234375, + "learning_rate": 1.7791044776119403e-05, + "loss": 0.0353, + "reward": 0.5000000149011612, + "reward_std": 0.13821824081242085, + "rewards/accuracy_reward": 0.046875003492459655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4531250223517418, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.9799346923828, + "epoch": 0.08931371816891942, + "grad_norm": 10.977415084838867, + "kl": 2.0546875, + "learning_rate": 1.7850746268656717e-05, + "loss": 0.0813, + "reward": 0.491071455180645, + "reward_std": 0.10143258981406689, + "rewards/accuracy_reward": 0.042410716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4486607313156128, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.7611999511719, + "epoch": 0.08961242625644089, + "grad_norm": 22.315181732177734, + "kl": 2.14453125, + "learning_rate": 1.791044776119403e-05, + "loss": 0.086, + "reward": 0.6623884364962578, + "reward_std": 0.09762909635901451, + "rewards/accuracy_reward": 0.19419643213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4681919887661934, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6116333007812, + "epoch": 0.08991113434396236, + "grad_norm": 9.37828540802002, + "kl": 1.3974609375, + "learning_rate": 1.7970149253731345e-05, + "loss": 0.056, + "reward": 0.5267857313156128, + "reward_std": 0.07622705772519112, + "rewards/accuracy_reward": 0.04241071501746774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4888763427734, + "epoch": 0.09020984243148383, + "grad_norm": 2.1012940406799316, + "kl": 0.26904296875, + "learning_rate": 1.802985074626866e-05, + "loss": 0.0116, + "reward": 0.5507812798023224, + "reward_std": 0.054714429657906294, + "rewards/accuracy_reward": 0.05357143119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.8884124755859, + "epoch": 0.0905085505190053, + "grad_norm": 0.23605623841285706, + "kl": 0.1748046875, + "learning_rate": 1.8089552238805973e-05, + "loss": 0.0079, + "reward": 0.5792410969734192, + "reward_std": 0.09924923069775105, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.9576110839844, + "epoch": 0.09080725860652678, + "grad_norm": 0.16280560195446014, + "kl": 0.14306640625, + "learning_rate": 1.8149253731343287e-05, + "loss": 0.0057, + "reward": 0.5306919813156128, + "reward_std": 0.048331079073250294, + "rewards/accuracy_reward": 0.03125000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.5826263427734, + "epoch": 0.09110596669404825, + "grad_norm": 0.6321225166320801, + "kl": 0.154541015625, + "learning_rate": 1.8208955223880598e-05, + "loss": 0.0063, + "reward": 0.5312500298023224, + "reward_std": 0.09894461464136839, + "rewards/accuracy_reward": 0.033482145285233855, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.2946929931641, + "epoch": 0.09140467478156972, + "grad_norm": 0.9144079089164734, + "kl": 0.255615234375, + "learning_rate": 1.8268656716417912e-05, + "loss": 0.012, + "reward": 0.6037946790456772, + "reward_std": 0.09256977867335081, + "rewards/accuracy_reward": 0.10937500279396772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.6406707763672, + "epoch": 0.09170338286909117, + "grad_norm": 0.20014750957489014, + "kl": 0.174560546875, + "learning_rate": 1.8328358208955226e-05, + "loss": 0.007, + "reward": 0.5959821790456772, + "reward_std": 0.07262345030903816, + "rewards/accuracy_reward": 0.09821428777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7254791259766, + "epoch": 0.09200209095661264, + "grad_norm": 0.5545385479927063, + "kl": 0.20458984375, + "learning_rate": 1.8388059701492537e-05, + "loss": 0.0082, + "reward": 0.5390625298023224, + "reward_std": 0.07098836032673717, + "rewards/accuracy_reward": 0.04241071501746774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4151916503906, + "epoch": 0.09230079904413412, + "grad_norm": 3.2088406085968018, + "kl": 0.239013671875, + "learning_rate": 1.844776119402985e-05, + "loss": 0.006, + "reward": 0.5435268133878708, + "reward_std": 0.12782794050872326, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4743303805589676, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.5848693847656, + "epoch": 0.09259950713165559, + "grad_norm": 5.871523857116699, + "kl": 0.36376953125, + "learning_rate": 1.8507462686567165e-05, + "loss": 0.015, + "reward": 0.5753348544239998, + "reward_std": 0.09806687757372856, + "rewards/accuracy_reward": 0.11607143376022577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4592634215950966, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.9263610839844, + "epoch": 0.09289821521917706, + "grad_norm": 4.840591907501221, + "kl": 0.44775390625, + "learning_rate": 1.856716417910448e-05, + "loss": 0.0205, + "reward": 0.471540205180645, + "reward_std": 0.13194252736866474, + "rewards/accuracy_reward": 0.017857143422588706, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4536830484867096, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.7344055175781, + "epoch": 0.09319692330669853, + "grad_norm": 3.699049711227417, + "kl": 0.79296875, + "learning_rate": 1.8626865671641793e-05, + "loss": 0.0291, + "reward": 0.5641741454601288, + "reward_std": 0.14366153813898563, + "rewards/accuracy_reward": 0.10267857764847577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4614955559372902, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.5469055175781, + "epoch": 0.09349563139422, + "grad_norm": 7.740135669708252, + "kl": 1.337890625, + "learning_rate": 1.8686567164179107e-05, + "loss": 0.0489, + "reward": 0.5256696566939354, + "reward_std": 0.10104528069496155, + "rewards/accuracy_reward": 0.0580357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4676339477300644, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7589569091797, + "epoch": 0.09379433948174147, + "grad_norm": 6.602975368499756, + "kl": 1.25390625, + "learning_rate": 1.874626865671642e-05, + "loss": 0.0482, + "reward": 0.5418526977300644, + "reward_std": 0.10677672550082207, + "rewards/accuracy_reward": 0.0647321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4771205633878708, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.763427734375, + "epoch": 0.09409304756926294, + "grad_norm": 3.541693687438965, + "kl": 0.5751953125, + "learning_rate": 1.8805970149253735e-05, + "loss": 0.0212, + "reward": 0.4916294738650322, + "reward_std": 0.12892271392047405, + "rewards/accuracy_reward": 0.026785715483129025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4648437723517418, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.1205902099609, + "epoch": 0.09439175565678441, + "grad_norm": 2.140970468521118, + "kl": 0.6923828125, + "learning_rate": 1.8865671641791045e-05, + "loss": 0.0225, + "reward": 0.585379496216774, + "reward_std": 0.13350859470665455, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4559151977300644, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.4486999511719, + "epoch": 0.09469046374430588, + "grad_norm": 7.633982181549072, + "kl": 1.0546875, + "learning_rate": 1.892537313432836e-05, + "loss": 0.0402, + "reward": 0.5786830708384514, + "reward_std": 0.08756779041141272, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330633878708, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.0915679931641, + "epoch": 0.09498917183182734, + "grad_norm": 7.456286907196045, + "kl": 0.90234375, + "learning_rate": 1.8985074626865673e-05, + "loss": 0.0216, + "reward": 0.4988839626312256, + "reward_std": 0.07789996825158596, + "rewards/accuracy_reward": 0.013392857741564512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.4196929931641, + "epoch": 0.09528787991934881, + "grad_norm": 0.7132242918014526, + "kl": 0.4306640625, + "learning_rate": 1.9044776119402984e-05, + "loss": 0.0145, + "reward": 0.557477705180645, + "reward_std": 0.08617981243878603, + "rewards/accuracy_reward": 0.06919642887078226, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8661193847656, + "epoch": 0.09558658800687028, + "grad_norm": 4.012878894805908, + "kl": 0.26171875, + "learning_rate": 1.9104477611940298e-05, + "loss": 0.0106, + "reward": 0.4799107313156128, + "reward_std": 0.1197678092867136, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4620535969734192, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.5536193847656, + "epoch": 0.09588529609439175, + "grad_norm": 3.1736581325531006, + "kl": 0.31787109375, + "learning_rate": 1.9164179104477612e-05, + "loss": 0.0142, + "reward": 0.5736607313156128, + "reward_std": 0.11025777459144592, + "rewards/accuracy_reward": 0.0937500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107387661934, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.9219360351562, + "epoch": 0.09618400418191322, + "grad_norm": 1.7500709295272827, + "kl": 0.521484375, + "learning_rate": 1.922388059701493e-05, + "loss": 0.0207, + "reward": 0.541294664144516, + "reward_std": 0.04874269338324666, + "rewards/accuracy_reward": 0.046875003492459655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.310302734375, + "epoch": 0.0964827122694347, + "grad_norm": 2.341068744659424, + "kl": 0.46826171875, + "learning_rate": 1.928358208955224e-05, + "loss": 0.0192, + "reward": 0.592075914144516, + "reward_std": 0.06362787471152842, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.3281555175781, + "epoch": 0.09678142035695617, + "grad_norm": 1.5262703895568848, + "kl": 0.771484375, + "learning_rate": 1.9343283582089554e-05, + "loss": 0.0344, + "reward": 0.5022321566939354, + "reward_std": 0.08567542303353548, + "rewards/accuracy_reward": 0.017857143888249993, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750298023224, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.5290374755859, + "epoch": 0.09708012844447764, + "grad_norm": 0.9800349473953247, + "kl": 0.9169921875, + "learning_rate": 1.9402985074626868e-05, + "loss": 0.042, + "reward": 0.5558036044239998, + "reward_std": 0.14325474109500647, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143059372902, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.8371124267578, + "epoch": 0.09737883653199911, + "grad_norm": 3.9833359718322754, + "kl": 0.44580078125, + "learning_rate": 1.946268656716418e-05, + "loss": 0.0189, + "reward": 0.5563616380095482, + "reward_std": 0.13147106021642685, + "rewards/accuracy_reward": 0.08258929057046771, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4737723469734192, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.9643402099609, + "epoch": 0.09767754461952058, + "grad_norm": 3.343451976776123, + "kl": 0.6005859375, + "learning_rate": 1.9522388059701493e-05, + "loss": 0.0309, + "reward": 0.5814732313156128, + "reward_std": 0.144991060718894, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.467633955180645, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.1920166015625, + "epoch": 0.09797625270704205, + "grad_norm": 2.6696596145629883, + "kl": 0.7958984375, + "learning_rate": 1.9582089552238807e-05, + "loss": 0.0402, + "reward": 0.5758928954601288, + "reward_std": 0.10639673192054033, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107387661934, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.4866333007812, + "epoch": 0.0982749607945635, + "grad_norm": 1.1600247621536255, + "kl": 0.8154296875, + "learning_rate": 1.964179104477612e-05, + "loss": 0.0419, + "reward": 0.5876116305589676, + "reward_std": 0.14721078053116798, + "rewards/accuracy_reward": 0.11160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4760044887661934, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.8036193847656, + "epoch": 0.09857366888208498, + "grad_norm": 2.430856704711914, + "kl": 0.388671875, + "learning_rate": 1.9701492537313435e-05, + "loss": 0.0174, + "reward": 0.5239955633878708, + "reward_std": 0.09855730272829533, + "rewards/accuracy_reward": 0.04017857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.5245971679688, + "epoch": 0.09887237696960645, + "grad_norm": 1.2461105585098267, + "kl": 0.2919921875, + "learning_rate": 1.976119402985075e-05, + "loss": 0.0154, + "reward": 0.554129496216774, + "reward_std": 0.060772581258788705, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.4219055175781, + "epoch": 0.09917108505712792, + "grad_norm": 0.6035804748535156, + "kl": 0.3642578125, + "learning_rate": 1.9820895522388063e-05, + "loss": 0.0159, + "reward": 0.5948660969734192, + "reward_std": 0.07703645946457982, + "rewards/accuracy_reward": 0.10267857694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.6473693847656, + "epoch": 0.09946979314464939, + "grad_norm": 2.328124523162842, + "kl": 0.54638671875, + "learning_rate": 1.9880597014925377e-05, + "loss": 0.0266, + "reward": 0.5675223469734192, + "reward_std": 0.12608890607953072, + "rewards/accuracy_reward": 0.07366071944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.5022735595703, + "epoch": 0.09976850123217086, + "grad_norm": 1.7465543746948242, + "kl": 0.54052734375, + "learning_rate": 1.9940298507462688e-05, + "loss": 0.0269, + "reward": 0.5541294813156128, + "reward_std": 0.09422783553600311, + "rewards/accuracy_reward": 0.06250000302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 930.8103179931641, + "epoch": 0.10006720931969233, + "grad_norm": 0.6329307556152344, + "kl": 0.25341796875, + "learning_rate": 2e-05, + "loss": 0.0109, + "reward": 0.5625000149011612, + "reward_std": 0.07770087872631848, + "rewards/accuracy_reward": 0.06473214481957257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.6830902099609, + "epoch": 0.1003659174072138, + "grad_norm": 0.8710017204284668, + "kl": 0.19384765625, + "learning_rate": 1.9999994560490055e-05, + "loss": 0.0155, + "reward": 0.5546875149011612, + "reward_std": 0.1155692208558321, + "rewards/accuracy_reward": 0.05580357322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.1428985595703, + "epoch": 0.10066462549473527, + "grad_norm": 0.6794142723083496, + "kl": 0.33203125, + "learning_rate": 1.999997824196613e-05, + "loss": 0.014, + "reward": 0.5396205484867096, + "reward_std": 0.037821981590241194, + "rewards/accuracy_reward": 0.04464285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.7299499511719, + "epoch": 0.10096333358225675, + "grad_norm": 0.9964808225631714, + "kl": 0.36474609375, + "learning_rate": 1.999995104444598e-05, + "loss": 0.0174, + "reward": 0.6110491380095482, + "reward_std": 0.09178328537382185, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.0603179931641, + "epoch": 0.10126204166977822, + "grad_norm": 0.8040034174919128, + "kl": 0.218017578125, + "learning_rate": 1.9999912967959197e-05, + "loss": 0.0093, + "reward": 0.561941996216774, + "reward_std": 0.07345138117671013, + "rewards/accuracy_reward": 0.06919643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.8549499511719, + "epoch": 0.10156074975729967, + "grad_norm": 19.88669204711914, + "kl": 1.84375, + "learning_rate": 1.99998640125472e-05, + "loss": 0.0789, + "reward": 0.4960937723517418, + "reward_std": 0.14425748772919178, + "rewards/accuracy_reward": 0.04241071571595967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4536830559372902, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.9732666015625, + "epoch": 0.10185945784482114, + "grad_norm": 2.4623231887817383, + "kl": 0.6259765625, + "learning_rate": 1.9999804178263253e-05, + "loss": 0.0258, + "reward": 0.6099330708384514, + "reward_std": 0.16189981251955032, + "rewards/accuracy_reward": 0.14732143888249993, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.462611623108387, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.2210235595703, + "epoch": 0.10215816593234262, + "grad_norm": 2.8890833854675293, + "kl": 1.2763671875, + "learning_rate": 1.999973346517244e-05, + "loss": 0.053, + "reward": 0.5245535895228386, + "reward_std": 0.1923955176025629, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4263393059372902, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.4799346923828, + "epoch": 0.10245687401986409, + "grad_norm": 10.02528190612793, + "kl": 2.3984375, + "learning_rate": 1.99996518733517e-05, + "loss": 0.0936, + "reward": 0.5089285895228386, + "reward_std": 0.16551297903060913, + "rewards/accuracy_reward": 0.09821429033763707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4107143059372902, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.9464721679688, + "epoch": 0.10275558210738556, + "grad_norm": 4.265324115753174, + "kl": 1.94921875, + "learning_rate": 1.9999559402889794e-05, + "loss": 0.0805, + "reward": 0.4620535895228386, + "reward_std": 0.1649486254900694, + "rewards/accuracy_reward": 0.06473214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3973214477300644, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.8661193847656, + "epoch": 0.10305429019490703, + "grad_norm": 6.596566677093506, + "kl": 1.09375, + "learning_rate": 1.9999456053887315e-05, + "loss": 0.0473, + "reward": 0.4542410969734192, + "reward_std": 0.14355365745723248, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.436383955180645, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4085235595703, + "epoch": 0.1033529982824285, + "grad_norm": 6.300505638122559, + "kl": 0.9814453125, + "learning_rate": 1.9999341826456703e-05, + "loss": 0.0401, + "reward": 0.4905134066939354, + "reward_std": 0.12670683674514294, + "rewards/accuracy_reward": 0.044642860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4458705559372902, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4464569091797, + "epoch": 0.10365170636994997, + "grad_norm": 4.3031182289123535, + "kl": 0.9169921875, + "learning_rate": 1.999921672072223e-05, + "loss": 0.0368, + "reward": 0.510602705180645, + "reward_std": 0.14337029494345188, + "rewards/accuracy_reward": 0.06026785867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4503348395228386, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.4397735595703, + "epoch": 0.10395041445747144, + "grad_norm": 2.154984951019287, + "kl": 1.556640625, + "learning_rate": 1.9999080736819986e-05, + "loss": 0.0671, + "reward": 0.5736607387661934, + "reward_std": 0.23084361478686333, + "rewards/accuracy_reward": 0.14062500675208867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4330357387661934, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.3683319091797, + "epoch": 0.10424912254499291, + "grad_norm": 6.515596866607666, + "kl": 2.73046875, + "learning_rate": 1.9998933874897922e-05, + "loss": 0.1218, + "reward": 0.4698660895228386, + "reward_std": 0.17164187133312225, + "rewards/accuracy_reward": 0.03794643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4319196715950966, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.0268249511719, + "epoch": 0.10454783063251437, + "grad_norm": 6.345547199249268, + "kl": 3.15625, + "learning_rate": 1.99987761351158e-05, + "loss": 0.1374, + "reward": 0.5535714626312256, + "reward_std": 0.13770603202283382, + "rewards/accuracy_reward": 0.10714286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4464285895228386, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.9777069091797, + "epoch": 0.10484653872003584, + "grad_norm": 78.58914184570312, + "kl": 8.46875, + "learning_rate": 1.9998607517645227e-05, + "loss": 0.3564, + "reward": 0.3554687686264515, + "reward_std": 0.017260090680792928, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2483259029686451, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.5580902099609, + "epoch": 0.10514524680755731, + "grad_norm": 27.41887092590332, + "kl": 3.046875, + "learning_rate": 1.9998428022669646e-05, + "loss": 0.1227, + "reward": 0.2963169813156128, + "reward_std": 0.05443059653043747, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2606026902794838, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 864.3326263427734, + "epoch": 0.10544395489507878, + "grad_norm": 2.0600898265838623, + "kl": 0.49072265625, + "learning_rate": 1.9998237650384324e-05, + "loss": -0.0037, + "reward": 0.4960937798023224, + "reward_std": 0.14095536060631275, + "rewards/accuracy_reward": 0.053571431431919336, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4425223395228386, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 903.3036041259766, + "epoch": 0.10574266298260025, + "grad_norm": 1.532135009765625, + "kl": 0.3193359375, + "learning_rate": 1.9998036400996374e-05, + "loss": 0.0253, + "reward": 0.6462053954601288, + "reward_std": 0.09021112322807312, + "rewards/accuracy_reward": 0.16517857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268133878708, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 913.9464874267578, + "epoch": 0.10604137107012172, + "grad_norm": 0.6485714912414551, + "kl": 0.261474609375, + "learning_rate": 1.9997824274724734e-05, + "loss": 0.0338, + "reward": 0.5468750298023224, + "reward_std": 0.08214028365910053, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107387661934, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.7835235595703, + "epoch": 0.1063400791576432, + "grad_norm": 0.45187726616859436, + "kl": 0.202880859375, + "learning_rate": 1.999760127180017e-05, + "loss": 0.0105, + "reward": 0.5357143133878708, + "reward_std": 0.06987208127975464, + "rewards/accuracy_reward": 0.05133928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.1004791259766, + "epoch": 0.10663878724516467, + "grad_norm": 0.3410225808620453, + "kl": 0.19384765625, + "learning_rate": 1.99973673924653e-05, + "loss": 0.006, + "reward": 0.5502232238650322, + "reward_std": 0.05979477637447417, + "rewards/accuracy_reward": 0.0602678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.0469207763672, + "epoch": 0.10693749533268614, + "grad_norm": 0.6873849630355835, + "kl": 0.191162109375, + "learning_rate": 1.999712263697455e-05, + "loss": 0.0062, + "reward": 0.5145089626312256, + "reward_std": 0.08606773614883423, + "rewards/accuracy_reward": 0.024553572991862893, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.2344055175781, + "epoch": 0.10723620342020761, + "grad_norm": 1.1829609870910645, + "kl": 0.2021484375, + "learning_rate": 1.9996867005594193e-05, + "loss": 0.0091, + "reward": 0.5507812798023224, + "reward_std": 0.12156920321285725, + "rewards/accuracy_reward": 0.05803571571595967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.5714721679688, + "epoch": 0.10753491150772908, + "grad_norm": 0.40397801995277405, + "kl": 0.205322265625, + "learning_rate": 1.9996600498602334e-05, + "loss": 0.0054, + "reward": 0.560825914144516, + "reward_std": 0.07060469500720501, + "rewards/accuracy_reward": 0.06473214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.9286193847656, + "epoch": 0.10783361959525053, + "grad_norm": 1.5318493843078613, + "kl": 0.22119140625, + "learning_rate": 1.9996323116288906e-05, + "loss": 0.0078, + "reward": 0.5267857387661934, + "reward_std": 0.10431875381618738, + "rewards/accuracy_reward": 0.040178571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.9464721679688, + "epoch": 0.108132327682772, + "grad_norm": 1.1402748823165894, + "kl": 0.3037109375, + "learning_rate": 1.9996034858955667e-05, + "loss": 0.0139, + "reward": 0.5195312723517418, + "reward_std": 0.11351107526570559, + "rewards/accuracy_reward": 0.035714287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169813156128, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.1205902099609, + "epoch": 0.10843103577029348, + "grad_norm": 0.9787477850914001, + "kl": 0.3486328125, + "learning_rate": 1.9995735726916223e-05, + "loss": 0.0315, + "reward": 0.6434152275323868, + "reward_std": 0.09674691315740347, + "rewards/accuracy_reward": 0.16517858067527413, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478236623108387, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.5045166015625, + "epoch": 0.10872974385781495, + "grad_norm": 2.162909984588623, + "kl": 0.6142578125, + "learning_rate": 1.9995425720495993e-05, + "loss": 0.0315, + "reward": 0.6417410969734192, + "reward_std": 0.11664261389523745, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4520089477300644, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.6763916015625, + "epoch": 0.10902845194533642, + "grad_norm": 5.111187934875488, + "kl": 0.806640625, + "learning_rate": 1.999510484003224e-05, + "loss": 0.0416, + "reward": 0.5591518133878708, + "reward_std": 0.1553235426545143, + "rewards/accuracy_reward": 0.11607143515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4430803805589676, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.5067291259766, + "epoch": 0.10932716003285789, + "grad_norm": 1.9799141883850098, + "kl": 0.5546875, + "learning_rate": 1.9994773085874043e-05, + "loss": 0.0286, + "reward": 0.4614955559372902, + "reward_std": 0.12618290446698666, + "rewards/accuracy_reward": 0.015625000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4458705633878708, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0870971679688, + "epoch": 0.10962586812037936, + "grad_norm": 19903.208984375, + "kl": 391.6455078125, + "learning_rate": 1.9994430458382323e-05, + "loss": 16.282, + "reward": 0.5524553805589676, + "reward_std": 0.1140974871814251, + "rewards/accuracy_reward": 0.07142857392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.0156555175781, + "epoch": 0.10992457620790083, + "grad_norm": 1.0206562280654907, + "kl": 0.26171875, + "learning_rate": 1.999407695792982e-05, + "loss": 0.0041, + "reward": 0.5044643133878708, + "reward_std": 0.07086404040455818, + "rewards/accuracy_reward": 0.013392857741564512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.7053985595703, + "epoch": 0.1102232842954223, + "grad_norm": 0.972594141960144, + "kl": 0.25244140625, + "learning_rate": 1.9993712584901116e-05, + "loss": 0.0112, + "reward": 0.4977678805589676, + "reward_std": 0.07697703596204519, + "rewards/accuracy_reward": 0.013392857741564512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.5491638183594, + "epoch": 0.11052199238294377, + "grad_norm": 1.045257568359375, + "kl": 0.285400390625, + "learning_rate": 1.999333733969261e-05, + "loss": 0.0224, + "reward": 0.5842634215950966, + "reward_std": 0.1049301028251648, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4614955559372902, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8928833007812, + "epoch": 0.11082070047046524, + "grad_norm": 17.9425106048584, + "kl": 0.66015625, + "learning_rate": 1.999295122271253e-05, + "loss": 0.0277, + "reward": 0.3041294813156128, + "reward_std": 0.10030540823936462, + "rewards/accuracy_reward": 0.0267857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2773437649011612, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.7902374267578, + "epoch": 0.1111194085579867, + "grad_norm": 1.1288520097732544, + "kl": 0.404296875, + "learning_rate": 1.999255423438093e-05, + "loss": 0.0185, + "reward": 0.4771205559372902, + "reward_std": 0.1337430588901043, + "rewards/accuracy_reward": 0.017857143422588706, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4592634066939354, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.4018249511719, + "epoch": 0.11141811664550817, + "grad_norm": 0.35876885056495667, + "kl": 0.33544921875, + "learning_rate": 1.9992146375129703e-05, + "loss": 0.0235, + "reward": 0.5747768059372902, + "reward_std": 0.11416774615645409, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4520089402794838, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.3348693847656, + "epoch": 0.11171682473302964, + "grad_norm": 0.35318273305892944, + "kl": 0.2841796875, + "learning_rate": 1.9991727645402556e-05, + "loss": 0.0192, + "reward": 0.538504496216774, + "reward_std": 0.15945106372237206, + "rewards/accuracy_reward": 0.08482143515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4536830559372902, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.8125610351562, + "epoch": 0.11201553282055111, + "grad_norm": 0.3191957473754883, + "kl": 0.2607421875, + "learning_rate": 1.9991298045655022e-05, + "loss": 0.0209, + "reward": 0.6026785969734192, + "reward_std": 0.1575213074684143, + "rewards/accuracy_reward": 0.12946428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143059372902, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.3861999511719, + "epoch": 0.11231424090807259, + "grad_norm": 0.7156631946563721, + "kl": 0.2431640625, + "learning_rate": 1.9990857576354466e-05, + "loss": 0.0235, + "reward": 0.578683078289032, + "reward_std": 0.1588970348238945, + "rewards/accuracy_reward": 0.10491072107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4737723395228386, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.9397735595703, + "epoch": 0.11261294899559406, + "grad_norm": 16.100004196166992, + "kl": 0.3544921875, + "learning_rate": 1.999040623798008e-05, + "loss": 0.022, + "reward": 0.5011161044239998, + "reward_std": 0.12469187192618847, + "rewards/accuracy_reward": 0.0580357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4430803805589676, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.8772735595703, + "epoch": 0.11291165708311553, + "grad_norm": 34.27460861206055, + "kl": 0.47412109375, + "learning_rate": 1.998994403102287e-05, + "loss": 0.0373, + "reward": 0.5758928880095482, + "reward_std": 0.12124679982662201, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4754464477300644, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.7522735595703, + "epoch": 0.113210365170637, + "grad_norm": 12.406760215759277, + "kl": 0.6142578125, + "learning_rate": 1.9989470955985674e-05, + "loss": 0.0246, + "reward": 0.5708705708384514, + "reward_std": 0.12972076702862978, + "rewards/accuracy_reward": 0.08928572200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848469734192, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.5491333007812, + "epoch": 0.11350907325815847, + "grad_norm": 21.338197708129883, + "kl": 1.31201171875, + "learning_rate": 1.9988987013383153e-05, + "loss": 0.0328, + "reward": 0.5066964477300644, + "reward_std": 0.1282604318112135, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4508928805589676, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.4174652099609, + "epoch": 0.11380778134567994, + "grad_norm": 40.14265060424805, + "kl": 1.8359375, + "learning_rate": 1.9988492203741783e-05, + "loss": 0.0504, + "reward": 0.5680803880095482, + "reward_std": 0.12190199457108974, + "rewards/accuracy_reward": 0.11830357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4497768133878708, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 930.3348541259766, + "epoch": 0.11410648943320141, + "grad_norm": 10.432088851928711, + "kl": 1.0712890625, + "learning_rate": 1.9987986527599876e-05, + "loss": 0.0732, + "reward": 0.5764509290456772, + "reward_std": 0.1589381191879511, + "rewards/accuracy_reward": 0.10714286100119352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4693080633878708, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.3728179931641, + "epoch": 0.11440519752072287, + "grad_norm": 5.6992034912109375, + "kl": 0.52587890625, + "learning_rate": 1.9987469985507553e-05, + "loss": 0.0346, + "reward": 0.6417410969734192, + "reward_std": 0.15374388033524156, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.8236846923828, + "epoch": 0.11470390560824434, + "grad_norm": 6.33870792388916, + "kl": 0.61328125, + "learning_rate": 1.9986942578026767e-05, + "loss": 0.0258, + "reward": 0.5736607313156128, + "reward_std": 0.1439296454191208, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143059372902, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.1518402099609, + "epoch": 0.11500261369576581, + "grad_norm": 7.457898139953613, + "kl": 0.5126953125, + "learning_rate": 1.998640430573128e-05, + "loss": 0.0324, + "reward": 0.5396205633878708, + "reward_std": 0.1453549489378929, + "rewards/accuracy_reward": 0.060267860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4793526977300644, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.3370971679688, + "epoch": 0.11530132178328728, + "grad_norm": 7.249351501464844, + "kl": 0.75390625, + "learning_rate": 1.9985855169206678e-05, + "loss": 0.0347, + "reward": 0.6489955633878708, + "reward_std": 0.12911575939506292, + "rewards/accuracy_reward": 0.1718750111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4771205559372902, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.9621124267578, + "epoch": 0.11560002987080875, + "grad_norm": 4.034423828125, + "kl": 0.75830078125, + "learning_rate": 1.9985295169050374e-05, + "loss": 0.0469, + "reward": 0.5703125298023224, + "reward_std": 0.10518298670649529, + "rewards/accuracy_reward": 0.09375000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4765625223517418, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.6786041259766, + "epoch": 0.11589873795833022, + "grad_norm": 6.118593692779541, + "kl": 0.9736328125, + "learning_rate": 1.998472430587159e-05, + "loss": 0.0532, + "reward": 0.6049107313156128, + "reward_std": 0.12443427927792072, + "rewards/accuracy_reward": 0.129464294295758, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4754464477300644, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.7790679931641, + "epoch": 0.1161974460458517, + "grad_norm": 18.80058479309082, + "kl": 1.802734375, + "learning_rate": 1.9984142580291368e-05, + "loss": 0.0971, + "reward": 0.6183035969734192, + "reward_std": 0.1367051713168621, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.475446455180645, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.7991333007812, + "epoch": 0.11649615413337316, + "grad_norm": 28.462570190429688, + "kl": 3.71484375, + "learning_rate": 1.9983549992942572e-05, + "loss": 0.1964, + "reward": 0.553013414144516, + "reward_std": 0.13509854301810265, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.454799123108387, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.1786193847656, + "epoch": 0.11679486222089464, + "grad_norm": 31.30630874633789, + "kl": 4.421875, + "learning_rate": 1.9982946544469875e-05, + "loss": 0.2213, + "reward": 0.493861623108387, + "reward_std": 0.11518522538244724, + "rewards/accuracy_reward": 0.04687500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4469866305589676, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 927.7522735595703, + "epoch": 0.1170935703084161, + "grad_norm": 30.764698028564453, + "kl": 2.056640625, + "learning_rate": 1.998233223552977e-05, + "loss": 0.1342, + "reward": 0.5636160895228386, + "reward_std": 0.09681148082017899, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4497768059372902, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.5893249511719, + "epoch": 0.11739227839593756, + "grad_norm": 15.420845031738281, + "kl": 4.9375, + "learning_rate": 1.998170706679057e-05, + "loss": 0.2568, + "reward": 0.4229910895228386, + "reward_std": 0.10833907127380371, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4229910895228386, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.2098541259766, + "epoch": 0.11769098648345903, + "grad_norm": 26.558917999267578, + "kl": 3.42578125, + "learning_rate": 1.998107103893239e-05, + "loss": 0.198, + "reward": 0.4654018133878708, + "reward_std": 0.13937841542065144, + "rewards/accuracy_reward": 0.04910714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.416294664144516, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.0379943847656, + "epoch": 0.1179896945709805, + "grad_norm": 26.40386199951172, + "kl": 5.46875, + "learning_rate": 1.9980424152647174e-05, + "loss": 0.2659, + "reward": 0.4570312723517418, + "reward_std": 0.13502510637044907, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4101562723517418, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 922.5982513427734, + "epoch": 0.11828840265850198, + "grad_norm": 25.6286563873291, + "kl": 3.58203125, + "learning_rate": 1.9979766408638664e-05, + "loss": 0.224, + "reward": 0.502790205180645, + "reward_std": 0.15564481355249882, + "rewards/accuracy_reward": 0.09151786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4112723395228386, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.0268249511719, + "epoch": 0.11858711074602345, + "grad_norm": 23.64201545715332, + "kl": 5.5078125, + "learning_rate": 1.9979097807622424e-05, + "loss": 0.3018, + "reward": 0.5245535969734192, + "reward_std": 0.21128332987427711, + "rewards/accuracy_reward": 0.11383929220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4107142984867096, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.8616485595703, + "epoch": 0.11888581883354492, + "grad_norm": 36.902870178222656, + "kl": 2.65625, + "learning_rate": 1.9978418350325825e-05, + "loss": 0.1566, + "reward": 0.5345982387661934, + "reward_std": 0.1681667771190405, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4229910969734192, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.794677734375, + "epoch": 0.11918452692106639, + "grad_norm": 13.869124412536621, + "kl": 3.50390625, + "learning_rate": 1.9977728037488052e-05, + "loss": 0.1835, + "reward": 0.4882812649011612, + "reward_std": 0.15448465384542942, + "rewards/accuracy_reward": 0.06250000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4257812649011612, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.6004943847656, + "epoch": 0.11948323500858786, + "grad_norm": 140.75125122070312, + "kl": 9.921875, + "learning_rate": 1.99770268698601e-05, + "loss": 0.4565, + "reward": 0.5117187723517418, + "reward_std": 0.11513019166886806, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4335937649011612, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.8080749511719, + "epoch": 0.11978194309610933, + "grad_norm": 202.34188842773438, + "kl": 13.828125, + "learning_rate": 1.9976314848204762e-05, + "loss": 0.5793, + "reward": 0.4838169813156128, + "reward_std": 0.11848198808729649, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4101562649011612, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.1987152099609, + "epoch": 0.1200806511836308, + "grad_norm": 84.0167007446289, + "kl": 7.6796875, + "learning_rate": 1.9975591973296657e-05, + "loss": 0.3776, + "reward": 0.470982164144516, + "reward_std": 0.1261886227875948, + "rewards/accuracy_reward": 0.05133928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4196428805589676, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.3281707763672, + "epoch": 0.12037935927115227, + "grad_norm": 31.841007232666016, + "kl": 2.712890625, + "learning_rate": 1.9974858245922192e-05, + "loss": 0.1247, + "reward": 0.4559151977300644, + "reward_std": 0.1308898627758026, + "rewards/accuracy_reward": 0.042410716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4135044813156128, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.7545166015625, + "epoch": 0.12067806735867373, + "grad_norm": 35.70283126831055, + "kl": 1.31640625, + "learning_rate": 1.99741136668796e-05, + "loss": 0.0705, + "reward": 0.4564732313156128, + "reward_std": 0.1417136024683714, + "rewards/accuracy_reward": 0.022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4341518059372902, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.3527374267578, + "epoch": 0.1209767754461952, + "grad_norm": 30.779603958129883, + "kl": 2.24609375, + "learning_rate": 1.997335823697891e-05, + "loss": 0.1105, + "reward": 0.4347098469734192, + "reward_std": 0.11755921877920628, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3945312649011612, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.2254791259766, + "epoch": 0.12127548353371667, + "grad_norm": 9.413703918457031, + "kl": 3.322265625, + "learning_rate": 1.997259195704195e-05, + "loss": 0.1475, + "reward": 0.5239955633878708, + "reward_std": 0.14395921491086483, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.428013414144516, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.6741485595703, + "epoch": 0.12157419162123814, + "grad_norm": 121.7138900756836, + "kl": 8.953125, + "learning_rate": 1.997181482790236e-05, + "loss": 0.3712, + "reward": 0.5223214477300644, + "reward_std": 0.14385845698416233, + "rewards/accuracy_reward": 0.10491071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4174107313156128, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.5580749511719, + "epoch": 0.12187289970875961, + "grad_norm": 96.19042205810547, + "kl": 7.3515625, + "learning_rate": 1.997102685040558e-05, + "loss": 0.3128, + "reward": 0.5156250223517418, + "reward_std": 0.14501065388321877, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4285714402794838, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.3460235595703, + "epoch": 0.12217160779628108, + "grad_norm": 50.37589645385742, + "kl": 6.1171875, + "learning_rate": 1.9970228025408854e-05, + "loss": 0.2493, + "reward": 0.470424123108387, + "reward_std": 0.1474767029285431, + "rewards/accuracy_reward": 0.0401785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4302455559372902, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.9241485595703, + "epoch": 0.12247031588380256, + "grad_norm": 31.723974227905273, + "kl": 1.705078125, + "learning_rate": 1.996941835378122e-05, + "loss": 0.0782, + "reward": 0.4921875149011612, + "reward_std": 0.15984809771180153, + "rewards/accuracy_reward": 0.060267860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4319196566939354, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.6428985595703, + "epoch": 0.12276902397132403, + "grad_norm": 23.887569427490234, + "kl": 1.693359375, + "learning_rate": 1.9968597836403526e-05, + "loss": 0.0726, + "reward": 0.4296875223517418, + "reward_std": 0.12043316848576069, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4207589477300644, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.8370819091797, + "epoch": 0.1230677320588455, + "grad_norm": 6.18964147567749, + "kl": 2.009765625, + "learning_rate": 1.9967766474168416e-05, + "loss": 0.0928, + "reward": 0.5027901902794838, + "reward_std": 0.13621743954718113, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.435825914144516, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.1027221679688, + "epoch": 0.12336644014636697, + "grad_norm": 33.90864944458008, + "kl": 3.47265625, + "learning_rate": 1.9966924267980326e-05, + "loss": 0.1511, + "reward": 0.4575893059372902, + "reward_std": 0.13859626278281212, + "rewards/accuracy_reward": 0.020089287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4375000223517418, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.5379791259766, + "epoch": 0.12366514823388844, + "grad_norm": 20.364564895629883, + "kl": 2.17578125, + "learning_rate": 1.9966071218755497e-05, + "loss": 0.1182, + "reward": 0.5530134215950966, + "reward_std": 0.14797229319810867, + "rewards/accuracy_reward": 0.10937500721774995, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4436384215950966, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.6875457763672, + "epoch": 0.1239638563214099, + "grad_norm": 9.873104095458984, + "kl": 1.7421875, + "learning_rate": 1.9965207327421964e-05, + "loss": 0.0924, + "reward": 0.5658482387661934, + "reward_std": 0.14965256303548813, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4497768059372902, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.2656707763672, + "epoch": 0.12426256440893137, + "grad_norm": 5.962564945220947, + "kl": 0.9853515625, + "learning_rate": 1.996433259491955e-05, + "loss": 0.0314, + "reward": 0.5011161044239998, + "reward_std": 0.10112164355814457, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4564732387661934, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.7500305175781, + "epoch": 0.12456127249645284, + "grad_norm": 3.2848925590515137, + "kl": 0.51611328125, + "learning_rate": 1.9963447022199884e-05, + "loss": 0.0247, + "reward": 0.570312537252903, + "reward_std": 0.13277187291532755, + "rewards/accuracy_reward": 0.09151786286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478794664144516, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.0446929931641, + "epoch": 0.12485998058397431, + "grad_norm": 0.9341151118278503, + "kl": 0.52685546875, + "learning_rate": 1.9962550610226382e-05, + "loss": 0.0288, + "reward": 0.6132812798023224, + "reward_std": 0.19186223670840263, + "rewards/accuracy_reward": 0.14285715110599995, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.470424123108387, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.0156707763672, + "epoch": 0.1251586886714958, + "grad_norm": 9.552392959594727, + "kl": 1.03955078125, + "learning_rate": 1.996164335997425e-05, + "loss": 0.0565, + "reward": 0.5200892984867096, + "reward_std": 0.15620861388742924, + "rewards/accuracy_reward": 0.06696428963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4531250223517418, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.0335388183594, + "epoch": 0.12545739675901724, + "grad_norm": 3.1506245136260986, + "kl": 0.6640625, + "learning_rate": 1.9960725272430487e-05, + "loss": 0.0302, + "reward": 0.5781250223517418, + "reward_std": 0.10377108491957188, + "rewards/accuracy_reward": 0.11830357206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4598214477300644, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.2768249511719, + "epoch": 0.1257561048465387, + "grad_norm": 5.097635269165039, + "kl": 0.8935546875, + "learning_rate": 1.9959796348593886e-05, + "loss": 0.0356, + "reward": 0.4927455633878708, + "reward_std": 0.12349301017820835, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4659598469734192, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.3236999511719, + "epoch": 0.12605481293406018, + "grad_norm": 5.174260139465332, + "kl": 1.4453125, + "learning_rate": 1.9958856589475018e-05, + "loss": 0.0625, + "reward": 0.513950914144516, + "reward_std": 0.1541624665260315, + "rewards/accuracy_reward": 0.06696428777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4469866305589676, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.310302734375, + "epoch": 0.12635352102158165, + "grad_norm": 3.1990301609039307, + "kl": 1.103515625, + "learning_rate": 1.995790599609626e-05, + "loss": 0.0318, + "reward": 0.503348246216774, + "reward_std": 0.11821835301816463, + "rewards/accuracy_reward": 0.05580357275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.447544664144516, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.1652069091797, + "epoch": 0.12665222910910312, + "grad_norm": 2.6953656673431396, + "kl": 0.978515625, + "learning_rate": 1.9956944569491756e-05, + "loss": 0.0386, + "reward": 0.5340401977300644, + "reward_std": 0.1479045059531927, + "rewards/accuracy_reward": 0.11383929033763707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4202009066939354, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.2120666503906, + "epoch": 0.1269509371966246, + "grad_norm": 8.675219535827637, + "kl": 1.046875, + "learning_rate": 1.995597231070744e-05, + "loss": 0.0404, + "reward": 0.3510044813156128, + "reward_std": 0.10611462779343128, + "rewards/accuracy_reward": 0.0379464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3130580484867096, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.5759429931641, + "epoch": 0.12724964528414606, + "grad_norm": 4.317501068115234, + "kl": 1.572265625, + "learning_rate": 1.9954989220801046e-05, + "loss": 0.0584, + "reward": 0.3917410895228386, + "reward_std": 0.12279958091676235, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.313616082072258, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.4442291259766, + "epoch": 0.12754835337166753, + "grad_norm": 8.789463996887207, + "kl": 0.810546875, + "learning_rate": 1.9953995300842073e-05, + "loss": 0.0117, + "reward": 0.4810268059372902, + "reward_std": 0.16356783732771873, + "rewards/accuracy_reward": 0.06250000395812094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4185268059372902, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8482513427734, + "epoch": 0.127847061459189, + "grad_norm": 3.3314085006713867, + "kl": 0.46142578125, + "learning_rate": 1.9952990551911808e-05, + "loss": 0.0117, + "reward": 0.5446428805589676, + "reward_std": 0.0773840369656682, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.466517873108387, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3080444335938, + "epoch": 0.12814576954671048, + "grad_norm": 1.586737036705017, + "kl": 0.3916015625, + "learning_rate": 1.995197497510332e-05, + "loss": 0.0071, + "reward": 0.4983259066939354, + "reward_std": 0.1221934761852026, + "rewards/accuracy_reward": 0.03125000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.467075914144516, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.4375457763672, + "epoch": 0.12844447763423195, + "grad_norm": 1.4745851755142212, + "kl": 0.38818359375, + "learning_rate": 1.9950948571521458e-05, + "loss": 0.0089, + "reward": 0.5050223395228386, + "reward_std": 0.06827360671013594, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4648437723517418, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0357208251953, + "epoch": 0.12874318572175342, + "grad_norm": 0.726200520992279, + "kl": 0.33544921875, + "learning_rate": 1.994991134228285e-05, + "loss": 0.0126, + "reward": 0.6104910969734192, + "reward_std": 0.10340302158147097, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4787946566939354, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.4888763427734, + "epoch": 0.1290418938092749, + "grad_norm": 0.4221353828907013, + "kl": 0.380859375, + "learning_rate": 1.9948863288515895e-05, + "loss": 0.0107, + "reward": 0.5775669887661934, + "reward_std": 0.11448949202895164, + "rewards/accuracy_reward": 0.10491071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4726562649011612, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.4732513427734, + "epoch": 0.12934060189679636, + "grad_norm": 1.5006263256072998, + "kl": 0.50390625, + "learning_rate": 1.9947804411360775e-05, + "loss": 0.015, + "reward": 0.474888414144516, + "reward_std": 0.11761917546391487, + "rewards/accuracy_reward": 0.022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4525669887661934, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.9978179931641, + "epoch": 0.12963930998431783, + "grad_norm": 0.5302844047546387, + "kl": 0.4287109375, + "learning_rate": 1.9946734711969447e-05, + "loss": 0.0172, + "reward": 0.5139509290456772, + "reward_std": 0.14837919920682907, + "rewards/accuracy_reward": 0.04910714481957257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4648437723517418, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.9576263427734, + "epoch": 0.1299380180718393, + "grad_norm": 2.593717575073242, + "kl": 0.61865234375, + "learning_rate": 1.994565419150564e-05, + "loss": 0.0316, + "reward": 0.6322544813156128, + "reward_std": 0.11504254397004843, + "rewards/accuracy_reward": 0.1696428656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4626116305589676, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8772583007812, + "epoch": 0.13023672615936077, + "grad_norm": 0.3364025950431824, + "kl": 0.36767578125, + "learning_rate": 1.9944562851144846e-05, + "loss": 0.0152, + "reward": 0.5172991305589676, + "reward_std": 0.08578462339937687, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4771205633878708, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.1384124755859, + "epoch": 0.13053543424688224, + "grad_norm": 0.3741532862186432, + "kl": 0.39208984375, + "learning_rate": 1.9943460692074345e-05, + "loss": 0.0067, + "reward": 0.5239955633878708, + "reward_std": 0.10874446295201778, + "rewards/accuracy_reward": 0.05133928777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4726562723517418, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.9888916015625, + "epoch": 0.1308341423344037, + "grad_norm": 202.39749145507812, + "kl": 9.9853515625, + "learning_rate": 1.994234771549317e-05, + "loss": 0.4081, + "reward": 0.558593787252903, + "reward_std": 0.13683762960135937, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.424665205180645, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.6473388671875, + "epoch": 0.13113285042192518, + "grad_norm": 12.58808708190918, + "kl": 1.99609375, + "learning_rate": 1.9941223922612143e-05, + "loss": 0.0784, + "reward": 0.3264508992433548, + "reward_std": 0.06889492366462946, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2907366156578064, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0870666503906, + "epoch": 0.13143155850944666, + "grad_norm": 4.777479648590088, + "kl": 1.18359375, + "learning_rate": 1.9940089314653826e-05, + "loss": 0.045, + "reward": 0.5251116380095482, + "reward_std": 0.11672520823776722, + "rewards/accuracy_reward": 0.1294642873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.395647332072258, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.13173026659696813, + "grad_norm": 1.5939418077468872, + "kl": 0.42333984375, + "learning_rate": 1.9938943892852575e-05, + "loss": 0.0169, + "reward": 0.549665205180645, + "reward_std": 0.07421096600592136, + "rewards/accuracy_reward": 0.06473214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8504638671875, + "epoch": 0.13202897468448957, + "grad_norm": 1.4882818460464478, + "kl": 0.47412109375, + "learning_rate": 1.9937787658454484e-05, + "loss": 0.019, + "reward": 0.5680803805589676, + "reward_std": 0.07694578263908625, + "rewards/accuracy_reward": 0.08482143003493547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589402794838, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.4776916503906, + "epoch": 0.13232768277201104, + "grad_norm": 1.653355598449707, + "kl": 0.5791015625, + "learning_rate": 1.993662061271743e-05, + "loss": 0.0236, + "reward": 0.5597098395228386, + "reward_std": 0.084144726395607, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4748884066939354, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.6473388671875, + "epoch": 0.1326263908595325, + "grad_norm": 2.599005937576294, + "kl": 0.47412109375, + "learning_rate": 1.9935442756911044e-05, + "loss": 0.019, + "reward": 0.5764509290456772, + "reward_std": 0.12123155407607555, + "rewards/accuracy_reward": 0.09375000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3593902587891, + "epoch": 0.13292509894705398, + "grad_norm": 2.2561964988708496, + "kl": 0.3955078125, + "learning_rate": 1.9934254092316716e-05, + "loss": 0.0167, + "reward": 0.6210937798023224, + "reward_std": 0.06050761789083481, + "rewards/accuracy_reward": 0.12946429220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7053833007812, + "epoch": 0.13322380703457545, + "grad_norm": 2.1112403869628906, + "kl": 0.37646484375, + "learning_rate": 1.9933054620227595e-05, + "loss": 0.0142, + "reward": 0.584263414144516, + "reward_std": 0.05580357555299997, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7790374755859, + "epoch": 0.13352251512209692, + "grad_norm": 2.207418203353882, + "kl": 0.513671875, + "learning_rate": 1.9931844341948595e-05, + "loss": 0.0151, + "reward": 0.6612723618745804, + "reward_std": 0.07864931970834732, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.0312652587891, + "epoch": 0.1338212232096184, + "grad_norm": 4.981668472290039, + "kl": 0.7822265625, + "learning_rate": 1.9930623258796373e-05, + "loss": 0.0315, + "reward": 0.508928582072258, + "reward_std": 0.03552421159110963, + "rewards/accuracy_reward": 0.01116071455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2745819091797, + "epoch": 0.13411993129713987, + "grad_norm": 14.117424011230469, + "kl": 1.314453125, + "learning_rate": 1.9929391372099352e-05, + "loss": 0.0524, + "reward": 0.6065848469734192, + "reward_std": 0.09382713073864579, + "rewards/accuracy_reward": 0.11160714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.2745819091797, + "epoch": 0.13441863938466134, + "grad_norm": 0.3671143054962158, + "kl": 0.2548828125, + "learning_rate": 1.9928148683197705e-05, + "loss": 0.0101, + "reward": 0.5513393133878708, + "reward_std": 0.05118321720510721, + "rewards/accuracy_reward": 0.05133928684517741, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8973388671875, + "epoch": 0.1347173474721828, + "grad_norm": 0.12445668876171112, + "kl": 0.245361328125, + "learning_rate": 1.9926895193443352e-05, + "loss": 0.0098, + "reward": 0.5703125298023224, + "reward_std": 0.06955798622220755, + "rewards/accuracy_reward": 0.07142857671715319, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7812805175781, + "epoch": 0.13501605555970428, + "grad_norm": 0.11900437623262405, + "kl": 0.244140625, + "learning_rate": 1.992563090419997e-05, + "loss": 0.0102, + "reward": 0.5731026977300644, + "reward_std": 0.09974703565239906, + "rewards/accuracy_reward": 0.07366071734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5290374755859, + "epoch": 0.13531476364722575, + "grad_norm": 0.15876485407352448, + "kl": 0.25, + "learning_rate": 1.992435581684298e-05, + "loss": 0.0082, + "reward": 0.642857164144516, + "reward_std": 0.09514959272928536, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.185302734375, + "epoch": 0.13561347173474722, + "grad_norm": 0.4164274036884308, + "kl": 0.254638671875, + "learning_rate": 1.9923069932759554e-05, + "loss": 0.0097, + "reward": 0.6143973469734192, + "reward_std": 0.07487392518669367, + "rewards/accuracy_reward": 0.1183035783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5312652587891, + "epoch": 0.1359121798222687, + "grad_norm": 0.13591839373111725, + "kl": 0.243896484375, + "learning_rate": 1.9921773253348604e-05, + "loss": 0.0107, + "reward": 0.6875000447034836, + "reward_std": 0.07204253133386374, + "rewards/accuracy_reward": 0.1897321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.3549346923828, + "epoch": 0.13621088790979016, + "grad_norm": 0.10595729202032089, + "kl": 0.244873046875, + "learning_rate": 1.9920465780020794e-05, + "loss": 0.0088, + "reward": 0.5396205633878708, + "reward_std": 0.08756340947002172, + "rewards/accuracy_reward": 0.04017857392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7835083007812, + "epoch": 0.13650959599731163, + "grad_norm": 0.08758360892534256, + "kl": 0.23388671875, + "learning_rate": 1.9919147514198526e-05, + "loss": 0.0093, + "reward": 0.5172991305589676, + "reward_std": 0.04888306581415236, + "rewards/accuracy_reward": 0.017857143888249993, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.1116180419922, + "epoch": 0.1368083040848331, + "grad_norm": 0.09097584336996078, + "kl": 0.239990234375, + "learning_rate": 1.991781845731594e-05, + "loss": 0.0096, + "reward": 0.6651786118745804, + "reward_std": 0.06577018275856972, + "rewards/accuracy_reward": 0.165178582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.9687805175781, + "epoch": 0.13710701217235458, + "grad_norm": 0.09509167820215225, + "kl": 0.24267578125, + "learning_rate": 1.991647861081893e-05, + "loss": 0.0097, + "reward": 0.5546875149011612, + "reward_std": 0.062743806745857, + "rewards/accuracy_reward": 0.05580357206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.3214416503906, + "epoch": 0.13740572025987605, + "grad_norm": 0.128593310713768, + "kl": 0.240966796875, + "learning_rate": 1.9915127976165104e-05, + "loss": 0.0098, + "reward": 0.5993303805589676, + "reward_std": 0.09419977408833802, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.5335083007812, + "epoch": 0.13770442834739752, + "grad_norm": 0.14396043121814728, + "kl": 0.229248046875, + "learning_rate": 1.991376655482383e-05, + "loss": 0.0091, + "reward": 0.6969866454601288, + "reward_std": 0.06574545241892338, + "rewards/accuracy_reward": 0.2031250132713467, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.560302734375, + "epoch": 0.138003136434919, + "grad_norm": 0.14582453668117523, + "kl": 0.22705078125, + "learning_rate": 1.9912394348276197e-05, + "loss": 0.0088, + "reward": 0.5948660969734192, + "reward_std": 0.08445283211767673, + "rewards/accuracy_reward": 0.10044643376022577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.4687805175781, + "epoch": 0.13830184452244043, + "grad_norm": 0.5493846535682678, + "kl": 0.232177734375, + "learning_rate": 1.9911011358015033e-05, + "loss": 0.0092, + "reward": 0.5870535969734192, + "reward_std": 0.0672232136130333, + "rewards/accuracy_reward": 0.08928571594879031, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.5245971679688, + "epoch": 0.1386005526099619, + "grad_norm": 0.6799615621566772, + "kl": 0.240478515625, + "learning_rate": 1.9909617585544894e-05, + "loss": 0.0108, + "reward": 0.6160714626312256, + "reward_std": 0.1665780022740364, + "rewards/accuracy_reward": 0.12276786239817739, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.13889926069748337, + "grad_norm": 0.7918491363525391, + "kl": 0.30224609375, + "learning_rate": 1.9908213032382072e-05, + "loss": 0.0121, + "reward": 0.6473214477300644, + "reward_std": 0.10972919873893261, + "rewards/accuracy_reward": 0.1562500111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8102874755859, + "epoch": 0.13919796878500484, + "grad_norm": 4.436590194702148, + "kl": 0.44970703125, + "learning_rate": 1.9906797700054576e-05, + "loss": 0.0162, + "reward": 0.581473246216774, + "reward_std": 0.10535325482487679, + "rewards/accuracy_reward": 0.09821428963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.483258955180645, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7857208251953, + "epoch": 0.13949667687252632, + "grad_norm": 6.669743537902832, + "kl": 0.8076171875, + "learning_rate": 1.9905371590102157e-05, + "loss": 0.0284, + "reward": 0.5301339477300644, + "reward_std": 0.14502671919763088, + "rewards/accuracy_reward": 0.07366071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4564732313156128, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4286041259766, + "epoch": 0.13979538496004779, + "grad_norm": 13.068184852600098, + "kl": 1.3154296875, + "learning_rate": 1.9903934704076273e-05, + "loss": 0.0515, + "reward": 0.4587053656578064, + "reward_std": 0.13309713080525398, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4140625149011612, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.810302734375, + "epoch": 0.14009409304756926, + "grad_norm": 8.279438018798828, + "kl": 1.0546875, + "learning_rate": 1.9902487043540125e-05, + "loss": 0.0397, + "reward": 0.3956473395228386, + "reward_std": 0.15930379554629326, + "rewards/accuracy_reward": 0.017857144121080637, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3777901977300644, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.0290679931641, + "epoch": 0.14039280113509073, + "grad_norm": 5.743220806121826, + "kl": 1.037109375, + "learning_rate": 1.990102861006862e-05, + "loss": 0.0356, + "reward": 0.467633955180645, + "reward_std": 0.1927950605750084, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.380580373108387, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.8817291259766, + "epoch": 0.1406915092226122, + "grad_norm": 4.383653163909912, + "kl": 0.947265625, + "learning_rate": 1.989955940524839e-05, + "loss": 0.0327, + "reward": 0.462611623108387, + "reward_std": 0.17276423424482346, + "rewards/accuracy_reward": 0.06919643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3934151902794838, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.6495971679688, + "epoch": 0.14099021731013367, + "grad_norm": 4.387705326080322, + "kl": 0.94921875, + "learning_rate": 1.9898079430677796e-05, + "loss": 0.0371, + "reward": 0.6188616454601288, + "reward_std": 0.19385996460914612, + "rewards/accuracy_reward": 0.18526786658912897, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4335937723517418, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8705444335938, + "epoch": 0.14128892539765514, + "grad_norm": 2.122002124786377, + "kl": 0.943359375, + "learning_rate": 1.989658868796689e-05, + "loss": 0.0363, + "reward": 0.4547991305589676, + "reward_std": 0.09575528651475906, + "rewards/accuracy_reward": 0.0022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4525669887661934, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7210235595703, + "epoch": 0.1415876334851766, + "grad_norm": 3.873112440109253, + "kl": 0.859375, + "learning_rate": 1.9895087178737467e-05, + "loss": 0.0301, + "reward": 0.5691964477300644, + "reward_std": 0.16529704630374908, + "rewards/accuracy_reward": 0.11607143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4531250298023224, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6585235595703, + "epoch": 0.14188634157269808, + "grad_norm": 5.262368679046631, + "kl": 0.3603515625, + "learning_rate": 1.9893574904623013e-05, + "loss": 0.0135, + "reward": 0.4938616305589676, + "reward_std": 0.16050956025719643, + "rewards/accuracy_reward": 0.03794643050059676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4559151977300644, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7366180419922, + "epoch": 0.14218504966021955, + "grad_norm": 3.013078212738037, + "kl": 0.233154296875, + "learning_rate": 1.989205186726874e-05, + "loss": 0.0095, + "reward": 0.4363839477300644, + "reward_std": 0.1580851413309574, + "rewards/accuracy_reward": 0.058035716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3783482313156128, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.4040374755859, + "epoch": 0.14248375774774102, + "grad_norm": 1.9902845621109009, + "kl": 0.2177734375, + "learning_rate": 1.9890518068331555e-05, + "loss": 0.0084, + "reward": 0.5306919813156128, + "reward_std": 0.19064449705183506, + "rewards/accuracy_reward": 0.14955357741564512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.381138414144516, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8214569091797, + "epoch": 0.1427824658352625, + "grad_norm": 5.537568092346191, + "kl": 0.27734375, + "learning_rate": 1.988897350948009e-05, + "loss": 0.0128, + "reward": 0.5546875298023224, + "reward_std": 0.1742106582969427, + "rewards/accuracy_reward": 0.12276786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4319196566939354, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.8661041259766, + "epoch": 0.14308117392278397, + "grad_norm": 2.724141836166382, + "kl": 0.4833984375, + "learning_rate": 1.988741819239467e-05, + "loss": 0.0211, + "reward": 0.4827009215950966, + "reward_std": 0.11933119595050812, + "rewards/accuracy_reward": 0.0223214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4603794887661934, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.8571624755859, + "epoch": 0.14337988201030544, + "grad_norm": 7.4187235832214355, + "kl": 0.8408203125, + "learning_rate": 1.988585211876733e-05, + "loss": 0.0347, + "reward": 0.5982143059372902, + "reward_std": 0.11874869931489229, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107387661934, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.2254791259766, + "epoch": 0.1436785900978269, + "grad_norm": 11.932671546936035, + "kl": 1.1337890625, + "learning_rate": 1.98842752903018e-05, + "loss": 0.0456, + "reward": 0.5820312723517418, + "reward_std": 0.06238198606297374, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.6339721679688, + "epoch": 0.14397729818534838, + "grad_norm": 3.383366346359253, + "kl": 0.44873046875, + "learning_rate": 1.9882687708713514e-05, + "loss": 0.0172, + "reward": 0.6746652126312256, + "reward_std": 0.06887718522921205, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.5536041259766, + "epoch": 0.14427600627286985, + "grad_norm": 0.46016547083854675, + "kl": 0.270263671875, + "learning_rate": 1.9881089375729614e-05, + "loss": 0.0126, + "reward": 0.6060267984867096, + "reward_std": 0.09271344100125134, + "rewards/accuracy_reward": 0.10937500977888703, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.7187957763672, + "epoch": 0.14457471436039132, + "grad_norm": 2.359785795211792, + "kl": 0.267578125, + "learning_rate": 1.987948029308892e-05, + "loss": 0.0129, + "reward": 0.6729910969734192, + "reward_std": 0.1366348061710596, + "rewards/accuracy_reward": 0.17633929289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.200927734375, + "epoch": 0.14487342244791276, + "grad_norm": 0.26119938492774963, + "kl": 0.28564453125, + "learning_rate": 1.9877860462541964e-05, + "loss": 0.0117, + "reward": 0.5837053954601288, + "reward_std": 0.09220228902995586, + "rewards/accuracy_reward": 0.08705357764847577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.496651791036129, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.4107513427734, + "epoch": 0.14517213053543424, + "grad_norm": 0.6320672631263733, + "kl": 0.240478515625, + "learning_rate": 1.9876229885850957e-05, + "loss": 0.0096, + "reward": 0.5117187798023224, + "reward_std": 0.047481851652264595, + "rewards/accuracy_reward": 0.015625000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7924194335938, + "epoch": 0.1454708386229557, + "grad_norm": 0.5399042963981628, + "kl": 0.283203125, + "learning_rate": 1.987458856478981e-05, + "loss": 0.0113, + "reward": 0.5797991305589676, + "reward_std": 0.08104685321450233, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.3839569091797, + "epoch": 0.14576954671047718, + "grad_norm": 0.7278759479522705, + "kl": 0.298583984375, + "learning_rate": 1.987293650114412e-05, + "loss": 0.0157, + "reward": 0.5630580708384514, + "reward_std": 0.0663375873118639, + "rewards/accuracy_reward": 0.07142857206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8951416015625, + "epoch": 0.14606825479799865, + "grad_norm": 0.23716634511947632, + "kl": 0.23828125, + "learning_rate": 1.9871273696711166e-05, + "loss": 0.0107, + "reward": 0.5474330633878708, + "reward_std": 0.0816669873893261, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.8169860839844, + "epoch": 0.14636696288552012, + "grad_norm": 0.645960807800293, + "kl": 0.226318359375, + "learning_rate": 1.9869600153299916e-05, + "loss": 0.0092, + "reward": 0.5172991305589676, + "reward_std": 0.07000255631282926, + "rewards/accuracy_reward": 0.02455357206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8102722167969, + "epoch": 0.1466656709730416, + "grad_norm": 0.8166766166687012, + "kl": 0.233154296875, + "learning_rate": 1.986791587273103e-05, + "loss": 0.0093, + "reward": 0.5898437723517418, + "reward_std": 0.04069845820777118, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0714721679688, + "epoch": 0.14696437906056306, + "grad_norm": 1.1997902393341064, + "kl": 0.27001953125, + "learning_rate": 1.986622085683683e-05, + "loss": 0.0108, + "reward": 0.6311384290456772, + "reward_std": 0.11818994674831629, + "rewards/accuracy_reward": 0.15401786752045155, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4771205633878708, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8058319091797, + "epoch": 0.14726308714808453, + "grad_norm": 2.2412917613983154, + "kl": 0.291015625, + "learning_rate": 1.9864515107461332e-05, + "loss": 0.0121, + "reward": 0.5435267984867096, + "reward_std": 0.13339055515825748, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4408482387661934, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.8772735595703, + "epoch": 0.147561795235606, + "grad_norm": 0.8709274530410767, + "kl": 0.4384765625, + "learning_rate": 1.9862798626460225e-05, + "loss": 0.0219, + "reward": 0.4854910969734192, + "reward_std": 0.15781499072909355, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3895089477300644, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.5803985595703, + "epoch": 0.14786050332312747, + "grad_norm": 7.8553338050842285, + "kl": 0.689453125, + "learning_rate": 1.9861071415700866e-05, + "loss": 0.0343, + "reward": 0.381138414144516, + "reward_std": 0.189052052795887, + "rewards/accuracy_reward": 0.03348214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3476562649011612, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.591552734375, + "epoch": 0.14815921141064894, + "grad_norm": 8.384472846984863, + "kl": 0.8203125, + "learning_rate": 1.98593334770623e-05, + "loss": 0.0429, + "reward": 0.405133955180645, + "reward_std": 0.150318905711174, + "rewards/accuracy_reward": 0.051339289639145136, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.353794664144516, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.8214721679688, + "epoch": 0.14845791949817042, + "grad_norm": 3.8744919300079346, + "kl": 0.56982421875, + "learning_rate": 1.985758481243523e-05, + "loss": 0.0403, + "reward": 0.5284598469734192, + "reward_std": 0.14430204592645168, + "rewards/accuracy_reward": 0.15625000675208867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3722098395228386, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.0915679931641, + "epoch": 0.1487566275856919, + "grad_norm": 0.5076402425765991, + "kl": 0.3154296875, + "learning_rate": 1.9855825423722027e-05, + "loss": 0.0273, + "reward": 0.4503348395228386, + "reward_std": 0.15386656112968922, + "rewards/accuracy_reward": 0.022321430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4280134215950966, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.8884429931641, + "epoch": 0.14905533567321336, + "grad_norm": 0.5749093294143677, + "kl": 0.2578125, + "learning_rate": 1.9854055312836742e-05, + "loss": 0.0281, + "reward": 0.513392873108387, + "reward_std": 0.12214204296469688, + "rewards/accuracy_reward": 0.05357143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4598214477300644, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 923.0089721679688, + "epoch": 0.14935404376073483, + "grad_norm": 5.581171035766602, + "kl": 0.35546875, + "learning_rate": 1.9852274481705078e-05, + "loss": 0.0209, + "reward": 0.5485491156578064, + "reward_std": 0.07874064007773995, + "rewards/accuracy_reward": 0.053571431431919336, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 875.5379791259766, + "epoch": 0.1496527518482563, + "grad_norm": 0.17678576707839966, + "kl": 0.27685546875, + "learning_rate": 1.98504829322644e-05, + "loss": 0.0091, + "reward": 0.5731026977300644, + "reward_std": 0.05869755707681179, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 501 + }, + { + "clip_ratio": 0.0, + "completion_length": 860.6562957763672, + "epoch": 0.14995145993577777, + "grad_norm": 0.2671925723552704, + "kl": 0.2861328125, + "learning_rate": 1.9848680666463748e-05, + "loss": 0.0141, + "reward": 0.5234375298023224, + "reward_std": 0.052919947542250156, + "rewards/accuracy_reward": 0.024553572991862893, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 502 + }, + { + "clip_ratio": 0.0, + "completion_length": 883.419677734375, + "epoch": 0.15025016802329924, + "grad_norm": 0.13520494103431702, + "kl": 0.2470703125, + "learning_rate": 1.98468676862638e-05, + "loss": 0.0095, + "reward": 0.6004464477300644, + "reward_std": 0.05382233951240778, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 503 + }, + { + "clip_ratio": 0.0, + "completion_length": 888.1049499511719, + "epoch": 0.1505488761108207, + "grad_norm": 0.2891036570072174, + "kl": 0.248779296875, + "learning_rate": 1.984504399363691e-05, + "loss": 0.0059, + "reward": 0.566964328289032, + "reward_std": 0.06639490090310574, + "rewards/accuracy_reward": 0.06696429057046771, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 504 + }, + { + "clip_ratio": 0.0, + "completion_length": 861.1027221679688, + "epoch": 0.15084758419834218, + "grad_norm": 0.18862655758857727, + "kl": 0.25146484375, + "learning_rate": 1.9843209590567073e-05, + "loss": 0.008, + "reward": 0.6316964626312256, + "reward_std": 0.10716621484607458, + "rewards/accuracy_reward": 0.131696434924379, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.7366638183594, + "epoch": 0.15114629228586363, + "grad_norm": 0.08895101398229599, + "kl": 0.224609375, + "learning_rate": 1.9841364479049937e-05, + "loss": 0.0099, + "reward": 0.5641741156578064, + "reward_std": 0.027132629416882992, + "rewards/accuracy_reward": 0.06473214295692742, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 506 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.4353332519531, + "epoch": 0.1514450003733851, + "grad_norm": 1.140365719795227, + "kl": 0.272705078125, + "learning_rate": 1.983950866109281e-05, + "loss": 0.0093, + "reward": 0.5982143133878708, + "reward_std": 0.0760837011039257, + "rewards/accuracy_reward": 0.09821428591385484, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 507 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.0491485595703, + "epoch": 0.15174370846090657, + "grad_norm": 0.1314384639263153, + "kl": 0.2294921875, + "learning_rate": 1.983764213871463e-05, + "loss": 0.0021, + "reward": 0.7092634290456772, + "reward_std": 0.06611709576100111, + "rewards/accuracy_reward": 0.20982144260779023, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 508 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.9732513427734, + "epoch": 0.15204241654842804, + "grad_norm": 1.2272250652313232, + "kl": 0.27294921875, + "learning_rate": 1.9835764913945998e-05, + "loss": 0.0249, + "reward": 0.5691964626312256, + "reward_std": 0.07425143383443356, + "rewards/accuracy_reward": 0.06919643050059676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 509 + }, + { + "clip_ratio": 0.0, + "completion_length": 914.2411041259766, + "epoch": 0.1523411246359495, + "grad_norm": 1.6109284162521362, + "kl": 0.29638671875, + "learning_rate": 1.9833876988829147e-05, + "loss": 0.0126, + "reward": 0.5267857313156128, + "reward_std": 0.06935327220708132, + "rewards/accuracy_reward": 0.026785715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.7478179931641, + "epoch": 0.15263983272347098, + "grad_norm": 3.5408949851989746, + "kl": 0.7705078125, + "learning_rate": 1.9831978365417958e-05, + "loss": 0.0376, + "reward": 0.685825914144516, + "reward_std": 0.0712634208612144, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491629496216774, + "step": 511 + }, + { + "clip_ratio": 0.0, + "completion_length": 933.5446929931641, + "epoch": 0.15293854081099245, + "grad_norm": 13.050299644470215, + "kl": 1.71875, + "learning_rate": 1.9830069045777943e-05, + "loss": 0.0785, + "reward": 0.6545759290456772, + "reward_std": 0.11417525075376034, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 512 + }, + { + "clip_ratio": 0.0, + "completion_length": 928.1920166015625, + "epoch": 0.15323724889851392, + "grad_norm": 1.3568668365478516, + "kl": 0.28271484375, + "learning_rate": 1.9828149031986256e-05, + "loss": -0.0078, + "reward": 0.5318080559372902, + "reward_std": 0.030652948655188084, + "rewards/accuracy_reward": 0.0379464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 513 + }, + { + "clip_ratio": 0.0, + "completion_length": 903.7857666015625, + "epoch": 0.1535359569860354, + "grad_norm": 4.914193630218506, + "kl": 0.31787109375, + "learning_rate": 1.982621832613169e-05, + "loss": 0.0314, + "reward": 0.5636161118745804, + "reward_std": 0.10569724440574646, + "rewards/accuracy_reward": 0.06919643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 514 + }, + { + "clip_ratio": 0.0, + "completion_length": 899.2768249511719, + "epoch": 0.15383466507355686, + "grad_norm": 0.1051035225391388, + "kl": 0.253173828125, + "learning_rate": 1.982427693031465e-05, + "loss": 0.0054, + "reward": 0.5329241305589676, + "reward_std": 0.06302161235362291, + "rewards/accuracy_reward": 0.03348214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completion_length": 916.2299499511719, + "epoch": 0.15413337316107834, + "grad_norm": 2.2747879028320312, + "kl": 0.2900390625, + "learning_rate": 1.9822324846647195e-05, + "loss": 0.0151, + "reward": 0.5189732313156128, + "reward_std": 0.05121962702833116, + "rewards/accuracy_reward": 0.022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 516 + }, + { + "clip_ratio": 0.0, + "completion_length": 912.9598541259766, + "epoch": 0.1544320812485998, + "grad_norm": 1.0016824007034302, + "kl": 0.2998046875, + "learning_rate": 1.9820362077253e-05, + "loss": 0.01, + "reward": 0.6250000298023224, + "reward_std": 0.10013723745942116, + "rewards/accuracy_reward": 0.12946429196745157, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357238650322, + "step": 517 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.0379791259766, + "epoch": 0.15473078933612128, + "grad_norm": 1.0009006261825562, + "kl": 0.30029296875, + "learning_rate": 1.9818388624267362e-05, + "loss": 0.0166, + "reward": 0.5731027275323868, + "reward_std": 0.10540123376995325, + "rewards/accuracy_reward": 0.08482143329456449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 518 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.5223541259766, + "epoch": 0.15502949742364275, + "grad_norm": 0.42472487688064575, + "kl": 0.279296875, + "learning_rate": 1.9816404489837205e-05, + "loss": 0.0269, + "reward": 0.5848214626312256, + "reward_std": 0.1450909674167633, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 519 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.7678985595703, + "epoch": 0.15532820551116422, + "grad_norm": 0.3777104318141937, + "kl": 0.287109375, + "learning_rate": 1.981440967612108e-05, + "loss": 0.0194, + "reward": 0.5463169813156128, + "reward_std": 0.10423572920262814, + "rewards/accuracy_reward": 0.06696428777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4793526977300644, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.8303833007812, + "epoch": 0.1556269135986857, + "grad_norm": 0.8031876087188721, + "kl": 0.3046875, + "learning_rate": 1.981240418528914e-05, + "loss": 0.023, + "reward": 0.4681919887661934, + "reward_std": 0.12322927638888359, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4280134215950966, + "step": 521 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.0402221679688, + "epoch": 0.15592562168620716, + "grad_norm": 0.42932406067848206, + "kl": 0.32861328125, + "learning_rate": 1.981038801952316e-05, + "loss": 0.0289, + "reward": 0.553013414144516, + "reward_std": 0.13807966373860836, + "rewards/accuracy_reward": 0.08928571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4637276977300644, + "step": 522 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.5491333007812, + "epoch": 0.15622432977372863, + "grad_norm": 0.4340602159500122, + "kl": 0.341796875, + "learning_rate": 1.9808361181016543e-05, + "loss": 0.0242, + "reward": 0.568638414144516, + "reward_std": 0.14294905215501785, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4637276902794838, + "step": 523 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.2299499511719, + "epoch": 0.1565230378612501, + "grad_norm": 1.7642405033111572, + "kl": 0.3154296875, + "learning_rate": 1.980632367197428e-05, + "loss": 0.0244, + "reward": 0.5636160969734192, + "reward_std": 0.12394430674612522, + "rewards/accuracy_reward": 0.09598214854486287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4676339402794838, + "step": 524 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.4241638183594, + "epoch": 0.15682174594877157, + "grad_norm": 17.119173049926758, + "kl": 0.43115234375, + "learning_rate": 1.9804275494612988e-05, + "loss": 0.0182, + "reward": 0.5926339700818062, + "reward_std": 0.09344205167144537, + "rewards/accuracy_reward": 0.10714286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.4799346923828, + "epoch": 0.15712045403629304, + "grad_norm": 1002.1270751953125, + "kl": 27.875, + "learning_rate": 1.980221665116088e-05, + "loss": 1.1267, + "reward": 0.5401785969734192, + "reward_std": 0.08122059889137745, + "rewards/accuracy_reward": 0.05133928661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 526 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.1183471679688, + "epoch": 0.15741916212381452, + "grad_norm": 716.1602783203125, + "kl": 21.1044921875, + "learning_rate": 1.9800147143857774e-05, + "loss": 0.8446, + "reward": 0.5518973395228386, + "reward_std": 0.060744138434529305, + "rewards/accuracy_reward": 0.0580357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 527 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.5335388183594, + "epoch": 0.15771787021133596, + "grad_norm": 17.282676696777344, + "kl": 1.337890625, + "learning_rate": 1.979806697495509e-05, + "loss": 0.0536, + "reward": 0.631696455180645, + "reward_std": 0.08988063503056765, + "rewards/accuracy_reward": 0.1473214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750298023224, + "step": 528 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.6495971679688, + "epoch": 0.15801657829885743, + "grad_norm": 1.3293757438659668, + "kl": 0.35546875, + "learning_rate": 1.979597614671586e-05, + "loss": 0.0139, + "reward": 0.5301339477300644, + "reward_std": 0.1450254898518324, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4520089477300644, + "step": 529 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3571929931641, + "epoch": 0.1583152863863789, + "grad_norm": 3.2705302238464355, + "kl": 0.34130859375, + "learning_rate": 1.9793874661414682e-05, + "loss": 0.0143, + "reward": 0.5128348469734192, + "reward_std": 0.12646752037107944, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4503348469734192, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.0156555175781, + "epoch": 0.15861399447390037, + "grad_norm": 10.322676658630371, + "kl": 0.3173828125, + "learning_rate": 1.979176252133778e-05, + "loss": 0.0161, + "reward": 0.5122768059372902, + "reward_std": 0.12831770814955235, + "rewards/accuracy_reward": 0.05580357206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4564732387661934, + "step": 531 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.7678833007812, + "epoch": 0.15891270256142184, + "grad_norm": 6369.7021484375, + "kl": 40.1796875, + "learning_rate": 1.978963972878295e-05, + "loss": 1.6092, + "reward": 0.5117187723517418, + "reward_std": 0.0828961618244648, + "rewards/accuracy_reward": 0.04017857206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.471540205180645, + "step": 532 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.7366333007812, + "epoch": 0.1592114106489433, + "grad_norm": 95.72207641601562, + "kl": 2.169921875, + "learning_rate": 1.9787506286059584e-05, + "loss": 0.0882, + "reward": 0.6132812649011612, + "reward_std": 0.11036421917378902, + "rewards/accuracy_reward": 0.13169643771834671, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 533 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.2143249511719, + "epoch": 0.15951011873646478, + "grad_norm": 5.309536457061768, + "kl": 0.3310546875, + "learning_rate": 1.9785362195488656e-05, + "loss": 0.0144, + "reward": 0.5279018059372902, + "reward_std": 0.07026119017973542, + "rewards/accuracy_reward": 0.044642857974395156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 534 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.6228179931641, + "epoch": 0.15980882682398626, + "grad_norm": 13.140158653259277, + "kl": 1.166015625, + "learning_rate": 1.978320745940273e-05, + "loss": 0.0478, + "reward": 0.517299123108387, + "reward_std": 0.08000995917245746, + "rewards/accuracy_reward": 0.026785715715959668, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.6986999511719, + "epoch": 0.16010753491150773, + "grad_norm": 9.988635063171387, + "kl": 0.38916015625, + "learning_rate": 1.978104208014594e-05, + "loss": 0.0184, + "reward": 0.6863839626312256, + "reward_std": 0.10739694349467754, + "rewards/accuracy_reward": 0.20089286426082253, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910969734192, + "step": 536 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.8103179931641, + "epoch": 0.1604062429990292, + "grad_norm": 4.057969093322754, + "kl": 0.70849609375, + "learning_rate": 1.9778866060074014e-05, + "loss": 0.0339, + "reward": 0.6356026977300644, + "reward_std": 0.09937281534075737, + "rewards/accuracy_reward": 0.14062500279396772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 537 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.0156555175781, + "epoch": 0.16070495108655067, + "grad_norm": 1.1508798599243164, + "kl": 0.34716796875, + "learning_rate": 1.977667940155425e-05, + "loss": 0.0141, + "reward": 0.5429687649011612, + "reward_std": 0.03130731591954827, + "rewards/accuracy_reward": 0.04687500186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 538 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.4107666015625, + "epoch": 0.16100365917407214, + "grad_norm": 4.79181432723999, + "kl": 0.41259765625, + "learning_rate": 1.9774482106965512e-05, + "loss": 0.0137, + "reward": 0.6116071790456772, + "reward_std": 0.06312049110420048, + "rewards/accuracy_reward": 0.12053571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 539 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.9643249511719, + "epoch": 0.1613023672615936, + "grad_norm": 25.906322479248047, + "kl": 1.30908203125, + "learning_rate": 1.9772274178698245e-05, + "loss": 0.0522, + "reward": 0.6238839477300644, + "reward_std": 0.07695346884429455, + "rewards/accuracy_reward": 0.12723214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.4263763427734, + "epoch": 0.16160107534911508, + "grad_norm": 0.1824873834848404, + "kl": 0.34912109375, + "learning_rate": 1.9770055619154456e-05, + "loss": 0.0187, + "reward": 0.5496651977300644, + "reward_std": 0.05374849610961974, + "rewards/accuracy_reward": 0.053571431431919336, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 541 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.8750457763672, + "epoch": 0.16189978343663655, + "grad_norm": 1.0174477100372314, + "kl": 0.35205078125, + "learning_rate": 1.9767826430747724e-05, + "loss": 0.0147, + "reward": 0.561941996216774, + "reward_std": 0.05428311415016651, + "rewards/accuracy_reward": 0.06250000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 542 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.7879791259766, + "epoch": 0.16219849152415802, + "grad_norm": 0.26182296872138977, + "kl": 0.34716796875, + "learning_rate": 1.9765586615903183e-05, + "loss": 0.0134, + "reward": 0.5758928656578064, + "reward_std": 0.04935094341635704, + "rewards/accuracy_reward": 0.0758928582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 543 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.7299499511719, + "epoch": 0.1624971996116795, + "grad_norm": 0.15222543478012085, + "kl": 0.38037109375, + "learning_rate": 1.9763336177057536e-05, + "loss": 0.0138, + "reward": 0.6640625298023224, + "reward_std": 0.09112629410810769, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 544 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.4286193847656, + "epoch": 0.16279590769920096, + "grad_norm": 0.3223598301410675, + "kl": 0.40087890625, + "learning_rate": 1.9761075116659037e-05, + "loss": 0.0164, + "reward": 0.5418527126312256, + "reward_std": 0.023359465412795544, + "rewards/accuracy_reward": 0.042410716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.9710235595703, + "epoch": 0.16309461578672244, + "grad_norm": 0.4422331750392914, + "kl": 0.38037109375, + "learning_rate": 1.97588034371675e-05, + "loss": 0.0117, + "reward": 0.550223246216774, + "reward_std": 0.07005987223237753, + "rewards/accuracy_reward": 0.05133928777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 546 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.6428985595703, + "epoch": 0.1633933238742439, + "grad_norm": 0.16696399450302124, + "kl": 0.341796875, + "learning_rate": 1.9756521141054286e-05, + "loss": 0.0137, + "reward": 0.572544664144516, + "reward_std": 0.07143330201506615, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 547 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8750305175781, + "epoch": 0.16369203196176538, + "grad_norm": 0.5118518471717834, + "kl": 0.345703125, + "learning_rate": 1.9754228230802317e-05, + "loss": 0.0138, + "reward": 0.5390625298023224, + "reward_std": 0.022321429336443543, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 548 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.1406555175781, + "epoch": 0.16399074004928682, + "grad_norm": 0.13661125302314758, + "kl": 0.341796875, + "learning_rate": 1.9751924708906047e-05, + "loss": 0.0125, + "reward": 0.560825914144516, + "reward_std": 0.07849856000393629, + "rewards/accuracy_reward": 0.06250000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 549 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2678833007812, + "epoch": 0.1642894481368083, + "grad_norm": 0.28241240978240967, + "kl": 0.384765625, + "learning_rate": 1.9749610577871486e-05, + "loss": 0.0171, + "reward": 0.5809151977300644, + "reward_std": 0.04805200663395226, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937574505806, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.1852874755859, + "epoch": 0.16458815622432976, + "grad_norm": 2.0939292907714844, + "kl": 0.42822265625, + "learning_rate": 1.974728584021618e-05, + "loss": 0.0176, + "reward": 0.6205357238650322, + "reward_std": 0.05586799536831677, + "rewards/accuracy_reward": 0.12723214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 551 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4420013427734, + "epoch": 0.16488686431185123, + "grad_norm": 1.1499838829040527, + "kl": 0.59619140625, + "learning_rate": 1.9744950498469218e-05, + "loss": 0.0242, + "reward": 0.584821455180645, + "reward_std": 0.04623043001629412, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 552 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.5893249511719, + "epoch": 0.1651855723993727, + "grad_norm": 1.616790771484375, + "kl": 0.8896484375, + "learning_rate": 1.9742604555171222e-05, + "loss": 0.0388, + "reward": 0.5608259215950966, + "reward_std": 0.08407806046307087, + "rewards/accuracy_reward": 0.0803571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687723517418, + "step": 553 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.9263763427734, + "epoch": 0.16548428048689418, + "grad_norm": 3.962466239929199, + "kl": 0.81640625, + "learning_rate": 1.9740248012874344e-05, + "loss": 0.0297, + "reward": 0.4966518059372902, + "reward_std": 0.14639180339872837, + "rewards/accuracy_reward": 0.037946430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.458705373108387, + "step": 554 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.3772430419922, + "epoch": 0.16578298857441565, + "grad_norm": 4.467741012573242, + "kl": 0.9033203125, + "learning_rate": 1.973788087414228e-05, + "loss": 0.0364, + "reward": 0.5915178805589676, + "reward_std": 0.14993767626583576, + "rewards/accuracy_reward": 0.13839286379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4531250149011612, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.3951416015625, + "epoch": 0.16608169666193712, + "grad_norm": 5.068377494812012, + "kl": 1.95703125, + "learning_rate": 1.9735503141550233e-05, + "loss": 0.0783, + "reward": 0.4988839477300644, + "reward_std": 0.13568499125540257, + "rewards/accuracy_reward": 0.058035716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4408482313156128, + "step": 556 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.9911193847656, + "epoch": 0.1663804047494586, + "grad_norm": 1.6202384233474731, + "kl": 1.8125, + "learning_rate": 1.9733114817684957e-05, + "loss": 0.0738, + "reward": 0.4726562798023224, + "reward_std": 0.12528140470385551, + "rewards/accuracy_reward": 0.04241071501746774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4302455559372902, + "step": 557 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.8370971679688, + "epoch": 0.16667911283698006, + "grad_norm": 14.27087116241455, + "kl": 0.921875, + "learning_rate": 1.9730715905144705e-05, + "loss": 0.0384, + "reward": 0.4681919813156128, + "reward_std": 0.16398174315690994, + "rewards/accuracy_reward": 0.033482145285233855, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4347098469734192, + "step": 558 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.6785888671875, + "epoch": 0.16697782092450153, + "grad_norm": 14.665558815002441, + "kl": 1.0439453125, + "learning_rate": 1.972830640653926e-05, + "loss": 0.0409, + "reward": 0.455357164144516, + "reward_std": 0.12739547155797482, + "rewards/accuracy_reward": 0.015625000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4397321715950966, + "step": 559 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.9576263427734, + "epoch": 0.167276529012023, + "grad_norm": 2.42655348777771, + "kl": 1.703125, + "learning_rate": 1.972588632448992e-05, + "loss": 0.0696, + "reward": 0.513392873108387, + "reward_std": 0.13281137309968472, + "rewards/accuracy_reward": 0.060267861699685454, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4531250223517418, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.200927734375, + "epoch": 0.16757523709954447, + "grad_norm": 35.78474807739258, + "kl": 4.48046875, + "learning_rate": 1.97234556616295e-05, + "loss": 0.1838, + "reward": 0.5239955633878708, + "reward_std": 0.14427533373236656, + "rewards/accuracy_reward": 0.06473214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.459263414144516, + "step": 561 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.8661041259766, + "epoch": 0.16787394518706594, + "grad_norm": 38.58867263793945, + "kl": 4.3203125, + "learning_rate": 1.972101442060232e-05, + "loss": 0.1797, + "reward": 0.4910714402794838, + "reward_std": 0.12004372105002403, + "rewards/accuracy_reward": 0.02008928661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.470982164144516, + "step": 562 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.8705902099609, + "epoch": 0.1681726532745874, + "grad_norm": 23.633464813232422, + "kl": 3.6328125, + "learning_rate": 1.9718562604064213e-05, + "loss": 0.1486, + "reward": 0.554129496216774, + "reward_std": 0.11371793411672115, + "rewards/accuracy_reward": 0.08705357275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.467075914144516, + "step": 563 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.0000457763672, + "epoch": 0.16847136136210888, + "grad_norm": 1.9356738328933716, + "kl": 2.5234375, + "learning_rate": 1.9716100214682516e-05, + "loss": 0.1145, + "reward": 0.4938616380095482, + "reward_std": 0.08463205955922604, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4581473469734192, + "step": 564 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.3192291259766, + "epoch": 0.16877006944963036, + "grad_norm": 11.137552261352539, + "kl": 1.2294921875, + "learning_rate": 1.9713627255136062e-05, + "loss": 0.0591, + "reward": 0.5524553805589676, + "reward_std": 0.0921557629480958, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4720982387661934, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.9554138183594, + "epoch": 0.16906877753715183, + "grad_norm": 4.700808525085449, + "kl": 1.216796875, + "learning_rate": 1.9711143728115196e-05, + "loss": 0.0559, + "reward": 0.532924123108387, + "reward_std": 0.10339085105806589, + "rewards/accuracy_reward": 0.058035716880112886, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.474888414144516, + "step": 566 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.7790679931641, + "epoch": 0.1693674856246733, + "grad_norm": 9.361956596374512, + "kl": 2.2197265625, + "learning_rate": 1.9708649636321745e-05, + "loss": 0.0991, + "reward": 0.4827009215950966, + "reward_std": 0.06847202964127064, + "rewards/accuracy_reward": 0.004464285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4782366380095482, + "step": 567 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.3951416015625, + "epoch": 0.16966619371219477, + "grad_norm": 9.251808166503906, + "kl": 1.58984375, + "learning_rate": 1.970614498246904e-05, + "loss": 0.0654, + "reward": 0.5898437947034836, + "reward_std": 0.13683726079761982, + "rewards/accuracy_reward": 0.10267857555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 568 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.4487152099609, + "epoch": 0.16996490179971624, + "grad_norm": 1.058424472808838, + "kl": 0.8408203125, + "learning_rate": 1.97036297692819e-05, + "loss": 0.0389, + "reward": 0.5385044887661934, + "reward_std": 0.0801535602658987, + "rewards/accuracy_reward": 0.04687500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 569 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.3973693847656, + "epoch": 0.1702636098872377, + "grad_norm": 1.0127662420272827, + "kl": 0.52587890625, + "learning_rate": 1.970110399949663e-05, + "loss": 0.0284, + "reward": 0.632254496216774, + "reward_std": 0.0753267128020525, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.1138916015625, + "epoch": 0.17056231797475915, + "grad_norm": 0.4965631663799286, + "kl": 0.4716796875, + "learning_rate": 1.9698567675861017e-05, + "loss": 0.0243, + "reward": 0.5195312798023224, + "reward_std": 0.07936059241183102, + "rewards/accuracy_reward": 0.024553572526201606, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 571 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.3259429931641, + "epoch": 0.17086102606228062, + "grad_norm": 0.18551811575889587, + "kl": 0.28759765625, + "learning_rate": 1.9696020801134333e-05, + "loss": 0.0135, + "reward": 0.5691964477300644, + "reward_std": 0.07093437481671572, + "rewards/accuracy_reward": 0.07142857415601611, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 572 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.3103179931641, + "epoch": 0.1711597341498021, + "grad_norm": 0.18553966283798218, + "kl": 0.243896484375, + "learning_rate": 1.969346337808733e-05, + "loss": 0.0113, + "reward": 0.5708705633878708, + "reward_std": 0.07165654189884663, + "rewards/accuracy_reward": 0.07366071734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 573 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.4107513427734, + "epoch": 0.17145844223732357, + "grad_norm": 0.24659675359725952, + "kl": 0.2333984375, + "learning_rate": 1.9690895409502237e-05, + "loss": 0.0105, + "reward": 0.544084832072258, + "reward_std": 0.03593921009451151, + "rewards/accuracy_reward": 0.04687500186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 574 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.1518249511719, + "epoch": 0.17175715032484504, + "grad_norm": 0.18105627596378326, + "kl": 0.216796875, + "learning_rate": 1.9688316898172744e-05, + "loss": 0.0114, + "reward": 0.5295759290456772, + "reward_std": 0.0749765601940453, + "rewards/accuracy_reward": 0.03348214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.1875457763672, + "epoch": 0.1720558584123665, + "grad_norm": 0.11177831143140793, + "kl": 0.211181640625, + "learning_rate": 1.9685727846904026e-05, + "loss": 0.005, + "reward": 0.5524553805589676, + "reward_std": 0.03836447210051119, + "rewards/accuracy_reward": 0.05357143026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 576 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.9063110351562, + "epoch": 0.17235456649988798, + "grad_norm": 0.2078200876712799, + "kl": 0.205078125, + "learning_rate": 1.9683128258512712e-05, + "loss": 0.0069, + "reward": 0.5446428805589676, + "reward_std": 0.06532285967841744, + "rewards/accuracy_reward": 0.0468750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 577 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.3973541259766, + "epoch": 0.17265327458740945, + "grad_norm": 0.2132413685321808, + "kl": 0.2099609375, + "learning_rate": 1.96805181358269e-05, + "loss": 0.0095, + "reward": 0.5652902126312256, + "reward_std": 0.09937713667750359, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 578 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.7522888183594, + "epoch": 0.17295198267493092, + "grad_norm": 0.342558354139328, + "kl": 0.207763671875, + "learning_rate": 1.967789748168615e-05, + "loss": 0.0092, + "reward": 0.5005580484867096, + "reward_std": 0.03166804322972894, + "rewards/accuracy_reward": 0.004464285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 579 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.5022888183594, + "epoch": 0.1732506907624524, + "grad_norm": 0.3149016499519348, + "kl": 0.219970703125, + "learning_rate": 1.967526629894148e-05, + "loss": 0.0116, + "reward": 0.5597098469734192, + "reward_std": 0.09472163859754801, + "rewards/accuracy_reward": 0.06026785844005644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.1116485595703, + "epoch": 0.17354939884997386, + "grad_norm": 0.14980658888816833, + "kl": 0.228515625, + "learning_rate": 1.967262459045535e-05, + "loss": 0.0079, + "reward": 0.5245535969734192, + "reward_std": 0.06917762057855725, + "rewards/accuracy_reward": 0.026785715483129025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 581 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.3995971679688, + "epoch": 0.17384810693749533, + "grad_norm": 0.1409674733877182, + "kl": 0.24267578125, + "learning_rate": 1.9669972359101685e-05, + "loss": 0.01, + "reward": 0.5725446790456772, + "reward_std": 0.013392857741564512, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 582 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.7254943847656, + "epoch": 0.1741468150250168, + "grad_norm": 0.2267979383468628, + "kl": 0.23681640625, + "learning_rate": 1.9667309607765857e-05, + "loss": 0.0093, + "reward": 0.6060268133878708, + "reward_std": 0.07292639021761715, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 583 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.2433471679688, + "epoch": 0.17444552311253828, + "grad_norm": 0.1515013575553894, + "kl": 0.235595703125, + "learning_rate": 1.9664636339344668e-05, + "loss": 0.0094, + "reward": 0.6171875298023224, + "reward_std": 0.03597625717520714, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 584 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.4219207763672, + "epoch": 0.17474423120005975, + "grad_norm": 0.30086052417755127, + "kl": 0.23583984375, + "learning_rate": 1.966195255674638e-05, + "loss": 0.0094, + "reward": 0.5591518133878708, + "reward_std": 0.04891707003116608, + "rewards/accuracy_reward": 0.060267860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.4487152099609, + "epoch": 0.17504293928758122, + "grad_norm": 0.43346962332725525, + "kl": 0.23291015625, + "learning_rate": 1.9659258262890683e-05, + "loss": 0.0131, + "reward": 0.5485491305589676, + "reward_std": 0.10955240204930305, + "rewards/accuracy_reward": 0.05133928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 586 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3973693847656, + "epoch": 0.1753416473751027, + "grad_norm": 0.1504615992307663, + "kl": 0.222412109375, + "learning_rate": 1.9656553460708707e-05, + "loss": 0.0089, + "reward": 0.528459832072258, + "reward_std": 0.06534610083326697, + "rewards/accuracy_reward": 0.031250000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 587 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.3906555175781, + "epoch": 0.17564035546262416, + "grad_norm": 0.37903323769569397, + "kl": 0.2666015625, + "learning_rate": 1.9653838153143007e-05, + "loss": 0.0124, + "reward": 0.5937500149011612, + "reward_std": 0.10620336094871163, + "rewards/accuracy_reward": 0.09821429336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 588 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.732177734375, + "epoch": 0.17593906355014563, + "grad_norm": 0.3822634816169739, + "kl": 0.254638671875, + "learning_rate": 1.9651112343147577e-05, + "loss": 0.0101, + "reward": 0.5585937649011612, + "reward_std": 0.0728506469167769, + "rewards/accuracy_reward": 0.06250000209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 589 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.6094207763672, + "epoch": 0.1762377716376671, + "grad_norm": 0.2764602303504944, + "kl": 0.27392578125, + "learning_rate": 1.964837603368783e-05, + "loss": 0.0087, + "reward": 0.5385044813156128, + "reward_std": 0.04443926922976971, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.0045013427734, + "epoch": 0.17653647972518857, + "grad_norm": 0.9502437114715576, + "kl": 0.3408203125, + "learning_rate": 1.9645629227740596e-05, + "loss": 0.0127, + "reward": 0.561941996216774, + "reward_std": 0.07247133553028107, + "rewards/accuracy_reward": 0.06473214481957257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 591 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.6518096923828, + "epoch": 0.17683518781271004, + "grad_norm": 0.6568501591682434, + "kl": 0.2900390625, + "learning_rate": 1.9642871928294136e-05, + "loss": 0.0106, + "reward": 0.5357143133878708, + "reward_std": 0.04222711408510804, + "rewards/accuracy_reward": 0.03794643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 592 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8616485595703, + "epoch": 0.17713389590023149, + "grad_norm": 0.3026214838027954, + "kl": 0.28125, + "learning_rate": 1.9640104138348124e-05, + "loss": 0.0106, + "reward": 0.5546875298023224, + "reward_std": 0.08517747186124325, + "rewards/accuracy_reward": 0.06250000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 593 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.9643249511719, + "epoch": 0.17743260398775296, + "grad_norm": 0.40166452527046204, + "kl": 0.269287109375, + "learning_rate": 1.963732586091364e-05, + "loss": 0.0125, + "reward": 0.647879496216774, + "reward_std": 0.1214418075978756, + "rewards/accuracy_reward": 0.15848214901052415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 594 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0111694335938, + "epoch": 0.17773131207527443, + "grad_norm": 0.82562655210495, + "kl": 0.2421875, + "learning_rate": 1.963453709901318e-05, + "loss": 0.0099, + "reward": 0.5686384215950966, + "reward_std": 0.11076832469552755, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 595 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.9754943847656, + "epoch": 0.1780300201627959, + "grad_norm": 0.2718622386455536, + "kl": 0.2744140625, + "learning_rate": 1.963173785568064e-05, + "loss": 0.0108, + "reward": 0.564732164144516, + "reward_std": 0.10079828323796391, + "rewards/accuracy_reward": 0.07142857369035482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 596 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.9643249511719, + "epoch": 0.17832872825031737, + "grad_norm": 0.5699124336242676, + "kl": 0.2763671875, + "learning_rate": 1.9628928133961324e-05, + "loss": 0.0111, + "reward": 0.588169664144516, + "reward_std": 0.039349609753116965, + "rewards/accuracy_reward": 0.09151786123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 597 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.7433471679688, + "epoch": 0.17862743633783884, + "grad_norm": 0.42298275232315063, + "kl": 0.29345703125, + "learning_rate": 1.9626107936911936e-05, + "loss": 0.0124, + "reward": 0.5959821790456772, + "reward_std": 0.06821557250805199, + "rewards/accuracy_reward": 0.10044643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 598 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.7433624267578, + "epoch": 0.1789261444253603, + "grad_norm": 0.5626869201660156, + "kl": 0.29052734375, + "learning_rate": 1.9623277267600574e-05, + "loss": 0.0097, + "reward": 0.5502232313156128, + "reward_std": 0.09001796063967049, + "rewards/accuracy_reward": 0.053571431897580624, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 599 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.5424346923828, + "epoch": 0.17922485251288178, + "grad_norm": 0.4419832229614258, + "kl": 0.239013671875, + "learning_rate": 1.9620436129106725e-05, + "loss": 0.0105, + "reward": 0.577008955180645, + "reward_std": 0.07058041542768478, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.310302734375, + "epoch": 0.17952356060040325, + "grad_norm": 1.135029911994934, + "kl": 0.23681640625, + "learning_rate": 1.9617584524521273e-05, + "loss": 0.0095, + "reward": 0.5189732387661934, + "reward_std": 0.14425440318882465, + "rewards/accuracy_reward": 0.049107146449387074, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4698660895228386, + "step": 601 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8437957763672, + "epoch": 0.17982226868792472, + "grad_norm": 0.2720204293727875, + "kl": 0.267578125, + "learning_rate": 1.9614722456946483e-05, + "loss": 0.0098, + "reward": 0.5881696790456772, + "reward_std": 0.09814235474914312, + "rewards/accuracy_reward": 0.10044643376022577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 602 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.3303985595703, + "epoch": 0.1801209767754462, + "grad_norm": 0.44808173179626465, + "kl": 0.28369140625, + "learning_rate": 1.9611849929496004e-05, + "loss": 0.0095, + "reward": 0.5111607313156128, + "reward_std": 0.042638681130483747, + "rewards/accuracy_reward": 0.013392857974395156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 603 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.9152221679688, + "epoch": 0.18041968486296767, + "grad_norm": 0.4677877128124237, + "kl": 0.302734375, + "learning_rate": 1.9608966945294863e-05, + "loss": 0.0126, + "reward": 0.5736607313156128, + "reward_std": 0.06995524372905493, + "rewards/accuracy_reward": 0.07812500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 604 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.435302734375, + "epoch": 0.18071839295048914, + "grad_norm": 0.30795782804489136, + "kl": 0.31689453125, + "learning_rate": 1.9606073507479466e-05, + "loss": 0.012, + "reward": 0.572544664144516, + "reward_std": 0.10895088128745556, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.466552734375, + "epoch": 0.1810171010380106, + "grad_norm": 0.4766331911087036, + "kl": 0.392578125, + "learning_rate": 1.960316961919759e-05, + "loss": 0.0142, + "reward": 0.5066964402794838, + "reward_std": 0.07536467537283897, + "rewards/accuracy_reward": 0.020089287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 606 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.3393402099609, + "epoch": 0.18131580912553208, + "grad_norm": 0.9489984512329102, + "kl": 0.484375, + "learning_rate": 1.960025528360838e-05, + "loss": 0.0205, + "reward": 0.647879496216774, + "reward_std": 0.16504586301743984, + "rewards/accuracy_reward": 0.16294643515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 607 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.7388763427734, + "epoch": 0.18161451721305355, + "grad_norm": 1.5724332332611084, + "kl": 0.67431640625, + "learning_rate": 1.9597330503882345e-05, + "loss": 0.0265, + "reward": 0.548549123108387, + "reward_std": 0.14723935909569263, + "rewards/accuracy_reward": 0.07142857392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4771205559372902, + "step": 608 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.2232666015625, + "epoch": 0.18191322530057502, + "grad_norm": 1.3036589622497559, + "kl": 0.56005859375, + "learning_rate": 1.9594395283201362e-05, + "loss": 0.0229, + "reward": 0.5133928805589676, + "reward_std": 0.12179959379136562, + "rewards/accuracy_reward": 0.031250000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 609 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.2120819091797, + "epoch": 0.1822119333880965, + "grad_norm": 1.0423318147659302, + "kl": 0.408203125, + "learning_rate": 1.959144962475867e-05, + "loss": 0.0189, + "reward": 0.4882812798023224, + "reward_std": 0.16122385300695896, + "rewards/accuracy_reward": 0.037946430733427405, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4503348395228386, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.0111999511719, + "epoch": 0.18251064147561796, + "grad_norm": 1.4969007968902588, + "kl": 0.55126953125, + "learning_rate": 1.9588493531758843e-05, + "loss": 0.0277, + "reward": 0.650669664144516, + "reward_std": 0.1753799133002758, + "rewards/accuracy_reward": 0.18750000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.463169664144516, + "step": 611 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.5245971679688, + "epoch": 0.18280934956313943, + "grad_norm": 2.619720458984375, + "kl": 0.46435546875, + "learning_rate": 1.9585527007417825e-05, + "loss": 0.0189, + "reward": 0.4559151977300644, + "reward_std": 0.15252678096294403, + "rewards/accuracy_reward": 0.024553572991862893, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4313616305589676, + "step": 612 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.8415679931641, + "epoch": 0.1831080576506609, + "grad_norm": 2.296921491622925, + "kl": 0.873046875, + "learning_rate": 1.958255005496291e-05, + "loss": 0.0391, + "reward": 0.559709832072258, + "reward_std": 0.13905826210975647, + "rewards/accuracy_reward": 0.09375000675208867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4659598395228386, + "step": 613 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.6495971679688, + "epoch": 0.18340676573818235, + "grad_norm": 6.382227420806885, + "kl": 1.0986328125, + "learning_rate": 1.9579562677632725e-05, + "loss": 0.0469, + "reward": 0.5691964477300644, + "reward_std": 0.12343759834766388, + "rewards/accuracy_reward": 0.08482143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 614 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.9553985595703, + "epoch": 0.18370547382570382, + "grad_norm": 5.119771957397461, + "kl": 0.7236328125, + "learning_rate": 1.957656487867724e-05, + "loss": 0.0269, + "reward": 0.5385044813156128, + "reward_std": 0.11165977572090924, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.4799652099609, + "epoch": 0.1840041819132253, + "grad_norm": 1.342289924621582, + "kl": 0.46923828125, + "learning_rate": 1.9573556661357777e-05, + "loss": 0.0195, + "reward": 0.553013414144516, + "reward_std": 0.07886981219053268, + "rewards/accuracy_reward": 0.06026785867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 616 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.1964721679688, + "epoch": 0.18430289000074676, + "grad_norm": 0.8254843354225159, + "kl": 0.38671875, + "learning_rate": 1.9570538028946974e-05, + "loss": 0.0159, + "reward": 0.6093750298023224, + "reward_std": 0.0742235267534852, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 617 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.7076416015625, + "epoch": 0.18460159808826823, + "grad_norm": 1.1089051961898804, + "kl": 0.32958984375, + "learning_rate": 1.956750898472881e-05, + "loss": 0.0145, + "reward": 0.5898437798023224, + "reward_std": 0.062089079059660435, + "rewards/accuracy_reward": 0.09598214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 618 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.0871124267578, + "epoch": 0.1849003061757897, + "grad_norm": 1.1675289869308472, + "kl": 0.30517578125, + "learning_rate": 1.9564469531998586e-05, + "loss": 0.0163, + "reward": 0.5106026977300644, + "reward_std": 0.05152327846735716, + "rewards/accuracy_reward": 0.01562500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 619 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.7924346923828, + "epoch": 0.18519901426331117, + "grad_norm": 0.6659085750579834, + "kl": 0.3564453125, + "learning_rate": 1.9561419674062928e-05, + "loss": 0.0142, + "reward": 0.5904017984867096, + "reward_std": 0.05121962702833116, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.2611999511719, + "epoch": 0.18549772235083264, + "grad_norm": 4.862548351287842, + "kl": 0.78125, + "learning_rate": 1.9558359414239786e-05, + "loss": 0.0354, + "reward": 0.540736623108387, + "reward_std": 0.126004284247756, + "rewards/accuracy_reward": 0.05580357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330484867096, + "step": 621 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.6339721679688, + "epoch": 0.18579643043835412, + "grad_norm": 2.35941481590271, + "kl": 0.41015625, + "learning_rate": 1.9555288755858425e-05, + "loss": 0.0208, + "reward": 0.6227678954601288, + "reward_std": 0.1237190030515194, + "rewards/accuracy_reward": 0.12723215110599995, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 622 + }, + { + "clip_ratio": 0.0, + "completion_length": 898.6741485595703, + "epoch": 0.1860951385258756, + "grad_norm": 0.8034854531288147, + "kl": 0.36181640625, + "learning_rate": 1.9552207702259412e-05, + "loss": 0.0176, + "reward": 0.6277902126312256, + "reward_std": 0.14310340024530888, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 623 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.9955749511719, + "epoch": 0.18639384661339706, + "grad_norm": 1.4715462923049927, + "kl": 0.28076171875, + "learning_rate": 1.9549116256794636e-05, + "loss": 0.0147, + "reward": 0.604910746216774, + "reward_std": 0.08934829756617546, + "rewards/accuracy_reward": 0.10937500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 624 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.3303985595703, + "epoch": 0.18669255470091853, + "grad_norm": 0.23076246678829193, + "kl": 0.249267578125, + "learning_rate": 1.9546014422827287e-05, + "loss": 0.0091, + "reward": 0.561941996216774, + "reward_std": 0.09254478709772229, + "rewards/accuracy_reward": 0.06473214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 625 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.9732666015625, + "epoch": 0.18699126278844, + "grad_norm": 0.3807874917984009, + "kl": 0.21728515625, + "learning_rate": 1.954290220373186e-05, + "loss": 0.0149, + "reward": 0.5786830484867096, + "reward_std": 0.0846599128562957, + "rewards/accuracy_reward": 0.08258929010480642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 626 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.6049652099609, + "epoch": 0.18728997087596147, + "grad_norm": 0.9326808452606201, + "kl": 0.2451171875, + "learning_rate": 1.9539779602894136e-05, + "loss": 0.0114, + "reward": 0.5948660969734192, + "reward_std": 0.10562057420611382, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 627 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.2344055175781, + "epoch": 0.18758867896348294, + "grad_norm": 2.329702615737915, + "kl": 0.41748046875, + "learning_rate": 1.9536646623711204e-05, + "loss": 0.0219, + "reward": 0.4994420036673546, + "reward_std": 0.0844681840389967, + "rewards/accuracy_reward": 0.024553573224693537, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.474888414144516, + "step": 628 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.4330749511719, + "epoch": 0.1878873870510044, + "grad_norm": 3.3101754188537598, + "kl": 0.44091796875, + "learning_rate": 1.9533503269591438e-05, + "loss": 0.0267, + "reward": 0.5954241454601288, + "reward_std": 0.08094887156039476, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4726562649011612, + "step": 629 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.5223541259766, + "epoch": 0.18818609513852588, + "grad_norm": 15.194128036499023, + "kl": 1.08447265625, + "learning_rate": 1.9530349543954495e-05, + "loss": 0.0733, + "reward": 0.435825914144516, + "reward_std": 0.13223367184400558, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4179687723517418, + "step": 630 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.1674652099609, + "epoch": 0.18848480322604735, + "grad_norm": 111.3593521118164, + "kl": 1.4052734375, + "learning_rate": 1.9527185450231328e-05, + "loss": 0.0972, + "reward": 0.4386160895228386, + "reward_std": 0.14389366284012794, + "rewards/accuracy_reward": 0.0290178582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4095982313156128, + "step": 631 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.0915679931641, + "epoch": 0.18878351131356882, + "grad_norm": 9.636686325073242, + "kl": 0.802734375, + "learning_rate": 1.9524010991864152e-05, + "loss": 0.054, + "reward": 0.5005580633878708, + "reward_std": 0.15562492795288563, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3934151977300644, + "step": 632 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.8460388183594, + "epoch": 0.1890822194010903, + "grad_norm": 2.6506357192993164, + "kl": 0.6142578125, + "learning_rate": 1.952082617230647e-05, + "loss": 0.0421, + "reward": 0.446428582072258, + "reward_std": 0.12106983549892902, + "rewards/accuracy_reward": 0.013392857974395156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4330357313156128, + "step": 633 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.8393249511719, + "epoch": 0.18938092748861177, + "grad_norm": 1.4905191659927368, + "kl": 0.470703125, + "learning_rate": 1.9517630995023057e-05, + "loss": 0.0383, + "reward": 0.584263414144516, + "reward_std": 0.1547522097826004, + "rewards/accuracy_reward": 0.11160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4726562723517418, + "step": 634 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.2879943847656, + "epoch": 0.18967963557613324, + "grad_norm": 4.891307353973389, + "kl": 0.39453125, + "learning_rate": 1.9514425463489946e-05, + "loss": 0.0387, + "reward": 0.451450914144516, + "reward_std": 0.11431140638887882, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.442522332072258, + "step": 635 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.6027221679688, + "epoch": 0.18997834366365468, + "grad_norm": 11.234376907348633, + "kl": 0.46630859375, + "learning_rate": 1.9511209581194447e-05, + "loss": 0.0494, + "reward": 0.470982164144516, + "reward_std": 0.13873201981186867, + "rewards/accuracy_reward": 0.02232142980210483, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4486607387661934, + "step": 636 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.4420166015625, + "epoch": 0.19027705175117615, + "grad_norm": 13.011640548706055, + "kl": 0.453125, + "learning_rate": 1.9507983351635124e-05, + "loss": 0.0462, + "reward": 0.4804687723517418, + "reward_std": 0.1256442591547966, + "rewards/accuracy_reward": 0.022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4581473469734192, + "step": 637 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.3683624267578, + "epoch": 0.19057575983869762, + "grad_norm": 0.8248875737190247, + "kl": 0.235107421875, + "learning_rate": 1.9504746778321793e-05, + "loss": 0.0393, + "reward": 0.5485491380095482, + "reward_std": 0.09646591730415821, + "rewards/accuracy_reward": 0.08482143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4637276977300644, + "step": 638 + }, + { + "clip_ratio": 0.0, + "completion_length": 911.5268249511719, + "epoch": 0.1908744679262191, + "grad_norm": 4.751067161560059, + "kl": 0.274169921875, + "learning_rate": 1.9501499864775536e-05, + "loss": 0.0406, + "reward": 0.5753348544239998, + "reward_std": 0.09156756941229105, + "rewards/accuracy_reward": 0.0959821492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4793526977300644, + "step": 639 + }, + { + "clip_ratio": 0.0, + "completion_length": 915.4531707763672, + "epoch": 0.19117317601374056, + "grad_norm": 0.12738536298274994, + "kl": 0.24169921875, + "learning_rate": 1.9498242614528672e-05, + "loss": 0.0104, + "reward": 0.595982164144516, + "reward_std": 0.07249427773058414, + "rewards/accuracy_reward": 0.0959821492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 640 + }, + { + "clip_ratio": 0.0, + "completion_length": 926.5870819091797, + "epoch": 0.19147188410126204, + "grad_norm": 0.16550853848457336, + "kl": 0.26220703125, + "learning_rate": 1.9494975031124768e-05, + "loss": 0.0122, + "reward": 0.5691964626312256, + "reward_std": 0.06962599884718657, + "rewards/accuracy_reward": 0.0691964307334274, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 641 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.2946929931641, + "epoch": 0.1917705921887835, + "grad_norm": 0.28752753138542175, + "kl": 0.2880859375, + "learning_rate": 1.9491697118118643e-05, + "loss": 0.0177, + "reward": 0.565848246216774, + "reward_std": 0.07085918635129929, + "rewards/accuracy_reward": 0.06696428824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 642 + }, + { + "clip_ratio": 0.0, + "completion_length": 923.2344207763672, + "epoch": 0.19206930027630498, + "grad_norm": 0.9742460250854492, + "kl": 0.30517578125, + "learning_rate": 1.9488408879076336e-05, + "loss": 0.0157, + "reward": 0.6199777126312256, + "reward_std": 0.051334453048184514, + "rewards/accuracy_reward": 0.12500000861473382, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 643 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.2656707763672, + "epoch": 0.19236800836382645, + "grad_norm": 0.8899763822555542, + "kl": 0.2841796875, + "learning_rate": 1.9485110317575134e-05, + "loss": 0.0134, + "reward": 0.5993303656578064, + "reward_std": 0.08189507015049458, + "rewards/accuracy_reward": 0.10491071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196492433548, + "step": 644 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.0826263427734, + "epoch": 0.19266671645134792, + "grad_norm": 0.5775842666625977, + "kl": 0.28076171875, + "learning_rate": 1.9481801437203547e-05, + "loss": 0.0112, + "reward": 0.5652901977300644, + "reward_std": 0.0882484526373446, + "rewards/accuracy_reward": 0.0669642873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 645 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.6674652099609, + "epoch": 0.1929654245388694, + "grad_norm": 0.7362959384918213, + "kl": 0.3359375, + "learning_rate": 1.9478482241561312e-05, + "loss": 0.005, + "reward": 0.6010044813156128, + "reward_std": 0.10509405937045813, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687649011612, + "step": 646 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.2857513427734, + "epoch": 0.19326413262639086, + "grad_norm": 2.649111747741699, + "kl": 0.4150390625, + "learning_rate": 1.947515273425939e-05, + "loss": 0.0181, + "reward": 0.5050223544239998, + "reward_std": 0.10717353876680136, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4581473469734192, + "step": 647 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.9420013427734, + "epoch": 0.19356284071391233, + "grad_norm": 2.450873374938965, + "kl": 0.50341796875, + "learning_rate": 1.9471812918919958e-05, + "loss": 0.0209, + "reward": 0.5792411044239998, + "reward_std": 0.10371054895222187, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4720982313156128, + "step": 648 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.0335235595703, + "epoch": 0.1938615488014338, + "grad_norm": 0.7489220499992371, + "kl": 0.3974609375, + "learning_rate": 1.9468462799176407e-05, + "loss": 0.0161, + "reward": 0.6026785969734192, + "reward_std": 0.126133406534791, + "rewards/accuracy_reward": 0.11830357951112092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750149011612, + "step": 649 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2455596923828, + "epoch": 0.19416025688895527, + "grad_norm": 0.8324055671691895, + "kl": 0.37890625, + "learning_rate": 1.946510237867334e-05, + "loss": 0.0159, + "reward": 0.5926339626312256, + "reward_std": 0.12784786708652973, + "rewards/accuracy_reward": 0.11383928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4787946715950966, + "step": 650 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0424346923828, + "epoch": 0.19445896497647674, + "grad_norm": 0.8108119964599609, + "kl": 0.42529296875, + "learning_rate": 1.9461731661066564e-05, + "loss": 0.0172, + "reward": 0.5781250298023224, + "reward_std": 0.10726405866444111, + "rewards/accuracy_reward": 0.10044643213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4776785895228386, + "step": 651 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.3973388671875, + "epoch": 0.19475767306399822, + "grad_norm": 0.3904898166656494, + "kl": 0.49267578125, + "learning_rate": 1.9458350650023092e-05, + "loss": 0.0197, + "reward": 0.5703125298023224, + "reward_std": 0.07355655264109373, + "rewards/accuracy_reward": 0.08482143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 652 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.0268096923828, + "epoch": 0.1950563811515197, + "grad_norm": 0.593852698802948, + "kl": 0.533203125, + "learning_rate": 1.945495934922113e-05, + "loss": 0.0213, + "reward": 0.538504496216774, + "reward_std": 0.10099872387945652, + "rewards/accuracy_reward": 0.058035715483129025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687798023224, + "step": 653 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.4308319091797, + "epoch": 0.19535508923904116, + "grad_norm": 0.43533244729042053, + "kl": 0.4736328125, + "learning_rate": 1.945155776235008e-05, + "loss": 0.0192, + "reward": 0.4921875149011612, + "reward_std": 0.06434649787843227, + "rewards/accuracy_reward": 0.006696428870782256, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 654 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.5870666503906, + "epoch": 0.19565379732656263, + "grad_norm": 0.41510939598083496, + "kl": 0.4814453125, + "learning_rate": 1.944814589311054e-05, + "loss": 0.0193, + "reward": 0.4983259215950966, + "reward_std": 0.06705737859010696, + "rewards/accuracy_reward": 0.011160714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 655 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6451263427734, + "epoch": 0.1959525054140841, + "grad_norm": 0.4333215057849884, + "kl": 0.40185546875, + "learning_rate": 1.9444723745214285e-05, + "loss": 0.0164, + "reward": 0.5864955484867096, + "reward_std": 0.07016349025070667, + "rewards/accuracy_reward": 0.09598214854486287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 656 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.4420318603516, + "epoch": 0.19625121350160554, + "grad_norm": 0.44700387120246887, + "kl": 0.35595703125, + "learning_rate": 1.9441291322384275e-05, + "loss": 0.015, + "reward": 0.5691964402794838, + "reward_std": 0.06659717694856226, + "rewards/accuracy_reward": 0.0758928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 657 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8861999511719, + "epoch": 0.196549921589127, + "grad_norm": 0.6631210446357727, + "kl": 0.36865234375, + "learning_rate": 1.9437848628354655e-05, + "loss": 0.0078, + "reward": 0.5831473469734192, + "reward_std": 0.055217358749359846, + "rewards/accuracy_reward": 0.08705357438884676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937574505806, + "step": 658 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.8192443847656, + "epoch": 0.19684862967664848, + "grad_norm": 2.8098201751708984, + "kl": 0.30908203125, + "learning_rate": 1.9434395666870735e-05, + "loss": 0.0124, + "reward": 0.5703125298023224, + "reward_std": 0.02090683183632791, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 659 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.513427734375, + "epoch": 0.19714733776416996, + "grad_norm": 1.786787986755371, + "kl": 0.3505859375, + "learning_rate": 1.9430932441688998e-05, + "loss": 0.0066, + "reward": 0.540178582072258, + "reward_std": 0.1004242617636919, + "rewards/accuracy_reward": 0.042410716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 660 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.1585235595703, + "epoch": 0.19744604585169143, + "grad_norm": 1.5336588621139526, + "kl": 0.329833984375, + "learning_rate": 1.9427458956577098e-05, + "loss": 0.0053, + "reward": 0.7254464477300644, + "reward_std": 0.08515418786555529, + "rewards/accuracy_reward": 0.2276785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 661 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.2857666015625, + "epoch": 0.1977447539392129, + "grad_norm": 0.22425924241542816, + "kl": 0.246826171875, + "learning_rate": 1.942397521531384e-05, + "loss": 0.0095, + "reward": 0.5396205484867096, + "reward_std": 0.053728269413113594, + "rewards/accuracy_reward": 0.0401785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 662 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.2366485595703, + "epoch": 0.19804346202673437, + "grad_norm": 0.6081909537315369, + "kl": 0.2509765625, + "learning_rate": 1.9420481221689203e-05, + "loss": 0.0107, + "reward": 0.5435267984867096, + "reward_std": 0.06815916486084461, + "rewards/accuracy_reward": 0.04464285774156451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 663 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.9129943847656, + "epoch": 0.19834217011425584, + "grad_norm": 3.0976786613464355, + "kl": 0.429443359375, + "learning_rate": 1.9416976979504297e-05, + "loss": 0.0182, + "reward": 0.5362723469734192, + "reward_std": 0.015625000465661287, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 664 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.7545166015625, + "epoch": 0.1986408782017773, + "grad_norm": 0.10430505871772766, + "kl": 0.20654296875, + "learning_rate": 1.9413462492571403e-05, + "loss": 0.0096, + "reward": 0.5558035969734192, + "reward_std": 0.08641109615564346, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 665 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.3504943847656, + "epoch": 0.19893958628929878, + "grad_norm": 0.09554779529571533, + "kl": 0.218505859375, + "learning_rate": 1.940993776471393e-05, + "loss": 0.009, + "reward": 0.6445312798023224, + "reward_std": 0.07023734040558338, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 666 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.3214721679688, + "epoch": 0.19923829437682025, + "grad_norm": 1.4165154695510864, + "kl": 0.284423828125, + "learning_rate": 1.9406402799766452e-05, + "loss": 0.0114, + "reward": 0.5301339477300644, + "reward_std": 0.06910851155407727, + "rewards/accuracy_reward": 0.03125000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 667 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.0022735595703, + "epoch": 0.19953700246434172, + "grad_norm": 0.3996261656284332, + "kl": 0.2236328125, + "learning_rate": 1.940285760157465e-05, + "loss": 0.0127, + "reward": 0.6757812798023224, + "reward_std": 0.07932299748063087, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 668 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.5670013427734, + "epoch": 0.1998357105518632, + "grad_norm": 0.32040175795555115, + "kl": 0.225341796875, + "learning_rate": 1.9399302173995354e-05, + "loss": 0.0092, + "reward": 0.6579241305589676, + "reward_std": 0.04281733650714159, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 669 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.4152221679688, + "epoch": 0.20013441863938466, + "grad_norm": 0.34570303559303284, + "kl": 0.218994140625, + "learning_rate": 1.9395736520896528e-05, + "loss": 0.0118, + "reward": 0.5474330484867096, + "reward_std": 0.0509735606610775, + "rewards/accuracy_reward": 0.05133928684517741, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 670 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.1295013427734, + "epoch": 0.20043312672690614, + "grad_norm": 0.17571312189102173, + "kl": 0.21484375, + "learning_rate": 1.9392160646157242e-05, + "loss": 0.0081, + "reward": 0.6121652126312256, + "reward_std": 0.04926398349925876, + "rewards/accuracy_reward": 0.11383929336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 671 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.1652069091797, + "epoch": 0.2007318348144276, + "grad_norm": 0.1660800576210022, + "kl": 0.2138671875, + "learning_rate": 1.938857455366771e-05, + "loss": 0.0088, + "reward": 0.6450893133878708, + "reward_std": 0.07802686281502247, + "rewards/accuracy_reward": 0.1473214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 672 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.6629943847656, + "epoch": 0.20103054290194908, + "grad_norm": 0.21119655668735504, + "kl": 0.22607421875, + "learning_rate": 1.9384978247329238e-05, + "loss": 0.0133, + "reward": 0.6093750149011612, + "reward_std": 0.10158211551606655, + "rewards/accuracy_reward": 0.11160714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 673 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.0335235595703, + "epoch": 0.20132925098947055, + "grad_norm": 0.4884754717350006, + "kl": 0.238037109375, + "learning_rate": 1.9381371731054263e-05, + "loss": 0.0106, + "reward": 0.5440848469734192, + "reward_std": 0.08412562310695648, + "rewards/accuracy_reward": 0.049107145983725786, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 674 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.7210388183594, + "epoch": 0.20162795907699202, + "grad_norm": 0.21077343821525574, + "kl": 0.23779296875, + "learning_rate": 1.9377755008766316e-05, + "loss": 0.0113, + "reward": 0.5106026977300644, + "reward_std": 0.06436354713514447, + "rewards/accuracy_reward": 0.015625000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 675 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.7835235595703, + "epoch": 0.2019266671645135, + "grad_norm": 0.46964001655578613, + "kl": 0.25927734375, + "learning_rate": 1.9374128084400038e-05, + "loss": 0.0096, + "reward": 0.601004496216774, + "reward_std": 0.14055018685758114, + "rewards/accuracy_reward": 0.10937500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 676 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.0781555175781, + "epoch": 0.20222537525203496, + "grad_norm": 0.8845958113670349, + "kl": 0.27734375, + "learning_rate": 1.937049096190117e-05, + "loss": 0.0109, + "reward": 0.5892857387661934, + "reward_std": 0.09094086103141308, + "rewards/accuracy_reward": 0.10267857555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 677 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.5625610351562, + "epoch": 0.20252408333955643, + "grad_norm": 1.1504783630371094, + "kl": 0.42626953125, + "learning_rate": 1.936684364522654e-05, + "loss": 0.0306, + "reward": 0.5842634215950966, + "reward_std": 0.10099867172539234, + "rewards/accuracy_reward": 0.12053571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.463727705180645, + "step": 678 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.9263916015625, + "epoch": 0.20282279142707788, + "grad_norm": 3.11769437789917, + "kl": 0.9619140625, + "learning_rate": 1.9363186138344075e-05, + "loss": 0.0658, + "reward": 0.4693080559372902, + "reward_std": 0.13881143927574158, + "rewards/accuracy_reward": 0.044642857974395156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4246651977300644, + "step": 679 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.8460388183594, + "epoch": 0.20312149951459935, + "grad_norm": 7.4138617515563965, + "kl": 1.3046875, + "learning_rate": 1.9359518445232778e-05, + "loss": 0.0903, + "reward": 0.5351562649011612, + "reward_std": 0.13678933307528496, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4101562723517418, + "step": 680 + }, + { + "clip_ratio": 0.0, + "completion_length": 888.2745971679688, + "epoch": 0.20342020760212082, + "grad_norm": 9.435812950134277, + "kl": 0.9912109375, + "learning_rate": 1.935584056988275e-05, + "loss": 0.1282, + "reward": 0.5100446715950966, + "reward_std": 0.15398170053958893, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4341518059372902, + "step": 681 + }, + { + "clip_ratio": 0.0, + "completion_length": 864.0156555175781, + "epoch": 0.2037189156896423, + "grad_norm": 8.553121566772461, + "kl": 0.97265625, + "learning_rate": 1.935215251629515e-05, + "loss": 0.132, + "reward": 0.5401785895228386, + "reward_std": 0.1802242361009121, + "rewards/accuracy_reward": 0.10044643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4397321566939354, + "step": 682 + }, + { + "clip_ratio": 0.0, + "completion_length": 827.1830749511719, + "epoch": 0.20401762377716376, + "grad_norm": 4.247196674346924, + "kl": 0.7744140625, + "learning_rate": 1.934845428848222e-05, + "loss": 0.1463, + "reward": 0.541852705180645, + "reward_std": 0.13188962638378143, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.448102705180645, + "step": 683 + }, + { + "clip_ratio": 0.0, + "completion_length": 776.6629943847656, + "epoch": 0.20431633186468523, + "grad_norm": 9.282243728637695, + "kl": 0.703125, + "learning_rate": 1.9344745890467273e-05, + "loss": 0.1402, + "reward": 0.5011160969734192, + "reward_std": 0.0885338094085455, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4609375223517418, + "step": 684 + }, + { + "clip_ratio": 0.0, + "completion_length": 747.5491333007812, + "epoch": 0.2046150399522067, + "grad_norm": 9.94737434387207, + "kl": 0.97265625, + "learning_rate": 1.934102732628468e-05, + "loss": 0.1683, + "reward": 0.5206473395228386, + "reward_std": 0.16818701103329659, + "rewards/accuracy_reward": 0.06250000302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.458147332072258, + "step": 685 + }, + { + "clip_ratio": 0.0, + "completion_length": 718.9375305175781, + "epoch": 0.20491374803972817, + "grad_norm": 14.322884559631348, + "kl": 1.63671875, + "learning_rate": 1.9337298599979877e-05, + "loss": 0.2343, + "reward": 0.521205373108387, + "reward_std": 0.13876307010650635, + "rewards/accuracy_reward": 0.06026786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4609375223517418, + "step": 686 + }, + { + "clip_ratio": 0.0, + "completion_length": 718.6428833007812, + "epoch": 0.20521245612724964, + "grad_norm": 10.40839672088623, + "kl": 1.3720703125, + "learning_rate": 1.933355971560935e-05, + "loss": 0.2062, + "reward": 0.493861623108387, + "reward_std": 0.1493488885462284, + "rewards/accuracy_reward": 0.029017859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4648437723517418, + "step": 687 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.0692138671875, + "epoch": 0.2055111642147711, + "grad_norm": 15.63669204711914, + "kl": 0.974609375, + "learning_rate": 1.9329810677240643e-05, + "loss": 0.2155, + "reward": 0.558035746216774, + "reward_std": 0.14045224338769913, + "rewards/accuracy_reward": 0.08482143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143059372902, + "step": 688 + }, + { + "clip_ratio": 0.0, + "completion_length": 622.8995819091797, + "epoch": 0.20580987230229258, + "grad_norm": 138.53558349609375, + "kl": 4.04296875, + "learning_rate": 1.9326051488952334e-05, + "loss": 0.3739, + "reward": 0.537946455180645, + "reward_std": 0.10639236681163311, + "rewards/accuracy_reward": 0.05580357299186289, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482142873108387, + "step": 689 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.4576110839844, + "epoch": 0.20610858038981406, + "grad_norm": 37.627105712890625, + "kl": 2.513671875, + "learning_rate": 1.9322282154834055e-05, + "loss": 0.2829, + "reward": 0.6049107313156128, + "reward_std": 0.14356680028140545, + "rewards/accuracy_reward": 0.13169643771834671, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143059372902, + "step": 690 + }, + { + "clip_ratio": 0.0, + "completion_length": 628.5669860839844, + "epoch": 0.20640728847733553, + "grad_norm": 200.51248168945312, + "kl": 5.1220703125, + "learning_rate": 1.9318502678986476e-05, + "loss": 0.4499, + "reward": 0.6277902126312256, + "reward_std": 0.07522956561297178, + "rewards/accuracy_reward": 0.1495535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4782366305589676, + "step": 691 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.3549346923828, + "epoch": 0.206705996564857, + "grad_norm": 10.682424545288086, + "kl": 1.26171875, + "learning_rate": 1.9314713065521294e-05, + "loss": 0.2472, + "reward": 0.5027901977300644, + "reward_std": 0.08748153410851955, + "rewards/accuracy_reward": 0.022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687649011612, + "step": 692 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.4910888671875, + "epoch": 0.20700470465237847, + "grad_norm": 14.099264144897461, + "kl": 1.591796875, + "learning_rate": 1.9310913318561235e-05, + "loss": 0.1715, + "reward": 0.5574777126312256, + "reward_std": 0.03928318666294217, + "rewards/accuracy_reward": 0.0669642873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 693 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.6361999511719, + "epoch": 0.20730341273989994, + "grad_norm": 1.6668970584869385, + "kl": 0.77734375, + "learning_rate": 1.9307103442240054e-05, + "loss": 0.073, + "reward": 0.5876116305589676, + "reward_std": 0.0563204069621861, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 694 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.2031555175781, + "epoch": 0.2076021208274214, + "grad_norm": 17.97898292541504, + "kl": 0.80078125, + "learning_rate": 1.9303283440702524e-05, + "loss": 0.0632, + "reward": 0.6752232313156128, + "reward_std": 0.07494255271740258, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 695 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.2366180419922, + "epoch": 0.20790082891494288, + "grad_norm": 24.51415252685547, + "kl": 0.8818359375, + "learning_rate": 1.9299453318104428e-05, + "loss": 0.0684, + "reward": 0.5435268133878708, + "reward_std": 0.09122738055884838, + "rewards/accuracy_reward": 0.04464285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 696 + }, + { + "clip_ratio": 0.0, + "completion_length": 616.513427734375, + "epoch": 0.20819953700246435, + "grad_norm": 0.8897382020950317, + "kl": 0.38916015625, + "learning_rate": 1.9295613078612566e-05, + "loss": 0.0349, + "reward": 0.5747768133878708, + "reward_std": 0.022321430034935474, + "rewards/accuracy_reward": 0.07589285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 697 + }, + { + "clip_ratio": 0.0, + "completion_length": 623.5870819091797, + "epoch": 0.20849824508998582, + "grad_norm": 1.594877004623413, + "kl": 0.390625, + "learning_rate": 1.9291762726404742e-05, + "loss": 0.0144, + "reward": 0.5937500149011612, + "reward_std": 0.0604401770979166, + "rewards/accuracy_reward": 0.09375000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 698 + }, + { + "clip_ratio": 0.0, + "completion_length": 690.9531555175781, + "epoch": 0.2087969531775073, + "grad_norm": 0.8921732306480408, + "kl": 0.2978515625, + "learning_rate": 1.9287902265669764e-05, + "loss": 0.023, + "reward": 0.5435268133878708, + "reward_std": 0.020436199847608805, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 699 + }, + { + "clip_ratio": 0.0, + "completion_length": 755.8393249511719, + "epoch": 0.20909566126502874, + "grad_norm": 0.9147152900695801, + "kl": 0.297607421875, + "learning_rate": 1.9284031700607434e-05, + "loss": 0.0179, + "reward": 0.6021205633878708, + "reward_std": 0.06471499986946583, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 700 + }, + { + "clip_ratio": 0.0, + "completion_length": 765.8906555175781, + "epoch": 0.2093943693525502, + "grad_norm": 0.7356348633766174, + "kl": 0.291015625, + "learning_rate": 1.9280151035428544e-05, + "loss": 0.0151, + "reward": 0.6590402126312256, + "reward_std": 0.11298741400241852, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 701 + }, + { + "clip_ratio": 0.0, + "completion_length": 880.7678985595703, + "epoch": 0.20969307744007168, + "grad_norm": 0.894266664981842, + "kl": 0.314453125, + "learning_rate": 1.9276260274354884e-05, + "loss": 0.013, + "reward": 0.5005580484867096, + "reward_std": 0.01562500069849193, + "rewards/accuracy_reward": 0.0022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 702 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.6562957763672, + "epoch": 0.20999178552759315, + "grad_norm": 0.11563654243946075, + "kl": 0.206787109375, + "learning_rate": 1.927235942161921e-05, + "loss": 0.0144, + "reward": 0.5915178954601288, + "reward_std": 0.09067067038267851, + "rewards/accuracy_reward": 0.09151786426082253, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 703 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.7634429931641, + "epoch": 0.21029049361511462, + "grad_norm": 1.8831990957260132, + "kl": 0.53271484375, + "learning_rate": 1.9268448481465282e-05, + "loss": 0.0235, + "reward": 0.5697544813156128, + "reward_std": 0.052724237786605954, + "rewards/accuracy_reward": 0.07142857415601611, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 704 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.357177734375, + "epoch": 0.2105892017026361, + "grad_norm": 4.036041259765625, + "kl": 0.8916015625, + "learning_rate": 1.9264527458147807e-05, + "loss": 0.0334, + "reward": 0.525669664144516, + "reward_std": 0.08825564384460449, + "rewards/accuracy_reward": 0.029017857974395156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 705 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.0737152099609, + "epoch": 0.21088790979015756, + "grad_norm": 0.2969801425933838, + "kl": 0.293701171875, + "learning_rate": 1.926059635593248e-05, + "loss": 0.016, + "reward": 0.569754496216774, + "reward_std": 0.10155838658101857, + "rewards/accuracy_reward": 0.07812500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 706 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0223693847656, + "epoch": 0.21118661787767903, + "grad_norm": 0.7787396907806396, + "kl": 0.3408203125, + "learning_rate": 1.9256655179095954e-05, + "loss": 0.0146, + "reward": 0.494977705180645, + "reward_std": 0.05191523628309369, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 707 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.1183319091797, + "epoch": 0.2114853259652005, + "grad_norm": 0.470742791891098, + "kl": 0.3603515625, + "learning_rate": 1.9252703931925843e-05, + "loss": 0.0124, + "reward": 0.5446428805589676, + "reward_std": 0.09686651080846786, + "rewards/accuracy_reward": 0.0602678582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 708 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.3661041259766, + "epoch": 0.21178403405272198, + "grad_norm": 2.137824058532715, + "kl": 0.65234375, + "learning_rate": 1.9248742618720714e-05, + "loss": 0.0259, + "reward": 0.5859375223517418, + "reward_std": 0.11687659937888384, + "rewards/accuracy_reward": 0.10267857694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.483258955180645, + "step": 709 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8192138671875, + "epoch": 0.21208274214024345, + "grad_norm": 0.6474362015724182, + "kl": 0.55517578125, + "learning_rate": 1.9244771243790092e-05, + "loss": 0.0222, + "reward": 0.5937500298023224, + "reward_std": 0.06842645118013024, + "rewards/accuracy_reward": 0.10714285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071566939354, + "step": 710 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.5156402587891, + "epoch": 0.21238145022776492, + "grad_norm": 1.39689302444458, + "kl": 0.681640625, + "learning_rate": 1.9240789811454443e-05, + "loss": 0.0273, + "reward": 0.556919664144516, + "reward_std": 0.0973666300997138, + "rewards/accuracy_reward": 0.06696428940631449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 711 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.4866638183594, + "epoch": 0.2126801583152864, + "grad_norm": 9.243525505065918, + "kl": 1.7333984375, + "learning_rate": 1.9236798326045173e-05, + "loss": 0.0692, + "reward": 0.6210937798023224, + "reward_std": 0.13169757463037968, + "rewards/accuracy_reward": 0.13392858020961285, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 712 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.966552734375, + "epoch": 0.21297886640280786, + "grad_norm": 1.5170968770980835, + "kl": 1.375, + "learning_rate": 1.9232796791904627e-05, + "loss": 0.055, + "reward": 0.6227678954601288, + "reward_std": 0.10817982070147991, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 713 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.21327757449032933, + "grad_norm": 1.951468825340271, + "kl": 0.685546875, + "learning_rate": 1.9228785213386082e-05, + "loss": 0.0274, + "reward": 0.5513392984867096, + "reward_std": 0.1312995869666338, + "rewards/accuracy_reward": 0.06026786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 714 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2135762825778508, + "grad_norm": 6.821241855621338, + "kl": 0.921875, + "learning_rate": 1.9224763594853747e-05, + "loss": 0.0369, + "reward": 0.5178571790456772, + "reward_std": 0.11279823072254658, + "rewards/accuracy_reward": 0.03125000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071566939354, + "step": 715 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7611694335938, + "epoch": 0.21387499066537227, + "grad_norm": 3.825756072998047, + "kl": 1.474609375, + "learning_rate": 1.9220731940682738e-05, + "loss": 0.059, + "reward": 0.580357164144516, + "reward_std": 0.06854905374348164, + "rewards/accuracy_reward": 0.0959821455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 716 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.21417369875289374, + "grad_norm": 28.013551712036133, + "kl": 3.59375, + "learning_rate": 1.9216690255259113e-05, + "loss": 0.1438, + "reward": 0.5485491454601288, + "reward_std": 0.10120485071092844, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 717 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.21447240684041521, + "grad_norm": 100.46196746826172, + "kl": 8.84375, + "learning_rate": 1.921263854297982e-05, + "loss": 0.3534, + "reward": 0.5345982387661934, + "reward_std": 0.0868649436160922, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.483258955180645, + "step": 718 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.21477111492793668, + "grad_norm": 41.1817626953125, + "kl": 3.81640625, + "learning_rate": 1.9208576808252725e-05, + "loss": 0.1524, + "reward": 0.6138393208384514, + "reward_std": 0.11352726817131042, + "rewards/accuracy_reward": 0.12500000861473382, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888392984867096, + "step": 719 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.21506982301545816, + "grad_norm": 17.44343376159668, + "kl": 3.4296875, + "learning_rate": 1.9204505055496605e-05, + "loss": 0.1375, + "reward": 0.5279018208384514, + "reward_std": 0.13960541412234306, + "rewards/accuracy_reward": 0.0491071455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4787946715950966, + "step": 720 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8950958251953, + "epoch": 0.21536853110297963, + "grad_norm": 12.620870590209961, + "kl": 1.349609375, + "learning_rate": 1.920042328914112e-05, + "loss": 0.0518, + "reward": 0.5636160969734192, + "reward_std": 0.1196541041135788, + "rewards/accuracy_reward": 0.09151786100119352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4720982387661934, + "step": 721 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3705749511719, + "epoch": 0.21566723919050107, + "grad_norm": 12.561417579650879, + "kl": 0.728515625, + "learning_rate": 1.9196331513626836e-05, + "loss": 0.0247, + "reward": 0.5284598395228386, + "reward_std": 0.12688583135604858, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.474888414144516, + "step": 722 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.21596594727802254, + "grad_norm": 12.417606353759766, + "kl": 0.83203125, + "learning_rate": 1.9192229733405204e-05, + "loss": 0.0333, + "reward": 0.6049107313156128, + "reward_std": 0.09371221251785755, + "rewards/accuracy_reward": 0.12500000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107387661934, + "step": 723 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.216264655365544, + "grad_norm": 10.435410499572754, + "kl": 1.56640625, + "learning_rate": 1.9188117952938557e-05, + "loss": 0.0626, + "reward": 0.5558035969734192, + "reward_std": 0.12284088134765625, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.475446455180645, + "step": 724 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.21656336345306548, + "grad_norm": 50.60006332397461, + "kl": 6.58203125, + "learning_rate": 1.918399617670011e-05, + "loss": 0.2635, + "reward": 0.5479911044239998, + "reward_std": 0.09395855665206909, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4698660969734192, + "step": 725 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.21686207154058695, + "grad_norm": 47.849082946777344, + "kl": 6.421875, + "learning_rate": 1.9179864409173947e-05, + "loss": 0.2573, + "reward": 0.5518973544239998, + "reward_std": 0.09004351496696472, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4737723469734192, + "step": 726 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0513458251953, + "epoch": 0.21716077962810842, + "grad_norm": 67.75761413574219, + "kl": 8.21875, + "learning_rate": 1.9175722654855033e-05, + "loss": 0.3276, + "reward": 0.5418526977300644, + "reward_std": 0.1338912695646286, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4704241305589676, + "step": 727 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2589569091797, + "epoch": 0.2174594877156299, + "grad_norm": 1.8416246175765991, + "kl": 1.919921875, + "learning_rate": 1.917157091824919e-05, + "loss": 0.0768, + "reward": 0.537388414144516, + "reward_std": 0.0750106479972601, + "rewards/accuracy_reward": 0.05357143026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169813156128, + "step": 728 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.21775819580315137, + "grad_norm": 1.6852625608444214, + "kl": 2.59765625, + "learning_rate": 1.9167409203873095e-05, + "loss": 0.104, + "reward": 0.5870535969734192, + "reward_std": 0.13253043591976166, + "rewards/accuracy_reward": 0.11383928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143133878708, + "step": 729 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.9486694335938, + "epoch": 0.21805690389067284, + "grad_norm": 6.766231536865234, + "kl": 2.6484375, + "learning_rate": 1.916323751625429e-05, + "loss": 0.1062, + "reward": 0.5424107313156128, + "reward_std": 0.1182341855019331, + "rewards/accuracy_reward": 0.06026786123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 730 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2183556119781943, + "grad_norm": 3.7828783988952637, + "kl": 2.328125, + "learning_rate": 1.9159055859931163e-05, + "loss": 0.093, + "reward": 0.5446428805589676, + "reward_std": 0.10318935592658818, + "rewards/accuracy_reward": 0.0647321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107313156128, + "step": 731 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.21865432006571578, + "grad_norm": 4.9503889083862305, + "kl": 1.712890625, + "learning_rate": 1.915486423945294e-05, + "loss": 0.0685, + "reward": 0.5753348469734192, + "reward_std": 0.1247345432639122, + "rewards/accuracy_reward": 0.09375000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 732 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.21895302815323725, + "grad_norm": 4.212091445922852, + "kl": 3.044921875, + "learning_rate": 1.9150662659379705e-05, + "loss": 0.1218, + "reward": 0.5518973395228386, + "reward_std": 0.1277256105095148, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4737723469734192, + "step": 733 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.21925173624075872, + "grad_norm": 4.248727321624756, + "kl": 1.58203125, + "learning_rate": 1.914645112428235e-05, + "loss": 0.0632, + "reward": 0.5697544887661934, + "reward_std": 0.12547817453742027, + "rewards/accuracy_reward": 0.0892857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687723517418, + "step": 734 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2195504443282802, + "grad_norm": 6.611210346221924, + "kl": 1.18359375, + "learning_rate": 1.9142229638742623e-05, + "loss": 0.0474, + "reward": 0.5970982313156128, + "reward_std": 0.09159395284950733, + "rewards/accuracy_reward": 0.11160714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 735 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.9196472167969, + "epoch": 0.21984915241580166, + "grad_norm": 23.785200119018555, + "kl": 3.619140625, + "learning_rate": 1.913799820735309e-05, + "loss": 0.1447, + "reward": 0.5518973544239998, + "reward_std": 0.1248176321387291, + "rewards/accuracy_reward": 0.06696428754366934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 736 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.3817138671875, + "epoch": 0.22014786050332313, + "grad_norm": 37.6086311340332, + "kl": 5.0, + "learning_rate": 1.9133756834717118e-05, + "loss": 0.1936, + "reward": 0.5256696566939354, + "reward_std": 0.08569269720464945, + "rewards/accuracy_reward": 0.04464285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810267984867096, + "step": 737 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.9241333007812, + "epoch": 0.2204465685908446, + "grad_norm": 61.38288116455078, + "kl": 5.623046875, + "learning_rate": 1.9129505525448917e-05, + "loss": 0.2268, + "reward": 0.4960937649011612, + "reward_std": 0.07180869020521641, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 738 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.22074527667836608, + "grad_norm": 7.251669883728027, + "kl": 1.2353515625, + "learning_rate": 1.9125244284173497e-05, + "loss": 0.0493, + "reward": 0.5189732387661934, + "reward_std": 0.11716912779957056, + "rewards/accuracy_reward": 0.03348214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910969734192, + "step": 739 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8147430419922, + "epoch": 0.22104398476588755, + "grad_norm": 9.424324989318848, + "kl": 0.990234375, + "learning_rate": 1.912097311552666e-05, + "loss": 0.0396, + "reward": 0.5987723395228386, + "reward_std": 0.0581556586548686, + "rewards/accuracy_reward": 0.11160714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 740 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8236694335938, + "epoch": 0.22134269285340902, + "grad_norm": 2.316167116165161, + "kl": 0.7314453125, + "learning_rate": 1.9116692024155026e-05, + "loss": 0.0278, + "reward": 0.529575914144516, + "reward_std": 0.0935511402785778, + "rewards/accuracy_reward": 0.03571428684517741, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 741 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2216414009409305, + "grad_norm": 13.48326301574707, + "kl": 2.42578125, + "learning_rate": 1.9112401014716004e-05, + "loss": 0.0969, + "reward": 0.5613839477300644, + "reward_std": 0.08409876935184002, + "rewards/accuracy_reward": 0.07366071734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 742 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.22194010902845193, + "grad_norm": 12.017850875854492, + "kl": 1.951171875, + "learning_rate": 1.9108100091877787e-05, + "loss": 0.078, + "reward": 0.5178571790456772, + "reward_std": 0.08331916900351644, + "rewards/accuracy_reward": 0.0267857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 743 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.9151916503906, + "epoch": 0.2222388171159734, + "grad_norm": 2.6548731327056885, + "kl": 1.5859375, + "learning_rate": 1.9103789260319362e-05, + "loss": 0.0634, + "reward": 0.6361607387661934, + "reward_std": 0.058256207033991814, + "rewards/accuracy_reward": 0.14508929220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 744 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.2812652587891, + "epoch": 0.22253752520349487, + "grad_norm": 4.8434672355651855, + "kl": 0.84130859375, + "learning_rate": 1.9099468524730485e-05, + "loss": 0.0311, + "reward": 0.509486623108387, + "reward_std": 0.08994589652866125, + "rewards/accuracy_reward": 0.020089287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 745 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2768096923828, + "epoch": 0.22283623329101634, + "grad_norm": 4.985926628112793, + "kl": 0.7197265625, + "learning_rate": 1.90951378898117e-05, + "loss": 0.0286, + "reward": 0.5340402126312256, + "reward_std": 0.10301979538053274, + "rewards/accuracy_reward": 0.0424107164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 746 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.4709930419922, + "epoch": 0.22313494137853782, + "grad_norm": 1.6942225694656372, + "kl": 1.0029296875, + "learning_rate": 1.909079736027431e-05, + "loss": 0.042, + "reward": 0.589285746216774, + "reward_std": 0.0767946969717741, + "rewards/accuracy_reward": 0.09821428824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 747 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2234336494660593, + "grad_norm": 14.680274963378906, + "kl": 2.0732421875, + "learning_rate": 1.9086446940840386e-05, + "loss": 0.0829, + "reward": 0.5825893133878708, + "reward_std": 0.0738157439045608, + "rewards/accuracy_reward": 0.08928571734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 748 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.9553833007812, + "epoch": 0.22373235755358076, + "grad_norm": 13.678045272827148, + "kl": 1.6796875, + "learning_rate": 1.9082086636242757e-05, + "loss": 0.0677, + "reward": 0.5329241305589676, + "reward_std": 0.0915296939201653, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 749 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.2455596923828, + "epoch": 0.22403106564110223, + "grad_norm": 8.959420204162598, + "kl": 1.1474609375, + "learning_rate": 1.9077716451225007e-05, + "loss": 0.038, + "reward": 0.5625000149011612, + "reward_std": 0.09211359824985266, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357238650322, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3460083007812, + "epoch": 0.2243297737286237, + "grad_norm": 0.6668829917907715, + "kl": 0.6025390625, + "learning_rate": 1.9073336390541472e-05, + "loss": 0.0221, + "reward": 0.5552455633878708, + "reward_std": 0.10671877674758434, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 751 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.5736846923828, + "epoch": 0.22462848181614517, + "grad_norm": 1.415365219116211, + "kl": 0.3515625, + "learning_rate": 1.9068946458957225e-05, + "loss": 0.0141, + "reward": 0.5591517984867096, + "reward_std": 0.05334699316881597, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 752 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2455596923828, + "epoch": 0.22492718990366664, + "grad_norm": 0.7201195955276489, + "kl": 0.3154296875, + "learning_rate": 1.9064546661248084e-05, + "loss": 0.0125, + "reward": 0.6054687798023224, + "reward_std": 0.06791124120354652, + "rewards/accuracy_reward": 0.10714286146685481, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 753 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8861846923828, + "epoch": 0.2252258979911881, + "grad_norm": 1.2880821228027344, + "kl": 0.37744140625, + "learning_rate": 1.9060137002200597e-05, + "loss": 0.0149, + "reward": 0.5931920111179352, + "reward_std": 0.07724380679428577, + "rewards/accuracy_reward": 0.09598214505240321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 754 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8750152587891, + "epoch": 0.22552460607870958, + "grad_norm": 1.2805230617523193, + "kl": 0.51513671875, + "learning_rate": 1.905571748661204e-05, + "loss": 0.0206, + "reward": 0.547991082072258, + "reward_std": 0.09959492087364197, + "rewards/accuracy_reward": 0.051339289639145136, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 755 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.5625305175781, + "epoch": 0.22582331416623105, + "grad_norm": 5.473662376403809, + "kl": 0.96484375, + "learning_rate": 1.9051288119290414e-05, + "loss": 0.0356, + "reward": 0.5440848469734192, + "reward_std": 0.13247237913310528, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 756 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.6495819091797, + "epoch": 0.22612202225375252, + "grad_norm": 2.2164695262908936, + "kl": 0.62109375, + "learning_rate": 1.9046848905054433e-05, + "loss": 0.0251, + "reward": 0.6160714626312256, + "reward_std": 0.10812770947813988, + "rewards/accuracy_reward": 0.12500000349245965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 757 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.1830749511719, + "epoch": 0.226420730341274, + "grad_norm": 2.0726852416992188, + "kl": 0.53515625, + "learning_rate": 1.904239984873353e-05, + "loss": 0.0215, + "reward": 0.575334832072258, + "reward_std": 0.10165448300540447, + "rewards/accuracy_reward": 0.09151786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 758 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.7902069091797, + "epoch": 0.22671943842879547, + "grad_norm": 29.375459671020508, + "kl": 3.69140625, + "learning_rate": 1.9037940955167845e-05, + "loss": 0.1532, + "reward": 0.4737723395228386, + "reward_std": 0.179183566942811, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.420200914144516, + "step": 759 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.7187957763672, + "epoch": 0.22701814651631694, + "grad_norm": 13.606019020080566, + "kl": 2.08203125, + "learning_rate": 1.9033472229208213e-05, + "loss": 0.0867, + "reward": 0.6054687723517418, + "reward_std": 0.18180176615715027, + "rewards/accuracy_reward": 0.18080358300358057, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4246651977300644, + "step": 760 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.1897888183594, + "epoch": 0.2273168546038384, + "grad_norm": 3.207282304763794, + "kl": 0.9765625, + "learning_rate": 1.902899367571617e-05, + "loss": 0.0409, + "reward": 0.4888393133878708, + "reward_std": 0.15113691054284573, + "rewards/accuracy_reward": 0.05133928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4375000223517418, + "step": 761 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0960083007812, + "epoch": 0.22761556269135988, + "grad_norm": 0.5582742094993591, + "kl": 0.6318359375, + "learning_rate": 1.902450529956395e-05, + "loss": 0.026, + "reward": 0.5825893133878708, + "reward_std": 0.17799319326877594, + "rewards/accuracy_reward": 0.14732143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4352678805589676, + "step": 762 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1518249511719, + "epoch": 0.22791427077888135, + "grad_norm": 0.26021748781204224, + "kl": 0.544921875, + "learning_rate": 1.902000710563445e-05, + "loss": 0.0218, + "reward": 0.5474330633878708, + "reward_std": 0.16847065091133118, + "rewards/accuracy_reward": 0.10491071688011289, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4425223469734192, + "step": 763 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.6607513427734, + "epoch": 0.22821297886640282, + "grad_norm": 0.2972263693809509, + "kl": 0.509765625, + "learning_rate": 1.9015499098821283e-05, + "loss": 0.0208, + "reward": 0.5641741380095482, + "reward_std": 0.1407075971364975, + "rewards/accuracy_reward": 0.1250000069849193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4391741305589676, + "step": 764 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.513427734375, + "epoch": 0.22851168695392426, + "grad_norm": 0.29660528898239136, + "kl": 0.46630859375, + "learning_rate": 1.901098128402871e-05, + "loss": 0.0188, + "reward": 0.5396205484867096, + "reward_std": 0.173086391761899, + "rewards/accuracy_reward": 0.10044643189758062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4391741305589676, + "step": 765 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.794677734375, + "epoch": 0.22881039504144574, + "grad_norm": 0.4060265123844147, + "kl": 0.43017578125, + "learning_rate": 1.900645366617167e-05, + "loss": 0.0165, + "reward": 0.573660746216774, + "reward_std": 0.1346972230821848, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4642857313156128, + "step": 766 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8504943847656, + "epoch": 0.2291091031289672, + "grad_norm": 0.19650907814502716, + "kl": 0.298828125, + "learning_rate": 1.9001916250175764e-05, + "loss": 0.0128, + "reward": 0.592075914144516, + "reward_std": 0.058586302446201444, + "rewards/accuracy_reward": 0.0959821455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 767 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.22940781121648868, + "grad_norm": 0.08946015685796738, + "kl": 0.22119140625, + "learning_rate": 1.8997369040977266e-05, + "loss": 0.0088, + "reward": 0.550223246216774, + "reward_std": 0.03375995112583041, + "rewards/accuracy_reward": 0.051339289639145136, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 768 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.9062652587891, + "epoch": 0.22970651930401015, + "grad_norm": 0.24060720205307007, + "kl": 0.2216796875, + "learning_rate": 1.899281204352309e-05, + "loss": 0.0088, + "reward": 0.6467634290456772, + "reward_std": 0.09453641623258591, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 769 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.4843902587891, + "epoch": 0.23000522739153162, + "grad_norm": 0.4101909399032593, + "kl": 0.227294921875, + "learning_rate": 1.8988245262770795e-05, + "loss": 0.0093, + "reward": 0.6015625149011612, + "reward_std": 0.08293784968554974, + "rewards/accuracy_reward": 0.1183035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.483258955180645, + "step": 770 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.6026916503906, + "epoch": 0.2303039354790531, + "grad_norm": 2331.345703125, + "kl": 107.125, + "learning_rate": 1.8983668703688598e-05, + "loss": 4.282, + "reward": 0.5463169813156128, + "reward_std": 0.08464733697474003, + "rewards/accuracy_reward": 0.05580357299186289, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 771 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.23060264356657456, + "grad_norm": 14.35774040222168, + "kl": 0.55078125, + "learning_rate": 1.8979082371255347e-05, + "loss": 0.022, + "reward": 0.598214328289032, + "reward_std": 0.06699470477178693, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 772 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.5803985595703, + "epoch": 0.23090135165409603, + "grad_norm": 0.34338945150375366, + "kl": 0.265625, + "learning_rate": 1.8974486270460518e-05, + "loss": 0.0081, + "reward": 0.5429687798023224, + "reward_std": 0.11333895102143288, + "rewards/accuracy_reward": 0.05803571664728224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 773 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8080444335938, + "epoch": 0.2312000597416175, + "grad_norm": 0.37863755226135254, + "kl": 0.232666015625, + "learning_rate": 1.8969880406304227e-05, + "loss": 0.0089, + "reward": 0.5641741305589676, + "reward_std": 0.08255671989172697, + "rewards/accuracy_reward": 0.08258928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 774 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8036041259766, + "epoch": 0.23149876782913897, + "grad_norm": 0.7262576222419739, + "kl": 0.25732421875, + "learning_rate": 1.8965264783797192e-05, + "loss": 0.0049, + "reward": 0.6467634290456772, + "reward_std": 0.2025115080177784, + "rewards/accuracy_reward": 0.2031250149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.443638414144516, + "step": 775 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.2746124267578, + "epoch": 0.23179747591666044, + "grad_norm": 1.247390866279602, + "kl": 0.32080078125, + "learning_rate": 1.8960639407960764e-05, + "loss": 0.0061, + "reward": 0.4603794813156128, + "reward_std": 0.2073315754532814, + "rewards/accuracy_reward": 0.07812500419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3822544738650322, + "step": 776 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.7366485595703, + "epoch": 0.23209618400418192, + "grad_norm": 2.438589096069336, + "kl": 0.41064453125, + "learning_rate": 1.8956004283826897e-05, + "loss": 0.0116, + "reward": 0.4179687649011612, + "reward_std": 0.15649725124239922, + "rewards/accuracy_reward": 0.051339287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3666294813156128, + "step": 777 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.5245819091797, + "epoch": 0.2323948920917034, + "grad_norm": 2.085062265396118, + "kl": 0.6171875, + "learning_rate": 1.8951359416438152e-05, + "loss": 0.0154, + "reward": 0.4626116305589676, + "reward_std": 0.17872554436326027, + "rewards/accuracy_reward": 0.07812500325962901, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3844866305589676, + "step": 778 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.9687957763672, + "epoch": 0.23269360017922486, + "grad_norm": 3.1912879943847656, + "kl": 1.5859375, + "learning_rate": 1.894670481084769e-05, + "loss": 0.06, + "reward": 0.4146205559372902, + "reward_std": 0.18584304675459862, + "rewards/accuracy_reward": 0.04017857299186289, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3744419813156128, + "step": 779 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.185302734375, + "epoch": 0.23299230826674633, + "grad_norm": 3.181189775466919, + "kl": 2.57421875, + "learning_rate": 1.8942040472119263e-05, + "loss": 0.0979, + "reward": 0.4626116305589676, + "reward_std": 0.16692664846777916, + "rewards/accuracy_reward": 0.06919643119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3934151977300644, + "step": 780 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.7902069091797, + "epoch": 0.2332910163542678, + "grad_norm": 1.4188387393951416, + "kl": 1.341796875, + "learning_rate": 1.8937366405327217e-05, + "loss": 0.0485, + "reward": 0.5039062723517418, + "reward_std": 0.1266882512718439, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.423549123108387, + "step": 781 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2388610839844, + "epoch": 0.23358972444178927, + "grad_norm": 9.876374244689941, + "kl": 2.431640625, + "learning_rate": 1.8932682615556478e-05, + "loss": 0.0959, + "reward": 0.552455373108387, + "reward_std": 0.16196111217141151, + "rewards/accuracy_reward": 0.10937500675208867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4430803805589676, + "step": 782 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7388458251953, + "epoch": 0.23388843252931074, + "grad_norm": 1.4308191537857056, + "kl": 1.259765625, + "learning_rate": 1.8927989107902554e-05, + "loss": 0.0505, + "reward": 0.5262276828289032, + "reward_std": 0.13059073872864246, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4481026977300644, + "step": 783 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.6942138671875, + "epoch": 0.2341871406168322, + "grad_norm": 1.3306459188461304, + "kl": 1.798828125, + "learning_rate": 1.8923285887471514e-05, + "loss": 0.0717, + "reward": 0.4793526977300644, + "reward_std": 0.18097754754126072, + "rewards/accuracy_reward": 0.05133928591385484, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.428013414144516, + "step": 784 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.3437652587891, + "epoch": 0.23448584870435368, + "grad_norm": 11.818349838256836, + "kl": 3.35546875, + "learning_rate": 1.8918572959380005e-05, + "loss": 0.1333, + "reward": 0.534040205180645, + "reward_std": 0.13607310317456722, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4380580559372902, + "step": 785 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.5647583007812, + "epoch": 0.23478455679187513, + "grad_norm": 3.1851119995117188, + "kl": 1.5078125, + "learning_rate": 1.891385032875523e-05, + "loss": 0.0584, + "reward": 0.5111607387661934, + "reward_std": 0.16221912950277328, + "rewards/accuracy_reward": 0.0736607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4375000149011612, + "step": 786 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0960083007812, + "epoch": 0.2350832648793966, + "grad_norm": 16.740480422973633, + "kl": 2.876953125, + "learning_rate": 1.890911800073495e-05, + "loss": 0.1131, + "reward": 0.4799107313156128, + "reward_std": 0.1371930167078972, + "rewards/accuracy_reward": 0.026785715483129025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4531250149011612, + "step": 787 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.9776916503906, + "epoch": 0.23538197296691807, + "grad_norm": 3.372645139694214, + "kl": 1.81640625, + "learning_rate": 1.8904375980467474e-05, + "loss": 0.0722, + "reward": 0.628348246216774, + "reward_std": 0.16089612990617752, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4453125149011612, + "step": 788 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.5714569091797, + "epoch": 0.23568068105443954, + "grad_norm": 8.574797630310059, + "kl": 2.01953125, + "learning_rate": 1.889962427311165e-05, + "loss": 0.0788, + "reward": 0.516741082072258, + "reward_std": 0.0969607075676322, + "rewards/accuracy_reward": 0.04910714481957257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4676339477300644, + "step": 789 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.235979389141961, + "grad_norm": 30.110551834106445, + "kl": 5.26171875, + "learning_rate": 1.8894862883836875e-05, + "loss": 0.2109, + "reward": 0.6183035969734192, + "reward_std": 0.16113417595624924, + "rewards/accuracy_reward": 0.1495535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4687500223517418, + "step": 790 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4888610839844, + "epoch": 0.23627809722948248, + "grad_norm": 6.353398323059082, + "kl": 0.8154296875, + "learning_rate": 1.8890091817823073e-05, + "loss": 0.0293, + "reward": 0.6333705633878708, + "reward_std": 0.08845151402056217, + "rewards/accuracy_reward": 0.15625000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4771205559372902, + "step": 791 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2321472167969, + "epoch": 0.23657680531700395, + "grad_norm": 2.6906111240386963, + "kl": 1.623046875, + "learning_rate": 1.8885311080260695e-05, + "loss": 0.0633, + "reward": 0.534598246216774, + "reward_std": 0.09773872280493379, + "rewards/accuracy_reward": 0.0580357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4765625223517418, + "step": 792 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.9531402587891, + "epoch": 0.23687551340452542, + "grad_norm": 130.83889770507812, + "kl": 19.53125, + "learning_rate": 1.8880520676350717e-05, + "loss": 0.7763, + "reward": 0.503348246216774, + "reward_std": 0.11100805085152388, + "rewards/accuracy_reward": 0.0267857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4765625223517418, + "step": 793 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.6585083007812, + "epoch": 0.2371742214920469, + "grad_norm": 64.39842987060547, + "kl": 10.9609375, + "learning_rate": 1.8875720611304628e-05, + "loss": 0.4384, + "reward": 0.5703125298023224, + "reward_std": 0.09128244873136282, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4743303805589676, + "step": 794 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6897430419922, + "epoch": 0.23747292957956836, + "grad_norm": 31.126543045043945, + "kl": 0.830078125, + "learning_rate": 1.887091089034443e-05, + "loss": 0.0321, + "reward": 0.584263414144516, + "reward_std": 0.14493203163146973, + "rewards/accuracy_reward": 0.11383929289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.470424123108387, + "step": 795 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.5491485595703, + "epoch": 0.23777163766708984, + "grad_norm": 4.479785442352295, + "kl": 0.931640625, + "learning_rate": 1.8866091518702622e-05, + "loss": 0.0348, + "reward": 0.4966518208384514, + "reward_std": 0.11632258631289005, + "rewards/accuracy_reward": 0.024553572991862893, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4720982387661934, + "step": 796 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.2991333007812, + "epoch": 0.2380703457546113, + "grad_norm": 4.055196762084961, + "kl": 1.3828125, + "learning_rate": 1.8861262501622213e-05, + "loss": 0.0537, + "reward": 0.5033482536673546, + "reward_std": 0.1157107725739479, + "rewards/accuracy_reward": 0.0267857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4765625298023224, + "step": 797 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7500305175781, + "epoch": 0.23836905384213278, + "grad_norm": 1282.543701171875, + "kl": 83.1875, + "learning_rate": 1.88564238443567e-05, + "loss": 3.3314, + "reward": 0.4793526977300644, + "reward_std": 0.09853614494204521, + "rewards/accuracy_reward": 0.008928572060540318, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4704241305589676, + "step": 798 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.2969055175781, + "epoch": 0.23866776192965425, + "grad_norm": 831.8845825195312, + "kl": 112.84375, + "learning_rate": 1.8851575552170064e-05, + "loss": 4.5286, + "reward": 0.573660746216774, + "reward_std": 0.11562490742653608, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143059372902, + "step": 799 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.9911041259766, + "epoch": 0.23896647001717572, + "grad_norm": 98.67723083496094, + "kl": 22.78125, + "learning_rate": 1.884671763033678e-05, + "loss": 0.8792, + "reward": 0.5753348395228386, + "reward_std": 0.08639596775174141, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4771205559372902, + "step": 800 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8594055175781, + "epoch": 0.2392651781046972, + "grad_norm": 7.608706474304199, + "kl": 3.00390625, + "learning_rate": 1.8841850084141783e-05, + "loss": 0.121, + "reward": 0.5535714477300644, + "reward_std": 0.1165941096842289, + "rewards/accuracy_reward": 0.07589286030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4776785895228386, + "step": 801 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3817138671875, + "epoch": 0.23956388619221866, + "grad_norm": 1.2870513200759888, + "kl": 0.48681640625, + "learning_rate": 1.883697291888049e-05, + "loss": 0.0202, + "reward": 0.5552455559372902, + "reward_std": 0.12635097932070494, + "rewards/accuracy_reward": 0.0647321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 802 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.5201110839844, + "epoch": 0.23986259427974013, + "grad_norm": 1.4604252576828003, + "kl": 0.5166015625, + "learning_rate": 1.8832086139858777e-05, + "loss": 0.0206, + "reward": 0.604352705180645, + "reward_std": 0.10523173864930868, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 803 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8995666503906, + "epoch": 0.2401613023672616, + "grad_norm": 1.2227705717086792, + "kl": 0.5947265625, + "learning_rate": 1.8827189752392982e-05, + "loss": 0.0234, + "reward": 0.5507812798023224, + "reward_std": 0.13773047737777233, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 804 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8772430419922, + "epoch": 0.24046001045478307, + "grad_norm": 1.455676555633545, + "kl": 0.978515625, + "learning_rate": 1.882228376180989e-05, + "loss": 0.0394, + "reward": 0.597098246216774, + "reward_std": 0.1033830028027296, + "rewards/accuracy_reward": 0.12276786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4743303805589676, + "step": 805 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.9821472167969, + "epoch": 0.24075871854230455, + "grad_norm": 1.2886136770248413, + "kl": 0.697265625, + "learning_rate": 1.881736817344675e-05, + "loss": 0.0279, + "reward": 0.59765625, + "reward_std": 0.13364559598267078, + "rewards/accuracy_reward": 0.1160714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 806 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.2968902587891, + "epoch": 0.24105742662982602, + "grad_norm": 0.558358907699585, + "kl": 0.49462890625, + "learning_rate": 1.8812442992651224e-05, + "loss": 0.0133, + "reward": 0.5468750223517418, + "reward_std": 0.09113525971770287, + "rewards/accuracy_reward": 0.06026785867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 807 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8236694335938, + "epoch": 0.24135613471734746, + "grad_norm": 0.2408507913351059, + "kl": 0.431640625, + "learning_rate": 1.880750822478144e-05, + "loss": 0.0173, + "reward": 0.5117187649011612, + "reward_std": 0.05279536498710513, + "rewards/accuracy_reward": 0.013392857974395156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 808 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.24165484280486893, + "grad_norm": 0.15125730633735657, + "kl": 0.38232421875, + "learning_rate": 1.880256387520593e-05, + "loss": 0.0153, + "reward": 0.5094866305589676, + "reward_std": 0.04386679199524224, + "rewards/accuracy_reward": 0.011160714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 809 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2419535508923904, + "grad_norm": 0.6183080077171326, + "kl": 0.392578125, + "learning_rate": 1.8797609949303674e-05, + "loss": 0.0157, + "reward": 0.5736607313156128, + "reward_std": 0.008928571827709675, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 810 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.24225225897991187, + "grad_norm": 0.16105253994464874, + "kl": 0.37548828125, + "learning_rate": 1.879264645246405e-05, + "loss": 0.015, + "reward": 0.5714285969734192, + "reward_std": 0.08487515803426504, + "rewards/accuracy_reward": 0.07142857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 811 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.24255096706743334, + "grad_norm": 0.13228006660938263, + "kl": 0.39453125, + "learning_rate": 1.8787673390086857e-05, + "loss": 0.0158, + "reward": 0.5239955484867096, + "reward_std": 0.05907375295646489, + "rewards/accuracy_reward": 0.02455357275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 812 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2428496751549548, + "grad_norm": 0.11041919142007828, + "kl": 0.3603515625, + "learning_rate": 1.8782690767582295e-05, + "loss": 0.0144, + "reward": 0.553013414144516, + "reward_std": 0.0415295185521245, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 813 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.24314838324247628, + "grad_norm": 0.10428806394338608, + "kl": 0.3525390625, + "learning_rate": 1.8777698590370983e-05, + "loss": 0.0141, + "reward": 0.5200892984867096, + "reward_std": 0.0482259476557374, + "rewards/accuracy_reward": 0.02008928661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 814 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.9151916503906, + "epoch": 0.24344709132999776, + "grad_norm": 0.5636239647865295, + "kl": 0.34033203125, + "learning_rate": 1.8772696863883905e-05, + "loss": 0.0136, + "reward": 0.5820312947034836, + "reward_std": 0.0805139858275652, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 815 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.24374579941751923, + "grad_norm": 0.5248473882675171, + "kl": 0.359375, + "learning_rate": 1.876768559356246e-05, + "loss": 0.0144, + "reward": 0.6227678954601288, + "reward_std": 0.0916601587086916, + "rewards/accuracy_reward": 0.12276786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 816 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2440445075050407, + "grad_norm": 0.09236925095319748, + "kl": 0.34521484375, + "learning_rate": 1.8762664784858412e-05, + "loss": 0.0138, + "reward": 0.5602678805589676, + "reward_std": 0.0395701015368104, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 817 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.24434321559256217, + "grad_norm": 0.20103706419467926, + "kl": 0.36181640625, + "learning_rate": 1.875763444323391e-05, + "loss": 0.0145, + "reward": 0.5239955484867096, + "reward_std": 0.0643519964069128, + "rewards/accuracy_reward": 0.02455357275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 818 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.24464192368008364, + "grad_norm": 0.0912068784236908, + "kl": 0.36328125, + "learning_rate": 1.875259457416148e-05, + "loss": 0.0145, + "reward": 0.6316964775323868, + "reward_std": 0.04526757914572954, + "rewards/accuracy_reward": 0.13169643515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 819 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.6473388671875, + "epoch": 0.2449406317676051, + "grad_norm": 0.11641153693199158, + "kl": 0.3876953125, + "learning_rate": 1.8747545183123996e-05, + "loss": 0.0155, + "reward": 0.5468750149011612, + "reward_std": 0.0782818403095007, + "rewards/accuracy_reward": 0.046875000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 820 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.24523933985512658, + "grad_norm": 3412.322509765625, + "kl": 60.30712890625, + "learning_rate": 1.8742486275614706e-05, + "loss": 2.4159, + "reward": 0.5055803656578064, + "reward_std": 0.03125000186264515, + "rewards/accuracy_reward": 0.006696428870782256, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 821 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.7388610839844, + "epoch": 0.24553804794264805, + "grad_norm": 0.10382084548473358, + "kl": 0.39697265625, + "learning_rate": 1.8737417857137204e-05, + "loss": 0.0156, + "reward": 0.5613839626312256, + "reward_std": 0.06249528471380472, + "rewards/accuracy_reward": 0.06250000302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 822 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.4174346923828, + "epoch": 0.24583675603016952, + "grad_norm": 0.12200701236724854, + "kl": 0.39453125, + "learning_rate": 1.873233993320543e-05, + "loss": 0.0182, + "reward": 0.555803582072258, + "reward_std": 0.054956382140517235, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 823 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6049346923828, + "epoch": 0.246135464117691, + "grad_norm": 1.4175945520401, + "kl": 0.41748046875, + "learning_rate": 1.872725250934367e-05, + "loss": 0.0129, + "reward": 0.554129496216774, + "reward_std": 0.10002200771123171, + "rewards/accuracy_reward": 0.05580357392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 824 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.419677734375, + "epoch": 0.24643417220521247, + "grad_norm": 0.13394922018051147, + "kl": 0.38525390625, + "learning_rate": 1.8722155591086545e-05, + "loss": 0.0161, + "reward": 0.505022332072258, + "reward_std": 0.027823751559481025, + "rewards/accuracy_reward": 0.0066964291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 825 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.6786041259766, + "epoch": 0.24673288029273394, + "grad_norm": 0.11164297163486481, + "kl": 0.3447265625, + "learning_rate": 1.8717049183979e-05, + "loss": 0.0138, + "reward": 0.6562500298023224, + "reward_std": 0.05168620287440717, + "rewards/accuracy_reward": 0.15848215529695153, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 826 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.6875457763672, + "epoch": 0.2470315883802554, + "grad_norm": 0.10266058146953583, + "kl": 0.33984375, + "learning_rate": 1.8711933293576303e-05, + "loss": 0.0128, + "reward": 0.608816996216774, + "reward_std": 0.07944084610790014, + "rewards/accuracy_reward": 0.10937500419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 827 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.2478332519531, + "epoch": 0.24733029646777688, + "grad_norm": 1.4865108728408813, + "kl": 0.33056640625, + "learning_rate": 1.8706807925444045e-05, + "loss": 0.0132, + "reward": 0.5161830484867096, + "reward_std": 0.061149825574830174, + "rewards/accuracy_reward": 0.017857144121080637, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 828 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8392944335938, + "epoch": 0.24762900455529832, + "grad_norm": 0.10072160512208939, + "kl": 0.31591796875, + "learning_rate": 1.870167308515812e-05, + "loss": 0.0127, + "reward": 0.5781250298023224, + "reward_std": 0.044642859138548374, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 829 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.607177734375, + "epoch": 0.2479277126428198, + "grad_norm": 22.148332595825195, + "kl": 0.8251953125, + "learning_rate": 1.869652877830474e-05, + "loss": 0.0331, + "reward": 0.5524553954601288, + "reward_std": 0.06574365310370922, + "rewards/accuracy_reward": 0.05580357322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 830 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.6763610839844, + "epoch": 0.24822642073034126, + "grad_norm": 0.12319738417863846, + "kl": 0.3154296875, + "learning_rate": 1.8691375010480397e-05, + "loss": 0.0126, + "reward": 0.5630580633878708, + "reward_std": 0.07717469613999128, + "rewards/accuracy_reward": 0.06473214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 831 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6339569091797, + "epoch": 0.24852512881786273, + "grad_norm": 0.18024007976055145, + "kl": 0.2958984375, + "learning_rate": 1.868621178729189e-05, + "loss": 0.0121, + "reward": 0.6010044813156128, + "reward_std": 0.08440811978653073, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 832 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.7589416503906, + "epoch": 0.2488238369053842, + "grad_norm": 0.0994255542755127, + "kl": 0.30712890625, + "learning_rate": 1.8681039114356298e-05, + "loss": 0.0081, + "reward": 0.5039062649011612, + "reward_std": 0.030873439274728298, + "rewards/accuracy_reward": 0.0066964291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 833 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0133972167969, + "epoch": 0.24912254499290568, + "grad_norm": 0.10127174854278564, + "kl": 0.2861328125, + "learning_rate": 1.867585699730098e-05, + "loss": 0.0114, + "reward": 0.5736607313156128, + "reward_std": 0.07066834578290582, + "rewards/accuracy_reward": 0.07589286053553224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 834 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7701110839844, + "epoch": 0.24942125308042715, + "grad_norm": 0.11694991588592529, + "kl": 0.291015625, + "learning_rate": 1.867066544176358e-05, + "loss": 0.0115, + "reward": 0.5781250149011612, + "reward_std": 0.07558072032406926, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 835 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.9062805175781, + "epoch": 0.24971996116794862, + "grad_norm": 0.21769076585769653, + "kl": 0.2880859375, + "learning_rate": 1.8665464453391994e-05, + "loss": 0.0111, + "reward": 0.577566996216774, + "reward_std": 0.08820465113967657, + "rewards/accuracy_reward": 0.08035714598372579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 836 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7366180419922, + "epoch": 0.2500186692554701, + "grad_norm": 0.32051852345466614, + "kl": 0.29345703125, + "learning_rate": 1.866025403784439e-05, + "loss": 0.0107, + "reward": 0.5619419813156128, + "reward_std": 0.0678214740473777, + "rewards/accuracy_reward": 0.06696428707800806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 837 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8549194335938, + "epoch": 0.2503173773429916, + "grad_norm": 0.14015957713127136, + "kl": 0.2880859375, + "learning_rate": 1.8655034200789187e-05, + "loss": 0.0115, + "reward": 0.6171875298023224, + "reward_std": 0.09436129592359066, + "rewards/accuracy_reward": 0.12053572130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 838 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7879638671875, + "epoch": 0.25061608543051306, + "grad_norm": 0.17800332605838776, + "kl": 0.3017578125, + "learning_rate": 1.8649804947905057e-05, + "loss": 0.0121, + "reward": 0.5440848618745804, + "reward_std": 0.07851846888661385, + "rewards/accuracy_reward": 0.051339288242161274, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 839 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.6272583007812, + "epoch": 0.2509147935180345, + "grad_norm": 0.21721568703651428, + "kl": 0.29833984375, + "learning_rate": 1.864456628488092e-05, + "loss": 0.0069, + "reward": 0.6344866454601288, + "reward_std": 0.10535254888236523, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 840 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.25121350160555594, + "grad_norm": 0.2974444627761841, + "kl": 0.28271484375, + "learning_rate": 1.8639318217415918e-05, + "loss": 0.0113, + "reward": 0.5602678805589676, + "reward_std": 0.12067177519202232, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 841 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.1161041259766, + "epoch": 0.2515122096930774, + "grad_norm": 0.49861449003219604, + "kl": 0.29638671875, + "learning_rate": 1.8634060751219442e-05, + "loss": 0.0073, + "reward": 0.5412946715950966, + "reward_std": 0.06610901979729533, + "rewards/accuracy_reward": 0.05133928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553880095482, + "step": 842 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.6250152587891, + "epoch": 0.2518109177805989, + "grad_norm": 0.590794026851654, + "kl": 0.3447265625, + "learning_rate": 1.8628793892011103e-05, + "loss": 0.0177, + "reward": 0.5463169813156128, + "reward_std": 0.08972704317420721, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4793526977300644, + "step": 843 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.5268096923828, + "epoch": 0.25210962586812036, + "grad_norm": 0.364703506231308, + "kl": 0.47509765625, + "learning_rate": 1.8623517645520714e-05, + "loss": 0.0192, + "reward": 0.5357143059372902, + "reward_std": 0.16041192784905434, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4531250223517418, + "step": 844 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.4420166015625, + "epoch": 0.25240833395564183, + "grad_norm": 0.8169168829917908, + "kl": 0.6845703125, + "learning_rate": 1.861823201748833e-05, + "loss": 0.0277, + "reward": 0.5714285969734192, + "reward_std": 0.13622992299497128, + "rewards/accuracy_reward": 0.12500000605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4464285969734192, + "step": 845 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.560302734375, + "epoch": 0.2527070420431633, + "grad_norm": 0.7074524760246277, + "kl": 0.583984375, + "learning_rate": 1.861293701366418e-05, + "loss": 0.0224, + "reward": 0.5558036044239998, + "reward_std": 0.21337048150599003, + "rewards/accuracy_reward": 0.10044643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.455357164144516, + "step": 846 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.0045166015625, + "epoch": 0.25300575013068477, + "grad_norm": 0.2605844736099243, + "kl": 0.4111328125, + "learning_rate": 1.8607632639808724e-05, + "loss": 0.0106, + "reward": 0.5228794887661934, + "reward_std": 0.1036860877647996, + "rewards/accuracy_reward": 0.042410717345774174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687649011612, + "step": 847 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7812652587891, + "epoch": 0.25330445821820624, + "grad_norm": 0.2430507093667984, + "kl": 0.37841796875, + "learning_rate": 1.8602318901692592e-05, + "loss": 0.015, + "reward": 0.5786830708384514, + "reward_std": 0.07326417695730925, + "rewards/accuracy_reward": 0.09151786286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 848 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7768096923828, + "epoch": 0.2536031663057277, + "grad_norm": 0.19629159569740295, + "kl": 0.35986328125, + "learning_rate": 1.8596995805096615e-05, + "loss": 0.0145, + "reward": 0.5189732313156128, + "reward_std": 0.08284328505396843, + "rewards/accuracy_reward": 0.026785716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 849 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.1094055175781, + "epoch": 0.2539018743932492, + "grad_norm": 0.1816451996564865, + "kl": 0.3525390625, + "learning_rate": 1.8591663355811794e-05, + "loss": 0.0105, + "reward": 0.6383928880095482, + "reward_std": 0.07399087399244308, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 850 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6138610839844, + "epoch": 0.25420058248077065, + "grad_norm": 0.4337615668773651, + "kl": 0.3486328125, + "learning_rate": 1.8586321559639316e-05, + "loss": 0.0104, + "reward": 0.6774553805589676, + "reward_std": 0.12276740558445454, + "rewards/accuracy_reward": 0.1852678693830967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 851 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.7210235595703, + "epoch": 0.2544992905682921, + "grad_norm": 0.2858084738254547, + "kl": 0.3369140625, + "learning_rate": 1.8580970422390535e-05, + "loss": 0.0136, + "reward": 0.571986623108387, + "reward_std": 0.04973461525514722, + "rewards/accuracy_reward": 0.0803571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 852 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6093902587891, + "epoch": 0.2547979986558136, + "grad_norm": 0.19390204548835754, + "kl": 0.36083984375, + "learning_rate": 1.8575609949886955e-05, + "loss": 0.0163, + "reward": 0.611607164144516, + "reward_std": 0.11373101733624935, + "rewards/accuracy_reward": 0.12500000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 853 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8817138671875, + "epoch": 0.25509670674333507, + "grad_norm": 0.2980157136917114, + "kl": 0.3603515625, + "learning_rate": 1.8570240147960254e-05, + "loss": 0.0144, + "reward": 0.5725446790456772, + "reward_std": 0.10920927207916975, + "rewards/accuracy_reward": 0.08482143259607255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 854 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.25539541483085654, + "grad_norm": 0.3094604015350342, + "kl": 0.3466796875, + "learning_rate": 1.8564861022452244e-05, + "loss": 0.0139, + "reward": 0.494977705180645, + "reward_std": 0.08170208148658276, + "rewards/accuracy_reward": 0.01562500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4793526977300644, + "step": 855 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.255694122918378, + "grad_norm": 0.3986174166202545, + "kl": 0.37353515625, + "learning_rate": 1.855947257921489e-05, + "loss": 0.0149, + "reward": 0.482142873108387, + "reward_std": 0.08558742888271809, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143059372902, + "step": 856 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5670013427734, + "epoch": 0.2559928310058995, + "grad_norm": 0.2624119818210602, + "kl": 0.3369140625, + "learning_rate": 1.8554074824110285e-05, + "loss": 0.0167, + "reward": 0.5669643133878708, + "reward_std": 0.13181814551353455, + "rewards/accuracy_reward": 0.0959821455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.470982164144516, + "step": 857 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.9196472167969, + "epoch": 0.25629153909342095, + "grad_norm": 0.3215493857860565, + "kl": 0.33056640625, + "learning_rate": 1.8548667763010664e-05, + "loss": 0.0123, + "reward": 0.5329241305589676, + "reward_std": 0.1369304396212101, + "rewards/accuracy_reward": 0.07142857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4614955484867096, + "step": 858 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2565902471809424, + "grad_norm": 0.3239182233810425, + "kl": 0.31396484375, + "learning_rate": 1.8543251401798374e-05, + "loss": 0.0126, + "reward": 0.5329241305589676, + "reward_std": 0.14699793606996536, + "rewards/accuracy_reward": 0.07366071874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.459263414144516, + "step": 859 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2568889552684639, + "grad_norm": 0.38156992197036743, + "kl": 0.33642578125, + "learning_rate": 1.853782574636589e-05, + "loss": 0.0135, + "reward": 0.548549123108387, + "reward_std": 0.13958392851054668, + "rewards/accuracy_reward": 0.09598214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4525669887661934, + "step": 860 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.25718766335598536, + "grad_norm": 0.5311645269393921, + "kl": 0.3212890625, + "learning_rate": 1.8532390802615788e-05, + "loss": 0.0128, + "reward": 0.5898437798023224, + "reward_std": 0.14226817712187767, + "rewards/accuracy_reward": 0.12946429336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4603794813156128, + "step": 861 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.7633972167969, + "epoch": 0.25748637144350683, + "grad_norm": 1.393953800201416, + "kl": 0.3505859375, + "learning_rate": 1.8526946576460757e-05, + "loss": 0.0094, + "reward": 0.5664062798023224, + "reward_std": 0.12946580164134502, + "rewards/accuracy_reward": 0.12500000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4414062798023224, + "step": 862 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2577850795310283, + "grad_norm": 0.2605609893798828, + "kl": 0.26806640625, + "learning_rate": 1.8521493073823583e-05, + "loss": 0.0107, + "reward": 0.5345982313156128, + "reward_std": 0.11083475686609745, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4609375223517418, + "step": 863 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2580837876185498, + "grad_norm": 12.613091468811035, + "kl": 0.4951171875, + "learning_rate": 1.8516030300637142e-05, + "loss": 0.0198, + "reward": 0.5039062649011612, + "reward_std": 0.07386254146695137, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4681919813156128, + "step": 864 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.25838249570607125, + "grad_norm": 0.3393840789794922, + "kl": 0.231689453125, + "learning_rate": 1.851055826284439e-05, + "loss": 0.0093, + "reward": 0.5055803805589676, + "reward_std": 0.07945056725293398, + "rewards/accuracy_reward": 0.04017857206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4654018059372902, + "step": 865 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2586812037935927, + "grad_norm": 0.6545014977455139, + "kl": 0.26611328125, + "learning_rate": 1.850507696639838e-05, + "loss": 0.0106, + "reward": 0.4642857387661934, + "reward_std": 0.08793427422642708, + "rewards/accuracy_reward": 0.0022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4620535969734192, + "step": 866 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2589799118811142, + "grad_norm": 0.2864055335521698, + "kl": 0.22021484375, + "learning_rate": 1.849958641726221e-05, + "loss": 0.0088, + "reward": 0.534598246216774, + "reward_std": 0.12641905061900616, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4118303805589676, + "step": 867 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.25927861996863566, + "grad_norm": 0.2770010530948639, + "kl": 0.2216796875, + "learning_rate": 1.849408662140907e-05, + "loss": 0.0089, + "reward": 0.3750000223517418, + "reward_std": 0.10393085516989231, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3750000223517418, + "step": 868 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.25957732805615713, + "grad_norm": 0.19830471277236938, + "kl": 0.236572265625, + "learning_rate": 1.8488577584822197e-05, + "loss": 0.0095, + "reward": 0.4888393133878708, + "reward_std": 0.10335934348404408, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4107143059372902, + "step": 869 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2598760361436786, + "grad_norm": 0.29717257618904114, + "kl": 0.25830078125, + "learning_rate": 1.8483059313494877e-05, + "loss": 0.0103, + "reward": 0.4765625298023224, + "reward_std": 0.12656988389790058, + "rewards/accuracy_reward": 0.022321429336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4542410969734192, + "step": 870 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2601747442312001, + "grad_norm": 0.7744084596633911, + "kl": 0.4326171875, + "learning_rate": 1.847753181343046e-05, + "loss": 0.0173, + "reward": 0.5435268208384514, + "reward_std": 0.08279741741716862, + "rewards/accuracy_reward": 0.07589286053553224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4676339477300644, + "step": 871 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.26047345231872154, + "grad_norm": 1.5673190355300903, + "kl": 0.28564453125, + "learning_rate": 1.8471995090642312e-05, + "loss": 0.0114, + "reward": 0.4614955559372902, + "reward_std": 0.10152365639805794, + "rewards/accuracy_reward": 0.011160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4503348395228386, + "step": 872 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8504638671875, + "epoch": 0.260772160406243, + "grad_norm": 0.21263036131858826, + "kl": 0.255859375, + "learning_rate": 1.8466449151153853e-05, + "loss": 0.0103, + "reward": 0.5094866305589676, + "reward_std": 0.09917352348566055, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.435825914144516, + "step": 873 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2610708684937645, + "grad_norm": 0.23215100169181824, + "kl": 0.27099609375, + "learning_rate": 1.8460894000998518e-05, + "loss": 0.0108, + "reward": 0.4698660895228386, + "reward_std": 0.1130804494023323, + "rewards/accuracy_reward": 0.017857143422588706, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4520089477300644, + "step": 874 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.3325958251953, + "epoch": 0.26136957658128596, + "grad_norm": 0.28904351592063904, + "kl": 0.30419921875, + "learning_rate": 1.8455329646219767e-05, + "loss": 0.0122, + "reward": 0.5725446715950966, + "reward_std": 0.06946448609232903, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910969734192, + "step": 875 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2616682846688074, + "grad_norm": 0.23653103411197662, + "kl": 0.283203125, + "learning_rate": 1.844975609287107e-05, + "loss": 0.0113, + "reward": 0.5608258992433548, + "reward_std": 0.12302083522081375, + "rewards/accuracy_reward": 0.08035714644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687649011612, + "step": 876 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2589416503906, + "epoch": 0.2619669927563289, + "grad_norm": 0.23934079706668854, + "kl": 0.250732421875, + "learning_rate": 1.8444173347015912e-05, + "loss": 0.01, + "reward": 0.5602678805589676, + "reward_std": 0.05206179525703192, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 877 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.3102722167969, + "epoch": 0.26226570084385037, + "grad_norm": 0.5978423357009888, + "kl": 0.26171875, + "learning_rate": 1.843858141472777e-05, + "loss": 0.0079, + "reward": 0.5334821566939354, + "reward_std": 0.08638379909098148, + "rewards/accuracy_reward": 0.049107146449387074, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 878 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4732208251953, + "epoch": 0.26256440893137184, + "grad_norm": 0.19301018118858337, + "kl": 0.24658203125, + "learning_rate": 1.8432980302090116e-05, + "loss": 0.0102, + "reward": 0.6160714626312256, + "reward_std": 0.05581104755401611, + "rewards/accuracy_reward": 0.12053571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 879 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2628631170188933, + "grad_norm": 0.13723477721214294, + "kl": 0.255126953125, + "learning_rate": 1.842737001519642e-05, + "loss": 0.0102, + "reward": 0.5474330633878708, + "reward_std": 0.04787903488613665, + "rewards/accuracy_reward": 0.04910714412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 880 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8817138671875, + "epoch": 0.2631618251064148, + "grad_norm": 0.11535344272851944, + "kl": 0.25439453125, + "learning_rate": 1.8421750560150112e-05, + "loss": 0.0073, + "reward": 0.5502232313156128, + "reward_std": 0.030421036994084716, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 881 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3370819091797, + "epoch": 0.26346053319393625, + "grad_norm": 0.08821763098239899, + "kl": 0.26953125, + "learning_rate": 1.841612194306462e-05, + "loss": 0.0006, + "reward": 0.5837053656578064, + "reward_std": 0.03829334513284266, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 882 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.8750305175781, + "epoch": 0.26375924128145767, + "grad_norm": 0.10223310440778732, + "kl": 0.267578125, + "learning_rate": 1.8410484170063317e-05, + "loss": -0.0065, + "reward": 0.5440848469734192, + "reward_std": 0.04341474827378988, + "rewards/accuracy_reward": 0.04687500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 883 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.26405794936897914, + "grad_norm": 0.15204644203186035, + "kl": 0.2802734375, + "learning_rate": 1.8404837247279558e-05, + "loss": 0.0112, + "reward": 0.5078125298023224, + "reward_std": 0.04564689099788666, + "rewards/accuracy_reward": 0.011160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 884 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7232208251953, + "epoch": 0.2643566574565006, + "grad_norm": 0.09140574932098389, + "kl": 0.30419921875, + "learning_rate": 1.8399181180856635e-05, + "loss": 0.0122, + "reward": 0.6222098469734192, + "reward_std": 0.032600946724414825, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 885 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.6495666503906, + "epoch": 0.2646553655440221, + "grad_norm": 0.1101953387260437, + "kl": 0.32080078125, + "learning_rate": 1.8393515976947795e-05, + "loss": 0.0071, + "reward": 0.6283482313156128, + "reward_std": 0.05381523258984089, + "rewards/accuracy_reward": 0.12946429220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 886 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.779052734375, + "epoch": 0.26495407363154355, + "grad_norm": 0.3626231253147125, + "kl": 0.3583984375, + "learning_rate": 1.8387841641716226e-05, + "loss": 0.008, + "reward": 0.567522332072258, + "reward_std": 0.01562500116415322, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 887 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.6741180419922, + "epoch": 0.265252781719065, + "grad_norm": 0.17966966331005096, + "kl": 0.3525390625, + "learning_rate": 1.8382158181335046e-05, + "loss": 0.0141, + "reward": 0.5703125298023224, + "reward_std": 0.09866187162697315, + "rewards/accuracy_reward": 0.07589286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 888 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.0245971679688, + "epoch": 0.2655514898065865, + "grad_norm": 0.16343814134597778, + "kl": 0.37060546875, + "learning_rate": 1.8376465601987302e-05, + "loss": 0.0043, + "reward": 0.5145089477300644, + "reward_std": 0.07236650213599205, + "rewards/accuracy_reward": 0.02008928661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 889 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.3348541259766, + "epoch": 0.26585019789410796, + "grad_norm": 0.2606983184814453, + "kl": 0.3896484375, + "learning_rate": 1.837076390986597e-05, + "loss": 0.0139, + "reward": 0.5965402126312256, + "reward_std": 0.12146875075995922, + "rewards/accuracy_reward": 0.10937500186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 890 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.6406707763672, + "epoch": 0.26614890598162944, + "grad_norm": 0.3136255145072937, + "kl": 0.40966796875, + "learning_rate": 1.8365053111173924e-05, + "loss": 0.0134, + "reward": 0.6082589477300644, + "reward_std": 0.07850956916809082, + "rewards/accuracy_reward": 0.12053571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 891 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.2366333007812, + "epoch": 0.2664476140691509, + "grad_norm": 0.5535473227500916, + "kl": 0.455078125, + "learning_rate": 1.8359333212123958e-05, + "loss": 0.0144, + "reward": 0.5786830484867096, + "reward_std": 0.08622976671904325, + "rewards/accuracy_reward": 0.09151786123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 892 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7031402587891, + "epoch": 0.2667463221566724, + "grad_norm": 1.7689837217330933, + "kl": 0.6171875, + "learning_rate": 1.835360421893876e-05, + "loss": 0.0248, + "reward": 0.623325914144516, + "reward_std": 0.0968422219157219, + "rewards/accuracy_reward": 0.14062500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 893 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.4486846923828, + "epoch": 0.26704503024419385, + "grad_norm": 162.85716247558594, + "kl": 27.71875, + "learning_rate": 1.834786613785091e-05, + "loss": 1.107, + "reward": 0.5228794887661934, + "reward_std": 0.05601565632969141, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 894 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7053680419922, + "epoch": 0.2673437383317153, + "grad_norm": 0.62226802110672, + "kl": 0.38818359375, + "learning_rate": 1.8342118975102887e-05, + "loss": 0.0155, + "reward": 0.4871651977300644, + "reward_std": 0.06169608095660806, + "rewards/accuracy_reward": 0.004464285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 895 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8415374755859, + "epoch": 0.2676424464192368, + "grad_norm": 1.1815077066421509, + "kl": 0.53466796875, + "learning_rate": 1.833636273694703e-05, + "loss": 0.0155, + "reward": 0.5837053954601288, + "reward_std": 0.15417339280247688, + "rewards/accuracy_reward": 0.10044643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 896 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.1361846923828, + "epoch": 0.26794115450675826, + "grad_norm": 13.26549243927002, + "kl": 4.1015625, + "learning_rate": 1.8330597429645566e-05, + "loss": 0.1645, + "reward": 0.5987723469734192, + "reward_std": 0.06125330715440214, + "rewards/accuracy_reward": 0.10491072200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 897 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2232208251953, + "epoch": 0.26823986259427973, + "grad_norm": 0.7000740766525269, + "kl": 0.4462890625, + "learning_rate": 1.8324823059470587e-05, + "loss": 0.0179, + "reward": 0.5825893059372902, + "reward_std": 0.057856707368046045, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 898 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4866333007812, + "epoch": 0.2685385706818012, + "grad_norm": 0.2784978151321411, + "kl": 0.35546875, + "learning_rate": 1.8319039632704042e-05, + "loss": 0.0151, + "reward": 0.5664062723517418, + "reward_std": 0.04833107814192772, + "rewards/accuracy_reward": 0.07589285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 899 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.2053833007812, + "epoch": 0.2688372787693227, + "grad_norm": 0.2596333622932434, + "kl": 0.34912109375, + "learning_rate": 1.8313247155637725e-05, + "loss": 0.0112, + "reward": 0.5613839477300644, + "reward_std": 0.14759593084454536, + "rewards/accuracy_reward": 0.06919643189758062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 900 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.26913598685684414, + "grad_norm": 0.16858890652656555, + "kl": 0.35400390625, + "learning_rate": 1.830744563457329e-05, + "loss": 0.0142, + "reward": 0.5864955633878708, + "reward_std": 0.11031625792384148, + "rewards/accuracy_reward": 0.09151786495931447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 901 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2694346949443656, + "grad_norm": 0.3757841885089874, + "kl": 0.45654296875, + "learning_rate": 1.8301635075822222e-05, + "loss": 0.0183, + "reward": 0.5998884290456772, + "reward_std": 0.1419885717332363, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 902 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2232360839844, + "epoch": 0.2697334030318871, + "grad_norm": 0.19043906033039093, + "kl": 0.33056640625, + "learning_rate": 1.8295815485705842e-05, + "loss": 0.0132, + "reward": 0.5206473395228386, + "reward_std": 0.09272394049912691, + "rewards/accuracy_reward": 0.029017858672887087, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 903 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.9397430419922, + "epoch": 0.27003211111940856, + "grad_norm": 0.19805000722408295, + "kl": 0.31396484375, + "learning_rate": 1.8289986870555287e-05, + "loss": 0.0125, + "reward": 0.612723246216774, + "reward_std": 0.06673437915742397, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 904 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4910888671875, + "epoch": 0.27033081920693003, + "grad_norm": 0.14815479516983032, + "kl": 0.27880859375, + "learning_rate": 1.8284149236711527e-05, + "loss": 0.0113, + "reward": 0.5195312798023224, + "reward_std": 0.06835784576833248, + "rewards/accuracy_reward": 0.026785715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 905 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.5736694335938, + "epoch": 0.2706295272944515, + "grad_norm": 0.40556570887565613, + "kl": 0.380859375, + "learning_rate": 1.8278302590525326e-05, + "loss": 0.0152, + "reward": 0.5513393133878708, + "reward_std": 0.06877502659335732, + "rewards/accuracy_reward": 0.0602678582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 906 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.9151916503906, + "epoch": 0.27092823538197297, + "grad_norm": 0.16930773854255676, + "kl": 0.28515625, + "learning_rate": 1.8272446938357272e-05, + "loss": 0.0114, + "reward": 0.556361623108387, + "reward_std": 0.08331754803657532, + "rewards/accuracy_reward": 0.06473214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 907 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.27122694346949444, + "grad_norm": 1.7411341667175293, + "kl": 0.37744140625, + "learning_rate": 1.826658228657773e-05, + "loss": 0.0151, + "reward": 0.5457589477300644, + "reward_std": 0.052713218377903104, + "rewards/accuracy_reward": 0.05133928777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 908 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8258972167969, + "epoch": 0.2715256515570159, + "grad_norm": 0.2241709977388382, + "kl": 0.30517578125, + "learning_rate": 1.826070864156688e-05, + "loss": 0.0102, + "reward": 0.5943080633878708, + "reward_std": 0.12463708780705929, + "rewards/accuracy_reward": 0.10267857555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491629496216774, + "step": 909 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.2718243596445374, + "grad_norm": 0.46406522393226624, + "kl": 0.38330078125, + "learning_rate": 1.8254826009714663e-05, + "loss": 0.0153, + "reward": 0.557477705180645, + "reward_std": 0.10618928261101246, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 910 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0915374755859, + "epoch": 0.27212306773205885, + "grad_norm": 5.956707000732422, + "kl": 1.779296875, + "learning_rate": 1.8248934397420802e-05, + "loss": 0.0702, + "reward": 0.5228794887661934, + "reward_std": 0.1342071946710348, + "rewards/accuracy_reward": 0.04241071571595967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687649011612, + "step": 911 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.1919860839844, + "epoch": 0.2724217758195803, + "grad_norm": 0.657204806804657, + "kl": 0.6142578125, + "learning_rate": 1.82430338110948e-05, + "loss": 0.0243, + "reward": 0.5457589626312256, + "reward_std": 0.13943488337099552, + "rewards/accuracy_reward": 0.07812500186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.467633955180645, + "step": 912 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4286041259766, + "epoch": 0.2727204839071018, + "grad_norm": 1.0467920303344727, + "kl": 1.0693359375, + "learning_rate": 1.8237124257155917e-05, + "loss": 0.0402, + "reward": 0.5128348395228386, + "reward_std": 0.15846779569983482, + "rewards/accuracy_reward": 0.06696429080329835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4458705559372902, + "step": 913 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.5669860839844, + "epoch": 0.27301919199462327, + "grad_norm": 808.0704345703125, + "kl": 125.5, + "learning_rate": 1.823120574203317e-05, + "loss": 5.0292, + "reward": 0.5200893208384514, + "reward_std": 0.14830957353115082, + "rewards/accuracy_reward": 0.08705357508733869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4330357313156128, + "step": 914 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.1227874755859, + "epoch": 0.27331790008214474, + "grad_norm": 15.16275691986084, + "kl": 6.421875, + "learning_rate": 1.822527827216532e-05, + "loss": 0.2557, + "reward": 0.4715401902794838, + "reward_std": 0.16837782226502895, + "rewards/accuracy_reward": 0.05803571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4135044813156128, + "step": 915 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.5692138671875, + "epoch": 0.2736166081696662, + "grad_norm": 0.8659901022911072, + "kl": 1.125, + "learning_rate": 1.8219341854000873e-05, + "loss": 0.0432, + "reward": 0.4626116305589676, + "reward_std": 0.183783620595932, + "rewards/accuracy_reward": 0.06919643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3934151977300644, + "step": 916 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8638610839844, + "epoch": 0.2739153162571877, + "grad_norm": 2.2980260848999023, + "kl": 1.73046875, + "learning_rate": 1.821339649399807e-05, + "loss": 0.0672, + "reward": 0.4810268133878708, + "reward_std": 0.16016803681850433, + "rewards/accuracy_reward": 0.08258928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3984375223517418, + "step": 917 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.0870971679688, + "epoch": 0.27421402434470915, + "grad_norm": 2.0240345001220703, + "kl": 3.2578125, + "learning_rate": 1.8207442198624882e-05, + "loss": 0.1255, + "reward": 0.4107143059372902, + "reward_std": 0.171085674315691, + "rewards/accuracy_reward": 0.01785714295692742, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3928571566939354, + "step": 918 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.310302734375, + "epoch": 0.2745127324322306, + "grad_norm": 1.8269177675247192, + "kl": 2.984375, + "learning_rate": 1.8201478974358996e-05, + "loss": 0.1168, + "reward": 0.494977705180645, + "reward_std": 0.1983746662735939, + "rewards/accuracy_reward": 0.10044643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3945312649011612, + "step": 919 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.4420166015625, + "epoch": 0.2748114405197521, + "grad_norm": 6.142223834991455, + "kl": 4.29296875, + "learning_rate": 1.8195506827687818e-05, + "loss": 0.1685, + "reward": 0.5256696715950966, + "reward_std": 0.1623934917151928, + "rewards/accuracy_reward": 0.11383929220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.411830373108387, + "step": 920 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2946472167969, + "epoch": 0.27511014860727356, + "grad_norm": 2.1122875213623047, + "kl": 0.6025390625, + "learning_rate": 1.8189525765108457e-05, + "loss": 0.0236, + "reward": 0.4955357313156128, + "reward_std": 0.19320551678538322, + "rewards/accuracy_reward": 0.08928571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4062500149011612, + "step": 921 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.27540885669479503, + "grad_norm": 2.857938528060913, + "kl": 0.54248046875, + "learning_rate": 1.8183535793127722e-05, + "loss": 0.0217, + "reward": 0.498883955180645, + "reward_std": 0.15968188643455505, + "rewards/accuracy_reward": 0.055803572526201606, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4430803805589676, + "step": 922 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7567291259766, + "epoch": 0.2757075647823165, + "grad_norm": 38.53578186035156, + "kl": 11.3828125, + "learning_rate": 1.817753691826212e-05, + "loss": 0.4578, + "reward": 0.5435268059372902, + "reward_std": 0.14508968219161034, + "rewards/accuracy_reward": 0.09151786286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.452008955180645, + "step": 923 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4910888671875, + "epoch": 0.276006272869838, + "grad_norm": 19.65619659423828, + "kl": 6.265625, + "learning_rate": 1.8171529147037835e-05, + "loss": 0.2479, + "reward": 0.5351562798023224, + "reward_std": 0.12468541972339153, + "rewards/accuracy_reward": 0.06473214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4704241305589676, + "step": 924 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7188110351562, + "epoch": 0.27630498095735945, + "grad_norm": 3.8858156204223633, + "kl": 0.75390625, + "learning_rate": 1.8165512485990734e-05, + "loss": 0.0274, + "reward": 0.4832589626312256, + "reward_std": 0.11942490749061108, + "rewards/accuracy_reward": 0.017857144121080637, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4654018059372902, + "step": 925 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.27660368904488086, + "grad_norm": 3.215463876724243, + "kl": 0.7734375, + "learning_rate": 1.8159486941666354e-05, + "loss": 0.031, + "reward": 0.4882812723517418, + "reward_std": 0.08950760215520859, + "rewards/accuracy_reward": 0.0133928582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4748884066939354, + "step": 926 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.27690239713240233, + "grad_norm": 8.680713653564453, + "kl": 3.75390625, + "learning_rate": 1.8153452520619897e-05, + "loss": 0.15, + "reward": 0.553013414144516, + "reward_std": 0.11922962684184313, + "rewards/accuracy_reward": 0.07589286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4771205559372902, + "step": 927 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2076110839844, + "epoch": 0.2772011052199238, + "grad_norm": 13.833993911743164, + "kl": 5.14453125, + "learning_rate": 1.814740922941622e-05, + "loss": 0.2056, + "reward": 0.551339328289032, + "reward_std": 0.14252197369933128, + "rewards/accuracy_reward": 0.07142857369035482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107313156128, + "step": 928 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.4866485595703, + "epoch": 0.2774998133074453, + "grad_norm": 1.7273428440093994, + "kl": 0.8955078125, + "learning_rate": 1.8141357074629838e-05, + "loss": 0.0292, + "reward": 0.5340402126312256, + "reward_std": 0.07820571446791291, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687723517418, + "step": 929 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.27779852139496675, + "grad_norm": 1.851991891860962, + "kl": 0.8671875, + "learning_rate": 1.8135296062844893e-05, + "loss": 0.0347, + "reward": 0.6116071715950966, + "reward_std": 0.08861804753541946, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 930 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.2411041259766, + "epoch": 0.2780972294824882, + "grad_norm": 2.7003700733184814, + "kl": 2.34375, + "learning_rate": 1.8129226200655177e-05, + "loss": 0.0859, + "reward": 0.537946455180645, + "reward_std": 0.08843808434903622, + "rewards/accuracy_reward": 0.051339287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 931 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.1227874755859, + "epoch": 0.2783959375700097, + "grad_norm": 2.5258541107177734, + "kl": 1.986328125, + "learning_rate": 1.8123147494664105e-05, + "loss": 0.0743, + "reward": 0.5262276977300644, + "reward_std": 0.08593632839620113, + "rewards/accuracy_reward": 0.035714288242161274, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 932 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.6518249511719, + "epoch": 0.27869464565753116, + "grad_norm": 0.7381093502044678, + "kl": 0.6455078125, + "learning_rate": 1.8117059951484714e-05, + "loss": 0.0255, + "reward": 0.588169664144516, + "reward_std": 0.06872989400289953, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 933 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.9107208251953, + "epoch": 0.27899335374505263, + "grad_norm": 0.9171417355537415, + "kl": 0.53515625, + "learning_rate": 1.8110963577739654e-05, + "loss": 0.0214, + "reward": 0.624441996216774, + "reward_std": 0.06930929655209184, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 934 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.3013610839844, + "epoch": 0.2792920618325741, + "grad_norm": 0.4029950201511383, + "kl": 1.142578125, + "learning_rate": 1.8104858380061178e-05, + "loss": 0.0456, + "reward": 0.5524553954601288, + "reward_std": 0.07784495875239372, + "rewards/accuracy_reward": 0.06026786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 935 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.8750305175781, + "epoch": 0.27959076992009557, + "grad_norm": 32.8082389831543, + "kl": 3.6796875, + "learning_rate": 1.809874436509115e-05, + "loss": 0.1386, + "reward": 0.5775669813156128, + "reward_std": 0.07068594358861446, + "rewards/accuracy_reward": 0.08705357322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 936 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.5803680419922, + "epoch": 0.27988947800761704, + "grad_norm": 1.4930793046951294, + "kl": 1.140625, + "learning_rate": 1.809262153948101e-05, + "loss": 0.0456, + "reward": 0.5563616305589676, + "reward_std": 0.06370984809473157, + "rewards/accuracy_reward": 0.06250000325962901, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 937 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0089416503906, + "epoch": 0.2801881860951385, + "grad_norm": 1.796439528465271, + "kl": 1.70703125, + "learning_rate": 1.80864899098918e-05, + "loss": 0.0649, + "reward": 0.5396205559372902, + "reward_std": 0.10831330437213182, + "rewards/accuracy_reward": 0.04910714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 938 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.5803833007812, + "epoch": 0.28048689418266, + "grad_norm": 1.5879842042922974, + "kl": 0.5693359375, + "learning_rate": 1.8080349482994132e-05, + "loss": 0.023, + "reward": 0.513950914144516, + "reward_std": 0.06463339133188128, + "rewards/accuracy_reward": 0.020089286379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 939 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3616333007812, + "epoch": 0.28078560227018146, + "grad_norm": 0.6882360577583313, + "kl": 0.51318359375, + "learning_rate": 1.8074200265468183e-05, + "loss": 0.0218, + "reward": 0.554129496216774, + "reward_std": 0.08174987509846687, + "rewards/accuracy_reward": 0.06250000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 940 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.5491333007812, + "epoch": 0.2810843103577029, + "grad_norm": 1.1652644872665405, + "kl": 0.4921875, + "learning_rate": 1.80680422640037e-05, + "loss": 0.0208, + "reward": 0.6233259290456772, + "reward_std": 0.116873973980546, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 941 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.7165679931641, + "epoch": 0.2813830184452244, + "grad_norm": 0.43815404176712036, + "kl": 0.955078125, + "learning_rate": 1.8061875485299987e-05, + "loss": 0.0384, + "reward": 0.530691996216774, + "reward_std": 0.11745443381369114, + "rewards/accuracy_reward": 0.037946430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 942 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.1205596923828, + "epoch": 0.28168172653274587, + "grad_norm": 2.6819887161254883, + "kl": 1.3740234375, + "learning_rate": 1.80556999360659e-05, + "loss": 0.0555, + "reward": 0.536272332072258, + "reward_std": 0.04018349130637944, + "rewards/accuracy_reward": 0.042410716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 943 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.5335388183594, + "epoch": 0.28198043462026734, + "grad_norm": 2.8410181999206543, + "kl": 1.5234375, + "learning_rate": 1.804951562301982e-05, + "loss": 0.0608, + "reward": 0.5027902126312256, + "reward_std": 0.05394195485860109, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 944 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.7321929931641, + "epoch": 0.2822791427077888, + "grad_norm": 0.1924024075269699, + "kl": 0.42724609375, + "learning_rate": 1.8043322552889685e-05, + "loss": 0.0121, + "reward": 0.5842634290456772, + "reward_std": 0.162007924169302, + "rewards/accuracy_reward": 0.08928571944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 945 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.8304138183594, + "epoch": 0.2825778507953103, + "grad_norm": 0.34846213459968567, + "kl": 0.359375, + "learning_rate": 1.803712073241294e-05, + "loss": 0.0157, + "reward": 0.5507812649011612, + "reward_std": 0.11727066896855831, + "rewards/accuracy_reward": 0.053571431897580624, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 946 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.5268249511719, + "epoch": 0.28287655888283175, + "grad_norm": 0.21153737604618073, + "kl": 0.294921875, + "learning_rate": 1.8030910168336558e-05, + "loss": 0.0118, + "reward": 0.550223246216774, + "reward_std": 0.05457546189427376, + "rewards/accuracy_reward": 0.05133928684517741, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 947 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.1562805175781, + "epoch": 0.2831752669703532, + "grad_norm": 0.2850641906261444, + "kl": 0.29833984375, + "learning_rate": 1.802469086741703e-05, + "loss": 0.0119, + "reward": 0.545200914144516, + "reward_std": 0.03187747159972787, + "rewards/accuracy_reward": 0.0468750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 948 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.5268249511719, + "epoch": 0.2834739750578747, + "grad_norm": 0.18845084309577942, + "kl": 0.28662109375, + "learning_rate": 1.801846283642034e-05, + "loss": 0.0115, + "reward": 0.5228794813156128, + "reward_std": 0.04626652970910072, + "rewards/accuracy_reward": 0.024553572526201606, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 949 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.1428985595703, + "epoch": 0.28377268314539617, + "grad_norm": 0.3170224726200104, + "kl": 0.2880859375, + "learning_rate": 1.801222608212198e-05, + "loss": 0.0113, + "reward": 0.5731026977300644, + "reward_std": 0.011160714784637094, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 950 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.6942443847656, + "epoch": 0.28407139123291764, + "grad_norm": 0.2503434121608734, + "kl": 0.28076171875, + "learning_rate": 1.8005980611306926e-05, + "loss": 0.0081, + "reward": 0.5719866454601288, + "reward_std": 0.09410080034285784, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 951 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.1786041259766, + "epoch": 0.2843700993204391, + "grad_norm": 0.34398001432418823, + "kl": 0.2958984375, + "learning_rate": 1.799972643076963e-05, + "loss": 0.0143, + "reward": 0.550223246216774, + "reward_std": 0.054154925514012575, + "rewards/accuracy_reward": 0.05357143026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 952 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.5692443847656, + "epoch": 0.2846688074079606, + "grad_norm": 0.311290442943573, + "kl": 0.28515625, + "learning_rate": 1.7993463547314044e-05, + "loss": 0.0118, + "reward": 0.5742187798023224, + "reward_std": 0.08161418698728085, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 953 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.6495971679688, + "epoch": 0.28496751549548205, + "grad_norm": 0.11540818214416504, + "kl": 0.281005859375, + "learning_rate": 1.798719196775356e-05, + "loss": 0.0123, + "reward": 0.5239955633878708, + "reward_std": 0.07688578218221664, + "rewards/accuracy_reward": 0.026785715715959668, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 954 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.8995819091797, + "epoch": 0.2852662235830035, + "grad_norm": 0.2772671580314636, + "kl": 0.39208984375, + "learning_rate": 1.7980911698911045e-05, + "loss": 0.0175, + "reward": 0.5747768133878708, + "reward_std": 0.12927423976361752, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 955 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.4464721679688, + "epoch": 0.285564931670525, + "grad_norm": 0.7886497378349304, + "kl": 0.61328125, + "learning_rate": 1.797462274761881e-05, + "loss": 0.0263, + "reward": 0.5658482313156128, + "reward_std": 0.11386740021407604, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 956 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.2611999511719, + "epoch": 0.28586363975804646, + "grad_norm": 0.23361562192440033, + "kl": 0.35107421875, + "learning_rate": 1.7968325120718624e-05, + "loss": 0.0156, + "reward": 0.5172991305589676, + "reward_std": 0.07165886368602514, + "rewards/accuracy_reward": 0.022321430267766118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 957 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.7991485595703, + "epoch": 0.28616234784556793, + "grad_norm": 0.2810601592063904, + "kl": 0.38525390625, + "learning_rate": 1.796201882506169e-05, + "loss": 0.0167, + "reward": 0.5926339477300644, + "reward_std": 0.07716252747923136, + "rewards/accuracy_reward": 0.09821429289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 958 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.0290832519531, + "epoch": 0.2864610559330894, + "grad_norm": 0.08753642439842224, + "kl": 0.27392578125, + "learning_rate": 1.7955703867508634e-05, + "loss": 0.011, + "reward": 0.5725446492433548, + "reward_std": 0.02759577170945704, + "rewards/accuracy_reward": 0.07589286053553224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 959 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.9777221679688, + "epoch": 0.2867597640206109, + "grad_norm": 0.19468657672405243, + "kl": 0.2900390625, + "learning_rate": 1.794938025492951e-05, + "loss": 0.0122, + "reward": 0.5574776977300644, + "reward_std": 0.047315985430032015, + "rewards/accuracy_reward": 0.0602678582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 960 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3928985595703, + "epoch": 0.28705847210813235, + "grad_norm": 0.1081666350364685, + "kl": 0.260009765625, + "learning_rate": 1.7943047994203796e-05, + "loss": 0.0095, + "reward": 0.5825893133878708, + "reward_std": 0.09994458453729749, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 961 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.4197082519531, + "epoch": 0.2873571801956538, + "grad_norm": 0.2243560254573822, + "kl": 0.252685546875, + "learning_rate": 1.7936707092220363e-05, + "loss": 0.0102, + "reward": 0.5708705484867096, + "reward_std": 0.020089286379516125, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 962 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.5781707763672, + "epoch": 0.2876558882831753, + "grad_norm": 0.25123319029808044, + "kl": 0.256591796875, + "learning_rate": 1.79303575558775e-05, + "loss": 0.0121, + "reward": 0.5792410969734192, + "reward_std": 0.10216925200074911, + "rewards/accuracy_reward": 0.08482143515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 963 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.7947082519531, + "epoch": 0.28795459637069676, + "grad_norm": 0.13812938332557678, + "kl": 0.255859375, + "learning_rate": 1.792399939208287e-05, + "loss": 0.0077, + "reward": 0.5150669813156128, + "reward_std": 0.055043342523276806, + "rewards/accuracy_reward": 0.01785714295692742, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 964 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.3505096435547, + "epoch": 0.28825330445821823, + "grad_norm": 0.19728878140449524, + "kl": 0.243408203125, + "learning_rate": 1.791763260775354e-05, + "loss": 0.0102, + "reward": 0.5256696790456772, + "reward_std": 0.051804926712065935, + "rewards/accuracy_reward": 0.03125000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 965 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.9665679931641, + "epoch": 0.2885520125457397, + "grad_norm": 0.18629160523414612, + "kl": 0.239501953125, + "learning_rate": 1.791125720981594e-05, + "loss": 0.008, + "reward": 0.6088169813156128, + "reward_std": 0.09604742471128702, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 966 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.0937957763672, + "epoch": 0.28885072063326117, + "grad_norm": 0.32935211062431335, + "kl": 0.291259765625, + "learning_rate": 1.7904873205205886e-05, + "loss": 0.0068, + "reward": 0.5094866305589676, + "reward_std": 0.10034633427858353, + "rewards/accuracy_reward": 0.02232142980210483, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 967 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.8036193847656, + "epoch": 0.28914942872078264, + "grad_norm": 0.3116462826728821, + "kl": 0.30078125, + "learning_rate": 1.7898480600868544e-05, + "loss": 0.0169, + "reward": 0.5613839328289032, + "reward_std": 0.18593859300017357, + "rewards/accuracy_reward": 0.0803571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 968 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.6094207763672, + "epoch": 0.28944813680830406, + "grad_norm": 0.2638584077358246, + "kl": 0.380859375, + "learning_rate": 1.7892079403758444e-05, + "loss": 0.0164, + "reward": 0.5558035969734192, + "reward_std": 0.15824768878519535, + "rewards/accuracy_reward": 0.09151786123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4642857313156128, + "step": 969 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.1228179931641, + "epoch": 0.28974684489582553, + "grad_norm": 0.6657543182373047, + "kl": 0.47900390625, + "learning_rate": 1.788566962083946e-05, + "loss": 0.0239, + "reward": 0.5396205633878708, + "reward_std": 0.11767464317381382, + "rewards/accuracy_reward": 0.06696428963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4726562723517418, + "step": 970 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.7611999511719, + "epoch": 0.290045552983347, + "grad_norm": 1.4187771081924438, + "kl": 0.5185546875, + "learning_rate": 1.7879251259084803e-05, + "loss": 0.0262, + "reward": 0.6026785969734192, + "reward_std": 0.13026322051882744, + "rewards/accuracy_reward": 0.13169643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4709821566939354, + "step": 971 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.763427734375, + "epoch": 0.29034426107086847, + "grad_norm": 0.251737117767334, + "kl": 0.283447265625, + "learning_rate": 1.787282432547703e-05, + "loss": 0.0129, + "reward": 0.5066964477300644, + "reward_std": 0.11531908996403217, + "rewards/accuracy_reward": 0.026785715483129025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107387661934, + "step": 972 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.8103179931641, + "epoch": 0.29064296915838994, + "grad_norm": 0.2509310245513916, + "kl": 0.248291015625, + "learning_rate": 1.786638882700801e-05, + "loss": 0.0131, + "reward": 0.5959821790456772, + "reward_std": 0.12443189695477486, + "rewards/accuracy_reward": 0.10937500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071715950966, + "step": 973 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.5335235595703, + "epoch": 0.2909416772459114, + "grad_norm": 0.2342822104692459, + "kl": 0.25634765625, + "learning_rate": 1.7859944770678933e-05, + "loss": 0.0109, + "reward": 0.5881696715950966, + "reward_std": 0.09201641846448183, + "rewards/accuracy_reward": 0.10044643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 974 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.3906707763672, + "epoch": 0.2912403853334329, + "grad_norm": 0.1977291852235794, + "kl": 0.243408203125, + "learning_rate": 1.7853492163500306e-05, + "loss": 0.0134, + "reward": 0.5837053805589676, + "reward_std": 0.07697974797338247, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 975 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.8147735595703, + "epoch": 0.29153909342095435, + "grad_norm": 0.1677996814250946, + "kl": 0.238037109375, + "learning_rate": 1.7847031012491925e-05, + "loss": 0.0059, + "reward": 0.5463169813156128, + "reward_std": 0.0785817401483655, + "rewards/accuracy_reward": 0.05580357415601611, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 976 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.7924652099609, + "epoch": 0.2918378015084758, + "grad_norm": 0.12132588028907776, + "kl": 0.2607421875, + "learning_rate": 1.78405613246829e-05, + "loss": 0.0126, + "reward": 0.556361623108387, + "reward_std": 0.07982286484912038, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 977 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0803985595703, + "epoch": 0.2921365095959973, + "grad_norm": 0.20497556030750275, + "kl": 0.2998046875, + "learning_rate": 1.783408310711161e-05, + "loss": 0.0124, + "reward": 0.580357164144516, + "reward_std": 0.0789270419627428, + "rewards/accuracy_reward": 0.09151786123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 978 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.1049499511719, + "epoch": 0.29243521768351877, + "grad_norm": 0.1567801833152771, + "kl": 0.29052734375, + "learning_rate": 1.7827596366825718e-05, + "loss": 0.0109, + "reward": 0.6255580633878708, + "reward_std": 0.1745777539908886, + "rewards/accuracy_reward": 0.13616072200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 979 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1362152099609, + "epoch": 0.29273392577104024, + "grad_norm": 0.3735820949077606, + "kl": 0.33984375, + "learning_rate": 1.782110111088217e-05, + "loss": 0.0144, + "reward": 0.5597098618745804, + "reward_std": 0.09137984178960323, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 980 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.3862152099609, + "epoch": 0.2930326338585617, + "grad_norm": 0.13249407708644867, + "kl": 0.27783203125, + "learning_rate": 1.7814597346347163e-05, + "loss": 0.0124, + "reward": 0.6529018059372902, + "reward_std": 0.08888930385001004, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 981 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.5491333007812, + "epoch": 0.2933313419460832, + "grad_norm": 0.421864777803421, + "kl": 0.3232421875, + "learning_rate": 1.7808085080296154e-05, + "loss": 0.0131, + "reward": 0.5546875447034836, + "reward_std": 0.088715142570436, + "rewards/accuracy_reward": 0.06696428940631449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 982 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.2366485595703, + "epoch": 0.29363005003360465, + "grad_norm": 0.11753474175930023, + "kl": 0.2685546875, + "learning_rate": 1.7801564319813854e-05, + "loss": 0.0121, + "reward": 0.589285746216774, + "reward_std": 0.06134230550378561, + "rewards/accuracy_reward": 0.09375000419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 983 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.7835235595703, + "epoch": 0.2939287581211261, + "grad_norm": 0.10336526483297348, + "kl": 0.2578125, + "learning_rate": 1.779503507199421e-05, + "loss": 0.0123, + "reward": 0.5591518133878708, + "reward_std": 0.06302508432418108, + "rewards/accuracy_reward": 0.06473214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 984 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.2946929931641, + "epoch": 0.2942274662086476, + "grad_norm": 0.11461610347032547, + "kl": 0.256591796875, + "learning_rate": 1.77884973439404e-05, + "loss": 0.0121, + "reward": 0.5680803805589676, + "reward_std": 0.0752025069668889, + "rewards/accuracy_reward": 0.06919643329456449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 985 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.3058471679688, + "epoch": 0.29452617429616906, + "grad_norm": 0.13248060643672943, + "kl": 0.24951171875, + "learning_rate": 1.7781951142764838e-05, + "loss": 0.0105, + "reward": 0.628348246216774, + "reward_std": 0.11214131489396095, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 986 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.7277221679688, + "epoch": 0.29482488238369053, + "grad_norm": 0.11586283892393112, + "kl": 0.2451171875, + "learning_rate": 1.7775396475589144e-05, + "loss": 0.0109, + "reward": 0.6316964477300644, + "reward_std": 0.0700577343814075, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 987 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.1808471679688, + "epoch": 0.295123590471212, + "grad_norm": 0.10526672005653381, + "kl": 0.248291015625, + "learning_rate": 1.7768833349544157e-05, + "loss": 0.0107, + "reward": 0.5619419813156128, + "reward_std": 0.07592819817364216, + "rewards/accuracy_reward": 0.06696428707800806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 988 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.7076263427734, + "epoch": 0.2954222985587335, + "grad_norm": 0.17896351218223572, + "kl": 0.242431640625, + "learning_rate": 1.776226177176991e-05, + "loss": 0.0101, + "reward": 0.5697544813156128, + "reward_std": 0.03210598020814359, + "rewards/accuracy_reward": 0.07589286053553224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 989 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.4308471679688, + "epoch": 0.29572100664625495, + "grad_norm": 0.08925334364175797, + "kl": 0.235107421875, + "learning_rate": 1.7755681749415644e-05, + "loss": 0.0094, + "reward": 0.503348246216774, + "reward_std": 0.03836447047069669, + "rewards/accuracy_reward": 0.006696428870782256, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 990 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.4397735595703, + "epoch": 0.2960197147337764, + "grad_norm": 0.08329442888498306, + "kl": 0.236572265625, + "learning_rate": 1.774909328963977e-05, + "loss": 0.0095, + "reward": 0.619419664144516, + "reward_std": 0.03829334327019751, + "rewards/accuracy_reward": 0.12053571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 991 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.1339721679688, + "epoch": 0.2963184228212979, + "grad_norm": 0.11422885954380035, + "kl": 0.24462890625, + "learning_rate": 1.7742496399609888e-05, + "loss": 0.0123, + "reward": 0.5641741305589676, + "reward_std": 0.08508794801309705, + "rewards/accuracy_reward": 0.07142857555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 992 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.1004638671875, + "epoch": 0.29661713090881936, + "grad_norm": 0.1008947491645813, + "kl": 0.231689453125, + "learning_rate": 1.773589108650277e-05, + "loss": 0.0082, + "reward": 0.589285746216774, + "reward_std": 0.10071790078654885, + "rewards/accuracy_reward": 0.09151786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 993 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.9062957763672, + "epoch": 0.29691583899634083, + "grad_norm": 0.1438131481409073, + "kl": 0.25927734375, + "learning_rate": 1.772927735750435e-05, + "loss": 0.0095, + "reward": 0.6060267984867096, + "reward_std": 0.09832330793142319, + "rewards/accuracy_reward": 0.10937500838190317, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 994 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.7433471679688, + "epoch": 0.2972145470838623, + "grad_norm": 0.09834718704223633, + "kl": 0.23681640625, + "learning_rate": 1.7722655219809718e-05, + "loss": 0.0112, + "reward": 0.6015625149011612, + "reward_std": 0.07025778037495911, + "rewards/accuracy_reward": 0.10491071920841932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 995 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.2812957763672, + "epoch": 0.2975132551713838, + "grad_norm": 0.13115136325359344, + "kl": 0.255126953125, + "learning_rate": 1.7716024680623106e-05, + "loss": 0.0102, + "reward": 0.6556920111179352, + "reward_std": 0.10292673576623201, + "rewards/accuracy_reward": 0.16294643771834671, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 996 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.4397888183594, + "epoch": 0.29781196325890524, + "grad_norm": 0.13237453997135162, + "kl": 0.251220703125, + "learning_rate": 1.770938574715789e-05, + "loss": 0.0114, + "reward": 0.5552455633878708, + "reward_std": 0.09777702018618584, + "rewards/accuracy_reward": 0.06696428824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 997 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.4531707763672, + "epoch": 0.2981106713464267, + "grad_norm": 0.1791224479675293, + "kl": 0.26953125, + "learning_rate": 1.7702738426636587e-05, + "loss": 0.0094, + "reward": 0.6127232313156128, + "reward_std": 0.062215379904955626, + "rewards/accuracy_reward": 0.12053572130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 998 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.1808471679688, + "epoch": 0.2984093794339482, + "grad_norm": 0.13859713077545166, + "kl": 0.27001953125, + "learning_rate": 1.7696082726290825e-05, + "loss": 0.0143, + "reward": 0.5279018133878708, + "reward_std": 0.05340482760220766, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 999 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.0803985595703, + "epoch": 0.29870808752146966, + "grad_norm": 0.17536461353302002, + "kl": 0.287109375, + "learning_rate": 1.7689418653361354e-05, + "loss": 0.0158, + "reward": 0.5719866380095482, + "reward_std": 0.08307026512920856, + "rewards/accuracy_reward": 0.0870535783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330633878708, + "step": 1000 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.0915679931641, + "epoch": 0.2990067956089911, + "grad_norm": 0.14756453037261963, + "kl": 0.28271484375, + "learning_rate": 1.768274621509803e-05, + "loss": 0.0144, + "reward": 0.5591518133878708, + "reward_std": 0.0881609283387661, + "rewards/accuracy_reward": 0.06919643213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1001 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.1719055175781, + "epoch": 0.2993055036965126, + "grad_norm": 0.5057372450828552, + "kl": 0.30712890625, + "learning_rate": 1.7676065418759814e-05, + "loss": 0.0134, + "reward": 0.5619420111179352, + "reward_std": 0.1127330381423235, + "rewards/accuracy_reward": 0.07589286053553224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 1002 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.6495971679688, + "epoch": 0.29960421178403407, + "grad_norm": 0.15847234427928925, + "kl": 0.26904296875, + "learning_rate": 1.7669376271614757e-05, + "loss": 0.0121, + "reward": 0.6350446790456772, + "reward_std": 0.11287215165793896, + "rewards/accuracy_reward": 0.14508929220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1003 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.0826416015625, + "epoch": 0.29990291987155554, + "grad_norm": 0.20962347090244293, + "kl": 0.30419921875, + "learning_rate": 1.7662678780939996e-05, + "loss": 0.0146, + "reward": 0.6316964700818062, + "reward_std": 0.13792529422789812, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888392984867096, + "step": 1004 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.7321929931641, + "epoch": 0.300201627959077, + "grad_norm": 0.1405963897705078, + "kl": 0.29052734375, + "learning_rate": 1.7655972954021745e-05, + "loss": 0.0073, + "reward": 0.6054687649011612, + "reward_std": 0.052362322341650724, + "rewards/accuracy_reward": 0.11607143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 1005 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.6786193847656, + "epoch": 0.3005003360465985, + "grad_norm": 0.14180313050746918, + "kl": 0.2548828125, + "learning_rate": 1.764925879815529e-05, + "loss": 0.0123, + "reward": 0.511718787252903, + "reward_std": 0.09002960100769997, + "rewards/accuracy_reward": 0.022321430267766118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 1006 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.1361999511719, + "epoch": 0.30079904413411995, + "grad_norm": 0.13520847260951996, + "kl": 0.254150390625, + "learning_rate": 1.7642536320644964e-05, + "loss": 0.0104, + "reward": 0.5837053954601288, + "reward_std": 0.12826774083077908, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1007 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.9308319091797, + "epoch": 0.3010977522216414, + "grad_norm": 0.18880732357501984, + "kl": 0.2392578125, + "learning_rate": 1.7635805528804175e-05, + "loss": 0.0114, + "reward": 0.6272321790456772, + "reward_std": 0.09232376841828227, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1008 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.7656555175781, + "epoch": 0.3013964603091629, + "grad_norm": 0.18613561987876892, + "kl": 0.234375, + "learning_rate": 1.7629066429955358e-05, + "loss": 0.0082, + "reward": 0.5669642984867096, + "reward_std": 0.09724946226924658, + "rewards/accuracy_reward": 0.06919642956927419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1009 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.7121124267578, + "epoch": 0.30169516839668437, + "grad_norm": 0.17258746922016144, + "kl": 0.23388671875, + "learning_rate": 1.7622319031429995e-05, + "loss": 0.0074, + "reward": 0.6132812798023224, + "reward_std": 0.10789241828024387, + "rewards/accuracy_reward": 0.12053571944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1010 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.0513763427734, + "epoch": 0.30199387648420584, + "grad_norm": 0.13413117825984955, + "kl": 0.225830078125, + "learning_rate": 1.7615563340568594e-05, + "loss": 0.0113, + "reward": 0.5742187649011612, + "reward_std": 0.040596613893285394, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1011 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.9978179931641, + "epoch": 0.30229258457172725, + "grad_norm": 0.13380618393421173, + "kl": 0.24658203125, + "learning_rate": 1.7608799364720685e-05, + "loss": 0.0113, + "reward": 0.5831473469734192, + "reward_std": 0.04684467753395438, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1012 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.4844055175781, + "epoch": 0.3025912926592487, + "grad_norm": 0.24116964638233185, + "kl": 0.22802734375, + "learning_rate": 1.7602027111244807e-05, + "loss": 0.0083, + "reward": 0.5558035969734192, + "reward_std": 0.09411949850618839, + "rewards/accuracy_reward": 0.06473214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1013 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.6629943847656, + "epoch": 0.3028900007467702, + "grad_norm": 0.37910741567611694, + "kl": 0.2578125, + "learning_rate": 1.7595246587508513e-05, + "loss": 0.0113, + "reward": 0.6277901977300644, + "reward_std": 0.06862729717977345, + "rewards/accuracy_reward": 0.13616072130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 1014 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.1786346435547, + "epoch": 0.30318870883429166, + "grad_norm": 0.12444666028022766, + "kl": 0.245849609375, + "learning_rate": 1.7588457800888342e-05, + "loss": 0.0094, + "reward": 0.6082589775323868, + "reward_std": 0.09986336715519428, + "rewards/accuracy_reward": 0.11383929406292737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1015 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.0424652099609, + "epoch": 0.30348741692181314, + "grad_norm": 0.1443783938884735, + "kl": 0.270263671875, + "learning_rate": 1.7581660758769836e-05, + "loss": 0.0172, + "reward": 0.615513414144516, + "reward_std": 0.12774321623146534, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1016 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.5179138183594, + "epoch": 0.3037861250093346, + "grad_norm": 0.21504957973957062, + "kl": 0.2900390625, + "learning_rate": 1.7574855468547503e-05, + "loss": 0.0217, + "reward": 0.6143973618745804, + "reward_std": 0.15214142762124538, + "rewards/accuracy_reward": 0.12723215157166123, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 1017 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.1674499511719, + "epoch": 0.3040848330968561, + "grad_norm": 0.22057805955410004, + "kl": 0.2919921875, + "learning_rate": 1.7568041937624843e-05, + "loss": 0.0178, + "reward": 0.7159598544239998, + "reward_std": 0.09885263163596392, + "rewards/accuracy_reward": 0.2232142984867096, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 1018 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.654052734375, + "epoch": 0.30438354118437755, + "grad_norm": 0.20389653742313385, + "kl": 0.29150390625, + "learning_rate": 1.7561220173414297e-05, + "loss": 0.0187, + "reward": 0.6116071715950966, + "reward_std": 0.059805192053318024, + "rewards/accuracy_reward": 0.1205357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1019 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.5312957763672, + "epoch": 0.304682249271899, + "grad_norm": 0.15559183061122894, + "kl": 0.2431640625, + "learning_rate": 1.755439018333728e-05, + "loss": 0.0131, + "reward": 0.6065848469734192, + "reward_std": 0.04563337517902255, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1020 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.3549652099609, + "epoch": 0.3049809573594205, + "grad_norm": 0.16117966175079346, + "kl": 0.248291015625, + "learning_rate": 1.7547551974824158e-05, + "loss": 0.0155, + "reward": 0.5362723469734192, + "reward_std": 0.0616491143591702, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1021 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.9576416015625, + "epoch": 0.30527966544694196, + "grad_norm": 0.16911251842975616, + "kl": 0.260498046875, + "learning_rate": 1.7540705555314224e-05, + "loss": 0.0108, + "reward": 0.5479910969734192, + "reward_std": 0.09867309685796499, + "rewards/accuracy_reward": 0.053571431431919336, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1022 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.0558471679688, + "epoch": 0.30557837353446343, + "grad_norm": 0.1703730672597885, + "kl": 0.248291015625, + "learning_rate": 1.753385093225572e-05, + "loss": 0.0095, + "reward": 0.6261160969734192, + "reward_std": 0.11557511240243912, + "rewards/accuracy_reward": 0.1339285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 1023 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.1919860839844, + "epoch": 0.3058770816219849, + "grad_norm": 0.15134090185165405, + "kl": 0.247802734375, + "learning_rate": 1.7526988113105794e-05, + "loss": 0.0067, + "reward": 0.5345982313156128, + "reward_std": 0.06446054857224226, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196492433548, + "step": 1024 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.8928985595703, + "epoch": 0.3061757897095064, + "grad_norm": 0.17524300515651703, + "kl": 0.253662109375, + "learning_rate": 1.7520117105330524e-05, + "loss": 0.0142, + "reward": 0.5792410969734192, + "reward_std": 0.12597495131194592, + "rewards/accuracy_reward": 0.08928571734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1025 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.4509429931641, + "epoch": 0.30647449779702785, + "grad_norm": 0.1261157989501953, + "kl": 0.232666015625, + "learning_rate": 1.7513237916404896e-05, + "loss": 0.0093, + "reward": 0.6796875298023224, + "reward_std": 0.08220840897411108, + "rewards/accuracy_reward": 0.18750000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1026 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.7790679931641, + "epoch": 0.3067732058845493, + "grad_norm": 0.23665615916252136, + "kl": 0.23388671875, + "learning_rate": 1.750635055381279e-05, + "loss": 0.0118, + "reward": 0.5323661118745804, + "reward_std": 0.1105874627828598, + "rewards/accuracy_reward": 0.042410717345774174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1027 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.2545013427734, + "epoch": 0.3070719139720708, + "grad_norm": 0.2425909787416458, + "kl": 0.243408203125, + "learning_rate": 1.7499455025046982e-05, + "loss": 0.0126, + "reward": 0.5368303805589676, + "reward_std": 0.09897982701659203, + "rewards/accuracy_reward": 0.051339288242161274, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 1028 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.8683624267578, + "epoch": 0.30737062205959226, + "grad_norm": 0.14330674707889557, + "kl": 0.2861328125, + "learning_rate": 1.7492551337609134e-05, + "loss": 0.0175, + "reward": 0.5591518133878708, + "reward_std": 0.10058216005563736, + "rewards/accuracy_reward": 0.0758928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.483258955180645, + "step": 1029 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.1562957763672, + "epoch": 0.30766933014711373, + "grad_norm": 0.2730143666267395, + "kl": 0.32080078125, + "learning_rate": 1.748563949900978e-05, + "loss": 0.0071, + "reward": 0.5541294887661934, + "reward_std": 0.09278821013867855, + "rewards/accuracy_reward": 0.06696428754366934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 1030 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.3594055175781, + "epoch": 0.3079680382346352, + "grad_norm": 0.18509584665298462, + "kl": 0.3193359375, + "learning_rate": 1.7478719516768324e-05, + "loss": 0.0126, + "reward": 0.5703125298023224, + "reward_std": 0.15533979050815105, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 1031 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.8147735595703, + "epoch": 0.30826674632215667, + "grad_norm": 1.3853319883346558, + "kl": 0.42529296875, + "learning_rate": 1.7471791398413026e-05, + "loss": 0.0191, + "reward": 0.5747768208384514, + "reward_std": 0.13019253872334957, + "rewards/accuracy_reward": 0.09375000605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268133878708, + "step": 1032 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.6339721679688, + "epoch": 0.30856545440967814, + "grad_norm": 0.24162901937961578, + "kl": 0.36083984375, + "learning_rate": 1.7464855151481e-05, + "loss": 0.0199, + "reward": 0.569196455180645, + "reward_std": 0.12756244651973248, + "rewards/accuracy_reward": 0.08258929150179029, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071715950966, + "step": 1033 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.2545013427734, + "epoch": 0.3088641624971996, + "grad_norm": 0.18494215607643127, + "kl": 0.33837890625, + "learning_rate": 1.7457910783518204e-05, + "loss": 0.0192, + "reward": 0.6367187798023224, + "reward_std": 0.09775767102837563, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 1034 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.1652221679688, + "epoch": 0.3091628705847211, + "grad_norm": 0.12540695071220398, + "kl": 0.29052734375, + "learning_rate": 1.7450958302079428e-05, + "loss": 0.0128, + "reward": 0.602678582072258, + "reward_std": 0.04556291596964002, + "rewards/accuracy_reward": 0.11160714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1035 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.7745971679688, + "epoch": 0.30946157867224255, + "grad_norm": 0.14613790810108185, + "kl": 0.28466796875, + "learning_rate": 1.7443997714728294e-05, + "loss": 0.0121, + "reward": 0.5585937649011612, + "reward_std": 0.12046443670988083, + "rewards/accuracy_reward": 0.07142857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 1036 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.0826416015625, + "epoch": 0.309760286759764, + "grad_norm": 0.17331932485103607, + "kl": 0.280517578125, + "learning_rate": 1.7437029029037233e-05, + "loss": 0.0099, + "reward": 0.6428571790456772, + "reward_std": 0.12380258180201054, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 1037 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.3683624267578, + "epoch": 0.3100589948472855, + "grad_norm": 0.16673126816749573, + "kl": 0.2646484375, + "learning_rate": 1.7430052252587498e-05, + "loss": 0.0174, + "reward": 0.6339285969734192, + "reward_std": 0.1490095667541027, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 1038 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.5223693847656, + "epoch": 0.31035770293480697, + "grad_norm": 0.13919693231582642, + "kl": 0.2646484375, + "learning_rate": 1.7423067392969137e-05, + "loss": 0.0091, + "reward": 0.5781250298023224, + "reward_std": 0.06982398126274347, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1039 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.5692291259766, + "epoch": 0.31065641102232844, + "grad_norm": 0.1393003910779953, + "kl": 0.2744140625, + "learning_rate": 1.741607445778099e-05, + "loss": 0.011, + "reward": 0.5948660969734192, + "reward_std": 0.06588304601609707, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1040 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.8861999511719, + "epoch": 0.3109551191098499, + "grad_norm": 0.15342721343040466, + "kl": 0.27099609375, + "learning_rate": 1.7409073454630686e-05, + "loss": 0.0035, + "reward": 0.511160746216774, + "reward_std": 0.0692016773391515, + "rewards/accuracy_reward": 0.02008928661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1041 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.8638916015625, + "epoch": 0.3112538271973714, + "grad_norm": 0.1421797275543213, + "kl": 0.26904296875, + "learning_rate": 1.7402064391134626e-05, + "loss": 0.0114, + "reward": 0.5652901977300644, + "reward_std": 0.023138975026085973, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1042 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3348541259766, + "epoch": 0.31155253528489285, + "grad_norm": 0.1965983361005783, + "kl": 0.264404296875, + "learning_rate": 1.7395047274917994e-05, + "loss": 0.0111, + "reward": 0.5507812798023224, + "reward_std": 0.08626285754144192, + "rewards/accuracy_reward": 0.060267860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1043 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.4107513427734, + "epoch": 0.3118512433724143, + "grad_norm": 0.14811187982559204, + "kl": 0.2763671875, + "learning_rate": 1.7388022113614722e-05, + "loss": 0.0138, + "reward": 0.5474330633878708, + "reward_std": 0.07472634920850396, + "rewards/accuracy_reward": 0.058035717345774174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489397332072258, + "step": 1044 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.0245971679688, + "epoch": 0.3121499514599358, + "grad_norm": 0.1722760945558548, + "kl": 0.270263671875, + "learning_rate": 1.7380988914867488e-05, + "loss": 0.0128, + "reward": 0.5228794813156128, + "reward_std": 0.052276539616286755, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330633878708, + "step": 1045 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.0089721679688, + "epoch": 0.31244865954745726, + "grad_norm": 0.19931258261203766, + "kl": 0.28955078125, + "learning_rate": 1.7373947686327736e-05, + "loss": 0.0139, + "reward": 0.5597098469734192, + "reward_std": 0.09877336397767067, + "rewards/accuracy_reward": 0.07366071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 1046 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.5513916015625, + "epoch": 0.31274736763497873, + "grad_norm": 0.18080851435661316, + "kl": 0.32421875, + "learning_rate": 1.736689843565562e-05, + "loss": 0.016, + "reward": 0.5926339477300644, + "reward_std": 0.10620656423270702, + "rewards/accuracy_reward": 0.10714285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 1047 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.1808471679688, + "epoch": 0.3130460757225002, + "grad_norm": 0.4474746882915497, + "kl": 0.408203125, + "learning_rate": 1.7359841170520043e-05, + "loss": 0.0224, + "reward": 0.5513393133878708, + "reward_std": 0.10806824639439583, + "rewards/accuracy_reward": 0.06473214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 1048 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.2991638183594, + "epoch": 0.3133447838100217, + "grad_norm": 0.5866365432739258, + "kl": 0.4306640625, + "learning_rate": 1.7352775898598615e-05, + "loss": 0.0196, + "reward": 0.5625000223517418, + "reward_std": 0.10150550398975611, + "rewards/accuracy_reward": 0.07589286402799189, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 1049 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.2009429931641, + "epoch": 0.31364349189754315, + "grad_norm": 0.24028240144252777, + "kl": 0.30712890625, + "learning_rate": 1.7345702627577655e-05, + "loss": 0.0142, + "reward": 0.5513393133878708, + "reward_std": 0.09477246925234795, + "rewards/accuracy_reward": 0.06026786146685481, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1050 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.9152221679688, + "epoch": 0.3139421999850646, + "grad_norm": 0.1547701358795166, + "kl": 0.275390625, + "learning_rate": 1.7338621365152195e-05, + "loss": 0.0123, + "reward": 0.5982143133878708, + "reward_std": 0.08953498816117644, + "rewards/accuracy_reward": 0.1049107164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 1051 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.7232513427734, + "epoch": 0.3142409080725861, + "grad_norm": 0.11374294757843018, + "kl": 0.254150390625, + "learning_rate": 1.7331532119025953e-05, + "loss": 0.0108, + "reward": 0.5597098469734192, + "reward_std": 0.08355422038584948, + "rewards/accuracy_reward": 0.06473214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1052 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.0156707763672, + "epoch": 0.31453961616010756, + "grad_norm": 0.10130824148654938, + "kl": 0.24267578125, + "learning_rate": 1.7324434896911332e-05, + "loss": 0.0095, + "reward": 0.684709832072258, + "reward_std": 0.03448617667891085, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1053 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.2455749511719, + "epoch": 0.31483832424762903, + "grad_norm": 0.18298472464084625, + "kl": 0.245849609375, + "learning_rate": 1.7317329706529413e-05, + "loss": 0.0093, + "reward": 0.5825892984867096, + "reward_std": 0.08951570466160774, + "rewards/accuracy_reward": 0.08482143329456449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1054 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.8348693847656, + "epoch": 0.31513703233515045, + "grad_norm": 0.11225049197673798, + "kl": 0.2451171875, + "learning_rate": 1.731021655560995e-05, + "loss": 0.0105, + "reward": 0.534598246216774, + "reward_std": 0.08892471436411142, + "rewards/accuracy_reward": 0.03794643119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1055 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.9911041259766, + "epoch": 0.3154357404226719, + "grad_norm": 0.08796630799770355, + "kl": 0.234619140625, + "learning_rate": 1.7303095451891356e-05, + "loss": 0.0098, + "reward": 0.568638414144516, + "reward_std": 0.06782773230224848, + "rewards/accuracy_reward": 0.06919643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1056 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.2433471679688, + "epoch": 0.3157344485101934, + "grad_norm": 0.0955105870962143, + "kl": 0.23193359375, + "learning_rate": 1.7295966403120685e-05, + "loss": 0.008, + "reward": 0.5597098469734192, + "reward_std": 0.07128777168691158, + "rewards/accuracy_reward": 0.06250000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1057 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3326263427734, + "epoch": 0.31603315659771486, + "grad_norm": 0.1624225378036499, + "kl": 0.2607421875, + "learning_rate": 1.728882941705365e-05, + "loss": 0.0042, + "reward": 0.5256696790456772, + "reward_std": 0.06897235149517655, + "rewards/accuracy_reward": 0.029017859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1058 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.9107513427734, + "epoch": 0.31633186468523633, + "grad_norm": 0.10515455156564713, + "kl": 0.2421875, + "learning_rate": 1.7281684501454595e-05, + "loss": 0.0094, + "reward": 0.602678582072258, + "reward_std": 0.049872099654749036, + "rewards/accuracy_reward": 0.10491071618162096, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1059 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.013427734375, + "epoch": 0.3166305727727578, + "grad_norm": 0.11112035810947418, + "kl": 0.233154296875, + "learning_rate": 1.727453166409648e-05, + "loss": 0.0095, + "reward": 0.5496652126312256, + "reward_std": 0.0902593694627285, + "rewards/accuracy_reward": 0.051339289639145136, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1060 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.1250305175781, + "epoch": 0.31692928086027927, + "grad_norm": 0.1367129385471344, + "kl": 0.233154296875, + "learning_rate": 1.72673709127609e-05, + "loss": 0.0096, + "reward": 0.5641741305589676, + "reward_std": 0.06763660279102623, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 1061 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1986999511719, + "epoch": 0.31722798894780074, + "grad_norm": 0.10896162688732147, + "kl": 0.227783203125, + "learning_rate": 1.726020225523804e-05, + "loss": 0.0111, + "reward": 0.5435267984867096, + "reward_std": 0.0609417837113142, + "rewards/accuracy_reward": 0.04910714505240321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1062 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.0938110351562, + "epoch": 0.3175266970353222, + "grad_norm": 0.08805756270885468, + "kl": 0.230712890625, + "learning_rate": 1.7253025699326706e-05, + "loss": 0.0055, + "reward": 0.6060268133878708, + "reward_std": 0.022321429569274187, + "rewards/accuracy_reward": 0.10937500861473382, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1063 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.2723693847656, + "epoch": 0.3178254051228437, + "grad_norm": 0.10097762942314148, + "kl": 0.22314453125, + "learning_rate": 1.7245841252834282e-05, + "loss": 0.0086, + "reward": 0.5781250149011612, + "reward_std": 0.08193191513419151, + "rewards/accuracy_reward": 0.08035714505240321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1064 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.7143402099609, + "epoch": 0.31812411321036516, + "grad_norm": 0.09400265663862228, + "kl": 0.231689453125, + "learning_rate": 1.723864892357675e-05, + "loss": 0.0098, + "reward": 0.6238839626312256, + "reward_std": 0.08655861672013998, + "rewards/accuracy_reward": 0.12723214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1065 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.7701263427734, + "epoch": 0.3184228212978866, + "grad_norm": 0.13778595626354218, + "kl": 0.2353515625, + "learning_rate": 1.7231448719378645e-05, + "loss": 0.0103, + "reward": 0.5390625298023224, + "reward_std": 0.10928068123757839, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1066 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.9129791259766, + "epoch": 0.3187215293854081, + "grad_norm": 0.115340456366539, + "kl": 0.22607421875, + "learning_rate": 1.7224240648073097e-05, + "loss": 0.0088, + "reward": 0.6322544813156128, + "reward_std": 0.08111945330165327, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1067 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.9062957763672, + "epoch": 0.31902023747292957, + "grad_norm": 0.14763310551643372, + "kl": 0.224853515625, + "learning_rate": 1.7217024717501772e-05, + "loss": 0.0066, + "reward": 0.583147332072258, + "reward_std": 0.07749558612704277, + "rewards/accuracy_reward": 0.09375000605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 1068 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1428833007812, + "epoch": 0.31931894556045104, + "grad_norm": 0.14555369317531586, + "kl": 0.2236328125, + "learning_rate": 1.72098009355149e-05, + "loss": 0.007, + "reward": 0.6495535969734192, + "reward_std": 0.17520601861178875, + "rewards/accuracy_reward": 0.15848214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1069 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.2031555175781, + "epoch": 0.3196176536479725, + "grad_norm": 0.15450266003608704, + "kl": 0.2216796875, + "learning_rate": 1.7202569309971245e-05, + "loss": 0.0114, + "reward": 0.6037946715950966, + "reward_std": 0.06285429559648037, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 1070 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.8728179931641, + "epoch": 0.319916361735494, + "grad_norm": 0.12685075402259827, + "kl": 0.2255859375, + "learning_rate": 1.7195329848738113e-05, + "loss": 0.0097, + "reward": 0.5976562649011612, + "reward_std": 0.08064711769111454, + "rewards/accuracy_reward": 0.10714286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1071 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3527221679688, + "epoch": 0.32021506982301545, + "grad_norm": 0.18678654730319977, + "kl": 0.219970703125, + "learning_rate": 1.7188082559691318e-05, + "loss": 0.0098, + "reward": 0.5178571566939354, + "reward_std": 0.08517109043896198, + "rewards/accuracy_reward": 0.042410716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4754464477300644, + "step": 1072 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.4754943847656, + "epoch": 0.3205137779105369, + "grad_norm": 0.18161945044994354, + "kl": 0.222412109375, + "learning_rate": 1.718082745071521e-05, + "loss": 0.0113, + "reward": 0.5881696715950966, + "reward_std": 0.123329047113657, + "rewards/accuracy_reward": 0.1138392947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4743303880095482, + "step": 1073 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.6272735595703, + "epoch": 0.3208124859980584, + "grad_norm": 0.16916069388389587, + "kl": 0.209228515625, + "learning_rate": 1.7173564529702627e-05, + "loss": 0.0103, + "reward": 0.5418526977300644, + "reward_std": 0.11254492029547691, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4793526902794838, + "step": 1074 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.4286193847656, + "epoch": 0.32111119408557987, + "grad_norm": 0.22010602056980133, + "kl": 0.210693359375, + "learning_rate": 1.716629380455493e-05, + "loss": 0.0123, + "reward": 0.5848214477300644, + "reward_std": 0.09943042322993279, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.475446455180645, + "step": 1075 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.3951416015625, + "epoch": 0.32140990217310134, + "grad_norm": 0.2900941073894501, + "kl": 0.213623046875, + "learning_rate": 1.715901528318194e-05, + "loss": 0.0123, + "reward": 0.5390625223517418, + "reward_std": 0.07972473558038473, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.467633955180645, + "step": 1076 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.3281707763672, + "epoch": 0.3217086102606228, + "grad_norm": 0.2477128803730011, + "kl": 0.211181640625, + "learning_rate": 1.715172897350198e-05, + "loss": 0.0161, + "reward": 0.5078125149011612, + "reward_std": 0.1281936839222908, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.463169664144516, + "step": 1077 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.7902374267578, + "epoch": 0.3220073183481443, + "grad_norm": 0.2720833420753479, + "kl": 0.220947265625, + "learning_rate": 1.7144434883441843e-05, + "loss": 0.0188, + "reward": 0.5546875223517418, + "reward_std": 0.15311339125037193, + "rewards/accuracy_reward": 0.08482143492437899, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4698660895228386, + "step": 1078 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.9933471679688, + "epoch": 0.32230602643566575, + "grad_norm": 0.14489024877548218, + "kl": 0.224853515625, + "learning_rate": 1.7137133020936783e-05, + "loss": 0.0097, + "reward": 0.6032366454601288, + "reward_std": 0.15849005803465843, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687723517418, + "step": 1079 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.7455902099609, + "epoch": 0.3226047345231872, + "grad_norm": 0.2052963227033615, + "kl": 0.260498046875, + "learning_rate": 1.712982339393051e-05, + "loss": 0.0272, + "reward": 0.5770089477300644, + "reward_std": 0.14897046331316233, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4676339477300644, + "step": 1080 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.3884429931641, + "epoch": 0.3229034426107087, + "grad_norm": 0.2580326497554779, + "kl": 0.33544921875, + "learning_rate": 1.7122506010375182e-05, + "loss": 0.0449, + "reward": 0.5301339626312256, + "reward_std": 0.15431207045912743, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4587053805589676, + "step": 1081 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.8125610351562, + "epoch": 0.32320215069823016, + "grad_norm": 0.4800731837749481, + "kl": 0.40771484375, + "learning_rate": 1.7115180878231394e-05, + "loss": 0.0482, + "reward": 0.6216518133878708, + "reward_std": 0.12080533802509308, + "rewards/accuracy_reward": 0.1562500111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4654018059372902, + "step": 1082 + }, + { + "clip_ratio": 0.0, + "completion_length": 923.0402069091797, + "epoch": 0.32350085878575163, + "grad_norm": 0.38819342851638794, + "kl": 0.3984375, + "learning_rate": 1.7107848005468177e-05, + "loss": 0.0403, + "reward": 0.6316964626312256, + "reward_std": 0.14231932163238525, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4687500149011612, + "step": 1083 + }, + { + "clip_ratio": 0.0, + "completion_length": 888.3750305175781, + "epoch": 0.3237995668732731, + "grad_norm": 0.23248009383678436, + "kl": 0.3388671875, + "learning_rate": 1.710050740006297e-05, + "loss": 0.0482, + "reward": 0.5876116305589676, + "reward_std": 0.11220782715827227, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 1084 + }, + { + "clip_ratio": 0.0, + "completion_length": 868.6696624755859, + "epoch": 0.3240982749607946, + "grad_norm": 0.2218216210603714, + "kl": 0.3330078125, + "learning_rate": 1.7093159070001637e-05, + "loss": 0.0299, + "reward": 0.5987723469734192, + "reward_std": 0.1564360186457634, + "rewards/accuracy_reward": 0.11160715157166123, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 1085 + }, + { + "clip_ratio": 0.0, + "completion_length": 871.3861999511719, + "epoch": 0.32439698304831605, + "grad_norm": 0.23458757996559143, + "kl": 0.29248046875, + "learning_rate": 1.7085803023278444e-05, + "loss": 0.0317, + "reward": 0.5714285969734192, + "reward_std": 0.10216698609292507, + "rewards/accuracy_reward": 0.08035714528523386, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 1086 + }, + { + "clip_ratio": 0.0, + "completion_length": 889.8683319091797, + "epoch": 0.3246956911358375, + "grad_norm": 0.1793205887079239, + "kl": 0.26513671875, + "learning_rate": 1.7078439267896042e-05, + "loss": 0.0245, + "reward": 0.5664062798023224, + "reward_std": 0.09018317796289921, + "rewards/accuracy_reward": 0.07366071734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1087 + }, + { + "clip_ratio": 0.0, + "completion_length": 901.3750305175781, + "epoch": 0.324994399223359, + "grad_norm": 0.14552675187587738, + "kl": 0.262939453125, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.0307, + "reward": 0.573660746216774, + "reward_std": 0.10784158855676651, + "rewards/accuracy_reward": 0.08035714598372579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 1088 + }, + { + "clip_ratio": 0.0, + "completion_length": 886.7745971679688, + "epoch": 0.32529310731088046, + "grad_norm": 0.17114759981632233, + "kl": 0.259765625, + "learning_rate": 1.7063688663206172e-05, + "loss": 0.0227, + "reward": 0.6590402126312256, + "reward_std": 0.09371602023020387, + "rewards/accuracy_reward": 0.16294643841683865, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1089 + }, + { + "clip_ratio": 0.0, + "completion_length": 922.2902221679688, + "epoch": 0.32559181539840193, + "grad_norm": 0.18511751294136047, + "kl": 0.26171875, + "learning_rate": 1.705630182994592e-05, + "loss": 0.0119, + "reward": 0.5412946790456772, + "reward_std": 0.0887649916112423, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1090 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.1674652099609, + "epoch": 0.3258905234859234, + "grad_norm": 0.13980697095394135, + "kl": 0.24658203125, + "learning_rate": 1.7048907320120867e-05, + "loss": 0.0112, + "reward": 0.5552455484867096, + "reward_std": 0.07977192406542599, + "rewards/accuracy_reward": 0.06026786100119352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1091 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.7567443847656, + "epoch": 0.32618923157344487, + "grad_norm": 0.09707117825746536, + "kl": 0.248291015625, + "learning_rate": 1.7041505141775517e-05, + "loss": 0.0121, + "reward": 0.5513393133878708, + "reward_std": 0.041414158418774605, + "rewards/accuracy_reward": 0.05357143026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1092 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.4263763427734, + "epoch": 0.32648793966096634, + "grad_norm": 0.1282774955034256, + "kl": 0.248779296875, + "learning_rate": 1.7034095302962716e-05, + "loss": 0.0053, + "reward": 0.5948661044239998, + "reward_std": 0.1189840491861105, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.496651791036129, + "step": 1093 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.4553985595703, + "epoch": 0.3267866477484878, + "grad_norm": 0.0750611424446106, + "kl": 0.2412109375, + "learning_rate": 1.7026677811743638e-05, + "loss": 0.0104, + "reward": 0.6735491305589676, + "reward_std": 0.018674688413739204, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1094 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.5736999511719, + "epoch": 0.3270853558360093, + "grad_norm": 0.0924379751086235, + "kl": 0.22705078125, + "learning_rate": 1.701925267618779e-05, + "loss": 0.0089, + "reward": 0.6121651977300644, + "reward_std": 0.04997548833489418, + "rewards/accuracy_reward": 0.11383928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1095 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.1964569091797, + "epoch": 0.32738406392353075, + "grad_norm": 0.16710762679576874, + "kl": 0.243896484375, + "learning_rate": 1.7011819904372992e-05, + "loss": 0.0102, + "reward": 0.666294664144516, + "reward_std": 0.12903784960508347, + "rewards/accuracy_reward": 0.1674107275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1096 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.3036041259766, + "epoch": 0.3276827720110522, + "grad_norm": 0.12108688056468964, + "kl": 0.23876953125, + "learning_rate": 1.700437950438537e-05, + "loss": 0.0131, + "reward": 0.5831473618745804, + "reward_std": 0.10147538525052369, + "rewards/accuracy_reward": 0.08482143469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1097 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.1451416015625, + "epoch": 0.32798148009857364, + "grad_norm": 0.10973917692899704, + "kl": 0.2373046875, + "learning_rate": 1.699693148431935e-05, + "loss": 0.0135, + "reward": 0.6356027126312256, + "reward_std": 0.1033286303281784, + "rewards/accuracy_reward": 0.1383928582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1098 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.4152221679688, + "epoch": 0.3282801881860951, + "grad_norm": 0.14621810615062714, + "kl": 0.2255859375, + "learning_rate": 1.698947585227765e-05, + "loss": 0.0108, + "reward": 0.5652902126312256, + "reward_std": 0.0697524193674326, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1099 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.4308319091797, + "epoch": 0.3285788962736166, + "grad_norm": 0.10613299161195755, + "kl": 0.234619140625, + "learning_rate": 1.6982012616371263e-05, + "loss": 0.0103, + "reward": 0.5697544813156128, + "reward_std": 0.08347293362021446, + "rewards/accuracy_reward": 0.07142857694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1100 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.5915679931641, + "epoch": 0.32887760436113805, + "grad_norm": 0.09064580500125885, + "kl": 0.23046875, + "learning_rate": 1.6974541784719458e-05, + "loss": 0.0076, + "reward": 0.599888414144516, + "reward_std": 0.05909706326201558, + "rewards/accuracy_reward": 0.1026785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1101 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.1004943847656, + "epoch": 0.3291763124486595, + "grad_norm": 0.13685393333435059, + "kl": 0.23388671875, + "learning_rate": 1.6967063365449774e-05, + "loss": 0.0079, + "reward": 0.5329241305589676, + "reward_std": 0.03939946414902806, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1102 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.8906555175781, + "epoch": 0.329475020536181, + "grad_norm": 0.1228465810418129, + "kl": 0.234130859375, + "learning_rate": 1.695957736669799e-05, + "loss": 0.0093, + "reward": 0.5385044813156128, + "reward_std": 0.03533772681839764, + "rewards/accuracy_reward": 0.0424107164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1103 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.7009429931641, + "epoch": 0.32977372862370247, + "grad_norm": 0.09066915512084961, + "kl": 0.23388671875, + "learning_rate": 1.6952083796608144e-05, + "loss": 0.0036, + "reward": 0.5563616305589676, + "reward_std": 0.03776741703040898, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1104 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.3951263427734, + "epoch": 0.33007243671122394, + "grad_norm": 0.11683665215969086, + "kl": 0.23681640625, + "learning_rate": 1.694458266333251e-05, + "loss": 0.0107, + "reward": 0.6534598618745804, + "reward_std": 0.09122015535831451, + "rewards/accuracy_reward": 0.15848214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1105 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.9866333007812, + "epoch": 0.3303711447987454, + "grad_norm": 0.12565165758132935, + "kl": 0.23681640625, + "learning_rate": 1.6937073975031576e-05, + "loss": 0.0103, + "reward": 0.6467634290456772, + "reward_std": 0.08728904603049159, + "rewards/accuracy_reward": 0.1540178693830967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1106 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.8281707763672, + "epoch": 0.3306698528862669, + "grad_norm": 0.14961518347263336, + "kl": 0.234375, + "learning_rate": 1.6929557739874064e-05, + "loss": 0.0111, + "reward": 0.674107164144516, + "reward_std": 0.11380718648433685, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 1107 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.5759429931641, + "epoch": 0.33096856097378835, + "grad_norm": 0.20499388873577118, + "kl": 0.243896484375, + "learning_rate": 1.69220339660369e-05, + "loss": 0.0141, + "reward": 0.5909598544239998, + "reward_std": 0.060434792656451464, + "rewards/accuracy_reward": 0.10491071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 1108 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.7053985595703, + "epoch": 0.3312672690613098, + "grad_norm": 0.1522437036037445, + "kl": 0.251220703125, + "learning_rate": 1.6914502661705216e-05, + "loss": 0.0126, + "reward": 0.6607143133878708, + "reward_std": 0.0915249390527606, + "rewards/accuracy_reward": 0.17410714831203222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 1109 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.6875457763672, + "epoch": 0.3315659771488313, + "grad_norm": 0.16015756130218506, + "kl": 0.250244140625, + "learning_rate": 1.6906963835072325e-05, + "loss": 0.0063, + "reward": 0.5630580633878708, + "reward_std": 0.1610633172094822, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687723517418, + "step": 1110 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.1674499511719, + "epoch": 0.33186468523635276, + "grad_norm": 0.14642877876758575, + "kl": 0.246337890625, + "learning_rate": 1.6899417494339737e-05, + "loss": 0.0122, + "reward": 0.6205357387661934, + "reward_std": 0.10221802443265915, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107313156128, + "step": 1111 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.8437957763672, + "epoch": 0.33216339332387423, + "grad_norm": 0.1771167367696762, + "kl": 0.265869140625, + "learning_rate": 1.6891863647717135e-05, + "loss": 0.0138, + "reward": 0.6250000223517418, + "reward_std": 0.15339933894574642, + "rewards/accuracy_reward": 0.14285715413279831, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482142873108387, + "step": 1112 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.7768402099609, + "epoch": 0.3324621014113957, + "grad_norm": 0.1914633810520172, + "kl": 0.279541015625, + "learning_rate": 1.688430230342236e-05, + "loss": 0.0165, + "reward": 0.5390625223517418, + "reward_std": 0.10441018734127283, + "rewards/accuracy_reward": 0.06250000209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4765625223517418, + "step": 1113 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.4754943847656, + "epoch": 0.3327608094989172, + "grad_norm": 0.213156059384346, + "kl": 0.264404296875, + "learning_rate": 1.6876733469681407e-05, + "loss": 0.0211, + "reward": 0.504464328289032, + "reward_std": 0.09143719542771578, + "rewards/accuracy_reward": 0.020089286845177412, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750298023224, + "step": 1114 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.3929138183594, + "epoch": 0.33305951758643865, + "grad_norm": 0.17321635782718658, + "kl": 0.251953125, + "learning_rate": 1.6869157154728437e-05, + "loss": 0.0144, + "reward": 0.6082589477300644, + "reward_std": 0.14645712822675705, + "rewards/accuracy_reward": 0.12946428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478794664144516, + "step": 1115 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.5335235595703, + "epoch": 0.3333582256739601, + "grad_norm": 0.14786961674690247, + "kl": 0.252197265625, + "learning_rate": 1.686157336680573e-05, + "loss": 0.0128, + "reward": 0.6813616454601288, + "reward_std": 0.14006441179662943, + "rewards/accuracy_reward": 0.2008928656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687649011612, + "step": 1116 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.6429138183594, + "epoch": 0.3336569337614816, + "grad_norm": 0.18281829357147217, + "kl": 0.265380859375, + "learning_rate": 1.685398211416371e-05, + "loss": 0.0143, + "reward": 0.5011161044239998, + "reward_std": 0.1040105503052473, + "rewards/accuracy_reward": 0.020089287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268133878708, + "step": 1117 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.3817596435547, + "epoch": 0.33395564184900306, + "grad_norm": 0.16028372943401337, + "kl": 0.2578125, + "learning_rate": 1.6846383405060905e-05, + "loss": 0.0153, + "reward": 0.5864955484867096, + "reward_std": 0.10100538213737309, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1118 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.8616333007812, + "epoch": 0.33425434993652453, + "grad_norm": 0.19833898544311523, + "kl": 0.21728515625, + "learning_rate": 1.683877724776398e-05, + "loss": 0.0122, + "reward": 0.5401785969734192, + "reward_std": 0.07426655665040016, + "rewards/accuracy_reward": 0.051339287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1119 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.5513916015625, + "epoch": 0.334553058024046, + "grad_norm": 0.12383292615413666, + "kl": 0.228515625, + "learning_rate": 1.6831163650547678e-05, + "loss": 0.0103, + "reward": 0.5373884215950966, + "reward_std": 0.056798521894961596, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1120 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.9643249511719, + "epoch": 0.3348517661115675, + "grad_norm": 0.11756787449121475, + "kl": 0.2392578125, + "learning_rate": 1.6823542621694852e-05, + "loss": 0.0112, + "reward": 0.5373884290456772, + "reward_std": 0.05517753632739186, + "rewards/accuracy_reward": 0.044642857974395156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1121 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.3058471679688, + "epoch": 0.33515047419908894, + "grad_norm": 0.1315733641386032, + "kl": 0.236328125, + "learning_rate": 1.681591416949643e-05, + "loss": 0.0109, + "reward": 0.5323661044239998, + "reward_std": 0.08326821308583021, + "rewards/accuracy_reward": 0.042410716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1122 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.1317443847656, + "epoch": 0.3354491822866104, + "grad_norm": 0.12151264399290085, + "kl": 0.236083984375, + "learning_rate": 1.6808278302251425e-05, + "loss": 0.013, + "reward": 0.5613839477300644, + "reward_std": 0.0833771862089634, + "rewards/accuracy_reward": 0.0669642873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1123 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.7723693847656, + "epoch": 0.3357478903741319, + "grad_norm": 0.12912923097610474, + "kl": 0.2236328125, + "learning_rate": 1.6800635028266908e-05, + "loss": 0.0116, + "reward": 0.5909598469734192, + "reward_std": 0.06838063942268491, + "rewards/accuracy_reward": 0.09821429057046771, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1124 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.2879791259766, + "epoch": 0.33604659846165336, + "grad_norm": 0.42561694979667664, + "kl": 0.239990234375, + "learning_rate": 1.679298435585802e-05, + "loss": 0.0115, + "reward": 0.5814732313156128, + "reward_std": 0.06130589474923909, + "rewards/accuracy_reward": 0.08482143026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1125 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.6629943847656, + "epoch": 0.3363453065491748, + "grad_norm": 0.20712244510650635, + "kl": 0.247314453125, + "learning_rate": 1.678532629334793e-05, + "loss": 0.0111, + "reward": 0.594308078289032, + "reward_std": 0.121413454413414, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1126 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.7433624267578, + "epoch": 0.3366440146366963, + "grad_norm": 0.19410333037376404, + "kl": 0.247314453125, + "learning_rate": 1.677766084906787e-05, + "loss": 0.016, + "reward": 0.5357143059372902, + "reward_std": 0.09162657801061869, + "rewards/accuracy_reward": 0.04687500186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1127 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.3125457763672, + "epoch": 0.33694272272421777, + "grad_norm": 0.12173805385828018, + "kl": 0.27685546875, + "learning_rate": 1.6769988031357086e-05, + "loss": 0.0149, + "reward": 0.5429687723517418, + "reward_std": 0.06962088914588094, + "rewards/accuracy_reward": 0.0535714328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 1128 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.216552734375, + "epoch": 0.33724143081173924, + "grad_norm": 0.2637220621109009, + "kl": 0.267578125, + "learning_rate": 1.6762307848562858e-05, + "loss": 0.0117, + "reward": 0.537388414144516, + "reward_std": 0.03423754218965769, + "rewards/accuracy_reward": 0.0424107164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1129 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.7187957763672, + "epoch": 0.3375401388992607, + "grad_norm": 0.17960543930530548, + "kl": 0.242919921875, + "learning_rate": 1.6754620309040464e-05, + "loss": 0.0096, + "reward": 0.6143973469734192, + "reward_std": 0.11301467986777425, + "rewards/accuracy_reward": 0.11830358020961285, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1130 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.6853332519531, + "epoch": 0.3378388469867822, + "grad_norm": 0.1253282129764557, + "kl": 0.23486328125, + "learning_rate": 1.6746925421153196e-05, + "loss": 0.0106, + "reward": 0.6104911118745804, + "reward_std": 0.11761934496462345, + "rewards/accuracy_reward": 0.11607143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1131 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.9286041259766, + "epoch": 0.33813755507430365, + "grad_norm": 0.16088540852069855, + "kl": 0.239501953125, + "learning_rate": 1.6739223193272346e-05, + "loss": 0.012, + "reward": 0.5223214626312256, + "reward_std": 0.10322852339595556, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071715950966, + "step": 1132 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.3862152099609, + "epoch": 0.3384362631618251, + "grad_norm": 0.5324940085411072, + "kl": 0.2294921875, + "learning_rate": 1.6731513633777173e-05, + "loss": 0.0101, + "reward": 0.6333705633878708, + "reward_std": 0.07143450528383255, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1133 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.5424499511719, + "epoch": 0.3387349712493466, + "grad_norm": 0.18895886838436127, + "kl": 0.224609375, + "learning_rate": 1.6723796751054925e-05, + "loss": 0.0041, + "reward": 0.5195312798023224, + "reward_std": 0.07984000630676746, + "rewards/accuracy_reward": 0.026785714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1134 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.6629943847656, + "epoch": 0.33903367933686807, + "grad_norm": 0.12202315777540207, + "kl": 0.22314453125, + "learning_rate": 1.6716072553500816e-05, + "loss": 0.0114, + "reward": 0.595982164144516, + "reward_std": 0.11986478976905346, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1135 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.4040679931641, + "epoch": 0.33933238742438954, + "grad_norm": 0.14441193640232086, + "kl": 0.22216796875, + "learning_rate": 1.6708341049518016e-05, + "loss": 0.0117, + "reward": 0.537388414144516, + "reward_std": 0.10958543652668595, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1136 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.7678985595703, + "epoch": 0.339631095511911, + "grad_norm": 0.1433192491531372, + "kl": 0.2216796875, + "learning_rate": 1.670060224751764e-05, + "loss": 0.0094, + "reward": 0.5390625149011612, + "reward_std": 0.05023663095198572, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1137 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.5670013427734, + "epoch": 0.3399298035994325, + "grad_norm": 0.12289411574602127, + "kl": 0.2138671875, + "learning_rate": 1.669285615591875e-05, + "loss": 0.009, + "reward": 0.5156250298023224, + "reward_std": 0.09328151494264603, + "rewards/accuracy_reward": 0.02455357206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1138 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.3080749511719, + "epoch": 0.34022851168695395, + "grad_norm": 0.3211354613304138, + "kl": 0.22216796875, + "learning_rate": 1.668510278314833e-05, + "loss": 0.0095, + "reward": 0.535714328289032, + "reward_std": 0.060141791589558125, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1139 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.2254943847656, + "epoch": 0.3405272197744754, + "grad_norm": 0.14046789705753326, + "kl": 0.22021484375, + "learning_rate": 1.6677342137641294e-05, + "loss": 0.0123, + "reward": 0.603794664144516, + "reward_std": 0.11566056124866009, + "rewards/accuracy_reward": 0.11160714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1140 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.2455749511719, + "epoch": 0.3408259278619969, + "grad_norm": 0.1524757444858551, + "kl": 0.223876953125, + "learning_rate": 1.666957422784046e-05, + "loss": 0.0124, + "reward": 0.651785746216774, + "reward_std": 0.13160597532987595, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 1141 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.0402374267578, + "epoch": 0.3411246359495183, + "grad_norm": 0.13124968111515045, + "kl": 0.2333984375, + "learning_rate": 1.666179906219656e-05, + "loss": 0.0103, + "reward": 0.5200892984867096, + "reward_std": 0.060150225879624486, + "rewards/accuracy_reward": 0.026785716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1142 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.138427734375, + "epoch": 0.3414233440370398, + "grad_norm": 0.16341860592365265, + "kl": 0.235595703125, + "learning_rate": 1.6654016649168203e-05, + "loss": 0.0115, + "reward": 0.4983259215950966, + "reward_std": 0.051780270878225565, + "rewards/accuracy_reward": 0.006696428870782256, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1143 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.9822082519531, + "epoch": 0.34172205212456125, + "grad_norm": 0.13723155856132507, + "kl": 0.240234375, + "learning_rate": 1.66462269972219e-05, + "loss": 0.0158, + "reward": 0.5552455633878708, + "reward_std": 0.1183479018509388, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 1144 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.1942291259766, + "epoch": 0.3420207602120827, + "grad_norm": 0.22497116029262543, + "kl": 0.24462890625, + "learning_rate": 1.6638430114832015e-05, + "loss": 0.0119, + "reward": 0.6445312649011612, + "reward_std": 0.1035579051822424, + "rewards/accuracy_reward": 0.15178572107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1145 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.185302734375, + "epoch": 0.3423194682996042, + "grad_norm": 0.17560715973377228, + "kl": 0.26025390625, + "learning_rate": 1.6630626010480807e-05, + "loss": 0.011, + "reward": 0.5758928805589676, + "reward_std": 0.08383504068478942, + "rewards/accuracy_reward": 0.0825892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1146 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.0781707763672, + "epoch": 0.34261817638712566, + "grad_norm": 0.13795819878578186, + "kl": 0.251220703125, + "learning_rate": 1.662281469265837e-05, + "loss": 0.011, + "reward": 0.6422991380095482, + "reward_std": 0.04448678926564753, + "rewards/accuracy_reward": 0.1495535832364112, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1147 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.6741485595703, + "epoch": 0.34291688447464713, + "grad_norm": 0.2117270976305008, + "kl": 0.2353515625, + "learning_rate": 1.6614996169862654e-05, + "loss": 0.0133, + "reward": 0.5926339477300644, + "reward_std": 0.11431776825338602, + "rewards/accuracy_reward": 0.10491071944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 1148 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.7344207763672, + "epoch": 0.3432155925621686, + "grad_norm": 0.16363483667373657, + "kl": 0.237548828125, + "learning_rate": 1.6607170450599445e-05, + "loss": 0.0101, + "reward": 0.5786830633878708, + "reward_std": 0.09237025305628777, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1149 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.3125457763672, + "epoch": 0.3435143006496901, + "grad_norm": 0.14692093431949615, + "kl": 0.22021484375, + "learning_rate": 1.6599337543382356e-05, + "loss": 0.0154, + "reward": 0.5775669813156128, + "reward_std": 0.10173472249880433, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1150 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.357177734375, + "epoch": 0.34381300873721155, + "grad_norm": 0.12003330886363983, + "kl": 0.220947265625, + "learning_rate": 1.6591497456732827e-05, + "loss": 0.0112, + "reward": 0.5641741380095482, + "reward_std": 0.0690289419144392, + "rewards/accuracy_reward": 0.07366071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1151 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.5469207763672, + "epoch": 0.344111716824733, + "grad_norm": 0.12349299341440201, + "kl": 0.208740234375, + "learning_rate": 1.6583650199180097e-05, + "loss": 0.0086, + "reward": 0.5435268133878708, + "reward_std": 0.05237732362002134, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1152 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.3370971679688, + "epoch": 0.3444104249122545, + "grad_norm": 0.16795849800109863, + "kl": 0.215576171875, + "learning_rate": 1.6575795779261222e-05, + "loss": 0.0111, + "reward": 0.697544664144516, + "reward_std": 0.06852221209555864, + "rewards/accuracy_reward": 0.20758929708972573, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1153 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.1272735595703, + "epoch": 0.34470913299977596, + "grad_norm": 0.12294846773147583, + "kl": 0.206298828125, + "learning_rate": 1.6567934205521036e-05, + "loss": 0.0109, + "reward": 0.6612723618745804, + "reward_std": 0.09730509482324123, + "rewards/accuracy_reward": 0.16517858300358057, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1154 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.9888763427734, + "epoch": 0.34500784108729743, + "grad_norm": 0.13302907347679138, + "kl": 0.215576171875, + "learning_rate": 1.656006548651216e-05, + "loss": 0.0155, + "reward": 0.6032366305589676, + "reward_std": 0.13118955679237843, + "rewards/accuracy_reward": 0.11160714598372579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491629496216774, + "step": 1155 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.2254943847656, + "epoch": 0.3453065491748189, + "grad_norm": 0.2071894407272339, + "kl": 0.213134765625, + "learning_rate": 1.6552189630794987e-05, + "loss": 0.0142, + "reward": 0.599888414144516, + "reward_std": 0.08264228934422135, + "rewards/accuracy_reward": 0.1093750074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1156 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.6183319091797, + "epoch": 0.34560525726234037, + "grad_norm": 0.34746724367141724, + "kl": 0.228515625, + "learning_rate": 1.6544306646937683e-05, + "loss": 0.0116, + "reward": 0.5396205633878708, + "reward_std": 0.07913130987435579, + "rewards/accuracy_reward": 0.0468750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1157 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.5960235595703, + "epoch": 0.34590396534986184, + "grad_norm": 0.1403415948152542, + "kl": 0.23046875, + "learning_rate": 1.6536416543516157e-05, + "loss": 0.018, + "reward": 0.572544664144516, + "reward_std": 0.06451357621699572, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1158 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.3170013427734, + "epoch": 0.3462026734373833, + "grad_norm": 0.3597828149795532, + "kl": 0.301025390625, + "learning_rate": 1.652851932911407e-05, + "loss": 0.0231, + "reward": 0.5569196790456772, + "reward_std": 0.098233537748456, + "rewards/accuracy_reward": 0.06919643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 1159 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.0870971679688, + "epoch": 0.3465013815249048, + "grad_norm": 0.13880117237567902, + "kl": 0.239013671875, + "learning_rate": 1.6520615012322815e-05, + "loss": 0.0142, + "reward": 0.6121651977300644, + "reward_std": 0.03740669181570411, + "rewards/accuracy_reward": 0.1183035783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1160 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.3527069091797, + "epoch": 0.34680008961242625, + "grad_norm": 0.1407414823770523, + "kl": 0.24267578125, + "learning_rate": 1.6512703601741517e-05, + "loss": 0.0175, + "reward": 0.5312500298023224, + "reward_std": 0.08275718986988068, + "rewards/accuracy_reward": 0.03794643119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1161 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.4665679931641, + "epoch": 0.3470987976999477, + "grad_norm": 0.1545347422361374, + "kl": 0.2314453125, + "learning_rate": 1.6504785105977012e-05, + "loss": 0.0122, + "reward": 0.6875000298023224, + "reward_std": 0.04036256507970393, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1162 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.2812805175781, + "epoch": 0.3473975057874692, + "grad_norm": 0.19269677996635437, + "kl": 0.255615234375, + "learning_rate": 1.649685953364385e-05, + "loss": 0.017, + "reward": 0.5876116305589676, + "reward_std": 0.07208646275103092, + "rewards/accuracy_reward": 0.09375000419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1163 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.2545013427734, + "epoch": 0.34769621387499067, + "grad_norm": 0.17232006788253784, + "kl": 0.241943359375, + "learning_rate": 1.6488926893364276e-05, + "loss": 0.011, + "reward": 0.5775669813156128, + "reward_std": 0.08557257428765297, + "rewards/accuracy_reward": 0.08258928637951612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1164 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.1295013427734, + "epoch": 0.34799492196251214, + "grad_norm": 0.1550418734550476, + "kl": 0.22900390625, + "learning_rate": 1.6480987193768227e-05, + "loss": 0.0135, + "reward": 0.6049107313156128, + "reward_std": 0.07558072195388377, + "rewards/accuracy_reward": 0.10714286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1165 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.8460083007812, + "epoch": 0.3482936300500336, + "grad_norm": 0.16698944568634033, + "kl": 0.223388671875, + "learning_rate": 1.6473040443493314e-05, + "loss": 0.011, + "reward": 0.741629496216774, + "reward_std": 0.12031181901693344, + "rewards/accuracy_reward": 0.2455357275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1166 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.9777221679688, + "epoch": 0.3485923381375551, + "grad_norm": 0.0938696563243866, + "kl": 0.20361328125, + "learning_rate": 1.6465086651184826e-05, + "loss": 0.0111, + "reward": 0.5781250298023224, + "reward_std": 0.06500995275564492, + "rewards/accuracy_reward": 0.0803571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1167 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.5000305175781, + "epoch": 0.34889104622507655, + "grad_norm": 0.11565446853637695, + "kl": 0.216064453125, + "learning_rate": 1.645712582549571e-05, + "loss": 0.012, + "reward": 0.532366082072258, + "reward_std": 0.07575735077261925, + "rewards/accuracy_reward": 0.035714286379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1168 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.4732666015625, + "epoch": 0.349189754312598, + "grad_norm": 0.11242832243442535, + "kl": 0.2060546875, + "learning_rate": 1.644915797508656e-05, + "loss": 0.0117, + "reward": 0.541294664144516, + "reward_std": 0.07851662673056126, + "rewards/accuracy_reward": 0.04464286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1169 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.9620971679688, + "epoch": 0.3494884624001195, + "grad_norm": 0.10027069598436356, + "kl": 0.201171875, + "learning_rate": 1.6441183108625617e-05, + "loss": 0.0129, + "reward": 0.6060268133878708, + "reward_std": 0.06930232793092728, + "rewards/accuracy_reward": 0.10937500419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1170 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.5759429931641, + "epoch": 0.34978717048764096, + "grad_norm": 0.11730416864156723, + "kl": 0.189453125, + "learning_rate": 1.6433201234788758e-05, + "loss": 0.0069, + "reward": 0.5747768133878708, + "reward_std": 0.11144038196653128, + "rewards/accuracy_reward": 0.07812500465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1171 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.4844055175781, + "epoch": 0.35008587857516243, + "grad_norm": 0.08188337087631226, + "kl": 0.1923828125, + "learning_rate": 1.6425212362259474e-05, + "loss": 0.0098, + "reward": 0.6121651977300644, + "reward_std": 0.03348214412108064, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1172 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.6518249511719, + "epoch": 0.3503845866626839, + "grad_norm": 0.07000838220119476, + "kl": 0.1826171875, + "learning_rate": 1.641721649972888e-05, + "loss": 0.0077, + "reward": 0.564732164144516, + "reward_std": 0.03495405800640583, + "rewards/accuracy_reward": 0.06473214481957257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1173 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.5223693847656, + "epoch": 0.3506832947502054, + "grad_norm": 0.09345728904008865, + "kl": 0.189453125, + "learning_rate": 1.640921365589569e-05, + "loss": 0.0074, + "reward": 0.537388414144516, + "reward_std": 0.09264109679497778, + "rewards/accuracy_reward": 0.04017857392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1174 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.6473693847656, + "epoch": 0.35098200283772685, + "grad_norm": 0.09763197600841522, + "kl": 0.19482421875, + "learning_rate": 1.6401203839466212e-05, + "loss": 0.0089, + "reward": 0.616629496216774, + "reward_std": 0.09308461379259825, + "rewards/accuracy_reward": 0.11830357694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1175 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.1250457763672, + "epoch": 0.3512807109252483, + "grad_norm": 0.11076629161834717, + "kl": 0.201904296875, + "learning_rate": 1.6393187059154344e-05, + "loss": 0.0086, + "reward": 0.5786830633878708, + "reward_std": 0.08824845147319138, + "rewards/accuracy_reward": 0.0825892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1176 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8527221679688, + "epoch": 0.3515794190127698, + "grad_norm": 0.10511521995067596, + "kl": 0.195068359375, + "learning_rate": 1.6385163323681554e-05, + "loss": 0.0082, + "reward": 0.5719866156578064, + "reward_std": 0.08510634489357471, + "rewards/accuracy_reward": 0.07366071734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1177 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.5558471679688, + "epoch": 0.35187812710029126, + "grad_norm": 0.12272246181964874, + "kl": 0.206787109375, + "learning_rate": 1.637713264177688e-05, + "loss": 0.0062, + "reward": 0.6601562798023224, + "reward_std": 0.09780178684741259, + "rewards/accuracy_reward": 0.16517858067527413, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1178 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4263763427734, + "epoch": 0.35217683518781273, + "grad_norm": 0.12557841837406158, + "kl": 0.201171875, + "learning_rate": 1.636909502217692e-05, + "loss": 0.0083, + "reward": 0.5429687798023224, + "reward_std": 0.0649910718202591, + "rewards/accuracy_reward": 0.049107145285233855, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1179 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.6339569091797, + "epoch": 0.3524755432753342, + "grad_norm": 0.09028876572847366, + "kl": 0.20458984375, + "learning_rate": 1.6361050473625813e-05, + "loss": 0.0083, + "reward": 0.6623884290456772, + "reward_std": 0.05058616609312594, + "rewards/accuracy_reward": 0.1651785832364112, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1180 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.7522735595703, + "epoch": 0.3527742513628557, + "grad_norm": 0.12274350970983505, + "kl": 0.20263671875, + "learning_rate": 1.6352999004875242e-05, + "loss": 0.0083, + "reward": 0.5574776977300644, + "reward_std": 0.09193467535078526, + "rewards/accuracy_reward": 0.06473214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1181 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.2098541259766, + "epoch": 0.35307295945037714, + "grad_norm": 0.1294180303812027, + "kl": 0.213134765625, + "learning_rate": 1.6344940624684413e-05, + "loss": 0.0103, + "reward": 0.6367187798023224, + "reward_std": 0.07369113527238369, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1182 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.0848693847656, + "epoch": 0.3533716675378986, + "grad_norm": 0.1340627819299698, + "kl": 0.197265625, + "learning_rate": 1.6336875341820052e-05, + "loss": 0.0096, + "reward": 0.5362723618745804, + "reward_std": 0.130474423058331, + "rewards/accuracy_reward": 0.04464286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1183 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.0245819091797, + "epoch": 0.3536703756254201, + "grad_norm": 0.11865610629320145, + "kl": 0.20166015625, + "learning_rate": 1.6328803165056405e-05, + "loss": 0.0051, + "reward": 0.583147332072258, + "reward_std": 0.06864528730511665, + "rewards/accuracy_reward": 0.08928571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1184 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.685302734375, + "epoch": 0.3539690837129415, + "grad_norm": 0.10868717730045319, + "kl": 0.201171875, + "learning_rate": 1.63207241031752e-05, + "loss": 0.0089, + "reward": 0.5094866305589676, + "reward_std": 0.04992468934506178, + "rewards/accuracy_reward": 0.01562500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1185 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.9107360839844, + "epoch": 0.35426779180046297, + "grad_norm": 0.09075311571359634, + "kl": 0.191650390625, + "learning_rate": 1.631263816496567e-05, + "loss": 0.0075, + "reward": 0.5262277126312256, + "reward_std": 0.07275036163628101, + "rewards/accuracy_reward": 0.029017859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1186 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.4866485595703, + "epoch": 0.35456649988798444, + "grad_norm": 0.10367558151483536, + "kl": 0.205322265625, + "learning_rate": 1.630454535922452e-05, + "loss": 0.0051, + "reward": 0.5379464626312256, + "reward_std": 0.09397775051183999, + "rewards/accuracy_reward": 0.0424107164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1187 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.872802734375, + "epoch": 0.3548652079755059, + "grad_norm": 0.10577020794153214, + "kl": 0.201171875, + "learning_rate": 1.6296445694755937e-05, + "loss": 0.008, + "reward": 0.5145089477300644, + "reward_std": 0.0631710549350828, + "rewards/accuracy_reward": 0.017857144121080637, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1188 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.372802734375, + "epoch": 0.3551639160630274, + "grad_norm": 0.09979157894849777, + "kl": 0.19482421875, + "learning_rate": 1.628833918037155e-05, + "loss": 0.0076, + "reward": 0.6199776828289032, + "reward_std": 0.05527094006538391, + "rewards/accuracy_reward": 0.12276786286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1189 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.3594207763672, + "epoch": 0.35546262415054886, + "grad_norm": 0.1272897720336914, + "kl": 0.197998046875, + "learning_rate": 1.628022582489046e-05, + "loss": 0.0107, + "reward": 0.6199777275323868, + "reward_std": 0.12060297094285488, + "rewards/accuracy_reward": 0.1250000058207661, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1190 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7879943847656, + "epoch": 0.3557613322380703, + "grad_norm": 0.1266815960407257, + "kl": 0.207275390625, + "learning_rate": 1.6272105637139203e-05, + "loss": 0.0081, + "reward": 0.6339285969734192, + "reward_std": 0.1238986887037754, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1191 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2500457763672, + "epoch": 0.3560600403255918, + "grad_norm": 0.09671929478645325, + "kl": 0.189453125, + "learning_rate": 1.6263978625951743e-05, + "loss": 0.0086, + "reward": 0.6077009290456772, + "reward_std": 0.10089466348290443, + "rewards/accuracy_reward": 0.1093750074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1192 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.4419860839844, + "epoch": 0.35635874841311327, + "grad_norm": 0.1544579267501831, + "kl": 0.2138671875, + "learning_rate": 1.6255844800169472e-05, + "loss": 0.0062, + "reward": 0.599888414144516, + "reward_std": 0.07606650236994028, + "rewards/accuracy_reward": 0.10267857694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1193 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.3884429931641, + "epoch": 0.35665745650063474, + "grad_norm": 0.10793200880289078, + "kl": 0.197265625, + "learning_rate": 1.62477041686412e-05, + "loss": 0.0089, + "reward": 0.6060268133878708, + "reward_std": 0.07922783121466637, + "rewards/accuracy_reward": 0.10937500302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1194 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8995971679688, + "epoch": 0.3569561645881562, + "grad_norm": 0.08887606114149094, + "kl": 0.19189453125, + "learning_rate": 1.6239556740223132e-05, + "loss": 0.0077, + "reward": 0.6802455633878708, + "reward_std": 0.02720375615172088, + "rewards/accuracy_reward": 0.1830357201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1195 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8928833007812, + "epoch": 0.3572548726756777, + "grad_norm": 0.11130517721176147, + "kl": 0.189208984375, + "learning_rate": 1.6231402523778873e-05, + "loss": 0.0076, + "reward": 0.6049107313156128, + "reward_std": 0.10766016133129597, + "rewards/accuracy_reward": 0.10714286379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1196 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.1317443847656, + "epoch": 0.35755358076319915, + "grad_norm": 0.1100844219326973, + "kl": 0.194091796875, + "learning_rate": 1.6223241528179415e-05, + "loss": 0.0083, + "reward": 0.6093750298023224, + "reward_std": 0.08733610715717077, + "rewards/accuracy_reward": 0.11160714901052415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1197 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7924499511719, + "epoch": 0.3578522888507206, + "grad_norm": 0.10482799261808395, + "kl": 0.1943359375, + "learning_rate": 1.6215073762303113e-05, + "loss": 0.0076, + "reward": 0.7148437798023224, + "reward_std": 0.10026448778808117, + "rewards/accuracy_reward": 0.2187500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937574505806, + "step": 1198 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.2411346435547, + "epoch": 0.3581509969382421, + "grad_norm": 0.11859654635190964, + "kl": 0.188232421875, + "learning_rate": 1.62068992350357e-05, + "loss": 0.0075, + "reward": 0.588169664144516, + "reward_std": 0.09564401675015688, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1199 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.9888763427734, + "epoch": 0.35844970502576357, + "grad_norm": 0.11078810691833496, + "kl": 0.1904296875, + "learning_rate": 1.6198717955270264e-05, + "loss": 0.0076, + "reward": 0.5926339477300644, + "reward_std": 0.07677482557483017, + "rewards/accuracy_reward": 0.09375000419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1200 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.341552734375, + "epoch": 0.35874841311328504, + "grad_norm": 0.1123708039522171, + "kl": 0.195556640625, + "learning_rate": 1.619052993190723e-05, + "loss": 0.0036, + "reward": 0.6344866305589676, + "reward_std": 0.07490782556124032, + "rewards/accuracy_reward": 0.13839286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1201 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.450927734375, + "epoch": 0.3590471212008065, + "grad_norm": 0.08825306594371796, + "kl": 0.189208984375, + "learning_rate": 1.6182335173854368e-05, + "loss": 0.0076, + "reward": 0.5892857611179352, + "reward_std": 0.04954208713024855, + "rewards/accuracy_reward": 0.09375000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357238650322, + "step": 1202 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8281555175781, + "epoch": 0.359345829288328, + "grad_norm": 0.1338329315185547, + "kl": 0.18896484375, + "learning_rate": 1.617413369002677e-05, + "loss": 0.0077, + "reward": 0.5396205633878708, + "reward_std": 0.10262008011341095, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1203 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5893249511719, + "epoch": 0.35964453737584945, + "grad_norm": 0.11250364035367966, + "kl": 0.185546875, + "learning_rate": 1.616592548934685e-05, + "loss": 0.0009, + "reward": 0.5150669813156128, + "reward_std": 0.06635376368649304, + "rewards/accuracy_reward": 0.022321430267766118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 1204 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2656555175781, + "epoch": 0.3599432454633709, + "grad_norm": 0.13241919875144958, + "kl": 0.187255859375, + "learning_rate": 1.6157710580744322e-05, + "loss": 0.0074, + "reward": 0.550781287252903, + "reward_std": 0.08340508863329887, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1205 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0245819091797, + "epoch": 0.3602419535508924, + "grad_norm": 0.0898512527346611, + "kl": 0.18408203125, + "learning_rate": 1.61494889731562e-05, + "loss": 0.0083, + "reward": 0.5507812798023224, + "reward_std": 0.05739691015332937, + "rewards/accuracy_reward": 0.05357143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1206 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0692138671875, + "epoch": 0.36054066163841386, + "grad_norm": 0.16783414781093597, + "kl": 0.175048828125, + "learning_rate": 1.614126067552679e-05, + "loss": 0.0066, + "reward": 0.5580357313156128, + "reward_std": 0.12347032502293587, + "rewards/accuracy_reward": 0.06696428824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1207 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.044677734375, + "epoch": 0.36083936972593533, + "grad_norm": 0.10067892074584961, + "kl": 0.1796875, + "learning_rate": 1.6133025696807674e-05, + "loss": 0.0072, + "reward": 0.572544664144516, + "reward_std": 0.07786344666965306, + "rewards/accuracy_reward": 0.07366071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1208 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0379943847656, + "epoch": 0.3611380778134568, + "grad_norm": 0.12032534927129745, + "kl": 0.18408203125, + "learning_rate": 1.6124784045957705e-05, + "loss": 0.0073, + "reward": 0.597098246216774, + "reward_std": 0.10738882049918175, + "rewards/accuracy_reward": 0.10044643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1209 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.1473693847656, + "epoch": 0.3614367859009783, + "grad_norm": 0.10991744697093964, + "kl": 0.17578125, + "learning_rate": 1.6116535731942982e-05, + "loss": 0.0071, + "reward": 0.5691964328289032, + "reward_std": 0.08644913719035685, + "rewards/accuracy_reward": 0.07589286053553224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1210 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2009124755859, + "epoch": 0.36173549398849975, + "grad_norm": 0.12783609330654144, + "kl": 0.181396484375, + "learning_rate": 1.610828076373687e-05, + "loss": 0.007, + "reward": 0.559151828289032, + "reward_std": 0.08217915752902627, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1211 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7879943847656, + "epoch": 0.3620342020760212, + "grad_norm": 0.12644025683403015, + "kl": 0.181884765625, + "learning_rate": 1.6100019150319966e-05, + "loss": 0.0086, + "reward": 0.6199777126312256, + "reward_std": 0.11988403834402561, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1212 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8214721679688, + "epoch": 0.3623329101635427, + "grad_norm": 0.09984870254993439, + "kl": 0.174072265625, + "learning_rate": 1.6091750900680088e-05, + "loss": 0.0075, + "reward": 0.6238839477300644, + "reward_std": 0.0906711108982563, + "rewards/accuracy_reward": 0.1272321529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1213 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.6741485595703, + "epoch": 0.36263161825106416, + "grad_norm": 0.08717255294322968, + "kl": 0.17333984375, + "learning_rate": 1.608347602381229e-05, + "loss": 0.0077, + "reward": 0.620535746216774, + "reward_std": 0.0693532694131136, + "rewards/accuracy_reward": 0.12053572107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1214 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.6920013427734, + "epoch": 0.36293032633858563, + "grad_norm": 0.1272563487291336, + "kl": 0.176513671875, + "learning_rate": 1.6075194528718818e-05, + "loss": 0.0083, + "reward": 0.6088170111179352, + "reward_std": 0.16256995126605034, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1215 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2902221679688, + "epoch": 0.3632290344261071, + "grad_norm": 0.10534430295228958, + "kl": 0.175537109375, + "learning_rate": 1.6066906424409135e-05, + "loss": 0.0082, + "reward": 0.564732164144516, + "reward_std": 0.09485246171243489, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1216 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2902221679688, + "epoch": 0.36352774251362857, + "grad_norm": 0.10423535853624344, + "kl": 0.177001953125, + "learning_rate": 1.605861171989988e-05, + "loss": 0.0069, + "reward": 0.5474330484867096, + "reward_std": 0.08268622495234013, + "rewards/accuracy_reward": 0.051339288242161274, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937574505806, + "step": 1217 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2165832519531, + "epoch": 0.36382645060115004, + "grad_norm": 0.1167050302028656, + "kl": 0.17578125, + "learning_rate": 1.6050310424214885e-05, + "loss": 0.0072, + "reward": 0.5859375298023224, + "reward_std": 0.10503839701414108, + "rewards/accuracy_reward": 0.0915178582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1218 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0022735595703, + "epoch": 0.3641251586886715, + "grad_norm": 0.10391870886087418, + "kl": 0.17333984375, + "learning_rate": 1.604200254638514e-05, + "loss": 0.007, + "reward": 0.5719866305589676, + "reward_std": 0.06968231708742678, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1219 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.2924499511719, + "epoch": 0.364423866776193, + "grad_norm": 0.1250186711549759, + "kl": 0.1728515625, + "learning_rate": 1.6033688095448808e-05, + "loss": 0.0065, + "reward": 0.6110491305589676, + "reward_std": 0.11682063899934292, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1220 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.5446929931641, + "epoch": 0.36472257486371445, + "grad_norm": 0.10288295149803162, + "kl": 0.170166015625, + "learning_rate": 1.602536708045119e-05, + "loss": 0.0071, + "reward": 0.5357143133878708, + "reward_std": 0.06940336292609572, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1221 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.1250457763672, + "epoch": 0.3650212829512359, + "grad_norm": 0.11612934619188309, + "kl": 0.169677734375, + "learning_rate": 1.6017039510444737e-05, + "loss": 0.0066, + "reward": 0.5864955633878708, + "reward_std": 0.10962270945310593, + "rewards/accuracy_reward": 0.09151786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1222 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.1406707763672, + "epoch": 0.3653199910387574, + "grad_norm": 0.12649483978748322, + "kl": 0.172607421875, + "learning_rate": 1.6008705394489032e-05, + "loss": 0.0086, + "reward": 0.5585937798023224, + "reward_std": 0.07790527679026127, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1223 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7545013427734, + "epoch": 0.36561869912627887, + "grad_norm": 0.13330622017383575, + "kl": 0.17236328125, + "learning_rate": 1.6000364741650775e-05, + "loss": 0.0062, + "reward": 0.6523437798023224, + "reward_std": 0.14330579061061144, + "rewards/accuracy_reward": 0.1562500111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1224 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.0692291259766, + "epoch": 0.36591740721380034, + "grad_norm": 0.16621536016464233, + "kl": 0.16943359375, + "learning_rate": 1.5992017561003777e-05, + "loss": 0.0032, + "reward": 0.5189732313156128, + "reward_std": 0.04758668621070683, + "rewards/accuracy_reward": 0.0223214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1225 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.6562957763672, + "epoch": 0.3662161153013218, + "grad_norm": 133.70079040527344, + "kl": 3.516845703125, + "learning_rate": 1.598366386162895e-05, + "loss": 0.1421, + "reward": 0.5814732313156128, + "reward_std": 0.08826453750953078, + "rewards/accuracy_reward": 0.08482143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1226 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.388427734375, + "epoch": 0.3665148233888433, + "grad_norm": 0.23917946219444275, + "kl": 0.180419921875, + "learning_rate": 1.597530365261431e-05, + "loss": 0.0075, + "reward": 0.6037946790456772, + "reward_std": 0.13028187677264214, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1227 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.904052734375, + "epoch": 0.3668135314763647, + "grad_norm": 0.12029887735843658, + "kl": 0.175537109375, + "learning_rate": 1.5966936943054933e-05, + "loss": 0.008, + "reward": 0.6205357313156128, + "reward_std": 0.052558270283043385, + "rewards/accuracy_reward": 0.12500000419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1228 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.7745971679688, + "epoch": 0.36711223956388617, + "grad_norm": 0.1369030475616455, + "kl": 0.17431640625, + "learning_rate": 1.5958563742052987e-05, + "loss": 0.0032, + "reward": 0.5613839626312256, + "reward_std": 0.14377871714532375, + "rewards/accuracy_reward": 0.0647321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1229 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0781555175781, + "epoch": 0.36741094765140764, + "grad_norm": 0.2221548706293106, + "kl": 0.1904296875, + "learning_rate": 1.5950184058717694e-05, + "loss": 0.0082, + "reward": 0.5864955484867096, + "reward_std": 0.14392489567399025, + "rewards/accuracy_reward": 0.10267857881262898, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169813156128, + "step": 1230 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.685302734375, + "epoch": 0.3677096557389291, + "grad_norm": 0.6016314625740051, + "kl": 0.251220703125, + "learning_rate": 1.5941797902165325e-05, + "loss": 0.0112, + "reward": 0.5580357387661934, + "reward_std": 0.12839603051543236, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4620535969734192, + "step": 1231 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.3527221679688, + "epoch": 0.3680083638264506, + "grad_norm": 1.3638876676559448, + "kl": 0.264404296875, + "learning_rate": 1.5933405281519195e-05, + "loss": 0.0115, + "reward": 0.4838169887661934, + "reward_std": 0.14755673706531525, + "rewards/accuracy_reward": 0.042410716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4414062649011612, + "step": 1232 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.7522735595703, + "epoch": 0.36830707191397205, + "grad_norm": 0.5763188600540161, + "kl": 0.33154296875, + "learning_rate": 1.5925006205909654e-05, + "loss": 0.0184, + "reward": 0.5574776977300644, + "reward_std": 0.20629455894231796, + "rewards/accuracy_reward": 0.15401786682195961, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4034598395228386, + "step": 1233 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.1964721679688, + "epoch": 0.3686057800014935, + "grad_norm": 1.040512204170227, + "kl": 0.393798828125, + "learning_rate": 1.5916600684474076e-05, + "loss": 0.0193, + "reward": 0.3872768059372902, + "reward_std": 0.22172397002577782, + "rewards/accuracy_reward": 0.058035718742758036, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3292410895228386, + "step": 1234 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.6183471679688, + "epoch": 0.368904488089015, + "grad_norm": 3.3814480304718018, + "kl": 0.841796875, + "learning_rate": 1.5908188726356843e-05, + "loss": 0.0361, + "reward": 0.4854910969734192, + "reward_std": 0.17893236503005028, + "rewards/accuracy_reward": 0.08482143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4006696566939354, + "step": 1235 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.3661193847656, + "epoch": 0.36920319617653646, + "grad_norm": 1.8214540481567383, + "kl": 0.38671875, + "learning_rate": 1.589977034070934e-05, + "loss": 0.0195, + "reward": 0.4207589402794838, + "reward_std": 0.19359582662582397, + "rewards/accuracy_reward": 0.011160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4095982313156128, + "step": 1236 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.2411193847656, + "epoch": 0.36950190426405793, + "grad_norm": 0.9431951642036438, + "kl": 0.49609375, + "learning_rate": 1.5891345536689943e-05, + "loss": 0.0233, + "reward": 0.5239955559372902, + "reward_std": 0.19382350891828537, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4257812649011612, + "step": 1237 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.6853179931641, + "epoch": 0.3698006123515794, + "grad_norm": 0.3105558753013611, + "kl": 0.33447265625, + "learning_rate": 1.5882914323464022e-05, + "loss": 0.0148, + "reward": 0.5736607387661934, + "reward_std": 0.13296221755445004, + "rewards/accuracy_reward": 0.11830357694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.455357164144516, + "step": 1238 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.2277221679688, + "epoch": 0.3700993204391009, + "grad_norm": 1.6925930976867676, + "kl": 0.6552734375, + "learning_rate": 1.5874476710203902e-05, + "loss": 0.0322, + "reward": 0.506696455180645, + "reward_std": 0.1497109942138195, + "rewards/accuracy_reward": 0.09151786169968545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4151785969734192, + "step": 1239 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.7835388183594, + "epoch": 0.37039802852662235, + "grad_norm": 0.8277014493942261, + "kl": 0.705078125, + "learning_rate": 1.586603270608888e-05, + "loss": 0.0416, + "reward": 0.4977678805589676, + "reward_std": 0.1819719709455967, + "rewards/accuracy_reward": 0.13839286426082253, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3593750223517418, + "step": 1240 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.7277221679688, + "epoch": 0.3706967366141438, + "grad_norm": 0.8165575265884399, + "kl": 0.5517578125, + "learning_rate": 1.5857582320305207e-05, + "loss": 0.0304, + "reward": 0.4146205559372902, + "reward_std": 0.1476014107465744, + "rewards/accuracy_reward": 0.0401785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3744419738650322, + "step": 1241 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.9598846435547, + "epoch": 0.3709954447016653, + "grad_norm": 0.8409857153892517, + "kl": 0.5498046875, + "learning_rate": 1.5849125562046075e-05, + "loss": 0.0398, + "reward": 0.3984375223517418, + "reward_std": 0.183556467294693, + "rewards/accuracy_reward": 0.022321430267766118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3761160895228386, + "step": 1242 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.6004943847656, + "epoch": 0.37129415278918676, + "grad_norm": 0.4088018536567688, + "kl": 0.6474609375, + "learning_rate": 1.584066244051161e-05, + "loss": 0.0471, + "reward": 0.4196428805589676, + "reward_std": 0.17493632808327675, + "rewards/accuracy_reward": 0.026785715715959668, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.392857164144516, + "step": 1243 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.2835235595703, + "epoch": 0.37159286087670823, + "grad_norm": 0.5801388621330261, + "kl": 0.55126953125, + "learning_rate": 1.583219296490885e-05, + "loss": 0.0485, + "reward": 0.4335937723517418, + "reward_std": 0.1559055708348751, + "rewards/accuracy_reward": 0.020089287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4135044813156128, + "step": 1244 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.6183471679688, + "epoch": 0.3718915689642297, + "grad_norm": 0.9511351585388184, + "kl": 0.5322265625, + "learning_rate": 1.5823717144451768e-05, + "loss": 0.0477, + "reward": 0.4866071566939354, + "reward_std": 0.1412014663219452, + "rewards/accuracy_reward": 0.04910714412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4375000149011612, + "step": 1245 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.7299652099609, + "epoch": 0.3721902770517512, + "grad_norm": 1.3102378845214844, + "kl": 0.42626953125, + "learning_rate": 1.581523498836121e-05, + "loss": 0.0379, + "reward": 0.6043527126312256, + "reward_std": 0.17731167189776897, + "rewards/accuracy_reward": 0.15178572479635477, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4525669813156128, + "step": 1246 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.6518249511719, + "epoch": 0.37248898513927264, + "grad_norm": 1.516178011894226, + "kl": 0.4091796875, + "learning_rate": 1.5806746505864947e-05, + "loss": 0.0306, + "reward": 0.5312500298023224, + "reward_std": 0.11704436503350735, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107387661934, + "step": 1247 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.8549652099609, + "epoch": 0.3727876932267941, + "grad_norm": 0.9798563718795776, + "kl": 0.35693359375, + "learning_rate": 1.5798251706197606e-05, + "loss": 0.0265, + "reward": 0.6049107313156128, + "reward_std": 0.16881178133189678, + "rewards/accuracy_reward": 0.13169643515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143059372902, + "step": 1248 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.1116485595703, + "epoch": 0.3730864013143156, + "grad_norm": 0.23038455843925476, + "kl": 0.328125, + "learning_rate": 1.5789750598600693e-05, + "loss": 0.028, + "reward": 0.5954241454601288, + "reward_std": 0.18439874611794949, + "rewards/accuracy_reward": 0.13616072107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4592634066939354, + "step": 1249 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.4442443847656, + "epoch": 0.37338510940183706, + "grad_norm": 0.226936474442482, + "kl": 0.31005859375, + "learning_rate": 1.578124319232259e-05, + "loss": 0.0313, + "reward": 0.490513414144516, + "reward_std": 0.14030161499977112, + "rewards/accuracy_reward": 0.04910714388824999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4414062723517418, + "step": 1250 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.4620971679688, + "epoch": 0.3736838174893585, + "grad_norm": 0.27967581152915955, + "kl": 0.31640625, + "learning_rate": 1.577272949661852e-05, + "loss": 0.031, + "reward": 0.4302455484867096, + "reward_std": 0.13709397055208683, + "rewards/accuracy_reward": 0.013392857508733869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4168526902794838, + "step": 1251 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.0536041259766, + "epoch": 0.37398252557688, + "grad_norm": 0.22123539447784424, + "kl": 0.2900390625, + "learning_rate": 1.576420952075054e-05, + "loss": 0.0271, + "reward": 0.4921875149011612, + "reward_std": 0.1571044810116291, + "rewards/accuracy_reward": 0.06473214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4274553805589676, + "step": 1252 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.1540679931641, + "epoch": 0.37428123366440147, + "grad_norm": 0.36574164032936096, + "kl": 0.30615234375, + "learning_rate": 1.5755683273987554e-05, + "loss": 0.0305, + "reward": 0.494977705180645, + "reward_std": 0.1377113237977028, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4414062723517418, + "step": 1253 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.2902221679688, + "epoch": 0.37457994175192294, + "grad_norm": 0.20795786380767822, + "kl": 0.3203125, + "learning_rate": 1.5747150765605285e-05, + "loss": 0.0318, + "reward": 0.5463169887661934, + "reward_std": 0.14231491647660732, + "rewards/accuracy_reward": 0.07812500349245965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4681919887661934, + "step": 1254 + }, + { + "clip_ratio": 0.0, + "completion_length": 970.2120971679688, + "epoch": 0.3748786498394444, + "grad_norm": 0.19885434210300446, + "kl": 0.3671875, + "learning_rate": 1.5738612004886267e-05, + "loss": 0.028, + "reward": 0.490513414144516, + "reward_std": 0.07850657543167472, + "rewards/accuracy_reward": 0.011160714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.479352705180645, + "step": 1255 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.7768402099609, + "epoch": 0.3751773579269659, + "grad_norm": 0.39307403564453125, + "kl": 0.38037109375, + "learning_rate": 1.5730067001119832e-05, + "loss": 0.0142, + "reward": 0.6372768133878708, + "reward_std": 0.11432158201932907, + "rewards/accuracy_reward": 0.1450892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875074505806, + "step": 1256 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.1004791259766, + "epoch": 0.37547606601448735, + "grad_norm": 0.2621963918209076, + "kl": 0.3310546875, + "learning_rate": 1.5721515763602106e-05, + "loss": 0.0138, + "reward": 0.5306919664144516, + "reward_std": 0.05831352435052395, + "rewards/accuracy_reward": 0.03125000046566129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1257 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.0022888183594, + "epoch": 0.3757747741020088, + "grad_norm": 0.18970930576324463, + "kl": 0.337890625, + "learning_rate": 1.5712958301635993e-05, + "loss": 0.0145, + "reward": 0.6160714626312256, + "reward_std": 0.05205097235739231, + "rewards/accuracy_reward": 0.1160714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1258 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.7031707763672, + "epoch": 0.3760734821895303, + "grad_norm": 0.13822536170482635, + "kl": 0.31298828125, + "learning_rate": 1.5704394624531184e-05, + "loss": 0.0108, + "reward": 0.5714285969734192, + "reward_std": 0.030055894516408443, + "rewards/accuracy_reward": 0.07142857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1259 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.8861999511719, + "epoch": 0.37637219027705177, + "grad_norm": 0.12192536145448685, + "kl": 0.29052734375, + "learning_rate": 1.5695824741604114e-05, + "loss": 0.0146, + "reward": 0.6049107313156128, + "reward_std": 0.058279518969357014, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1260 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.1071624755859, + "epoch": 0.37667089836457324, + "grad_norm": 0.11045685410499573, + "kl": 0.26513671875, + "learning_rate": 1.568724866217797e-05, + "loss": 0.0092, + "reward": 0.633928582072258, + "reward_std": 0.0965799717232585, + "rewards/accuracy_reward": 0.13392857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1261 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.7344360351562, + "epoch": 0.3769696064520947, + "grad_norm": 0.09166643023490906, + "kl": 0.25634765625, + "learning_rate": 1.56786663955827e-05, + "loss": 0.0141, + "reward": 0.6205357313156128, + "reward_std": 0.04225464630872011, + "rewards/accuracy_reward": 0.12053571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1262 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.9911193847656, + "epoch": 0.3772683145396162, + "grad_norm": 0.14183078706264496, + "kl": 0.261474609375, + "learning_rate": 1.5670077951154955e-05, + "loss": 0.0109, + "reward": 0.6004464477300644, + "reward_std": 0.07324656285345554, + "rewards/accuracy_reward": 0.10044643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1263 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.1540679931641, + "epoch": 0.37756702262713765, + "grad_norm": 0.09185778349637985, + "kl": 0.24658203125, + "learning_rate": 1.5661483338238127e-05, + "loss": 0.014, + "reward": 0.5781250298023224, + "reward_std": 0.06120040826499462, + "rewards/accuracy_reward": 0.07812500465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1264 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.6317443847656, + "epoch": 0.3778657307146591, + "grad_norm": 0.19738371670246124, + "kl": 0.245849609375, + "learning_rate": 1.5652882566182316e-05, + "loss": 0.0089, + "reward": 0.5669642984867096, + "reward_std": 0.03382905758917332, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1265 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.7545013427734, + "epoch": 0.3781644388021806, + "grad_norm": 0.09315222501754761, + "kl": 0.235107421875, + "learning_rate": 1.5644275644344313e-05, + "loss": 0.0098, + "reward": 0.5379464477300644, + "reward_std": 0.07613666076213121, + "rewards/accuracy_reward": 0.03794643119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1266 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.1719207763672, + "epoch": 0.37846314688970206, + "grad_norm": 0.07826796919107437, + "kl": 0.22900390625, + "learning_rate": 1.5635662582087604e-05, + "loss": 0.0103, + "reward": 0.5954241305589676, + "reward_std": 0.03838741313666105, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1267 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.5357513427734, + "epoch": 0.37876185497722353, + "grad_norm": 0.06721828132867813, + "kl": 0.21728515625, + "learning_rate": 1.5627043388782365e-05, + "loss": 0.0096, + "reward": 0.6540178805589676, + "reward_std": 0.0433432636782527, + "rewards/accuracy_reward": 0.1540178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1268 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.2478179931641, + "epoch": 0.379060563064745, + "grad_norm": 0.07077130675315857, + "kl": 0.22021484375, + "learning_rate": 1.5618418073805425e-05, + "loss": 0.0076, + "reward": 0.5535714477300644, + "reward_std": 0.0382242389023304, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1269 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.7388916015625, + "epoch": 0.3793592711522665, + "grad_norm": 0.07915714383125305, + "kl": 0.211181640625, + "learning_rate": 1.560978664654029e-05, + "loss": 0.0097, + "reward": 0.6183035969734192, + "reward_std": 0.057466330006718636, + "rewards/accuracy_reward": 0.11830357694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1270 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.8862152099609, + "epoch": 0.3796579792397879, + "grad_norm": 0.08262507617473602, + "kl": 0.21826171875, + "learning_rate": 1.5601149116377095e-05, + "loss": 0.0083, + "reward": 0.589285746216774, + "reward_std": 0.05338135547935963, + "rewards/accuracy_reward": 0.08928571734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1271 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.3638916015625, + "epoch": 0.37995668732730936, + "grad_norm": 0.09530739486217499, + "kl": 0.21337890625, + "learning_rate": 1.5592505492712635e-05, + "loss": 0.0112, + "reward": 0.5625000298023224, + "reward_std": 0.06828013062477112, + "rewards/accuracy_reward": 0.06250000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1272 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.5937957763672, + "epoch": 0.38025539541483083, + "grad_norm": 0.08963415026664734, + "kl": 0.217041015625, + "learning_rate": 1.5583855784950323e-05, + "loss": 0.0092, + "reward": 0.660714328289032, + "reward_std": 0.07012897916138172, + "rewards/accuracy_reward": 0.16071428963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1273 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.6473541259766, + "epoch": 0.3805541035023523, + "grad_norm": 0.10387121140956879, + "kl": 0.21435546875, + "learning_rate": 1.5575200002500197e-05, + "loss": 0.0052, + "reward": 0.613839328289032, + "reward_std": 0.11078678071498871, + "rewards/accuracy_reward": 0.11383929289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1274 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.6027221679688, + "epoch": 0.3808528115898738, + "grad_norm": 0.05387106165289879, + "kl": 0.212158203125, + "learning_rate": 1.5566538154778894e-05, + "loss": 0.0078, + "reward": 0.5781250298023224, + "reward_std": 0.02112732268869877, + "rewards/accuracy_reward": 0.07812500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1275 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.9933319091797, + "epoch": 0.38115151967739525, + "grad_norm": 0.08558900654315948, + "kl": 0.212158203125, + "learning_rate": 1.555787025120966e-05, + "loss": 0.0084, + "reward": 0.5915178805589676, + "reward_std": 0.0777943404391408, + "rewards/accuracy_reward": 0.09151785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1276 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.9063110351562, + "epoch": 0.3814502277649167, + "grad_norm": 0.06815192848443985, + "kl": 0.20361328125, + "learning_rate": 1.554919630122232e-05, + "loss": 0.008, + "reward": 0.636160746216774, + "reward_std": 0.0433432636782527, + "rewards/accuracy_reward": 0.13616071734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1277 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.7143249511719, + "epoch": 0.3817489358524382, + "grad_norm": 0.1292756348848343, + "kl": 0.20361328125, + "learning_rate": 1.5540516314253284e-05, + "loss": 0.0064, + "reward": 0.6512277126312256, + "reward_std": 0.07841231860220432, + "rewards/accuracy_reward": 0.15178571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1278 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.6027221679688, + "epoch": 0.38204764393995966, + "grad_norm": 0.058186765760183334, + "kl": 0.202392578125, + "learning_rate": 1.553183029974553e-05, + "loss": 0.0081, + "reward": 0.5731026828289032, + "reward_std": 0.011160714784637094, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1279 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3661193847656, + "epoch": 0.38234635202748113, + "grad_norm": 0.0920458510518074, + "kl": 0.204345703125, + "learning_rate": 1.5523138267148582e-05, + "loss": 0.0054, + "reward": 0.5552455633878708, + "reward_std": 0.10352020245045424, + "rewards/accuracy_reward": 0.05580357392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1280 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.919677734375, + "epoch": 0.3826450601150026, + "grad_norm": 0.11200057715177536, + "kl": 0.214111328125, + "learning_rate": 1.551444022591853e-05, + "loss": 0.0064, + "reward": 0.5959821790456772, + "reward_std": 0.12915214989334345, + "rewards/accuracy_reward": 0.09598214854486287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1281 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.1830749511719, + "epoch": 0.38294376820252407, + "grad_norm": 0.09585738182067871, + "kl": 0.222412109375, + "learning_rate": 1.5505736185517984e-05, + "loss": 0.0092, + "reward": 0.5825892984867096, + "reward_std": 0.08722589444369078, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1282 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.2299346923828, + "epoch": 0.38324247629004554, + "grad_norm": 0.0897521898150444, + "kl": 0.216796875, + "learning_rate": 1.5497026155416087e-05, + "loss": 0.0093, + "reward": 0.5904017984867096, + "reward_std": 0.08105097827501595, + "rewards/accuracy_reward": 0.09151786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1283 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0491333007812, + "epoch": 0.383541184377567, + "grad_norm": 0.08448509871959686, + "kl": 0.20556640625, + "learning_rate": 1.5488310145088503e-05, + "loss": 0.0043, + "reward": 0.632254496216774, + "reward_std": 0.052516291849315166, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1284 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.7411041259766, + "epoch": 0.3838398924650885, + "grad_norm": 0.08931022137403488, + "kl": 0.203125, + "learning_rate": 1.547958816401739e-05, + "loss": 0.0076, + "reward": 0.633370578289032, + "reward_std": 0.07214025780558586, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1285 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.3906707763672, + "epoch": 0.38413860055260995, + "grad_norm": 0.09367892891168594, + "kl": 0.217529296875, + "learning_rate": 1.5470860221691414e-05, + "loss": 0.0082, + "reward": 0.5446428805589676, + "reward_std": 0.08142284955829382, + "rewards/accuracy_reward": 0.04464285867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1286 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.060302734375, + "epoch": 0.3844373086401314, + "grad_norm": 0.10108836740255356, + "kl": 0.2177734375, + "learning_rate": 1.5462126327605717e-05, + "loss": 0.0088, + "reward": 0.5535714477300644, + "reward_std": 0.05211924063041806, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1287 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3504943847656, + "epoch": 0.3847360167276529, + "grad_norm": 0.10129757970571518, + "kl": 0.216552734375, + "learning_rate": 1.5453386491261923e-05, + "loss": 0.0083, + "reward": 0.6294643133878708, + "reward_std": 0.06753916689194739, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1288 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.419677734375, + "epoch": 0.38503472481517437, + "grad_norm": 0.1242857277393341, + "kl": 0.2236328125, + "learning_rate": 1.5444640722168114e-05, + "loss": 0.0091, + "reward": 0.6512277126312256, + "reward_std": 0.13455131649971008, + "rewards/accuracy_reward": 0.15401786658912897, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1289 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6964721679688, + "epoch": 0.38533343290269584, + "grad_norm": 0.09078065305948257, + "kl": 0.234375, + "learning_rate": 1.5435889029838832e-05, + "loss": 0.0094, + "reward": 0.5195312649011612, + "reward_std": 0.05597654543817043, + "rewards/accuracy_reward": 0.022321430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1290 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.5424499511719, + "epoch": 0.3856321409902173, + "grad_norm": 0.09695456922054291, + "kl": 0.21923828125, + "learning_rate": 1.542713142379506e-05, + "loss": 0.0086, + "reward": 0.5284598469734192, + "reward_std": 0.07076409365981817, + "rewards/accuracy_reward": 0.031250001629814506, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1291 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5089721679688, + "epoch": 0.3859308490777388, + "grad_norm": 0.1378481686115265, + "kl": 0.23046875, + "learning_rate": 1.541836791356422e-05, + "loss": 0.0086, + "reward": 0.679129496216774, + "reward_std": 0.1223839558660984, + "rewards/accuracy_reward": 0.1852678693830967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1292 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.1741485595703, + "epoch": 0.38622955716526025, + "grad_norm": 0.10361068695783615, + "kl": 0.223388671875, + "learning_rate": 1.5409598508680138e-05, + "loss": 0.0053, + "reward": 0.5518973469734192, + "reward_std": 0.06725877826102078, + "rewards/accuracy_reward": 0.05580357415601611, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1293 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.1651916503906, + "epoch": 0.3865282652527817, + "grad_norm": 0.1304468810558319, + "kl": 0.22314453125, + "learning_rate": 1.5400823218683083e-05, + "loss": 0.01, + "reward": 0.5915178880095482, + "reward_std": 0.10742640029639006, + "rewards/accuracy_reward": 0.10044643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1294 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0826110839844, + "epoch": 0.3868269733403032, + "grad_norm": 0.11888450384140015, + "kl": 0.2236328125, + "learning_rate": 1.53920420531197e-05, + "loss": 0.0107, + "reward": 0.6724330633878708, + "reward_std": 0.08788568526506424, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1295 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.3928833007812, + "epoch": 0.38712568142782466, + "grad_norm": 0.12764044106006622, + "kl": 0.221435546875, + "learning_rate": 1.5383255021543042e-05, + "loss": 0.009, + "reward": 0.5909598469734192, + "reward_std": 0.10280568525195122, + "rewards/accuracy_reward": 0.09598214831203222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1296 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2053680419922, + "epoch": 0.38742438951534613, + "grad_norm": 0.12699563801288605, + "kl": 0.211669921875, + "learning_rate": 1.5374462133512534e-05, + "loss": 0.0088, + "reward": 0.569754496216774, + "reward_std": 0.10877786763012409, + "rewards/accuracy_reward": 0.0758928582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1297 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.341552734375, + "epoch": 0.3877230976028676, + "grad_norm": 0.1415906399488449, + "kl": 0.221923828125, + "learning_rate": 1.5365663398593982e-05, + "loss": 0.0083, + "reward": 0.6277901977300644, + "reward_std": 0.1661590039730072, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1298 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0022583007812, + "epoch": 0.3880218056903891, + "grad_norm": 0.11033512651920319, + "kl": 0.20849609375, + "learning_rate": 1.5356858826359543e-05, + "loss": 0.0071, + "reward": 0.5669643133878708, + "reward_std": 0.08330807834863663, + "rewards/accuracy_reward": 0.06919643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1299 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0870971679688, + "epoch": 0.38832051377791055, + "grad_norm": 0.12185993045568466, + "kl": 0.216064453125, + "learning_rate": 1.534804842638773e-05, + "loss": 0.0083, + "reward": 0.5396205633878708, + "reward_std": 0.09452391974627972, + "rewards/accuracy_reward": 0.044642860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1300 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3013763427734, + "epoch": 0.388619221865432, + "grad_norm": 0.11552886664867401, + "kl": 0.20068359375, + "learning_rate": 1.5339232208263394e-05, + "loss": 0.0077, + "reward": 0.6199776977300644, + "reward_std": 0.09410164225846529, + "rewards/accuracy_reward": 0.12276786123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1301 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.9107513427734, + "epoch": 0.3889179299529535, + "grad_norm": 0.09635385125875473, + "kl": 0.20263671875, + "learning_rate": 1.533041018157771e-05, + "loss": 0.0081, + "reward": 0.5887277126312256, + "reward_std": 0.06234393268823624, + "rewards/accuracy_reward": 0.0892857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1302 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1250305175781, + "epoch": 0.38921663804047496, + "grad_norm": 0.09703496098518372, + "kl": 0.197021484375, + "learning_rate": 1.532158235592819e-05, + "loss": 0.0086, + "reward": 0.5457589477300644, + "reward_std": 0.08538415213115513, + "rewards/accuracy_reward": 0.046875002793967724, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1303 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7433624267578, + "epoch": 0.38951534612799643, + "grad_norm": 0.10595198720693588, + "kl": 0.2041015625, + "learning_rate": 1.5312748740918643e-05, + "loss": 0.0074, + "reward": 0.5647321492433548, + "reward_std": 0.07475176197476685, + "rewards/accuracy_reward": 0.07142857508733869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1304 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8795013427734, + "epoch": 0.3898140542155179, + "grad_norm": 0.11011716723442078, + "kl": 0.202880859375, + "learning_rate": 1.5303909346159166e-05, + "loss": 0.0062, + "reward": 0.5574776977300644, + "reward_std": 0.1059852831531316, + "rewards/accuracy_reward": 0.060267860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1305 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3772583007812, + "epoch": 0.3901127623030394, + "grad_norm": 0.10204757004976273, + "kl": 0.1943359375, + "learning_rate": 1.529506418126616e-05, + "loss": 0.0078, + "reward": 0.6199776977300644, + "reward_std": 0.09857119480147958, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1306 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.2545013427734, + "epoch": 0.39041147039056084, + "grad_norm": 0.12306667119264603, + "kl": 0.195068359375, + "learning_rate": 1.5286213255862295e-05, + "loss": 0.0079, + "reward": 0.6618303805589676, + "reward_std": 0.07451973995193839, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1307 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4844207763672, + "epoch": 0.3907101784780823, + "grad_norm": 0.11319736391305923, + "kl": 0.194580078125, + "learning_rate": 1.527735657957651e-05, + "loss": 0.0079, + "reward": 0.5719866454601288, + "reward_std": 0.11120566353201866, + "rewards/accuracy_reward": 0.07366071734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1308 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.2768249511719, + "epoch": 0.3910088865656038, + "grad_norm": 0.13384689390659332, + "kl": 0.193115234375, + "learning_rate": 1.5268494162044008e-05, + "loss": 0.0074, + "reward": 0.5591518133878708, + "reward_std": 0.07224253099411726, + "rewards/accuracy_reward": 0.06026786030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1309 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0223693847656, + "epoch": 0.39130759465312526, + "grad_norm": 0.12041939049959183, + "kl": 0.18798828125, + "learning_rate": 1.5259626012906227e-05, + "loss": 0.0015, + "reward": 0.5641741454601288, + "reward_std": 0.09667650796473026, + "rewards/accuracy_reward": 0.06919643119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1310 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8348693847656, + "epoch": 0.39160630274064673, + "grad_norm": 0.11658087372779846, + "kl": 0.18359375, + "learning_rate": 1.5250752141810839e-05, + "loss": 0.0046, + "reward": 0.5636160969734192, + "reward_std": 0.08544615190476179, + "rewards/accuracy_reward": 0.066964291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1311 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2545166015625, + "epoch": 0.3919050108281682, + "grad_norm": 0.0969657227396965, + "kl": 0.18505859375, + "learning_rate": 1.524187255841175e-05, + "loss": 0.0072, + "reward": 0.5625000149011612, + "reward_std": 0.06531209778040648, + "rewards/accuracy_reward": 0.06473214388825, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1312 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.1473693847656, + "epoch": 0.39220371891568967, + "grad_norm": 0.09227337688207626, + "kl": 0.177978515625, + "learning_rate": 1.5232987272369076e-05, + "loss": 0.0078, + "reward": 0.6891741454601288, + "reward_std": 0.09371816273778677, + "rewards/accuracy_reward": 0.18973214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1313 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.8549499511719, + "epoch": 0.3925024270032111, + "grad_norm": 0.08992331475019455, + "kl": 0.1826171875, + "learning_rate": 1.5224096293349137e-05, + "loss": 0.0069, + "reward": 0.5781250298023224, + "reward_std": 0.07268907688558102, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1314 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.3125457763672, + "epoch": 0.39280113509073256, + "grad_norm": 0.10117138177156448, + "kl": 0.1767578125, + "learning_rate": 1.5215199631024452e-05, + "loss": 0.006, + "reward": 0.6395089626312256, + "reward_std": 0.07561204861849546, + "rewards/accuracy_reward": 0.14285715413279831, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1315 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.6339721679688, + "epoch": 0.393099843178254, + "grad_norm": 0.10889749228954315, + "kl": 0.190185546875, + "learning_rate": 1.5206297295073706e-05, + "loss": 0.0067, + "reward": 0.590959832072258, + "reward_std": 0.07984000630676746, + "rewards/accuracy_reward": 0.09375000325962901, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1316 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0000457763672, + "epoch": 0.3933985512657755, + "grad_norm": 0.09976642578840256, + "kl": 0.1845703125, + "learning_rate": 1.519738929518178e-05, + "loss": 0.0091, + "reward": 0.5652901977300644, + "reward_std": 0.09709859918802977, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1317 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.0535888671875, + "epoch": 0.39369725935329697, + "grad_norm": 0.10673259198665619, + "kl": 0.187255859375, + "learning_rate": 1.51884756410397e-05, + "loss": 0.0085, + "reward": 0.5965402126312256, + "reward_std": 0.11411860585212708, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1318 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.8303985595703, + "epoch": 0.39399596744081844, + "grad_norm": 0.07801935076713562, + "kl": 0.177001953125, + "learning_rate": 1.5179556342344643e-05, + "loss": 0.0069, + "reward": 0.6188616305589676, + "reward_std": 0.024553573224693537, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1319 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.6428985595703, + "epoch": 0.3942946755283399, + "grad_norm": 0.08723336458206177, + "kl": 0.17578125, + "learning_rate": 1.5170631408799938e-05, + "loss": 0.0083, + "reward": 0.599888414144516, + "reward_std": 0.07378355413675308, + "rewards/accuracy_reward": 0.10044643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1320 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0826416015625, + "epoch": 0.3945933836158614, + "grad_norm": 0.10325779020786285, + "kl": 0.178466796875, + "learning_rate": 1.516170085011504e-05, + "loss": 0.0096, + "reward": 0.5429687798023224, + "reward_std": 0.09863005671650171, + "rewards/accuracy_reward": 0.044642860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1321 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.9888763427734, + "epoch": 0.39489209170338285, + "grad_norm": 0.11188855022192001, + "kl": 0.174560546875, + "learning_rate": 1.5152764676005518e-05, + "loss": 0.006, + "reward": 0.53125, + "reward_std": 0.08532518334686756, + "rewards/accuracy_reward": 0.03348214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1322 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.6696929931641, + "epoch": 0.3951907997909043, + "grad_norm": 0.0822596475481987, + "kl": 0.1787109375, + "learning_rate": 1.514382289619305e-05, + "loss": 0.0076, + "reward": 0.5641741305589676, + "reward_std": 0.06032158527523279, + "rewards/accuracy_reward": 0.06473214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1323 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.6696929931641, + "epoch": 0.3954895078784258, + "grad_norm": 0.10315360128879547, + "kl": 0.178466796875, + "learning_rate": 1.5134875520405423e-05, + "loss": 0.0075, + "reward": 0.615513414144516, + "reward_std": 0.04833107930608094, + "rewards/accuracy_reward": 0.11830357811413705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1324 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.4442443847656, + "epoch": 0.39578821596594727, + "grad_norm": 0.08988931030035019, + "kl": 0.17529296875, + "learning_rate": 1.51259225583765e-05, + "loss": 0.0075, + "reward": 0.6372767984867096, + "reward_std": 0.06255572568625212, + "rewards/accuracy_reward": 0.13839286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1325 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.2344207763672, + "epoch": 0.39608692405346874, + "grad_norm": 0.10925421118736267, + "kl": 0.1748046875, + "learning_rate": 1.511696401984623e-05, + "loss": 0.008, + "reward": 0.6651785969734192, + "reward_std": 0.09509873390197754, + "rewards/accuracy_reward": 0.16741072107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1326 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.7701263427734, + "epoch": 0.3963856321409902, + "grad_norm": 0.120707206428051, + "kl": 0.178466796875, + "learning_rate": 1.5107999914560618e-05, + "loss": 0.008, + "reward": 0.620535746216774, + "reward_std": 0.1159540768712759, + "rewards/accuracy_reward": 0.12500000675208867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1327 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3951263427734, + "epoch": 0.3966843402285117, + "grad_norm": 0.12171304225921631, + "kl": 0.180908203125, + "learning_rate": 1.5099030252271742e-05, + "loss": 0.0075, + "reward": 0.627232164144516, + "reward_std": 0.06435074983164668, + "rewards/accuracy_reward": 0.13169643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1328 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5491638183594, + "epoch": 0.39698304831603315, + "grad_norm": 0.10757122188806534, + "kl": 0.180419921875, + "learning_rate": 1.509005504273771e-05, + "loss": 0.0072, + "reward": 0.6411830633878708, + "reward_std": 0.07592052686959505, + "rewards/accuracy_reward": 0.14285715110599995, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1329 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8750305175781, + "epoch": 0.3972817564035546, + "grad_norm": 0.10543598234653473, + "kl": 0.180908203125, + "learning_rate": 1.5081074295722666e-05, + "loss": 0.0069, + "reward": 0.5496651977300644, + "reward_std": 0.08152991533279419, + "rewards/accuracy_reward": 0.05357143026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1330 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.1004791259766, + "epoch": 0.3975804644910761, + "grad_norm": 0.11983317881822586, + "kl": 0.182373046875, + "learning_rate": 1.5072088020996791e-05, + "loss": 0.0062, + "reward": 0.6523437798023224, + "reward_std": 0.047172361984848976, + "rewards/accuracy_reward": 0.15625000838190317, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1331 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.6384429931641, + "epoch": 0.39787917257859756, + "grad_norm": 0.12050566077232361, + "kl": 0.185546875, + "learning_rate": 1.5063096228336265e-05, + "loss": 0.0079, + "reward": 0.6640625447034836, + "reward_std": 0.10859970469027758, + "rewards/accuracy_reward": 0.16517857927829027, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1332 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5134429931641, + "epoch": 0.39817788066611903, + "grad_norm": 0.09089052677154541, + "kl": 0.1826171875, + "learning_rate": 1.5054098927523281e-05, + "loss": 0.0073, + "reward": 0.5602678954601288, + "reward_std": 0.0750907314941287, + "rewards/accuracy_reward": 0.06250000209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1333 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.2388610839844, + "epoch": 0.3984765887536405, + "grad_norm": 0.12960848212242126, + "kl": 0.18017578125, + "learning_rate": 1.5045096128346017e-05, + "loss": 0.0088, + "reward": 0.6595982611179352, + "reward_std": 0.12707727774977684, + "rewards/accuracy_reward": 0.1607142877765, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1334 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.8928985595703, + "epoch": 0.398775296841162, + "grad_norm": 0.09380675107240677, + "kl": 0.183837890625, + "learning_rate": 1.503608784059864e-05, + "loss": 0.0114, + "reward": 0.6707589477300644, + "reward_std": 0.09468763624317944, + "rewards/accuracy_reward": 0.1718750074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1335 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.9486999511719, + "epoch": 0.39907400492868345, + "grad_norm": 0.14270751178264618, + "kl": 0.18896484375, + "learning_rate": 1.5027074074081282e-05, + "loss": 0.0078, + "reward": 0.6529018133878708, + "reward_std": 0.11145550874061882, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1336 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.1004791259766, + "epoch": 0.3993727130162049, + "grad_norm": 0.1149626299738884, + "kl": 0.189208984375, + "learning_rate": 1.5018054838600033e-05, + "loss": 0.008, + "reward": 0.647879496216774, + "reward_std": 0.08767829462885857, + "rewards/accuracy_reward": 0.14955358300358057, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1337 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.6317291259766, + "epoch": 0.3996714211037264, + "grad_norm": 0.3240753412246704, + "kl": 0.183837890625, + "learning_rate": 1.5009030143966948e-05, + "loss": 0.0085, + "reward": 0.6121652126312256, + "reward_std": 0.0931694507598877, + "rewards/accuracy_reward": 0.11607143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1338 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.0781707763672, + "epoch": 0.39997012919124786, + "grad_norm": 0.5875338912010193, + "kl": 0.205078125, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.0099, + "reward": 0.6328125298023224, + "reward_std": 0.07023851666599512, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1339 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.7455749511719, + "epoch": 0.40026883727876933, + "grad_norm": 5.144883632659912, + "kl": 0.9326171875, + "learning_rate": 1.4990964416523108e-05, + "loss": 0.0335, + "reward": 0.6160714626312256, + "reward_std": 0.10041971877217293, + "rewards/accuracy_reward": 0.13616072200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107387661934, + "step": 1340 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5000305175781, + "epoch": 0.4005675453662908, + "grad_norm": 0.2919211685657501, + "kl": 0.276123046875, + "learning_rate": 1.4981923403366096e-05, + "loss": 0.0107, + "reward": 0.5876116454601288, + "reward_std": 0.16337387822568417, + "rewards/accuracy_reward": 0.10714286379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687723517418, + "step": 1341 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.3795013427734, + "epoch": 0.40086625345381227, + "grad_norm": 0.20863960683345795, + "kl": 0.20068359375, + "learning_rate": 1.4972876970364703e-05, + "loss": 0.0076, + "reward": 0.6155134290456772, + "reward_std": 0.10057238303124905, + "rewards/accuracy_reward": 0.12500000838190317, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 1342 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.3482513427734, + "epoch": 0.40116496154133374, + "grad_norm": 0.3060239851474762, + "kl": 0.20068359375, + "learning_rate": 1.496382512736056e-05, + "loss": 0.0084, + "reward": 0.5005580559372902, + "reward_std": 0.05323330243118107, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 1343 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.7589721679688, + "epoch": 0.4014636696288552, + "grad_norm": 0.3026253283023834, + "kl": 0.21044921875, + "learning_rate": 1.4954767884201186e-05, + "loss": 0.0087, + "reward": 0.5686384364962578, + "reward_std": 0.1138030644506216, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 1344 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.2611999511719, + "epoch": 0.4017623777163767, + "grad_norm": 0.2261921465396881, + "kl": 0.2138671875, + "learning_rate": 1.4945705250739972e-05, + "loss": 0.0061, + "reward": 0.5457589626312256, + "reward_std": 0.11150529328733683, + "rewards/accuracy_reward": 0.060267859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 1345 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.9419860839844, + "epoch": 0.40206108580389815, + "grad_norm": 0.2108086794614792, + "kl": 0.222900390625, + "learning_rate": 1.4936637236836178e-05, + "loss": 0.009, + "reward": 0.5390625298023224, + "reward_std": 0.08888729196041822, + "rewards/accuracy_reward": 0.060267860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478794664144516, + "step": 1346 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7611846923828, + "epoch": 0.4023597938914196, + "grad_norm": 0.2820032835006714, + "kl": 0.2529296875, + "learning_rate": 1.492756385235491e-05, + "loss": 0.0087, + "reward": 0.5273437723517418, + "reward_std": 0.12462512589991093, + "rewards/accuracy_reward": 0.05580357322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4715401977300644, + "step": 1347 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.5647735595703, + "epoch": 0.4026585019789411, + "grad_norm": 0.3115713894367218, + "kl": 0.31982421875, + "learning_rate": 1.4918485107167127e-05, + "loss": 0.0107, + "reward": 0.5753348618745804, + "reward_std": 0.13191594276577234, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4748884066939354, + "step": 1348 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.1228179931641, + "epoch": 0.40295721006646257, + "grad_norm": 30.17136001586914, + "kl": 2.71728515625, + "learning_rate": 1.490940101114961e-05, + "loss": 0.108, + "reward": 0.6210937798023224, + "reward_std": 0.1363169513642788, + "rewards/accuracy_reward": 0.14955357206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.471540205180645, + "step": 1349 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2544708251953, + "epoch": 0.40325591815398404, + "grad_norm": 0.3776077628135681, + "kl": 0.28662109375, + "learning_rate": 1.4900311574184967e-05, + "loss": 0.0105, + "reward": 0.6183036044239998, + "reward_std": 0.10956014692783356, + "rewards/accuracy_reward": 0.14062500861473382, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4776786044239998, + "step": 1350 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.8325958251953, + "epoch": 0.4035546262415055, + "grad_norm": 0.40127676725387573, + "kl": 0.250244140625, + "learning_rate": 1.4891216806161613e-05, + "loss": 0.01, + "reward": 0.5580357238650322, + "reward_std": 0.16467429138720036, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750149011612, + "step": 1351 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.3638763427734, + "epoch": 0.403853334329027, + "grad_norm": 0.6858088374137878, + "kl": 0.28125, + "learning_rate": 1.488211671697376e-05, + "loss": 0.0124, + "reward": 0.4949776977300644, + "reward_std": 0.06840462330728769, + "rewards/accuracy_reward": 0.0133928582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848469734192, + "step": 1352 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.5982513427734, + "epoch": 0.40415204241654845, + "grad_norm": 0.36850839853286743, + "kl": 0.3603515625, + "learning_rate": 1.4873011316521421e-05, + "loss": 0.0144, + "reward": 0.5407366305589676, + "reward_std": 0.0600888489279896, + "rewards/accuracy_reward": 0.04687500209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1353 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5089569091797, + "epoch": 0.4044507505040699, + "grad_norm": 1.1871821880340576, + "kl": 0.61083984375, + "learning_rate": 1.4863900614710379e-05, + "loss": 0.0224, + "reward": 0.6696428954601288, + "reward_std": 0.11598192434757948, + "rewards/accuracy_reward": 0.17410715413279831, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1354 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.0022735595703, + "epoch": 0.4047494585915914, + "grad_norm": 0.5564934611320496, + "kl": 0.50537109375, + "learning_rate": 1.4854784621452176e-05, + "loss": 0.0218, + "reward": 0.580357164144516, + "reward_std": 0.06625264370813966, + "rewards/accuracy_reward": 0.08705357508733869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 1355 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.5424499511719, + "epoch": 0.40504816667911286, + "grad_norm": 0.7310737371444702, + "kl": 0.267578125, + "learning_rate": 1.484566334666413e-05, + "loss": 0.0123, + "reward": 0.5318080633878708, + "reward_std": 0.06556158186867833, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1356 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.8683471679688, + "epoch": 0.4053468747666343, + "grad_norm": 0.16539964079856873, + "kl": 0.20947265625, + "learning_rate": 1.4836536800269288e-05, + "loss": 0.0084, + "reward": 0.5898437798023224, + "reward_std": 0.09268699144013226, + "rewards/accuracy_reward": 0.09375000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1357 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7567291259766, + "epoch": 0.40564558285415575, + "grad_norm": 0.42152905464172363, + "kl": 0.22607421875, + "learning_rate": 1.4827404992196436e-05, + "loss": 0.0096, + "reward": 0.573660746216774, + "reward_std": 0.0868384437635541, + "rewards/accuracy_reward": 0.08928571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750149011612, + "step": 1358 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.3928985595703, + "epoch": 0.4059442909416772, + "grad_norm": 0.2453368753194809, + "kl": 0.2001953125, + "learning_rate": 1.481826793238009e-05, + "loss": 0.0096, + "reward": 0.6088169813156128, + "reward_std": 0.12174298986792564, + "rewards/accuracy_reward": 0.11830357415601611, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 1359 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.6518249511719, + "epoch": 0.4062429990291987, + "grad_norm": 0.17328602075576782, + "kl": 0.21337890625, + "learning_rate": 1.4809125630760477e-05, + "loss": 0.0085, + "reward": 0.5837053805589676, + "reward_std": 0.062469678930938244, + "rewards/accuracy_reward": 0.09151786053553224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 1360 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8705749511719, + "epoch": 0.40654170711672016, + "grad_norm": 0.5601662993431091, + "kl": 0.2783203125, + "learning_rate": 1.479997809728352e-05, + "loss": 0.0112, + "reward": 0.651227705180645, + "reward_std": 0.06187330046668649, + "rewards/accuracy_reward": 0.15848215413279831, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1361 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4286041259766, + "epoch": 0.40684041520424163, + "grad_norm": 0.17647747695446014, + "kl": 0.22021484375, + "learning_rate": 1.4790825341900844e-05, + "loss": 0.0086, + "reward": 0.5864955633878708, + "reward_std": 0.10817372240126133, + "rewards/accuracy_reward": 0.08928572130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1362 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.4174346923828, + "epoch": 0.4071391232917631, + "grad_norm": 0.3968355357646942, + "kl": 0.245849609375, + "learning_rate": 1.4781667374569746e-05, + "loss": 0.0111, + "reward": 0.573660746216774, + "reward_std": 0.1295827105641365, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 1363 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.5179138183594, + "epoch": 0.4074378313792846, + "grad_norm": 0.5605382919311523, + "kl": 0.24072265625, + "learning_rate": 1.4772504205253197e-05, + "loss": 0.0092, + "reward": 0.6132812798023224, + "reward_std": 0.09991573728621006, + "rewards/accuracy_reward": 0.1183035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1364 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2076263427734, + "epoch": 0.40773653946680605, + "grad_norm": 0.2861899435520172, + "kl": 0.23876953125, + "learning_rate": 1.476333584391983e-05, + "loss": 0.0095, + "reward": 0.5485491305589676, + "reward_std": 0.08921325299888849, + "rewards/accuracy_reward": 0.051339288242161274, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1365 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8772735595703, + "epoch": 0.4080352475543275, + "grad_norm": 0.38161352276802063, + "kl": 0.2568359375, + "learning_rate": 1.4754162300543922e-05, + "loss": 0.0101, + "reward": 0.646763414144516, + "reward_std": 0.07795751583762467, + "rewards/accuracy_reward": 0.14955357648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1366 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.6830749511719, + "epoch": 0.408333955641849, + "grad_norm": 0.5357054471969604, + "kl": 0.31787109375, + "learning_rate": 1.4744983585105388e-05, + "loss": 0.0128, + "reward": 0.640066996216774, + "reward_std": 0.09164069592952728, + "rewards/accuracy_reward": 0.14508929592557251, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1367 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4286041259766, + "epoch": 0.40863266372937046, + "grad_norm": 1.7490360736846924, + "kl": 0.77734375, + "learning_rate": 1.4735799707589773e-05, + "loss": 0.0288, + "reward": 0.5636160969734192, + "reward_std": 0.11811833828687668, + "rewards/accuracy_reward": 0.07142857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 1368 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.5223388671875, + "epoch": 0.40893137181689193, + "grad_norm": 1.1202691793441772, + "kl": 0.59326171875, + "learning_rate": 1.4726610677988232e-05, + "loss": 0.0235, + "reward": 0.6294643133878708, + "reward_std": 0.11794267781078815, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 1369 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.5000152587891, + "epoch": 0.4092300799044134, + "grad_norm": 0.22739681601524353, + "kl": 0.2587890625, + "learning_rate": 1.4717416506297535e-05, + "loss": 0.0087, + "reward": 0.5686384066939354, + "reward_std": 0.09418095601722598, + "rewards/accuracy_reward": 0.0825892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 1370 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.4095287879919349, + "grad_norm": 0.3235754370689392, + "kl": 0.241943359375, + "learning_rate": 1.470821720252003e-05, + "loss": 0.0097, + "reward": 0.5262276977300644, + "reward_std": 0.1381373442709446, + "rewards/accuracy_reward": 0.05357143119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4726562723517418, + "step": 1371 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.40982749607945634, + "grad_norm": 0.2524183392524719, + "kl": 0.2412109375, + "learning_rate": 1.4699012776663668e-05, + "loss": 0.0097, + "reward": 0.5228794887661934, + "reward_std": 0.04864303581416607, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330633878708, + "step": 1372 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0580596923828, + "epoch": 0.4101262041669778, + "grad_norm": 0.2825910449028015, + "kl": 0.244873046875, + "learning_rate": 1.4689803238741955e-05, + "loss": 0.0088, + "reward": 0.5814732387661934, + "reward_std": 0.11492317821830511, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910969734192, + "step": 1373 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.3638763427734, + "epoch": 0.4104249122544993, + "grad_norm": 0.3253958523273468, + "kl": 0.259521484375, + "learning_rate": 1.468058859877397e-05, + "loss": 0.0101, + "reward": 0.5390625298023224, + "reward_std": 0.10689083859324455, + "rewards/accuracy_reward": 0.051339289639145136, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 1374 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.419677734375, + "epoch": 0.41072362034202076, + "grad_norm": 0.20857356488704681, + "kl": 0.296875, + "learning_rate": 1.4671368866784338e-05, + "loss": 0.0107, + "reward": 0.5078125298023224, + "reward_std": 0.058721842244267464, + "rewards/accuracy_reward": 0.01785714295692742, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1375 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.3013610839844, + "epoch": 0.4110223284295422, + "grad_norm": 0.7737520337104797, + "kl": 0.478515625, + "learning_rate": 1.4662144052803223e-05, + "loss": 0.0184, + "reward": 0.613281287252903, + "reward_std": 0.06374538503587246, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1376 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.2545166015625, + "epoch": 0.4113210365170637, + "grad_norm": 0.28951090574264526, + "kl": 0.31201171875, + "learning_rate": 1.4652914166866312e-05, + "loss": 0.0109, + "reward": 0.6562500149011612, + "reward_std": 0.05992567213252187, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1377 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.5937805175781, + "epoch": 0.41161974460458517, + "grad_norm": 0.23981612920761108, + "kl": 0.27685546875, + "learning_rate": 1.4643679219014827e-05, + "loss": 0.0109, + "reward": 0.6646205633878708, + "reward_std": 0.09366164170205593, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1378 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.7299194335938, + "epoch": 0.41191845269210664, + "grad_norm": 0.21497085690498352, + "kl": 0.2822265625, + "learning_rate": 1.463443921929548e-05, + "loss": 0.0104, + "reward": 0.6043527126312256, + "reward_std": 0.1111149387434125, + "rewards/accuracy_reward": 0.11383928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 1379 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.5558166503906, + "epoch": 0.4122171607796281, + "grad_norm": 0.149282306432724, + "kl": 0.2607421875, + "learning_rate": 1.4625194177760485e-05, + "loss": 0.0102, + "reward": 0.5585937649011612, + "reward_std": 0.0805054884403944, + "rewards/accuracy_reward": 0.06473214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1380 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.2924346923828, + "epoch": 0.4125158688671496, + "grad_norm": 0.17583142220973969, + "kl": 0.252197265625, + "learning_rate": 1.4615944104467544e-05, + "loss": 0.0101, + "reward": 0.6183036118745804, + "reward_std": 0.12198652140796185, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1381 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0602874755859, + "epoch": 0.41281457695467105, + "grad_norm": 0.18274857103824615, + "kl": 0.248046875, + "learning_rate": 1.4606689009479829e-05, + "loss": 0.0078, + "reward": 0.5959821790456772, + "reward_std": 0.1039078263565898, + "rewards/accuracy_reward": 0.10491071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1382 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.9799346923828, + "epoch": 0.4131132850421925, + "grad_norm": 0.23037967085838318, + "kl": 0.266845703125, + "learning_rate": 1.4597428902865973e-05, + "loss": 0.0103, + "reward": 0.5948660969734192, + "reward_std": 0.1630854532122612, + "rewards/accuracy_reward": 0.11830357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4765625149011612, + "step": 1383 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.1986846923828, + "epoch": 0.413411993129714, + "grad_norm": 0.2874922752380371, + "kl": 0.249267578125, + "learning_rate": 1.4588163794700068e-05, + "loss": 0.0111, + "reward": 0.5820312723517418, + "reward_std": 0.09622667916119099, + "rewards/accuracy_reward": 0.10267857206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.479352705180645, + "step": 1384 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.5111694335938, + "epoch": 0.41371070121723547, + "grad_norm": 0.20433495938777924, + "kl": 0.25048828125, + "learning_rate": 1.4578893695061644e-05, + "loss": 0.0098, + "reward": 0.5412946566939354, + "reward_std": 0.09523524902760983, + "rewards/accuracy_reward": 0.06026785867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 1385 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.1629791259766, + "epoch": 0.41400940930475694, + "grad_norm": 0.17219462990760803, + "kl": 0.29345703125, + "learning_rate": 1.456961861403566e-05, + "loss": 0.0119, + "reward": 0.5758928805589676, + "reward_std": 0.06401256751269102, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1386 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.9397583007812, + "epoch": 0.4143081173922784, + "grad_norm": 0.1733807772397995, + "kl": 0.2978515625, + "learning_rate": 1.4560338561712495e-05, + "loss": 0.0109, + "reward": 0.5686384290456772, + "reward_std": 0.11710502672940493, + "rewards/accuracy_reward": 0.08035714598372579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812798023224, + "step": 1387 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8973541259766, + "epoch": 0.4146068254797999, + "grad_norm": 0.24990282952785492, + "kl": 0.30029296875, + "learning_rate": 1.4551053548187933e-05, + "loss": 0.0104, + "reward": 0.5758928805589676, + "reward_std": 0.13598552951589227, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1388 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.5000305175781, + "epoch": 0.41490553356732135, + "grad_norm": 0.22865010797977448, + "kl": 0.255859375, + "learning_rate": 1.4541763583563165e-05, + "loss": 0.0076, + "reward": 0.5859375298023224, + "reward_std": 0.10968576185405254, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 1389 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.8259429931641, + "epoch": 0.4152042416548428, + "grad_norm": 0.19397753477096558, + "kl": 0.31884765625, + "learning_rate": 1.4532468677944758e-05, + "loss": 0.0097, + "reward": 0.5580357387661934, + "reward_std": 0.09224207606166601, + "rewards/accuracy_reward": 0.07366071874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750298023224, + "step": 1390 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.1986846923828, + "epoch": 0.4155029497423643, + "grad_norm": 0.28769296407699585, + "kl": 0.2958984375, + "learning_rate": 1.4523168841444657e-05, + "loss": 0.0126, + "reward": 0.5703125298023224, + "reward_std": 0.12648486765101552, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 1391 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.9241333007812, + "epoch": 0.41580165782988576, + "grad_norm": 1.5769290924072266, + "kl": 0.6044921875, + "learning_rate": 1.4513864084180176e-05, + "loss": 0.023, + "reward": 0.588169664144516, + "reward_std": 0.15309167467057705, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 1392 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6585083007812, + "epoch": 0.41610036591740723, + "grad_norm": 0.2233695685863495, + "kl": 0.3017578125, + "learning_rate": 1.4504554416273977e-05, + "loss": 0.0111, + "reward": 0.603794664144516, + "reward_std": 0.0596232870593667, + "rewards/accuracy_reward": 0.11383929220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 1393 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.935302734375, + "epoch": 0.4163990740049287, + "grad_norm": 0.21179910004138947, + "kl": 0.263671875, + "learning_rate": 1.4495239847854071e-05, + "loss": 0.0065, + "reward": 0.5970982313156128, + "reward_std": 0.161135233938694, + "rewards/accuracy_reward": 0.10714286379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553880095482, + "step": 1394 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8571929931641, + "epoch": 0.4166977820924502, + "grad_norm": 0.23053555190563202, + "kl": 0.31298828125, + "learning_rate": 1.4485920389053786e-05, + "loss": 0.0126, + "reward": 0.4955357238650322, + "reward_std": 0.0638352120295167, + "rewards/accuracy_reward": 0.011160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750149011612, + "step": 1395 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6942138671875, + "epoch": 0.41699649017997165, + "grad_norm": 0.1656234711408615, + "kl": 0.252197265625, + "learning_rate": 1.4476596050011787e-05, + "loss": 0.0094, + "reward": 0.540178582072258, + "reward_std": 0.09839408146217465, + "rewards/accuracy_reward": 0.04687500302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1396 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7277221679688, + "epoch": 0.4172951982674931, + "grad_norm": 0.3120059370994568, + "kl": 0.263427734375, + "learning_rate": 1.4467266840872041e-05, + "loss": 0.0112, + "reward": 0.4949777126312256, + "reward_std": 0.09005796071141958, + "rewards/accuracy_reward": 0.011160714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 1397 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.7098388671875, + "epoch": 0.4175939063550146, + "grad_norm": 0.2667173743247986, + "kl": 0.273681640625, + "learning_rate": 1.4457932771783808e-05, + "loss": 0.0085, + "reward": 0.5273437798023224, + "reward_std": 0.12005666457116604, + "rewards/accuracy_reward": 0.042410717345774174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 1398 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.1719055175781, + "epoch": 0.41789261444253606, + "grad_norm": 0.8840191960334778, + "kl": 0.41162109375, + "learning_rate": 1.4448593852901644e-05, + "loss": 0.0139, + "reward": 0.5686383992433548, + "reward_std": 0.1102263405919075, + "rewards/accuracy_reward": 0.08035714854486287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 1399 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.4442443847656, + "epoch": 0.4181913225300575, + "grad_norm": 0.3159501850605011, + "kl": 0.308349609375, + "learning_rate": 1.443925009438538e-05, + "loss": 0.0104, + "reward": 0.5507812798023224, + "reward_std": 0.10004298388957977, + "rewards/accuracy_reward": 0.058035716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1400 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.6339721679688, + "epoch": 0.41849003061757895, + "grad_norm": 0.12777301669120789, + "kl": 0.23193359375, + "learning_rate": 1.4429901506400106e-05, + "loss": 0.0081, + "reward": 0.5150669813156128, + "reward_std": 0.040951183531433344, + "rewards/accuracy_reward": 0.0200892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1401 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4888610839844, + "epoch": 0.4187887387051004, + "grad_norm": 0.16759474575519562, + "kl": 0.213623046875, + "learning_rate": 1.4420548099116167e-05, + "loss": 0.0086, + "reward": 0.5853794813156128, + "reward_std": 0.11715064803138375, + "rewards/accuracy_reward": 0.08928571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1402 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.2857360839844, + "epoch": 0.4190874467926219, + "grad_norm": 0.1351417750120163, + "kl": 0.242919921875, + "learning_rate": 1.441118988270916e-05, + "loss": 0.0099, + "reward": 0.5820312798023224, + "reward_std": 0.06962359743192792, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1403 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6808319091797, + "epoch": 0.41938615488014336, + "grad_norm": 0.17266012728214264, + "kl": 0.271240234375, + "learning_rate": 1.4401826867359903e-05, + "loss": 0.0083, + "reward": 0.5424107313156128, + "reward_std": 0.11090771295130253, + "rewards/accuracy_reward": 0.04910714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1404 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2969207763672, + "epoch": 0.41968486296766483, + "grad_norm": 0.13660529255867004, + "kl": 0.212890625, + "learning_rate": 1.4392459063254438e-05, + "loss": 0.009, + "reward": 0.5959821790456772, + "reward_std": 0.11779660545289516, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1405 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.4643402099609, + "epoch": 0.4199835710551863, + "grad_norm": 0.178821861743927, + "kl": 0.25, + "learning_rate": 1.438308648058402e-05, + "loss": 0.0099, + "reward": 0.6021205484867096, + "reward_std": 0.11086591379716992, + "rewards/accuracy_reward": 0.10937500186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1406 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.9464569091797, + "epoch": 0.42028227914270777, + "grad_norm": 0.17645670473575592, + "kl": 0.24169921875, + "learning_rate": 1.4373709129545101e-05, + "loss": 0.0091, + "reward": 0.5881696939468384, + "reward_std": 0.09985014237463474, + "rewards/accuracy_reward": 0.09151786100119352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1407 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1317291259766, + "epoch": 0.42058098723022924, + "grad_norm": 0.2370661497116089, + "kl": 0.2958984375, + "learning_rate": 1.4364327020339319e-05, + "loss": 0.0122, + "reward": 0.6104910969734192, + "reward_std": 0.11597028747200966, + "rewards/accuracy_reward": 0.12053571944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1408 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.7612152099609, + "epoch": 0.4208796953177507, + "grad_norm": 0.7763593792915344, + "kl": 0.31640625, + "learning_rate": 1.4354940163173486e-05, + "loss": 0.012, + "reward": 0.5898437798023224, + "reward_std": 0.10162721248343587, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491629496216774, + "step": 1409 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5402221679688, + "epoch": 0.4211784034052722, + "grad_norm": 0.20207619667053223, + "kl": 0.26611328125, + "learning_rate": 1.4345548568259586e-05, + "loss": 0.0093, + "reward": 0.565848246216774, + "reward_std": 0.10884425416588783, + "rewards/accuracy_reward": 0.07366071944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1410 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0535888671875, + "epoch": 0.42147711149279365, + "grad_norm": 0.3876742422580719, + "kl": 0.345703125, + "learning_rate": 1.4336152245814755e-05, + "loss": 0.0105, + "reward": 0.6378348469734192, + "reward_std": 0.08837560983374715, + "rewards/accuracy_reward": 0.1428571529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1411 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3772735595703, + "epoch": 0.4217758195803151, + "grad_norm": 0.18367169797420502, + "kl": 0.287353515625, + "learning_rate": 1.4326751206061268e-05, + "loss": 0.0102, + "reward": 0.6032366454601288, + "reward_std": 0.10612136917188764, + "rewards/accuracy_reward": 0.11383929220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 1412 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5960235595703, + "epoch": 0.4220745276678366, + "grad_norm": 0.22040602564811707, + "kl": 0.218017578125, + "learning_rate": 1.4317345459226536e-05, + "loss": 0.0061, + "reward": 0.5680803805589676, + "reward_std": 0.11628764122724533, + "rewards/accuracy_reward": 0.08035714575089514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487723246216774, + "step": 1413 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7902221679688, + "epoch": 0.42237323575535807, + "grad_norm": 0.2532702684402466, + "kl": 0.231689453125, + "learning_rate": 1.4307935015543093e-05, + "loss": 0.0091, + "reward": 0.557477705180645, + "reward_std": 0.13204228319227695, + "rewards/accuracy_reward": 0.0669642873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 1414 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0513763427734, + "epoch": 0.42267194384287954, + "grad_norm": 0.3687335252761841, + "kl": 0.283447265625, + "learning_rate": 1.4298519885248574e-05, + "loss": 0.0113, + "reward": 0.5256696715950966, + "reward_std": 0.0920994933694601, + "rewards/accuracy_reward": 0.03348214481957257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1415 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.7076263427734, + "epoch": 0.422970651930401, + "grad_norm": 0.17838983237743378, + "kl": 0.33349609375, + "learning_rate": 1.4289100078585718e-05, + "loss": 0.0127, + "reward": 0.6289062798023224, + "reward_std": 0.06282990169711411, + "rewards/accuracy_reward": 0.13839285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 1416 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.6495971679688, + "epoch": 0.4232693600179225, + "grad_norm": 0.24806837737560272, + "kl": 0.286376953125, + "learning_rate": 1.4279675605802355e-05, + "loss": 0.0097, + "reward": 0.7299107313156128, + "reward_std": 0.13449129136279225, + "rewards/accuracy_reward": 0.2366071455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1417 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.2545166015625, + "epoch": 0.42356806810544395, + "grad_norm": 0.5467337369918823, + "kl": 0.45849609375, + "learning_rate": 1.4270246477151386e-05, + "loss": 0.02, + "reward": 0.5385044813156128, + "reward_std": 0.09842424816451967, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1418 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.4710235595703, + "epoch": 0.4238667761929654, + "grad_norm": 0.2263677567243576, + "kl": 0.30615234375, + "learning_rate": 1.4260812702890778e-05, + "loss": 0.0103, + "reward": 0.6339285969734192, + "reward_std": 0.17408580519258976, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1419 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.1138763427734, + "epoch": 0.4241654842804869, + "grad_norm": 0.3743097484111786, + "kl": 0.310546875, + "learning_rate": 1.4251374293283555e-05, + "loss": 0.01, + "reward": 0.5926339477300644, + "reward_std": 0.11406497843563557, + "rewards/accuracy_reward": 0.09598214458674192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1420 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.216552734375, + "epoch": 0.42446419236800836, + "grad_norm": 0.30440884828567505, + "kl": 0.314697265625, + "learning_rate": 1.4241931258597781e-05, + "loss": 0.0124, + "reward": 0.5797991454601288, + "reward_std": 0.09918831940740347, + "rewards/accuracy_reward": 0.08482143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1421 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1741638183594, + "epoch": 0.42476290045552983, + "grad_norm": 0.1656067967414856, + "kl": 0.205810546875, + "learning_rate": 1.423248360910655e-05, + "loss": 0.0091, + "reward": 0.5926339626312256, + "reward_std": 0.12264021672308445, + "rewards/accuracy_reward": 0.0959821492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1422 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.6808624267578, + "epoch": 0.4250616085430513, + "grad_norm": 0.4470791518688202, + "kl": 0.27099609375, + "learning_rate": 1.4223031355087983e-05, + "loss": 0.0093, + "reward": 0.561941996216774, + "reward_std": 0.07600245624780655, + "rewards/accuracy_reward": 0.06919643259607255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1423 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.0580749511719, + "epoch": 0.4253603166305728, + "grad_norm": 0.16887180507183075, + "kl": 0.22802734375, + "learning_rate": 1.4213574506825201e-05, + "loss": 0.0104, + "reward": 0.5926339477300644, + "reward_std": 0.074593267403543, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1424 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.2879791259766, + "epoch": 0.42565902471809425, + "grad_norm": 0.31006988883018494, + "kl": 0.293701171875, + "learning_rate": 1.4204113074606332e-05, + "loss": 0.0118, + "reward": 0.6188616305589676, + "reward_std": 0.111617062240839, + "rewards/accuracy_reward": 0.12723215110599995, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1425 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7701416015625, + "epoch": 0.4259577328056157, + "grad_norm": 0.4268342852592468, + "kl": 0.233154296875, + "learning_rate": 1.419464706872448e-05, + "loss": 0.0089, + "reward": 0.5680803805589676, + "reward_std": 0.12339233420789242, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1426 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.482177734375, + "epoch": 0.4262564408931372, + "grad_norm": 0.5778020024299622, + "kl": 0.58203125, + "learning_rate": 1.4185176499477742e-05, + "loss": 0.0233, + "reward": 0.5859375298023224, + "reward_std": 0.1299140639603138, + "rewards/accuracy_reward": 0.09151786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1427 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.3616638183594, + "epoch": 0.42655514898065866, + "grad_norm": 0.35696807503700256, + "kl": 0.6318359375, + "learning_rate": 1.4175701377169162e-05, + "loss": 0.0233, + "reward": 0.6183036118745804, + "reward_std": 0.09509985335171223, + "rewards/accuracy_reward": 0.12723214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1428 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.7009429931641, + "epoch": 0.42685385706818013, + "grad_norm": 0.5502614974975586, + "kl": 0.474609375, + "learning_rate": 1.4166221712106749e-05, + "loss": 0.0178, + "reward": 0.7114955633878708, + "reward_std": 0.1280779018998146, + "rewards/accuracy_reward": 0.2165178656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 1429 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.7991638183594, + "epoch": 0.4271525651557016, + "grad_norm": 0.19530600309371948, + "kl": 0.294189453125, + "learning_rate": 1.4156737514603443e-05, + "loss": 0.0103, + "reward": 0.5368303805589676, + "reward_std": 0.0980440666899085, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1430 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.5848693847656, + "epoch": 0.4274512732432231, + "grad_norm": 0.19645078480243683, + "kl": 0.30419921875, + "learning_rate": 1.4147248794977127e-05, + "loss": 0.0104, + "reward": 0.5652902126312256, + "reward_std": 0.07644941750913858, + "rewards/accuracy_reward": 0.06919643213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1431 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.5960388183594, + "epoch": 0.42774998133074454, + "grad_norm": 0.6845777630805969, + "kl": 0.294677734375, + "learning_rate": 1.4137755563550597e-05, + "loss": 0.0124, + "reward": 0.5719866380095482, + "reward_std": 0.09975809883326292, + "rewards/accuracy_reward": 0.07812500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1432 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.2678985595703, + "epoch": 0.428048689418266, + "grad_norm": 0.27610647678375244, + "kl": 0.2265625, + "learning_rate": 1.4128257830651554e-05, + "loss": 0.0072, + "reward": 0.5970982164144516, + "reward_std": 0.06248657708056271, + "rewards/accuracy_reward": 0.1026785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1433 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.1897735595703, + "epoch": 0.4283473975057875, + "grad_norm": 0.21540401875972748, + "kl": 0.272216796875, + "learning_rate": 1.411875560661261e-05, + "loss": 0.0109, + "reward": 0.5630580633878708, + "reward_std": 0.10569551587104797, + "rewards/accuracy_reward": 0.07366071944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489397332072258, + "step": 1434 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.4598541259766, + "epoch": 0.42864610559330896, + "grad_norm": 0.337761253118515, + "kl": 0.27783203125, + "learning_rate": 1.4109248901771242e-05, + "loss": 0.0125, + "reward": 0.5931919887661934, + "reward_std": 0.11324948817491531, + "rewards/accuracy_reward": 0.10714286053553224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 1435 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.7344055175781, + "epoch": 0.42894481368083043, + "grad_norm": 0.2680559456348419, + "kl": 0.2900390625, + "learning_rate": 1.4099737726469823e-05, + "loss": 0.0122, + "reward": 0.5597098618745804, + "reward_std": 0.09776632720604539, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1436 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5402221679688, + "epoch": 0.4292435217683519, + "grad_norm": 0.21946942806243896, + "kl": 0.33447265625, + "learning_rate": 1.409022209105557e-05, + "loss": 0.0136, + "reward": 0.5753348395228386, + "reward_std": 0.11375760845839977, + "rewards/accuracy_reward": 0.09598214970901608, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4793526977300644, + "step": 1437 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8058471679688, + "epoch": 0.42954222985587337, + "grad_norm": 0.4861312806606293, + "kl": 0.40625, + "learning_rate": 1.408070200588057e-05, + "loss": 0.0152, + "reward": 0.5859375149011612, + "reward_std": 0.1962537206709385, + "rewards/accuracy_reward": 0.11160714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4743303805589676, + "step": 1438 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.4330749511719, + "epoch": 0.42984093794339484, + "grad_norm": 0.2821057438850403, + "kl": 0.380859375, + "learning_rate": 1.407117748130174e-05, + "loss": 0.0137, + "reward": 0.6456473618745804, + "reward_std": 0.09800587594509125, + "rewards/accuracy_reward": 0.16517857927829027, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687723517418, + "step": 1439 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.1674346923828, + "epoch": 0.4301396460309163, + "grad_norm": 0.3328908383846283, + "kl": 0.39404296875, + "learning_rate": 1.4061648527680825e-05, + "loss": 0.0198, + "reward": 0.545758955180645, + "reward_std": 0.12623492162674665, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478794664144516, + "step": 1440 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.9174499511719, + "epoch": 0.4304383541184378, + "grad_norm": 0.5827884078025818, + "kl": 0.3359375, + "learning_rate": 1.4052115155384401e-05, + "loss": 0.0104, + "reward": 0.529575914144516, + "reward_std": 0.10158562287688255, + "rewards/accuracy_reward": 0.053571432596072555, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4760044887661934, + "step": 1441 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.4576416015625, + "epoch": 0.43073706220595925, + "grad_norm": 0.2599911093711853, + "kl": 0.31787109375, + "learning_rate": 1.4042577374783834e-05, + "loss": 0.012, + "reward": 0.5591518208384514, + "reward_std": 0.10023756697773933, + "rewards/accuracy_reward": 0.07589285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 1442 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.5134429931641, + "epoch": 0.43103577029348067, + "grad_norm": 0.2682012915611267, + "kl": 0.2880859375, + "learning_rate": 1.40330351962553e-05, + "loss": 0.0115, + "reward": 0.516741082072258, + "reward_std": 0.05987477907910943, + "rewards/accuracy_reward": 0.024553571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1443 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.1250457763672, + "epoch": 0.43133447838100214, + "grad_norm": 0.35157644748687744, + "kl": 0.33154296875, + "learning_rate": 1.402348863017975e-05, + "loss": 0.0131, + "reward": 0.5228794813156128, + "reward_std": 0.06092731771059334, + "rewards/accuracy_reward": 0.03125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1444 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8125457763672, + "epoch": 0.4316331864685236, + "grad_norm": 0.3096959590911865, + "kl": 0.33447265625, + "learning_rate": 1.401393768694292e-05, + "loss": 0.0137, + "reward": 0.5468750298023224, + "reward_std": 0.11028173193335533, + "rewards/accuracy_reward": 0.055803572526201606, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 1445 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.5067291259766, + "epoch": 0.4319318945560451, + "grad_norm": 0.19999642670154572, + "kl": 0.259521484375, + "learning_rate": 1.4004382376935293e-05, + "loss": 0.0105, + "reward": 0.6411830633878708, + "reward_std": 0.07147403224371374, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1446 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.6585235595703, + "epoch": 0.43223060264356655, + "grad_norm": 0.3142814636230469, + "kl": 0.3203125, + "learning_rate": 1.3994822710552108e-05, + "loss": 0.0129, + "reward": 0.6378348469734192, + "reward_std": 0.14163931575603783, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1447 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.2388763427734, + "epoch": 0.432529310731088, + "grad_norm": 0.20149506628513336, + "kl": 0.248046875, + "learning_rate": 1.3985258698193351e-05, + "loss": 0.0114, + "reward": 0.5976562798023224, + "reward_std": 0.1115835141390562, + "rewards/accuracy_reward": 0.10491071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1448 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.0357513427734, + "epoch": 0.4328280188186095, + "grad_norm": 0.14209698140621185, + "kl": 0.231201171875, + "learning_rate": 1.397569035026373e-05, + "loss": 0.0094, + "reward": 0.6551339477300644, + "reward_std": 0.1267199080903083, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1449 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.4955749511719, + "epoch": 0.43312672690613097, + "grad_norm": 0.4690750539302826, + "kl": 0.347900390625, + "learning_rate": 1.3966117677172663e-05, + "loss": 0.0154, + "reward": 0.6210937798023224, + "reward_std": 0.07533728424459696, + "rewards/accuracy_reward": 0.12946429220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1450 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8415679931641, + "epoch": 0.43342543499365244, + "grad_norm": 0.27301138639450073, + "kl": 0.27197265625, + "learning_rate": 1.3956540689334286e-05, + "loss": 0.0109, + "reward": 0.5797991305589676, + "reward_std": 0.06840413343161345, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1451 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3594207763672, + "epoch": 0.4337241430811739, + "grad_norm": 0.2163301259279251, + "kl": 0.26220703125, + "learning_rate": 1.3946959397167423e-05, + "loss": 0.008, + "reward": 0.7008928954601288, + "reward_std": 0.1336129792034626, + "rewards/accuracy_reward": 0.2098214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714402794838, + "step": 1452 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.185302734375, + "epoch": 0.4340228511686954, + "grad_norm": 0.23277547955513, + "kl": 0.2646484375, + "learning_rate": 1.393737381109558e-05, + "loss": 0.0067, + "reward": 0.5357143133878708, + "reward_std": 0.1073725325986743, + "rewards/accuracy_reward": 0.04687500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1453 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.4241485595703, + "epoch": 0.43432155925621685, + "grad_norm": 0.1922038495540619, + "kl": 0.31982421875, + "learning_rate": 1.392778394154693e-05, + "loss": 0.0124, + "reward": 0.5485491380095482, + "reward_std": 0.08859445620328188, + "rewards/accuracy_reward": 0.058035718742758036, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 1454 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.1763763427734, + "epoch": 0.4346202673437383, + "grad_norm": 0.26382535696029663, + "kl": 0.3662109375, + "learning_rate": 1.3918189798954322e-05, + "loss": 0.0162, + "reward": 0.541294664144516, + "reward_std": 0.12677767500281334, + "rewards/accuracy_reward": 0.05803571501746774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 1455 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2009124755859, + "epoch": 0.4349189754312598, + "grad_norm": 0.18840385973453522, + "kl": 0.328369140625, + "learning_rate": 1.3908591393755234e-05, + "loss": 0.0097, + "reward": 0.5669643133878708, + "reward_std": 0.11108816228806973, + "rewards/accuracy_reward": 0.07366071688011289, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493303582072258, + "step": 1456 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8348693847656, + "epoch": 0.43521768351878126, + "grad_norm": 0.17610879242420197, + "kl": 0.287109375, + "learning_rate": 1.3898988736391792e-05, + "loss": 0.0092, + "reward": 0.5747768133878708, + "reward_std": 0.0858859505970031, + "rewards/accuracy_reward": 0.08482143376022577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 1457 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3951263427734, + "epoch": 0.43551639160630273, + "grad_norm": 0.22839364409446716, + "kl": 0.37060546875, + "learning_rate": 1.3889381837310746e-05, + "loss": 0.0157, + "reward": 0.6612723469734192, + "reward_std": 0.11188186332583427, + "rewards/accuracy_reward": 0.1741071492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 1458 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0558319091797, + "epoch": 0.4358150996938242, + "grad_norm": 0.283755362033844, + "kl": 0.36767578125, + "learning_rate": 1.3879770706963464e-05, + "loss": 0.0143, + "reward": 0.6501116305589676, + "reward_std": 0.10163872316479683, + "rewards/accuracy_reward": 0.15848214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1459 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0982513427734, + "epoch": 0.4361138077813457, + "grad_norm": 0.2682253420352936, + "kl": 0.3623046875, + "learning_rate": 1.387015535580591e-05, + "loss": 0.0145, + "reward": 0.5585937798023224, + "reward_std": 0.10689483024179935, + "rewards/accuracy_reward": 0.06696428963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1460 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.6674346923828, + "epoch": 0.43641251586886715, + "grad_norm": 1.067014455795288, + "kl": 0.50830078125, + "learning_rate": 1.3860535794298644e-05, + "loss": 0.0206, + "reward": 0.588169664144516, + "reward_std": 0.11119626555591822, + "rewards/accuracy_reward": 0.09598214877769351, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 1461 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.9062805175781, + "epoch": 0.4367112239563886, + "grad_norm": 0.21992383897304535, + "kl": 0.3740234375, + "learning_rate": 1.385091203290681e-05, + "loss": 0.0106, + "reward": 0.560825914144516, + "reward_std": 0.09634908707812428, + "rewards/accuracy_reward": 0.07142857555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 1462 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.9888763427734, + "epoch": 0.4370099320439101, + "grad_norm": 0.17582836747169495, + "kl": 0.248291015625, + "learning_rate": 1.384128408210011e-05, + "loss": 0.0068, + "reward": 0.5619419813156128, + "reward_std": 0.12543739937245846, + "rewards/accuracy_reward": 0.06473214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1463 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.8638763427734, + "epoch": 0.43730864013143156, + "grad_norm": 0.5936867594718933, + "kl": 0.257568359375, + "learning_rate": 1.3831651952352818e-05, + "loss": 0.0108, + "reward": 0.534040205180645, + "reward_std": 0.07968292571604252, + "rewards/accuracy_reward": 0.04464286030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 1464 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.4955902099609, + "epoch": 0.43760734821895303, + "grad_norm": 0.15366804599761963, + "kl": 0.23583984375, + "learning_rate": 1.3822015654143742e-05, + "loss": 0.0099, + "reward": 0.5619419887661934, + "reward_std": 0.060282707680016756, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 1465 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1161193847656, + "epoch": 0.4379060563064745, + "grad_norm": 0.2665017247200012, + "kl": 0.23583984375, + "learning_rate": 1.3812375197956233e-05, + "loss": 0.0089, + "reward": 0.5318080484867096, + "reward_std": 0.09396623400971293, + "rewards/accuracy_reward": 0.04017857206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 1466 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.5781707763672, + "epoch": 0.43820476439399597, + "grad_norm": 0.3849310576915741, + "kl": 0.2744140625, + "learning_rate": 1.3802730594278161e-05, + "loss": 0.0078, + "reward": 0.5362723395228386, + "reward_std": 0.04872149741277099, + "rewards/accuracy_reward": 0.044642860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 1467 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.3772735595703, + "epoch": 0.43850347248151744, + "grad_norm": 0.30386507511138916, + "kl": 0.349609375, + "learning_rate": 1.3793081853601913e-05, + "loss": 0.014, + "reward": 0.6216518133878708, + "reward_std": 0.13019312731921673, + "rewards/accuracy_reward": 0.12946428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1468 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.5446929931641, + "epoch": 0.4388021805690389, + "grad_norm": 0.31805843114852905, + "kl": 0.33984375, + "learning_rate": 1.3783428986424366e-05, + "loss": 0.0119, + "reward": 0.5312500074505806, + "reward_std": 0.0972960451617837, + "rewards/accuracy_reward": 0.04017857392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1469 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.8326416015625, + "epoch": 0.4391008886565604, + "grad_norm": 0.556326150894165, + "kl": 0.4189453125, + "learning_rate": 1.37737720032469e-05, + "loss": 0.0158, + "reward": 0.561941996216774, + "reward_std": 0.11416072957217693, + "rewards/accuracy_reward": 0.07142857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1470 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.6138763427734, + "epoch": 0.43939959674408186, + "grad_norm": 0.3690832257270813, + "kl": 0.3310546875, + "learning_rate": 1.3764110914575365e-05, + "loss": 0.0138, + "reward": 0.6188616305589676, + "reward_std": 0.07751888129860163, + "rewards/accuracy_reward": 0.13392857764847577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 1471 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.1495971679688, + "epoch": 0.4396983048316033, + "grad_norm": 0.25919342041015625, + "kl": 0.30859375, + "learning_rate": 1.3754445730920075e-05, + "loss": 0.0158, + "reward": 0.551897332072258, + "reward_std": 0.10780468210577965, + "rewards/accuracy_reward": 0.06696429033763707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 1472 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.6920013427734, + "epoch": 0.4399970129191248, + "grad_norm": 0.22350293397903442, + "kl": 0.256103515625, + "learning_rate": 1.3744776462795806e-05, + "loss": 0.0073, + "reward": 0.5876116305589676, + "reward_std": 0.0814664545468986, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 1473 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.9955596923828, + "epoch": 0.44029572100664627, + "grad_norm": 0.28183990716934204, + "kl": 0.37744140625, + "learning_rate": 1.3735103120721773e-05, + "loss": 0.0152, + "reward": 0.5753348544239998, + "reward_std": 0.07914318796247244, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 1474 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.7567291259766, + "epoch": 0.44059442909416774, + "grad_norm": 0.4397391676902771, + "kl": 0.417236328125, + "learning_rate": 1.3725425715221625e-05, + "loss": 0.0158, + "reward": 0.6356026977300644, + "reward_std": 0.07307070610113442, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1475 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.2433471679688, + "epoch": 0.4408931371816892, + "grad_norm": 0.5483337044715881, + "kl": 0.34814453125, + "learning_rate": 1.3715744256823427e-05, + "loss": 0.0129, + "reward": 0.6110491305589676, + "reward_std": 0.16536743193864822, + "rewards/accuracy_reward": 0.1160714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1476 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.3013610839844, + "epoch": 0.4411918452692107, + "grad_norm": 0.3562644124031067, + "kl": 0.27099609375, + "learning_rate": 1.3706058756059661e-05, + "loss": 0.0104, + "reward": 0.5864955633878708, + "reward_std": 0.07246512360870838, + "rewards/accuracy_reward": 0.09151786123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1477 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.6964874267578, + "epoch": 0.44149055335673215, + "grad_norm": 0.5073967576026917, + "kl": 0.282470703125, + "learning_rate": 1.3696369223467204e-05, + "loss": 0.0125, + "reward": 0.5641741305589676, + "reward_std": 0.10477453283965588, + "rewards/accuracy_reward": 0.07142857415601611, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1478 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.4866485595703, + "epoch": 0.4417892614442536, + "grad_norm": 0.1707530915737152, + "kl": 0.396484375, + "learning_rate": 1.3686675669587311e-05, + "loss": 0.0159, + "reward": 0.5641741305589676, + "reward_std": 0.09595987945795059, + "rewards/accuracy_reward": 0.07142857601866126, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1479 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.8661193847656, + "epoch": 0.4420879695317751, + "grad_norm": 0.3236517906188965, + "kl": 0.304443359375, + "learning_rate": 1.3676978104965623e-05, + "loss": 0.0128, + "reward": 0.6450893133878708, + "reward_std": 0.11206040903925896, + "rewards/accuracy_reward": 0.1540178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714402794838, + "step": 1480 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.7924499511719, + "epoch": 0.44238667761929656, + "grad_norm": 0.2624981999397278, + "kl": 0.318359375, + "learning_rate": 1.3667276540152143e-05, + "loss": 0.0119, + "reward": 0.5904017984867096, + "reward_std": 0.10364381037652493, + "rewards/accuracy_reward": 0.09821428963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1481 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.5826263427734, + "epoch": 0.44268538570681804, + "grad_norm": 0.3621937334537506, + "kl": 0.40283203125, + "learning_rate": 1.3657570985701217e-05, + "loss": 0.0162, + "reward": 0.6143973469734192, + "reward_std": 0.054624921875074506, + "rewards/accuracy_reward": 0.12053572130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1482 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.6540679931641, + "epoch": 0.4429840937943395, + "grad_norm": 0.8409522771835327, + "kl": 0.5595703125, + "learning_rate": 1.3647861452171536e-05, + "loss": 0.022, + "reward": 0.5842634290456772, + "reward_std": 0.15187692269682884, + "rewards/accuracy_reward": 0.09151786030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1483 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.919677734375, + "epoch": 0.443282801881861, + "grad_norm": 0.2875790297985077, + "kl": 0.355712890625, + "learning_rate": 1.3638147950126128e-05, + "loss": 0.0147, + "reward": 0.5803571790456772, + "reward_std": 0.09556989837437868, + "rewards/accuracy_reward": 0.08705357438884676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493303582072258, + "step": 1484 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.6652221679688, + "epoch": 0.44358150996938245, + "grad_norm": 1.7519073486328125, + "kl": 0.79638671875, + "learning_rate": 1.3628430490132327e-05, + "loss": 0.0321, + "reward": 0.572544664144516, + "reward_std": 0.07779127452522516, + "rewards/accuracy_reward": 0.07812500302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1485 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.6607513427734, + "epoch": 0.44388021805690386, + "grad_norm": 0.922296941280365, + "kl": 0.310546875, + "learning_rate": 1.3618709082761773e-05, + "loss": 0.0126, + "reward": 0.6763393133878708, + "reward_std": 0.1616017483174801, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1486 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.9174499511719, + "epoch": 0.44417892614442533, + "grad_norm": 0.9184406399726868, + "kl": 0.283203125, + "learning_rate": 1.3608983738590414e-05, + "loss": 0.008, + "reward": 0.612723246216774, + "reward_std": 0.09628275595605373, + "rewards/accuracy_reward": 0.11830358183942735, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1487 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.8326110839844, + "epoch": 0.4444776342319468, + "grad_norm": 0.5096105337142944, + "kl": 0.316162109375, + "learning_rate": 1.3599254468198462e-05, + "loss": 0.0132, + "reward": 0.5859375149011612, + "reward_std": 0.06595112779177725, + "rewards/accuracy_reward": 0.09151786100119352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1488 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.9241333007812, + "epoch": 0.4447763423194683, + "grad_norm": 0.2401159256696701, + "kl": 0.474853515625, + "learning_rate": 1.3589521282170415e-05, + "loss": 0.0144, + "reward": 0.530133955180645, + "reward_std": 0.07364929933100939, + "rewards/accuracy_reward": 0.03571428661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1489 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.013427734375, + "epoch": 0.44507505040698975, + "grad_norm": 0.697991669178009, + "kl": 0.596923828125, + "learning_rate": 1.3579784191095022e-05, + "loss": 0.0285, + "reward": 0.599888414144516, + "reward_std": 0.06710325833410025, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1490 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.8549652099609, + "epoch": 0.4453737584945112, + "grad_norm": 0.6801483035087585, + "kl": 0.5908203125, + "learning_rate": 1.3570043205565289e-05, + "loss": 0.0241, + "reward": 0.5641741156578064, + "reward_std": 0.06275301147252321, + "rewards/accuracy_reward": 0.06919643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1491 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.2277221679688, + "epoch": 0.4456724665820327, + "grad_norm": 0.5442260503768921, + "kl": 0.31640625, + "learning_rate": 1.356029833617845e-05, + "loss": 0.0137, + "reward": 0.5340401828289032, + "reward_std": 0.06268590618856251, + "rewards/accuracy_reward": 0.035714288242161274, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1492 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.2701416015625, + "epoch": 0.44597117466955416, + "grad_norm": 0.3726612627506256, + "kl": 0.263916015625, + "learning_rate": 1.3550549593535965e-05, + "loss": 0.0102, + "reward": 0.584263414144516, + "reward_std": 0.10734810237772763, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1493 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.7031707763672, + "epoch": 0.44626988275707563, + "grad_norm": 2.7643446922302246, + "kl": 0.2978515625, + "learning_rate": 1.3540796988243514e-05, + "loss": 0.0118, + "reward": 0.5412946790456772, + "reward_std": 0.11143556982278824, + "rewards/accuracy_reward": 0.05133928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1494 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8169860839844, + "epoch": 0.4465685908445971, + "grad_norm": 2.0431711673736572, + "kl": 0.5322265625, + "learning_rate": 1.3531040530910977e-05, + "loss": 0.0238, + "reward": 0.5613839477300644, + "reward_std": 0.11302877217531204, + "rewards/accuracy_reward": 0.0803571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 1495 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.6942291259766, + "epoch": 0.4468672989321186, + "grad_norm": 4.57348108291626, + "kl": 2.23046875, + "learning_rate": 1.3521280232152421e-05, + "loss": 0.0893, + "reward": 0.5357143059372902, + "reward_std": 0.12914585322141647, + "rewards/accuracy_reward": 0.08482143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.450892873108387, + "step": 1496 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.5736846923828, + "epoch": 0.44716600701964004, + "grad_norm": 3.9766175746917725, + "kl": 1.923828125, + "learning_rate": 1.3511516102586093e-05, + "loss": 0.0806, + "reward": 0.4481026902794838, + "reward_std": 0.15801502764225006, + "rewards/accuracy_reward": 0.0267857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4213169813156128, + "step": 1497 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.3683471679688, + "epoch": 0.4474647151071615, + "grad_norm": 3.4448533058166504, + "kl": 1.921875, + "learning_rate": 1.3501748152834413e-05, + "loss": 0.0765, + "reward": 0.4464285969734192, + "reward_std": 0.1479911282658577, + "rewards/accuracy_reward": 0.05133928777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3950893133878708, + "step": 1498 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.5781707763672, + "epoch": 0.447763423194683, + "grad_norm": 1.9344934225082397, + "kl": 1.650390625, + "learning_rate": 1.3491976393523952e-05, + "loss": 0.064, + "reward": 0.4447544813156128, + "reward_std": 0.1652222741395235, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.384486623108387, + "step": 1499 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.9085235595703, + "epoch": 0.44806213128220446, + "grad_norm": 2.9996211528778076, + "kl": 0.7763671875, + "learning_rate": 1.3482200835285421e-05, + "loss": 0.0308, + "reward": 0.4246651977300644, + "reward_std": 0.1723794974386692, + "rewards/accuracy_reward": 0.05357143119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3710937649011612, + "step": 1500 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.4754791259766, + "epoch": 0.4483608393697259, + "grad_norm": 2.1097991466522217, + "kl": 0.650390625, + "learning_rate": 1.3472421488753678e-05, + "loss": 0.0244, + "reward": 0.5005580559372902, + "reward_std": 0.13741383887827396, + "rewards/accuracy_reward": 0.14732143771834671, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.353236623108387, + "step": 1501 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.4911041259766, + "epoch": 0.4486595474572474, + "grad_norm": 1.486100196838379, + "kl": 0.716796875, + "learning_rate": 1.3462638364567688e-05, + "loss": 0.0281, + "reward": 0.3398437649011612, + "reward_std": 0.12095869332551956, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3398437649011612, + "step": 1502 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.2946929931641, + "epoch": 0.44895825554476887, + "grad_norm": 1.7384463548660278, + "kl": 0.7255859375, + "learning_rate": 1.3452851473370531e-05, + "loss": 0.0254, + "reward": 0.3750000223517418, + "reward_std": 0.15257175266742706, + "rewards/accuracy_reward": 0.011160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3638393059372902, + "step": 1503 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.9018402099609, + "epoch": 0.44925696363229034, + "grad_norm": 1.1537762880325317, + "kl": 0.953125, + "learning_rate": 1.3443060825809387e-05, + "loss": 0.0363, + "reward": 0.377232164144516, + "reward_std": 0.16783707588911057, + "rewards/accuracy_reward": 0.017857144121080637, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3593750149011612, + "step": 1504 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7969055175781, + "epoch": 0.4495556717198118, + "grad_norm": 1.24853515625, + "kl": 1.1064453125, + "learning_rate": 1.343326643253552e-05, + "loss": 0.0448, + "reward": 0.4397321566939354, + "reward_std": 0.12964718416333199, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.368303582072258, + "step": 1505 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7277069091797, + "epoch": 0.4498543798073333, + "grad_norm": 0.6571123003959656, + "kl": 1.1640625, + "learning_rate": 1.3423468304204275e-05, + "loss": 0.046, + "reward": 0.424665205180645, + "reward_std": 0.156600559130311, + "rewards/accuracy_reward": 0.05580357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.368861623108387, + "step": 1506 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.7076263427734, + "epoch": 0.45015308789485475, + "grad_norm": 0.9764202833175659, + "kl": 1.044921875, + "learning_rate": 1.3413666451475048e-05, + "loss": 0.0425, + "reward": 0.4397321566939354, + "reward_std": 0.18150098621845245, + "rewards/accuracy_reward": 0.06026786030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3794642984867096, + "step": 1507 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7344207763672, + "epoch": 0.4504517959823762, + "grad_norm": 0.5474401712417603, + "kl": 0.8701171875, + "learning_rate": 1.3403860885011297e-05, + "loss": 0.0353, + "reward": 0.4804687723517418, + "reward_std": 0.14388343691825867, + "rewards/accuracy_reward": 0.08258928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3978794813156128, + "step": 1508 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.1920013427734, + "epoch": 0.4507505040698977, + "grad_norm": 0.764144241809845, + "kl": 0.7802734375, + "learning_rate": 1.3394051615480516e-05, + "loss": 0.0305, + "reward": 0.4760044813156128, + "reward_std": 0.18940521031618118, + "rewards/accuracy_reward": 0.05357143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4224330633878708, + "step": 1509 + }, + { + "clip_ratio": 0.0, + "completion_length": 1023.0781555175781, + "epoch": 0.45104921215741917, + "grad_norm": 0.8285483717918396, + "kl": 0.6455078125, + "learning_rate": 1.3384238653554234e-05, + "loss": 0.0258, + "reward": 0.5491071790456772, + "reward_std": 0.12228528410196304, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4352678805589676, + "step": 1510 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.2969055175781, + "epoch": 0.45134792024494064, + "grad_norm": 0.7933478951454163, + "kl": 0.5771484375, + "learning_rate": 1.3374422009907984e-05, + "loss": 0.0269, + "reward": 0.4693080559372902, + "reward_std": 0.12501132674515247, + "rewards/accuracy_reward": 0.022321430267766118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.446986623108387, + "step": 1511 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.3013763427734, + "epoch": 0.4516466283324621, + "grad_norm": 0.5048947334289551, + "kl": 0.7138671875, + "learning_rate": 1.3364601695221318e-05, + "loss": 0.0301, + "reward": 0.5128348469734192, + "reward_std": 0.15448389202356339, + "rewards/accuracy_reward": 0.0647321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4481026977300644, + "step": 1512 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.7277221679688, + "epoch": 0.4519453364199836, + "grad_norm": 0.8140555024147034, + "kl": 0.80078125, + "learning_rate": 1.3354777720177775e-05, + "loss": 0.0394, + "reward": 0.6863839477300644, + "reward_std": 0.1842218153178692, + "rewards/accuracy_reward": 0.22767858020961285, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4587053805589676, + "step": 1513 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.2076263427734, + "epoch": 0.45224404450750505, + "grad_norm": 0.9576892852783203, + "kl": 0.775390625, + "learning_rate": 1.3344950095464872e-05, + "loss": 0.0327, + "reward": 0.6350446790456772, + "reward_std": 0.13128018379211426, + "rewards/accuracy_reward": 0.17187500465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4631696566939354, + "step": 1514 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.4732666015625, + "epoch": 0.4525427525950265, + "grad_norm": 0.4603498578071594, + "kl": 0.51708984375, + "learning_rate": 1.333511883177411e-05, + "loss": 0.0214, + "reward": 0.5424107313156128, + "reward_std": 0.12026974186301231, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4642857387661934, + "step": 1515 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.0692291259766, + "epoch": 0.452841460682548, + "grad_norm": 0.5195677280426025, + "kl": 0.48095703125, + "learning_rate": 1.3325283939800935e-05, + "loss": 0.0223, + "reward": 0.5859375223517418, + "reward_std": 0.11780317779630423, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4720982313156128, + "step": 1516 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.3995971679688, + "epoch": 0.45314016877006946, + "grad_norm": 0.3132658302783966, + "kl": 0.38427734375, + "learning_rate": 1.3315445430244744e-05, + "loss": 0.0149, + "reward": 0.5641741156578064, + "reward_std": 0.07661758363246918, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169813156128, + "step": 1517 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.3839721679688, + "epoch": 0.45343887685759093, + "grad_norm": 0.30347758531570435, + "kl": 0.3544921875, + "learning_rate": 1.3305603313808875e-05, + "loss": 0.0161, + "reward": 0.6250000149011612, + "reward_std": 0.08687434066087008, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1518 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.8839569091797, + "epoch": 0.4537375849451124, + "grad_norm": 0.27922073006629944, + "kl": 0.42529296875, + "learning_rate": 1.3295757601200582e-05, + "loss": 0.0199, + "reward": 0.507812537252903, + "reward_std": 0.06339173158630729, + "rewards/accuracy_reward": 0.01785714295692742, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1519 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.1406707763672, + "epoch": 0.4540362930326339, + "grad_norm": 0.29898399114608765, + "kl": 0.3193359375, + "learning_rate": 1.3285908303131043e-05, + "loss": 0.0139, + "reward": 0.5446428954601288, + "reward_std": 0.09898672066628933, + "rewards/accuracy_reward": 0.053571431431919336, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1520 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.5536193847656, + "epoch": 0.45433500112015535, + "grad_norm": 0.2896219491958618, + "kl": 0.34521484375, + "learning_rate": 1.327605543031532e-05, + "loss": 0.0164, + "reward": 0.5239955484867096, + "reward_std": 0.06582715176045895, + "rewards/accuracy_reward": 0.029017859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1521 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.4241638183594, + "epoch": 0.4546337092076768, + "grad_norm": 0.29626402258872986, + "kl": 0.33740234375, + "learning_rate": 1.3266198993472377e-05, + "loss": 0.0147, + "reward": 0.5446428805589676, + "reward_std": 0.046498440438881516, + "rewards/accuracy_reward": 0.046875003492459655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1522 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.3638763427734, + "epoch": 0.4549324172951983, + "grad_norm": 0.3154611885547638, + "kl": 0.3173828125, + "learning_rate": 1.3256339003325054e-05, + "loss": 0.0126, + "reward": 0.5770089477300644, + "reward_std": 0.08076102519407868, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1523 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.3504791259766, + "epoch": 0.45523112538271976, + "grad_norm": 0.21065734326839447, + "kl": 0.2568359375, + "learning_rate": 1.324647547060005e-05, + "loss": 0.0097, + "reward": 0.5904018133878708, + "reward_std": 0.05695624020881951, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1524 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.5424499511719, + "epoch": 0.45552983347024123, + "grad_norm": 0.27689075469970703, + "kl": 0.273193359375, + "learning_rate": 1.3236608406027918e-05, + "loss": 0.0111, + "reward": 0.5680803805589676, + "reward_std": 0.0716194158885628, + "rewards/accuracy_reward": 0.06919643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1525 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.9620971679688, + "epoch": 0.4558285415577627, + "grad_norm": 0.1436397284269333, + "kl": 0.212158203125, + "learning_rate": 1.3226737820343066e-05, + "loss": 0.0093, + "reward": 0.5625000298023224, + "reward_std": 0.05055215861648321, + "rewards/accuracy_reward": 0.06250000302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1526 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.5112152099609, + "epoch": 0.45612724964528417, + "grad_norm": 0.15930378437042236, + "kl": 0.2060546875, + "learning_rate": 1.321686372428372e-05, + "loss": 0.0124, + "reward": 0.6646205633878708, + "reward_std": 0.0385320654604584, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1527 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.5848693847656, + "epoch": 0.45642595773280564, + "grad_norm": 0.15984582901000977, + "kl": 0.19091796875, + "learning_rate": 1.3206986128591925e-05, + "loss": 0.0088, + "reward": 0.6289062798023224, + "reward_std": 0.08459895942360163, + "rewards/accuracy_reward": 0.12946428824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1528 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.6786041259766, + "epoch": 0.45672466582032706, + "grad_norm": 0.13483546674251556, + "kl": 0.19677734375, + "learning_rate": 1.3197105044013544e-05, + "loss": 0.0067, + "reward": 0.525669664144516, + "reward_std": 0.06350403651595116, + "rewards/accuracy_reward": 0.026785714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1529 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.9710235595703, + "epoch": 0.45702337390784853, + "grad_norm": 0.1494402289390564, + "kl": 0.19189453125, + "learning_rate": 1.3187220481298227e-05, + "loss": 0.0092, + "reward": 0.6328125298023224, + "reward_std": 0.12118218932300806, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1530 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.4375457763672, + "epoch": 0.45732208199537, + "grad_norm": 0.3321848213672638, + "kl": 0.2119140625, + "learning_rate": 1.3177332451199405e-05, + "loss": 0.0122, + "reward": 0.5758928954601288, + "reward_std": 0.09584134258329868, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1531 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.5826416015625, + "epoch": 0.45762079008289147, + "grad_norm": 0.48037928342819214, + "kl": 0.22802734375, + "learning_rate": 1.3167440964474285e-05, + "loss": 0.0151, + "reward": 0.7343750298023224, + "reward_std": 0.1584643954411149, + "rewards/accuracy_reward": 0.2366071529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1532 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.5513916015625, + "epoch": 0.45791949817041294, + "grad_norm": 0.943138062953949, + "kl": 0.267822265625, + "learning_rate": 1.3157546031883843e-05, + "loss": 0.0105, + "reward": 0.5446428805589676, + "reward_std": 0.03552421252243221, + "rewards/accuracy_reward": 0.0468750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1533 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.1451263427734, + "epoch": 0.4582182062579344, + "grad_norm": 0.2560576796531677, + "kl": 0.275634765625, + "learning_rate": 1.314764766419279e-05, + "loss": 0.014, + "reward": 0.5608258992433548, + "reward_std": 0.10604359209537506, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1534 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.9062957763672, + "epoch": 0.4585169143454559, + "grad_norm": 0.28210335969924927, + "kl": 0.299560546875, + "learning_rate": 1.3137745872169578e-05, + "loss": 0.0094, + "reward": 0.6316964477300644, + "reward_std": 0.0627832654863596, + "rewards/accuracy_reward": 0.13392857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1535 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.8036346435547, + "epoch": 0.45881562243297735, + "grad_norm": 0.42984312772750854, + "kl": 0.3759765625, + "learning_rate": 1.312784066658639e-05, + "loss": 0.0181, + "reward": 0.5803571790456772, + "reward_std": 0.046098936814814806, + "rewards/accuracy_reward": 0.08258928824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1536 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.4196929931641, + "epoch": 0.4591143305204988, + "grad_norm": 0.49113866686820984, + "kl": 0.4375, + "learning_rate": 1.3117932058219123e-05, + "loss": 0.0277, + "reward": 0.6099330633878708, + "reward_std": 0.07995093613862991, + "rewards/accuracy_reward": 0.11160714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1537 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.0312957763672, + "epoch": 0.4594130386080203, + "grad_norm": 0.91732257604599, + "kl": 0.42724609375, + "learning_rate": 1.3108020057847363e-05, + "loss": 0.0164, + "reward": 0.6724330633878708, + "reward_std": 0.053612685296684504, + "rewards/accuracy_reward": 0.17410715389996767, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1538 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.8705749511719, + "epoch": 0.45971174669554177, + "grad_norm": 1.1763626337051392, + "kl": 0.490234375, + "learning_rate": 1.3098104676254397e-05, + "loss": 0.0199, + "reward": 0.6350446790456772, + "reward_std": 0.07813955936580896, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1539 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.8839874267578, + "epoch": 0.46001045478306324, + "grad_norm": 0.33702483773231506, + "kl": 0.4638671875, + "learning_rate": 1.3088185924227195e-05, + "loss": 0.0133, + "reward": 0.5072544813156128, + "reward_std": 0.055005913600325584, + "rewards/accuracy_reward": 0.013392857974395156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1540 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.3281707763672, + "epoch": 0.4603091628705847, + "grad_norm": 0.6173097491264343, + "kl": 0.48876953125, + "learning_rate": 1.3078263812556377e-05, + "loss": 0.0195, + "reward": 0.5998884290456772, + "reward_std": 0.15364855527877808, + "rewards/accuracy_reward": 0.10714286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1541 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.4085388183594, + "epoch": 0.4606078709581062, + "grad_norm": 0.2744865119457245, + "kl": 0.35009765625, + "learning_rate": 1.3068338352036236e-05, + "loss": 0.015, + "reward": 0.6238839626312256, + "reward_std": 0.07167237857356668, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1542 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.5759429931641, + "epoch": 0.46090657904562765, + "grad_norm": 1.0144052505493164, + "kl": 0.32666015625, + "learning_rate": 1.3058409553464697e-05, + "loss": 0.0096, + "reward": 0.522879496216774, + "reward_std": 0.06334796291776001, + "rewards/accuracy_reward": 0.029017859371379018, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1543 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.6339721679688, + "epoch": 0.4612052871331491, + "grad_norm": 0.39128419756889343, + "kl": 0.48876953125, + "learning_rate": 1.3048477427643322e-05, + "loss": 0.0194, + "reward": 0.5641741305589676, + "reward_std": 0.09087759861722589, + "rewards/accuracy_reward": 0.07366071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1544 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.7076416015625, + "epoch": 0.4615039952206706, + "grad_norm": 0.3284718692302704, + "kl": 0.53466796875, + "learning_rate": 1.3038541985377286e-05, + "loss": 0.0257, + "reward": 0.5959821790456772, + "reward_std": 0.13276398181915283, + "rewards/accuracy_reward": 0.10714286100119352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 1545 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.0937957763672, + "epoch": 0.46180270330819206, + "grad_norm": 0.3769945204257965, + "kl": 0.603515625, + "learning_rate": 1.302860323747538e-05, + "loss": 0.0276, + "reward": 0.5167410895228386, + "reward_std": 0.08754977770149708, + "rewards/accuracy_reward": 0.0267857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1546 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.9152221679688, + "epoch": 0.46210141139571353, + "grad_norm": 1.3439924716949463, + "kl": 0.82568359375, + "learning_rate": 1.3018661194749986e-05, + "loss": 0.0343, + "reward": 0.5005580559372902, + "reward_std": 0.04912404669448733, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1547 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.5045166015625, + "epoch": 0.462400119483235, + "grad_norm": 0.6339476704597473, + "kl": 0.55322265625, + "learning_rate": 1.3008715868017075e-05, + "loss": 0.0219, + "reward": 0.621651828289032, + "reward_std": 0.08416787534952164, + "rewards/accuracy_reward": 0.12946429336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1548 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.1005096435547, + "epoch": 0.4626988275707565, + "grad_norm": 0.3539251685142517, + "kl": 0.3857421875, + "learning_rate": 1.2998767268096183e-05, + "loss": 0.0239, + "reward": 0.5608258992433548, + "reward_std": 0.06608642195351422, + "rewards/accuracy_reward": 0.06473214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1549 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.294677734375, + "epoch": 0.46299753565827795, + "grad_norm": 0.09269053488969803, + "kl": 0.200439453125, + "learning_rate": 1.2988815405810415e-05, + "loss": 0.0082, + "reward": 0.5368303656578064, + "reward_std": 0.013392857741564512, + "rewards/accuracy_reward": 0.0379464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1550 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8683319091797, + "epoch": 0.4632962437457994, + "grad_norm": 0.41295918822288513, + "kl": 0.20751953125, + "learning_rate": 1.2978860291986422e-05, + "loss": 0.0094, + "reward": 0.5987723469734192, + "reward_std": 0.0854479051195085, + "rewards/accuracy_reward": 0.10491071618162096, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1551 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.6495971679688, + "epoch": 0.4635949518333209, + "grad_norm": 0.15744203329086304, + "kl": 0.2158203125, + "learning_rate": 1.296890193745439e-05, + "loss": 0.0099, + "reward": 0.6121652275323868, + "reward_std": 0.10025577736087143, + "rewards/accuracy_reward": 0.11607143376022577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1552 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0647888183594, + "epoch": 0.46389365992084236, + "grad_norm": 0.16778677701950073, + "kl": 0.1953125, + "learning_rate": 1.295894035304803e-05, + "loss": 0.0083, + "reward": 0.5697544813156128, + "reward_std": 0.0619998883921653, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1553 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.8973846435547, + "epoch": 0.46419236800836383, + "grad_norm": 0.41466599702835083, + "kl": 0.24560546875, + "learning_rate": 1.294897554960458e-05, + "loss": 0.0045, + "reward": 0.5463169813156128, + "reward_std": 0.05568002280779183, + "rewards/accuracy_reward": 0.05133928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1554 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7254943847656, + "epoch": 0.4644910760958853, + "grad_norm": 0.19748689234256744, + "kl": 0.21484375, + "learning_rate": 1.2939007537964758e-05, + "loss": 0.0061, + "reward": 0.5641741454601288, + "reward_std": 0.09902342408895493, + "rewards/accuracy_reward": 0.07142857485450804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1555 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.6406707763672, + "epoch": 0.4647897841834068, + "grad_norm": 0.38183048367500305, + "kl": 0.226806640625, + "learning_rate": 1.292903632897279e-05, + "loss": 0.0087, + "reward": 0.631138414144516, + "reward_std": 0.12048804853111506, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 1556 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.3795013427734, + "epoch": 0.46508849227092824, + "grad_norm": 0.21406753361225128, + "kl": 0.193359375, + "learning_rate": 1.2919061933476371e-05, + "loss": 0.0078, + "reward": 0.5474330633878708, + "reward_std": 0.05791808129288256, + "rewards/accuracy_reward": 0.051339288242161274, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937574505806, + "step": 1557 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.6518249511719, + "epoch": 0.4653872003584497, + "grad_norm": 0.18968446552753448, + "kl": 0.21923828125, + "learning_rate": 1.2909084362326669e-05, + "loss": 0.0055, + "reward": 0.5954241305589676, + "reward_std": 0.056054480373859406, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1558 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.4308624267578, + "epoch": 0.4656859084459712, + "grad_norm": 0.16359466314315796, + "kl": 0.23828125, + "learning_rate": 1.28991036263783e-05, + "loss": 0.0112, + "reward": 0.5602678954601288, + "reward_std": 0.09621011279523373, + "rewards/accuracy_reward": 0.06473214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1559 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.4531860351562, + "epoch": 0.46598461653349266, + "grad_norm": 0.19880594313144684, + "kl": 0.2021484375, + "learning_rate": 1.288911973648933e-05, + "loss": 0.0085, + "reward": 0.6032366305589676, + "reward_std": 0.09714228776283562, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1560 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.8415374755859, + "epoch": 0.46628332462101413, + "grad_norm": 0.19445741176605225, + "kl": 0.20263671875, + "learning_rate": 1.2879132703521249e-05, + "loss": 0.0086, + "reward": 0.5848214477300644, + "reward_std": 0.11651074606925249, + "rewards/accuracy_reward": 0.09151786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 1561 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.8281707763672, + "epoch": 0.4665820327085356, + "grad_norm": 0.3515031933784485, + "kl": 0.24365234375, + "learning_rate": 1.2869142538338974e-05, + "loss": 0.0097, + "reward": 0.5898437798023224, + "reward_std": 0.11858869343996048, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1562 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.8661346435547, + "epoch": 0.46688074079605707, + "grad_norm": 0.28802165389060974, + "kl": 0.20166015625, + "learning_rate": 1.2859149251810823e-05, + "loss": 0.0093, + "reward": 0.6143973469734192, + "reward_std": 0.09714156435802579, + "rewards/accuracy_reward": 0.12053571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1563 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.7723693847656, + "epoch": 0.46717944888357854, + "grad_norm": 0.37472787499427795, + "kl": 0.208251953125, + "learning_rate": 1.284915285480851e-05, + "loss": 0.008, + "reward": 0.5535714626312256, + "reward_std": 0.08580083958804607, + "rewards/accuracy_reward": 0.060267859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1564 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.0379943847656, + "epoch": 0.4674781569711, + "grad_norm": 0.26914694905281067, + "kl": 0.21533203125, + "learning_rate": 1.2839153358207142e-05, + "loss": 0.01, + "reward": 0.6065848618745804, + "reward_std": 0.06741274613887072, + "rewards/accuracy_reward": 0.11383929336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 1565 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.0246124267578, + "epoch": 0.4677768650586215, + "grad_norm": 0.33039695024490356, + "kl": 0.21875, + "learning_rate": 1.2829150772885186e-05, + "loss": 0.0084, + "reward": 0.5892857313156128, + "reward_std": 0.106539347441867, + "rewards/accuracy_reward": 0.0959821492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1566 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.5312957763672, + "epoch": 0.46807557314614295, + "grad_norm": 0.32853928208351135, + "kl": 0.226806640625, + "learning_rate": 1.2819145109724476e-05, + "loss": 0.0089, + "reward": 0.6099330484867096, + "reward_std": 0.15215464681386948, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1567 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.2678985595703, + "epoch": 0.4683742812336644, + "grad_norm": 0.5754266381263733, + "kl": 0.240234375, + "learning_rate": 1.280913637961019e-05, + "loss": 0.0099, + "reward": 0.6741071790456772, + "reward_std": 0.08734325505793095, + "rewards/accuracy_reward": 0.1852678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1568 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5714721679688, + "epoch": 0.4686729893211859, + "grad_norm": 0.23749056458473206, + "kl": 0.24365234375, + "learning_rate": 1.2799124593430849e-05, + "loss": 0.0088, + "reward": 0.555803582072258, + "reward_std": 0.143313848413527, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071715950966, + "step": 1569 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8549499511719, + "epoch": 0.46897169740870737, + "grad_norm": 0.3739506006240845, + "kl": 0.265625, + "learning_rate": 1.2789109762078296e-05, + "loss": 0.0112, + "reward": 0.5306919887661934, + "reward_std": 0.07630361150950193, + "rewards/accuracy_reward": 0.0491071455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 1570 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.9397430419922, + "epoch": 0.46927040549622884, + "grad_norm": 0.33090007305145264, + "kl": 0.268310546875, + "learning_rate": 1.2779091896447682e-05, + "loss": 0.0108, + "reward": 0.5680803880095482, + "reward_std": 0.0812741219997406, + "rewards/accuracy_reward": 0.08482143376022577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 1571 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.0982360839844, + "epoch": 0.46956911358375025, + "grad_norm": 0.2341744601726532, + "kl": 0.2451171875, + "learning_rate": 1.2769071007437466e-05, + "loss": 0.0102, + "reward": 0.5585937798023224, + "reward_std": 0.08833147399127483, + "rewards/accuracy_reward": 0.07142857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 1572 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.3549499511719, + "epoch": 0.4698678216712717, + "grad_norm": 0.33530187606811523, + "kl": 0.28271484375, + "learning_rate": 1.2759047105949391e-05, + "loss": 0.0117, + "reward": 0.5558035969734192, + "reward_std": 0.09057424403727055, + "rewards/accuracy_reward": 0.08035714644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.475446455180645, + "step": 1573 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.4754791259766, + "epoch": 0.4701665297587932, + "grad_norm": 0.2904554605484009, + "kl": 0.295654296875, + "learning_rate": 1.2749020202888485e-05, + "loss": 0.016, + "reward": 0.6746652126312256, + "reward_std": 0.14599928446114063, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 1574 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.060302734375, + "epoch": 0.47046523784631467, + "grad_norm": 0.4592401087284088, + "kl": 0.322265625, + "learning_rate": 1.2738990309163025e-05, + "loss": 0.0132, + "reward": 0.537946455180645, + "reward_std": 0.05102218873798847, + "rewards/accuracy_reward": 0.04687500209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1575 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.4352874755859, + "epoch": 0.47076394593383614, + "grad_norm": 1.0320295095443726, + "kl": 0.443359375, + "learning_rate": 1.2728957435684561e-05, + "loss": 0.0178, + "reward": 0.5474330633878708, + "reward_std": 0.0789848854765296, + "rewards/accuracy_reward": 0.05580357392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1576 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1986999511719, + "epoch": 0.4710626540213576, + "grad_norm": 0.2453877180814743, + "kl": 0.35693359375, + "learning_rate": 1.2718921593367874e-05, + "loss": 0.0116, + "reward": 0.6149553805589676, + "reward_std": 0.0913325659930706, + "rewards/accuracy_reward": 0.12500000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 1577 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3839721679688, + "epoch": 0.4713613621088791, + "grad_norm": 0.27072903513908386, + "kl": 0.34716796875, + "learning_rate": 1.2708882793130974e-05, + "loss": 0.0136, + "reward": 0.5669643133878708, + "reward_std": 0.09037834778428078, + "rewards/accuracy_reward": 0.07366071967408061, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 1578 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.3281707763672, + "epoch": 0.47166007019640055, + "grad_norm": 0.17118553817272186, + "kl": 0.27587890625, + "learning_rate": 1.2698841045895096e-05, + "loss": 0.011, + "reward": 0.5440848469734192, + "reward_std": 0.043414748972281814, + "rewards/accuracy_reward": 0.0468750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1579 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.9576416015625, + "epoch": 0.471958778283922, + "grad_norm": 0.2712956666946411, + "kl": 0.294921875, + "learning_rate": 1.2688796362584676e-05, + "loss": 0.0079, + "reward": 0.6021205633878708, + "reward_std": 0.08209509123116732, + "rewards/accuracy_reward": 0.10714286542497575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1580 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.2120819091797, + "epoch": 0.4722574863714435, + "grad_norm": 0.33802640438079834, + "kl": 0.24560546875, + "learning_rate": 1.2678748754127344e-05, + "loss": 0.0106, + "reward": 0.633370578289032, + "reward_std": 0.12063831044360995, + "rewards/accuracy_reward": 0.14062500977888703, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1581 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7522583007812, + "epoch": 0.47255619445896496, + "grad_norm": 0.3428291380405426, + "kl": 0.260009765625, + "learning_rate": 1.2668698231453908e-05, + "loss": 0.0114, + "reward": 0.638950914144516, + "reward_std": 0.0792618349660188, + "rewards/accuracy_reward": 0.14285714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1582 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.7522735595703, + "epoch": 0.47285490254648643, + "grad_norm": 0.591833770275116, + "kl": 0.26025390625, + "learning_rate": 1.2658644805498361e-05, + "loss": 0.0122, + "reward": 0.599888414144516, + "reward_std": 0.08770540170371532, + "rewards/accuracy_reward": 0.10937500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1583 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1205749511719, + "epoch": 0.4731536106340079, + "grad_norm": 0.301146000623703, + "kl": 0.239990234375, + "learning_rate": 1.2648588487197842e-05, + "loss": 0.0074, + "reward": 0.5479910969734192, + "reward_std": 0.07077893009409308, + "rewards/accuracy_reward": 0.053571432596072555, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1584 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.4732513427734, + "epoch": 0.4734523187215294, + "grad_norm": 0.4744061529636383, + "kl": 0.27685546875, + "learning_rate": 1.2638529287492635e-05, + "loss": 0.0107, + "reward": 0.631138414144516, + "reward_std": 0.08117074752226472, + "rewards/accuracy_reward": 0.13839286495931447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1585 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.7254943847656, + "epoch": 0.47375102680905085, + "grad_norm": 0.24062113463878632, + "kl": 0.27099609375, + "learning_rate": 1.262846721732617e-05, + "loss": 0.0118, + "reward": 0.672433078289032, + "reward_std": 0.11665921588428319, + "rewards/accuracy_reward": 0.17857143771834671, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1586 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.4643249511719, + "epoch": 0.4740497348965723, + "grad_norm": 0.4307761788368225, + "kl": 0.30029296875, + "learning_rate": 1.2618402287644989e-05, + "loss": 0.0116, + "reward": 0.601004496216774, + "reward_std": 0.06454375106841326, + "rewards/accuracy_reward": 0.10267857788130641, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1587 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.6875610351562, + "epoch": 0.4743484429840938, + "grad_norm": 0.21449698507785797, + "kl": 0.314453125, + "learning_rate": 1.2608334509398752e-05, + "loss": 0.0165, + "reward": 0.6132812798023224, + "reward_std": 0.09271273482590914, + "rewards/accuracy_reward": 0.11607143515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1588 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.3527221679688, + "epoch": 0.47464715107161526, + "grad_norm": 0.24016200006008148, + "kl": 0.3056640625, + "learning_rate": 1.2598263893540207e-05, + "loss": 0.0128, + "reward": 0.6015625149011612, + "reward_std": 0.058110975893214345, + "rewards/accuracy_reward": 0.10491071874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1589 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.9286193847656, + "epoch": 0.47494585915913673, + "grad_norm": 0.31912827491760254, + "kl": 0.3076171875, + "learning_rate": 1.2588190451025209e-05, + "loss": 0.0128, + "reward": 0.5781250149011612, + "reward_std": 0.07269153371453285, + "rewards/accuracy_reward": 0.08035714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1590 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.575927734375, + "epoch": 0.4752445672466582, + "grad_norm": 0.32911306619644165, + "kl": 0.267333984375, + "learning_rate": 1.2578114192812669e-05, + "loss": 0.011, + "reward": 0.6383928805589676, + "reward_std": 0.10550615936517715, + "rewards/accuracy_reward": 0.14062501047737896, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1591 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.8482513427734, + "epoch": 0.47554327533417967, + "grad_norm": 0.1748829483985901, + "kl": 0.262939453125, + "learning_rate": 1.2568035129864569e-05, + "loss": 0.0095, + "reward": 0.6255580633878708, + "reward_std": 0.05397573299705982, + "rewards/accuracy_reward": 0.1316964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1592 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.0223693847656, + "epoch": 0.47584198342170114, + "grad_norm": 0.14818055927753448, + "kl": 0.243408203125, + "learning_rate": 1.255795327314594e-05, + "loss": 0.0109, + "reward": 0.5546875149011612, + "reward_std": 0.06980227236635983, + "rewards/accuracy_reward": 0.05803571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1593 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8370819091797, + "epoch": 0.4761406915092226, + "grad_norm": 0.16538819670677185, + "kl": 0.203369140625, + "learning_rate": 1.2547868633624858e-05, + "loss": 0.0085, + "reward": 0.6160714626312256, + "reward_std": 0.08418691903352737, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1594 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.9576263427734, + "epoch": 0.4764393995967441, + "grad_norm": 0.16104784607887268, + "kl": 0.206298828125, + "learning_rate": 1.2537781222272423e-05, + "loss": 0.0088, + "reward": 0.5920759290456772, + "reward_std": 0.10931099625304341, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1595 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8281555175781, + "epoch": 0.47673810768426556, + "grad_norm": 0.18440274894237518, + "kl": 0.224609375, + "learning_rate": 1.2527691050062743e-05, + "loss": 0.01, + "reward": 0.608816996216774, + "reward_std": 0.10904788505285978, + "rewards/accuracy_reward": 0.11383929336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1596 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.0379791259766, + "epoch": 0.477036815771787, + "grad_norm": 0.1513051688671112, + "kl": 0.220458984375, + "learning_rate": 1.2517598127972943e-05, + "loss": 0.0113, + "reward": 0.5976562798023224, + "reward_std": 0.07719677686691284, + "rewards/accuracy_reward": 0.10267857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1597 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.2031860351562, + "epoch": 0.4773355238593085, + "grad_norm": 0.37184780836105347, + "kl": 0.22412109375, + "learning_rate": 1.250750246698313e-05, + "loss": 0.0048, + "reward": 0.6021205633878708, + "reward_std": 0.09815179114229977, + "rewards/accuracy_reward": 0.11160714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1598 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.0335235595703, + "epoch": 0.47763423194682997, + "grad_norm": 0.2641267478466034, + "kl": 0.19970703125, + "learning_rate": 1.2497404078076396e-05, + "loss": 0.0083, + "reward": 0.5485491305589676, + "reward_std": 0.11215133965015411, + "rewards/accuracy_reward": 0.05357143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 1599 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.0937957763672, + "epoch": 0.47793294003435144, + "grad_norm": 0.3099367022514343, + "kl": 0.238037109375, + "learning_rate": 1.2487302972238795e-05, + "loss": 0.0104, + "reward": 0.5837053656578064, + "reward_std": 0.10689570684917271, + "rewards/accuracy_reward": 0.09151786123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1600 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.6495819091797, + "epoch": 0.4782316481218729, + "grad_norm": 0.17239294946193695, + "kl": 0.22216796875, + "learning_rate": 1.2477199160459345e-05, + "loss": 0.0089, + "reward": 0.6060267984867096, + "reward_std": 0.08460740488953888, + "rewards/accuracy_reward": 0.11160714598372579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1601 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.1518249511719, + "epoch": 0.4785303562093944, + "grad_norm": 0.418038934469223, + "kl": 0.3076171875, + "learning_rate": 1.246709265373e-05, + "loss": 0.013, + "reward": 0.5915178805589676, + "reward_std": 0.04234638740308583, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1602 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.4375457763672, + "epoch": 0.47882906429691585, + "grad_norm": 0.26165899634361267, + "kl": 0.324462890625, + "learning_rate": 1.2456983463045644e-05, + "loss": 0.014, + "reward": 0.5708705633878708, + "reward_std": 0.10546633531339467, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 1603 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.216552734375, + "epoch": 0.4791277723844373, + "grad_norm": 1.0611590147018433, + "kl": 0.4921875, + "learning_rate": 1.2446871599404095e-05, + "loss": 0.0219, + "reward": 0.5005580633878708, + "reward_std": 0.04811058845371008, + "rewards/accuracy_reward": 0.006696428870782256, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1604 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.7545013427734, + "epoch": 0.4794264804719588, + "grad_norm": 0.9185601472854614, + "kl": 0.5107421875, + "learning_rate": 1.2436757073806065e-05, + "loss": 0.0209, + "reward": 0.6049107313156128, + "reward_std": 0.048225947888568044, + "rewards/accuracy_reward": 0.10714286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1605 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.2076263427734, + "epoch": 0.47972518855948026, + "grad_norm": 0.22047100961208344, + "kl": 0.421630859375, + "learning_rate": 1.2426639897255166e-05, + "loss": 0.019, + "reward": 0.6489955633878708, + "reward_std": 0.05848417431116104, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1606 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.8906555175781, + "epoch": 0.48002389664700174, + "grad_norm": 0.4473843574523926, + "kl": 0.240966796875, + "learning_rate": 1.2416520080757892e-05, + "loss": 0.0104, + "reward": 0.6378348469734192, + "reward_std": 0.12302349880337715, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1607 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.7031707763672, + "epoch": 0.4803226047345232, + "grad_norm": 0.6071315407752991, + "kl": 0.275146484375, + "learning_rate": 1.2406397635323617e-05, + "loss": 0.0127, + "reward": 0.5876116305589676, + "reward_std": 0.1006555836647749, + "rewards/accuracy_reward": 0.09151786402799189, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1608 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.8326568603516, + "epoch": 0.4806213128220447, + "grad_norm": 0.2858738899230957, + "kl": 0.205810546875, + "learning_rate": 1.239627257196457e-05, + "loss": 0.009, + "reward": 0.6043526977300644, + "reward_std": 0.13921119272708893, + "rewards/accuracy_reward": 0.10714286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1609 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.8951263427734, + "epoch": 0.48092002090956615, + "grad_norm": 1.564302921295166, + "kl": 0.258056640625, + "learning_rate": 1.2386144901695817e-05, + "loss": 0.0149, + "reward": 0.5474330633878708, + "reward_std": 0.07663950743153691, + "rewards/accuracy_reward": 0.051339286379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1610 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.5960235595703, + "epoch": 0.4812187289970876, + "grad_norm": 0.23272664844989777, + "kl": 0.267578125, + "learning_rate": 1.2376014635535285e-05, + "loss": 0.0103, + "reward": 0.7020089477300644, + "reward_std": 0.17573492601513863, + "rewards/accuracy_reward": 0.2053571529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1611 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.0737152099609, + "epoch": 0.4815174370846091, + "grad_norm": 0.45895418524742126, + "kl": 0.265869140625, + "learning_rate": 1.2365881784503704e-05, + "loss": 0.0094, + "reward": 0.579241082072258, + "reward_std": 0.052377325715497136, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1612 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.9442443847656, + "epoch": 0.48181614517213056, + "grad_norm": 0.5209150910377502, + "kl": 0.3642578125, + "learning_rate": 1.2355746359624621e-05, + "loss": 0.0146, + "reward": 0.577566996216774, + "reward_std": 0.06966783292591572, + "rewards/accuracy_reward": 0.08035714644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1613 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.1518249511719, + "epoch": 0.48211485325965203, + "grad_norm": 1.345536231994629, + "kl": 0.738037109375, + "learning_rate": 1.2345608371924384e-05, + "loss": 0.0304, + "reward": 0.6155134290456772, + "reward_std": 0.08274577115662396, + "rewards/accuracy_reward": 0.11830357951112092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1614 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.2969207763672, + "epoch": 0.48241356134717345, + "grad_norm": 1.2547001838684082, + "kl": 0.6337890625, + "learning_rate": 1.2335467832432136e-05, + "loss": 0.0281, + "reward": 0.5496651977300644, + "reward_std": 0.05040410649962723, + "rewards/accuracy_reward": 0.051339289639145136, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1615 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.9687805175781, + "epoch": 0.4827122694346949, + "grad_norm": 1.9336228370666504, + "kl": 0.837890625, + "learning_rate": 1.2325324752179788e-05, + "loss": 0.0356, + "reward": 0.5452009290456772, + "reward_std": 0.08811327745206654, + "rewards/accuracy_reward": 0.05133928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1616 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.7723541259766, + "epoch": 0.4830109775222164, + "grad_norm": 0.284496545791626, + "kl": 0.34765625, + "learning_rate": 1.2315179142202012e-05, + "loss": 0.0138, + "reward": 0.642857164144516, + "reward_std": 0.08278253627941012, + "rewards/accuracy_reward": 0.1450892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1617 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.5826263427734, + "epoch": 0.48330968560973786, + "grad_norm": 0.7607295513153076, + "kl": 0.26318359375, + "learning_rate": 1.2305031013536244e-05, + "loss": 0.011, + "reward": 0.6004464626312256, + "reward_std": 0.04094352759420872, + "rewards/accuracy_reward": 0.10267857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1618 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.1585235595703, + "epoch": 0.48360839369725933, + "grad_norm": 0.39700183272361755, + "kl": 0.19775390625, + "learning_rate": 1.2294880377222649e-05, + "loss": 0.0069, + "reward": 0.6785714626312256, + "reward_std": 0.07496684323996305, + "rewards/accuracy_reward": 0.18080357951112092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1619 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.7567291259766, + "epoch": 0.4839071017847808, + "grad_norm": 1.016057014465332, + "kl": 0.2783203125, + "learning_rate": 1.2284727244304126e-05, + "loss": 0.0123, + "reward": 0.6266741156578064, + "reward_std": 0.10351425083354115, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1620 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.872802734375, + "epoch": 0.4842058098723023, + "grad_norm": 0.596627950668335, + "kl": 0.260009765625, + "learning_rate": 1.227457162582629e-05, + "loss": 0.011, + "reward": 0.5786830633878708, + "reward_std": 0.12137783132493496, + "rewards/accuracy_reward": 0.0825892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1621 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.6272583007812, + "epoch": 0.48450451795982374, + "grad_norm": 0.23411902785301208, + "kl": 0.2216796875, + "learning_rate": 1.2264413532837456e-05, + "loss": 0.0062, + "reward": 0.6417411118745804, + "reward_std": 0.09850435890257359, + "rewards/accuracy_reward": 0.14285715389996767, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1622 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.5379791259766, + "epoch": 0.4848032260473452, + "grad_norm": 0.48101168870925903, + "kl": 0.569580078125, + "learning_rate": 1.2254252976388637e-05, + "loss": 0.0153, + "reward": 0.6741071790456772, + "reward_std": 0.0911797359585762, + "rewards/accuracy_reward": 0.18080357648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1623 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.1986999511719, + "epoch": 0.4851019341348667, + "grad_norm": 0.6672786474227905, + "kl": 0.39599609375, + "learning_rate": 1.2244089967533515e-05, + "loss": 0.0171, + "reward": 0.6255580633878708, + "reward_std": 0.12811970338225365, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1624 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.0424499511719, + "epoch": 0.48540064222238816, + "grad_norm": 0.5534139275550842, + "kl": 0.3173828125, + "learning_rate": 1.2233924517328456e-05, + "loss": 0.0141, + "reward": 0.5150669813156128, + "reward_std": 0.04668492660857737, + "rewards/accuracy_reward": 0.01562500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1625 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3527221679688, + "epoch": 0.4856993503099096, + "grad_norm": 0.24254927039146423, + "kl": 0.3037109375, + "learning_rate": 1.2223756636832471e-05, + "loss": 0.0118, + "reward": 0.6171875447034836, + "reward_std": 0.08176010847091675, + "rewards/accuracy_reward": 0.12053572223521769, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1626 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1696929931641, + "epoch": 0.4859980583974311, + "grad_norm": 0.313755601644516, + "kl": 0.3515625, + "learning_rate": 1.2213586337107217e-05, + "loss": 0.0148, + "reward": 0.5959821790456772, + "reward_std": 0.12932711956091225, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1627 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.3616333007812, + "epoch": 0.48629676648495257, + "grad_norm": 0.13597317039966583, + "kl": 0.243408203125, + "learning_rate": 1.220341362921698e-05, + "loss": 0.0096, + "reward": 0.5552455484867096, + "reward_std": 0.06568865897133946, + "rewards/accuracy_reward": 0.0602678582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 1628 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.3058624267578, + "epoch": 0.48659547457247404, + "grad_norm": 0.2331625372171402, + "kl": 0.3232421875, + "learning_rate": 1.2193238524228677e-05, + "loss": 0.0139, + "reward": 0.6612723469734192, + "reward_std": 0.14689494110643864, + "rewards/accuracy_reward": 0.16517857555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1629 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.9219055175781, + "epoch": 0.4868941826599955, + "grad_norm": 0.544937014579773, + "kl": 0.2490234375, + "learning_rate": 1.2183061033211817e-05, + "loss": 0.0063, + "reward": 0.5357143133878708, + "reward_std": 0.03571428684517741, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1630 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.4621124267578, + "epoch": 0.487192890747517, + "grad_norm": 0.39822033047676086, + "kl": 0.223388671875, + "learning_rate": 1.2172881167238515e-05, + "loss": 0.0095, + "reward": 0.5915178805589676, + "reward_std": 0.11369114927947521, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1631 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1518249511719, + "epoch": 0.48749159883503845, + "grad_norm": 0.21027711033821106, + "kl": 0.258056640625, + "learning_rate": 1.216269893738347e-05, + "loss": 0.0082, + "reward": 0.5931919813156128, + "reward_std": 0.07064144778996706, + "rewards/accuracy_reward": 0.09821428963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1632 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.5558624267578, + "epoch": 0.4877903069225599, + "grad_norm": 0.24203622341156006, + "kl": 0.321044921875, + "learning_rate": 1.2152514354723948e-05, + "loss": 0.0145, + "reward": 0.5736607238650322, + "reward_std": 0.1096903346478939, + "rewards/accuracy_reward": 0.0803571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1633 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.0178985595703, + "epoch": 0.4880890150100814, + "grad_norm": 0.20246192812919617, + "kl": 0.307861328125, + "learning_rate": 1.2142327430339777e-05, + "loss": 0.012, + "reward": 0.5842634290456772, + "reward_std": 0.09747906774282455, + "rewards/accuracy_reward": 0.09375000349245965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 1634 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.8906555175781, + "epoch": 0.48838772309760287, + "grad_norm": 0.5159730911254883, + "kl": 0.3974609375, + "learning_rate": 1.213213817531333e-05, + "loss": 0.0159, + "reward": 0.5362723469734192, + "reward_std": 0.1168962623924017, + "rewards/accuracy_reward": 0.0424107164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1635 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0915374755859, + "epoch": 0.48868643118512434, + "grad_norm": 0.32929539680480957, + "kl": 0.418701171875, + "learning_rate": 1.2121946600729524e-05, + "loss": 0.0167, + "reward": 0.6517857313156128, + "reward_std": 0.13735586032271385, + "rewards/accuracy_reward": 0.1584821529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493303582072258, + "step": 1636 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.6428985595703, + "epoch": 0.4889851392726458, + "grad_norm": 0.21364454925060272, + "kl": 0.4423828125, + "learning_rate": 1.2111752717675788e-05, + "loss": 0.018, + "reward": 0.5591517984867096, + "reward_std": 0.05089520616456866, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1637 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.0536041259766, + "epoch": 0.4892838473601673, + "grad_norm": 0.2663039267063141, + "kl": 0.3505859375, + "learning_rate": 1.2101556537242069e-05, + "loss": 0.0141, + "reward": 0.5585937649011612, + "reward_std": 0.1040588547475636, + "rewards/accuracy_reward": 0.0647321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1638 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.5468902587891, + "epoch": 0.48958255544768875, + "grad_norm": 0.433868408203125, + "kl": 0.412109375, + "learning_rate": 1.2091358070520813e-05, + "loss": 0.017, + "reward": 0.6177455484867096, + "reward_std": 0.09595722146332264, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1639 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.5245819091797, + "epoch": 0.4898812635352102, + "grad_norm": 0.2854633331298828, + "kl": 0.357421875, + "learning_rate": 1.2081157328606951e-05, + "loss": 0.0152, + "reward": 0.6551339626312256, + "reward_std": 0.20440507680177689, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 1640 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.6562805175781, + "epoch": 0.4901799716227317, + "grad_norm": 0.2568225562572479, + "kl": 0.35498046875, + "learning_rate": 1.2070954322597893e-05, + "loss": 0.0147, + "reward": 0.6623884290456772, + "reward_std": 0.10562615189701319, + "rewards/accuracy_reward": 0.1696428693830967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1641 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.6451263427734, + "epoch": 0.49047867971025316, + "grad_norm": 0.3829863965511322, + "kl": 0.37548828125, + "learning_rate": 1.2060749063593503e-05, + "loss": 0.0126, + "reward": 0.5753348469734192, + "reward_std": 0.10946286469697952, + "rewards/accuracy_reward": 0.08482143259607255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1642 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.4174499511719, + "epoch": 0.49077738779777463, + "grad_norm": 0.21457593142986298, + "kl": 0.3955078125, + "learning_rate": 1.205054156269611e-05, + "loss": 0.0146, + "reward": 0.6328125298023224, + "reward_std": 0.09416133956983685, + "rewards/accuracy_reward": 0.1406250111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 1643 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.1763763427734, + "epoch": 0.4910760958852961, + "grad_norm": 0.30138731002807617, + "kl": 0.39892578125, + "learning_rate": 1.204033183101047e-05, + "loss": 0.0161, + "reward": 0.549107164144516, + "reward_std": 0.10948829655535519, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1644 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.2678985595703, + "epoch": 0.4913748039728176, + "grad_norm": 0.3206157982349396, + "kl": 0.5224609375, + "learning_rate": 1.203011987964377e-05, + "loss": 0.0189, + "reward": 0.5373884290456772, + "reward_std": 0.10922556929290295, + "rewards/accuracy_reward": 0.05133928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 1645 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.075927734375, + "epoch": 0.49167351206033905, + "grad_norm": 0.516727864742279, + "kl": 0.36376953125, + "learning_rate": 1.2019905719705618e-05, + "loss": 0.0134, + "reward": 0.5948661118745804, + "reward_std": 0.11689968593418598, + "rewards/accuracy_reward": 0.10937500838190317, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910969734192, + "step": 1646 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.9777221679688, + "epoch": 0.4919722201478605, + "grad_norm": 0.29510390758514404, + "kl": 0.408203125, + "learning_rate": 1.2009689362308014e-05, + "loss": 0.0173, + "reward": 0.655691996216774, + "reward_std": 0.07214309461414814, + "rewards/accuracy_reward": 0.16741072246804833, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 1647 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2701110839844, + "epoch": 0.492270928235382, + "grad_norm": 0.3388111889362335, + "kl": 0.54345703125, + "learning_rate": 1.1999470818565355e-05, + "loss": 0.0219, + "reward": 0.6043526977300644, + "reward_std": 0.1274050883948803, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 1648 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.6920013427734, + "epoch": 0.49256963632290346, + "grad_norm": 0.3083387017250061, + "kl": 0.5244140625, + "learning_rate": 1.1989250099594412e-05, + "loss": 0.0206, + "reward": 0.6004464477300644, + "reward_std": 0.14903922658413649, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750298023224, + "step": 1649 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.4196624755859, + "epoch": 0.49286834441042493, + "grad_norm": 0.36207157373428345, + "kl": 0.5478515625, + "learning_rate": 1.1979027216514329e-05, + "loss": 0.0219, + "reward": 0.5468750298023224, + "reward_std": 0.14037449471652508, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143059372902, + "step": 1650 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.6964721679688, + "epoch": 0.4931670524979464, + "grad_norm": 0.2963274419307709, + "kl": 0.48779296875, + "learning_rate": 1.1968802180446602e-05, + "loss": 0.0202, + "reward": 0.6088169813156128, + "reward_std": 0.13136626034975052, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 1651 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.7790679931641, + "epoch": 0.49346576058546787, + "grad_norm": 0.32787150144577026, + "kl": 0.6376953125, + "learning_rate": 1.1958575002515062e-05, + "loss": 0.0275, + "reward": 0.5809151977300644, + "reward_std": 0.08938771765679121, + "rewards/accuracy_reward": 0.09598215040750802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 1652 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2455749511719, + "epoch": 0.49376446867298934, + "grad_norm": 0.25146299600601196, + "kl": 0.5078125, + "learning_rate": 1.1948345693845884e-05, + "loss": 0.0195, + "reward": 0.5362723469734192, + "reward_std": 0.14162994362413883, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 1653 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.9888763427734, + "epoch": 0.4940631767605108, + "grad_norm": 0.27721667289733887, + "kl": 0.4765625, + "learning_rate": 1.1938114265567552e-05, + "loss": 0.0191, + "reward": 0.5368303880095482, + "reward_std": 0.07614879216998816, + "rewards/accuracy_reward": 0.04910714505240321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 1654 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.6674499511719, + "epoch": 0.4943618848480323, + "grad_norm": 0.9352993369102478, + "kl": 0.6904296875, + "learning_rate": 1.192788072881085e-05, + "loss": 0.0294, + "reward": 0.5931919813156128, + "reward_std": 0.13912853226065636, + "rewards/accuracy_reward": 0.10937500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 1655 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.3080749511719, + "epoch": 0.49466059293555376, + "grad_norm": 0.3319210708141327, + "kl": 0.7275390625, + "learning_rate": 1.1917645094708867e-05, + "loss": 0.0298, + "reward": 0.5200893133878708, + "reward_std": 0.11521151475608349, + "rewards/accuracy_reward": 0.033482144586741924, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071715950966, + "step": 1656 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.2835083007812, + "epoch": 0.4949593010230752, + "grad_norm": 0.4436088800430298, + "kl": 0.7958984375, + "learning_rate": 1.1907407374396973e-05, + "loss": 0.0326, + "reward": 0.577566996216774, + "reward_std": 0.11796054430305958, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 1657 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.3236999511719, + "epoch": 0.49525800911059664, + "grad_norm": 0.36591434478759766, + "kl": 0.67724609375, + "learning_rate": 1.18971675790128e-05, + "loss": 0.0286, + "reward": 0.6216518133878708, + "reward_std": 0.1251368150115013, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 1658 + }, + { + "clip_ratio": 0.0, + "completion_length": 1022.2232666015625, + "epoch": 0.4955567171981181, + "grad_norm": 0.9310885071754456, + "kl": 0.9638671875, + "learning_rate": 1.1886925719696243e-05, + "loss": 0.0385, + "reward": 0.493861623108387, + "reward_std": 0.09020628780126572, + "rewards/accuracy_reward": 0.011160714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 1659 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.2790679931641, + "epoch": 0.4958554252856396, + "grad_norm": 0.5314598679542542, + "kl": 0.61572265625, + "learning_rate": 1.1876681807589443e-05, + "loss": 0.0264, + "reward": 0.5652902126312256, + "reward_std": 0.12477578409016132, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 1660 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.4955749511719, + "epoch": 0.49615413337316105, + "grad_norm": 0.6393521428108215, + "kl": 0.7138671875, + "learning_rate": 1.1866435853836773e-05, + "loss": 0.0277, + "reward": 0.5591518133878708, + "reward_std": 0.11239983513951302, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478794664144516, + "step": 1661 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.4509124755859, + "epoch": 0.4964528414606825, + "grad_norm": 0.40862682461738586, + "kl": 0.62158203125, + "learning_rate": 1.1856187869584821e-05, + "loss": 0.0252, + "reward": 0.5837053656578064, + "reward_std": 0.10657690837979317, + "rewards/accuracy_reward": 0.10044643562287092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 1662 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.3549499511719, + "epoch": 0.496751549548204, + "grad_norm": 0.49968281388282776, + "kl": 0.9599609375, + "learning_rate": 1.1845937865982393e-05, + "loss": 0.0399, + "reward": 0.5323660895228386, + "reward_std": 0.11852757073938847, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 1663 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.7344207763672, + "epoch": 0.49705025763572547, + "grad_norm": 0.6371643543243408, + "kl": 0.9345703125, + "learning_rate": 1.1835685854180489e-05, + "loss": 0.0379, + "reward": 0.5870536118745804, + "reward_std": 0.1708824671804905, + "rewards/accuracy_reward": 0.10491071944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 1664 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.8370819091797, + "epoch": 0.49734896572324694, + "grad_norm": 0.8057978749275208, + "kl": 0.896484375, + "learning_rate": 1.1825431845332293e-05, + "loss": 0.036, + "reward": 0.5636161118745804, + "reward_std": 0.16524964943528175, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 1665 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.5692291259766, + "epoch": 0.4976476738107684, + "grad_norm": 0.4866262972354889, + "kl": 0.8310546875, + "learning_rate": 1.1815175850593159e-05, + "loss": 0.0338, + "reward": 0.5876116454601288, + "reward_std": 0.11644870042800903, + "rewards/accuracy_reward": 0.10491072060540318, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 1666 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.1719207763672, + "epoch": 0.4979463818982899, + "grad_norm": 1.1430670022964478, + "kl": 0.787109375, + "learning_rate": 1.1804917881120608e-05, + "loss": 0.0322, + "reward": 0.5976562947034836, + "reward_std": 0.16658764332532883, + "rewards/accuracy_reward": 0.11607143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 1667 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.1049346923828, + "epoch": 0.49824508998581135, + "grad_norm": 0.6520851850509644, + "kl": 0.9794921875, + "learning_rate": 1.1794657948074301e-05, + "loss": 0.0395, + "reward": 0.548549123108387, + "reward_std": 0.12429111264646053, + "rewards/accuracy_reward": 0.0647321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 1668 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.5558624267578, + "epoch": 0.4985437980733328, + "grad_norm": 2.121619462966919, + "kl": 1.517578125, + "learning_rate": 1.1784396062616046e-05, + "loss": 0.0621, + "reward": 0.5412946715950966, + "reward_std": 0.09494243562221527, + "rewards/accuracy_reward": 0.05580357322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910969734192, + "step": 1669 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.8973693847656, + "epoch": 0.4988425061608543, + "grad_norm": 4.888049125671387, + "kl": 2.021484375, + "learning_rate": 1.177413223590976e-05, + "loss": 0.0808, + "reward": 0.5758928805589676, + "reward_std": 0.08495990186929703, + "rewards/accuracy_reward": 0.09151786030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 1670 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.185302734375, + "epoch": 0.49914121424837576, + "grad_norm": 2.5332980155944824, + "kl": 1.4697265625, + "learning_rate": 1.1763866479121486e-05, + "loss": 0.0587, + "reward": 0.514508955180645, + "reward_std": 0.08085663430392742, + "rewards/accuracy_reward": 0.024553572526201606, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 1671 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.8371124267578, + "epoch": 0.49943992233589724, + "grad_norm": 0.45329388976097107, + "kl": 1.150390625, + "learning_rate": 1.1753598803419361e-05, + "loss": 0.0448, + "reward": 0.5781250149011612, + "reward_std": 0.18185918405652046, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107387661934, + "step": 1672 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.4330749511719, + "epoch": 0.4997386304234187, + "grad_norm": 0.6507479548454285, + "kl": 0.70849609375, + "learning_rate": 1.1743329219973609e-05, + "loss": 0.0276, + "reward": 0.5853794813156128, + "reward_std": 0.07884604902938008, + "rewards/accuracy_reward": 0.09375000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1673 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.0335388183594, + "epoch": 0.5000373385109402, + "grad_norm": 1.3661482334136963, + "kl": 0.5703125, + "learning_rate": 1.1733057739956531e-05, + "loss": 0.0208, + "reward": 0.5636160969734192, + "reward_std": 0.13467583060264587, + "rewards/accuracy_reward": 0.07589286123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 1674 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.7255096435547, + "epoch": 0.5003360465984616, + "grad_norm": 0.45218873023986816, + "kl": 0.342529296875, + "learning_rate": 1.1722784374542489e-05, + "loss": 0.0155, + "reward": 0.5982143133878708, + "reward_std": 0.1584603153169155, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493303582072258, + "step": 1675 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.7031707763672, + "epoch": 0.5006347546859832, + "grad_norm": 0.6830487251281738, + "kl": 0.375, + "learning_rate": 1.17125091349079e-05, + "loss": 0.0158, + "reward": 0.6266741305589676, + "reward_std": 0.08848544675856829, + "rewards/accuracy_reward": 0.13392857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1676 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.7589721679688, + "epoch": 0.5009334627735046, + "grad_norm": 0.8727529644966125, + "kl": 0.7744140625, + "learning_rate": 1.1702232032231213e-05, + "loss": 0.0338, + "reward": 0.5825893133878708, + "reward_std": 0.1394949108362198, + "rewards/accuracy_reward": 0.09598214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 1677 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.4844207763672, + "epoch": 0.5012321708610261, + "grad_norm": 0.4373599886894226, + "kl": 0.923828125, + "learning_rate": 1.1691953077692915e-05, + "loss": 0.0381, + "reward": 0.537946455180645, + "reward_std": 0.0645745899528265, + "rewards/accuracy_reward": 0.0491071455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1678 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.6562957763672, + "epoch": 0.5015308789485475, + "grad_norm": 3.214205026626587, + "kl": 1.0859375, + "learning_rate": 1.1681672282475495e-05, + "loss": 0.044, + "reward": 0.572544664144516, + "reward_std": 0.08676945231854916, + "rewards/accuracy_reward": 0.07812500465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1679 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.4241638183594, + "epoch": 0.501829587036069, + "grad_norm": 6.7926106452941895, + "kl": 2.0869140625, + "learning_rate": 1.1671389657763457e-05, + "loss": 0.0889, + "reward": 0.5803571790456772, + "reward_std": 0.1426965482532978, + "rewards/accuracy_reward": 0.09151786379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888392984867096, + "step": 1680 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.7098541259766, + "epoch": 0.5021282951235905, + "grad_norm": 3.997464418411255, + "kl": 1.720703125, + "learning_rate": 1.166110521474328e-05, + "loss": 0.0689, + "reward": 0.604910746216774, + "reward_std": 0.10167843289673328, + "rewards/accuracy_reward": 0.11383929196745157, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 1681 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.5982513427734, + "epoch": 0.5024270032111119, + "grad_norm": 0.8138920068740845, + "kl": 0.845703125, + "learning_rate": 1.1650818964603439e-05, + "loss": 0.0334, + "reward": 0.5491071790456772, + "reward_std": 0.07879290822893381, + "rewards/accuracy_reward": 0.055803572526201606, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 1682 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.4687805175781, + "epoch": 0.5027257112986334, + "grad_norm": 0.6057121753692627, + "kl": 0.57568359375, + "learning_rate": 1.1640530918534361e-05, + "loss": 0.0302, + "reward": 0.5970982313156128, + "reward_std": 0.13283684593625367, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1683 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.1116485595703, + "epoch": 0.5030244193861548, + "grad_norm": 1.7178425788879395, + "kl": 0.45849609375, + "learning_rate": 1.163024108772843e-05, + "loss": 0.024, + "reward": 0.6718750447034836, + "reward_std": 0.12491570319980383, + "rewards/accuracy_reward": 0.18080357951112092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1684 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.6049499511719, + "epoch": 0.5033231274736764, + "grad_norm": 2.624199867248535, + "kl": 0.3798828125, + "learning_rate": 1.161994948337998e-05, + "loss": 0.0192, + "reward": 0.5429687649011612, + "reward_std": 0.09363541565835476, + "rewards/accuracy_reward": 0.05357143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 1685 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.3995819091797, + "epoch": 0.5036218355611978, + "grad_norm": 1.449641466140747, + "kl": 0.4345703125, + "learning_rate": 1.1609656116685265e-05, + "loss": 0.0189, + "reward": 0.5613839626312256, + "reward_std": 0.08836214663460851, + "rewards/accuracy_reward": 0.06919643213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1686 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.7946929931641, + "epoch": 0.5039205436487193, + "grad_norm": 0.6753060817718506, + "kl": 0.3271484375, + "learning_rate": 1.1599360998842454e-05, + "loss": 0.0148, + "reward": 0.541294664144516, + "reward_std": 0.09506954159587622, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1687 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.6049346923828, + "epoch": 0.5042192517362407, + "grad_norm": 0.22243854403495789, + "kl": 0.40478515625, + "learning_rate": 1.1589064141051633e-05, + "loss": 0.0129, + "reward": 0.6132812798023224, + "reward_std": 0.10988576337695122, + "rewards/accuracy_reward": 0.1183035783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1688 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.3214721679688, + "epoch": 0.5045179598237622, + "grad_norm": 0.7962387800216675, + "kl": 0.5908203125, + "learning_rate": 1.1578765554514772e-05, + "loss": 0.0234, + "reward": 0.5664062798023224, + "reward_std": 0.08561086957342923, + "rewards/accuracy_reward": 0.07366071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1689 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.5357666015625, + "epoch": 0.5048166679112837, + "grad_norm": 0.34496259689331055, + "kl": 0.583984375, + "learning_rate": 1.1568465250435725e-05, + "loss": 0.0252, + "reward": 0.5736607313156128, + "reward_std": 0.05662465398199856, + "rewards/accuracy_reward": 0.08258928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1690 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.0022735595703, + "epoch": 0.5051153759988052, + "grad_norm": 0.7316872477531433, + "kl": 0.640625, + "learning_rate": 1.1558163240020209e-05, + "loss": 0.0295, + "reward": 0.589285746216774, + "reward_std": 0.09732652548700571, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 1691 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.4486999511719, + "epoch": 0.5054140840863266, + "grad_norm": 0.4074510931968689, + "kl": 0.666015625, + "learning_rate": 1.1547859534475805e-05, + "loss": 0.0305, + "reward": 0.5837053880095482, + "reward_std": 0.08523060381412506, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553880095482, + "step": 1692 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.8236999511719, + "epoch": 0.5057127921738481, + "grad_norm": 0.4925979673862457, + "kl": 0.5615234375, + "learning_rate": 1.1537554145011932e-05, + "loss": 0.0248, + "reward": 0.5440848469734192, + "reward_std": 0.09182743029668927, + "rewards/accuracy_reward": 0.05133928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1693 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.7411193847656, + "epoch": 0.5060115002613695, + "grad_norm": 1.3921937942504883, + "kl": 0.921875, + "learning_rate": 1.152724708283985e-05, + "loss": 0.0389, + "reward": 0.6651785969734192, + "reward_std": 0.10250518284738064, + "rewards/accuracy_reward": 0.1763392947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1694 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.5201416015625, + "epoch": 0.5063102083488911, + "grad_norm": 0.22705447673797607, + "kl": 0.53515625, + "learning_rate": 1.1516938359172624e-05, + "loss": 0.024, + "reward": 0.6060268133878708, + "reward_std": 0.11292771808803082, + "rewards/accuracy_reward": 0.11160715227015316, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1695 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.6473541259766, + "epoch": 0.5066089164364125, + "grad_norm": 0.2738815248012543, + "kl": 0.6650390625, + "learning_rate": 1.150662798522514e-05, + "loss": 0.0299, + "reward": 0.5937500447034836, + "reward_std": 0.11096903029829264, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 1696 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.4643402099609, + "epoch": 0.506907624523934, + "grad_norm": 0.42758992314338684, + "kl": 0.7060546875, + "learning_rate": 1.1496315972214076e-05, + "loss": 0.0289, + "reward": 0.5926339626312256, + "reward_std": 0.11386402696371078, + "rewards/accuracy_reward": 0.10267857392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1697 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.060302734375, + "epoch": 0.5072063326114554, + "grad_norm": 0.795799732208252, + "kl": 0.794921875, + "learning_rate": 1.1486002331357887e-05, + "loss": 0.0331, + "reward": 0.595982164144516, + "reward_std": 0.09357129223644733, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1698 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.497802734375, + "epoch": 0.507505040698977, + "grad_norm": 0.6346880197525024, + "kl": 0.724609375, + "learning_rate": 1.1475687073876806e-05, + "loss": 0.0343, + "reward": 0.6015625298023224, + "reward_std": 0.11401769146323204, + "rewards/accuracy_reward": 0.11160714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 1699 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.2344207763672, + "epoch": 0.5078037487864984, + "grad_norm": 0.7635748386383057, + "kl": 0.9521484375, + "learning_rate": 1.146537021099282e-05, + "loss": 0.039, + "reward": 0.5435267984867096, + "reward_std": 0.12238951586186886, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 1700 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.5647735595703, + "epoch": 0.5081024568740199, + "grad_norm": 0.397711843252182, + "kl": 1.08740234375, + "learning_rate": 1.1455051753929668e-05, + "loss": 0.0483, + "reward": 0.6344866454601288, + "reward_std": 0.11987030319869518, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 1701 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.3013763427734, + "epoch": 0.5084011649615413, + "grad_norm": 1.0011160373687744, + "kl": 1.123046875, + "learning_rate": 1.1444731713912818e-05, + "loss": 0.0479, + "reward": 0.5502232387661934, + "reward_std": 0.08844944508746266, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1702 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.1361999511719, + "epoch": 0.5086998730490628, + "grad_norm": 0.5103711485862732, + "kl": 0.931640625, + "learning_rate": 1.1434410102169462e-05, + "loss": 0.0358, + "reward": 0.6534598618745804, + "reward_std": 0.07396410219371319, + "rewards/accuracy_reward": 0.16071429220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1703 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.1071929931641, + "epoch": 0.5089985811365842, + "grad_norm": 0.515558123588562, + "kl": 1.6220703125, + "learning_rate": 1.1424086929928502e-05, + "loss": 0.0699, + "reward": 0.5719866305589676, + "reward_std": 0.11252858163788915, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687649011612, + "step": 1704 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.1473693847656, + "epoch": 0.5092972892241058, + "grad_norm": 0.5198972225189209, + "kl": 0.9814453125, + "learning_rate": 1.1413762208420536e-05, + "loss": 0.0403, + "reward": 0.588169664144516, + "reward_std": 0.07344307564198971, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1705 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.8460235595703, + "epoch": 0.5095959973116272, + "grad_norm": 0.38083237409591675, + "kl": 0.74609375, + "learning_rate": 1.1403435948877855e-05, + "loss": 0.0362, + "reward": 0.5485491305589676, + "reward_std": 0.12412650743499398, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1706 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.4174652099609, + "epoch": 0.5098947053991487, + "grad_norm": 0.3164600133895874, + "kl": 0.759765625, + "learning_rate": 1.139310816253441e-05, + "loss": 0.0306, + "reward": 0.5485491305589676, + "reward_std": 0.06589056784287095, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1707 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.8460235595703, + "epoch": 0.5101934134866701, + "grad_norm": 0.3265995383262634, + "kl": 0.53369140625, + "learning_rate": 1.1382778860625826e-05, + "loss": 0.0272, + "reward": 0.5876116454601288, + "reward_std": 0.1110578179359436, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1708 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.7076263427734, + "epoch": 0.5104921215741917, + "grad_norm": 0.6935426592826843, + "kl": 0.70751953125, + "learning_rate": 1.1372448054389364e-05, + "loss": 0.0272, + "reward": 0.6735491305589676, + "reward_std": 0.15019158646464348, + "rewards/accuracy_reward": 0.18303572572767735, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 1709 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.8995971679688, + "epoch": 0.5107908296617131, + "grad_norm": 0.37991589307785034, + "kl": 0.552734375, + "learning_rate": 1.1362115755063936e-05, + "loss": 0.0256, + "reward": 0.6651785969734192, + "reward_std": 0.09491350990720093, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1710 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.9598693847656, + "epoch": 0.5110895377492346, + "grad_norm": 1.3429983854293823, + "kl": 0.9326171875, + "learning_rate": 1.1351781973890068e-05, + "loss": 0.0365, + "reward": 0.6372768133878708, + "reward_std": 0.15072827599942684, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 1711 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.4888763427734, + "epoch": 0.511388245836756, + "grad_norm": 0.4006635248661041, + "kl": 0.52685546875, + "learning_rate": 1.1341446722109901e-05, + "loss": 0.0199, + "reward": 0.588169664144516, + "reward_std": 0.08439227612689137, + "rewards/accuracy_reward": 0.09375000605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1712 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.1250305175781, + "epoch": 0.5116869539242775, + "grad_norm": 0.3039073646068573, + "kl": 0.513671875, + "learning_rate": 1.1331110010967177e-05, + "loss": 0.0232, + "reward": 0.5652902126312256, + "reward_std": 0.05696803121827543, + "rewards/accuracy_reward": 0.06919643003493547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1713 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3750457763672, + "epoch": 0.511985662011799, + "grad_norm": 0.4899868369102478, + "kl": 0.359375, + "learning_rate": 1.1320771851707225e-05, + "loss": 0.0133, + "reward": 0.5820312649011612, + "reward_std": 0.07333930325694382, + "rewards/accuracy_reward": 0.08705357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1714 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.9821929931641, + "epoch": 0.5122843700993205, + "grad_norm": 0.2491191029548645, + "kl": 0.248291015625, + "learning_rate": 1.1310432255576944e-05, + "loss": 0.007, + "reward": 0.6529018133878708, + "reward_std": 0.17304034531116486, + "rewards/accuracy_reward": 0.1562500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1715 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.1987152099609, + "epoch": 0.5125830781868419, + "grad_norm": 1.117187738418579, + "kl": 0.3017578125, + "learning_rate": 1.1300091233824806e-05, + "loss": 0.017, + "reward": 0.5273437798023224, + "reward_std": 0.059838708024472, + "rewards/accuracy_reward": 0.0334821455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1716 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.9844055175781, + "epoch": 0.5128817862743634, + "grad_norm": 1.2658030986785889, + "kl": 0.36962890625, + "learning_rate": 1.128974879770083e-05, + "loss": 0.0119, + "reward": 0.6300223469734192, + "reward_std": 0.0876702656969428, + "rewards/accuracy_reward": 0.14062500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 1717 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.8995971679688, + "epoch": 0.5131804943618848, + "grad_norm": 0.4037001132965088, + "kl": 0.3583984375, + "learning_rate": 1.1279404958456572e-05, + "loss": 0.0116, + "reward": 0.5781250298023224, + "reward_std": 0.10597921907901764, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1718 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.2879943847656, + "epoch": 0.5134792024494064, + "grad_norm": 1.0373485088348389, + "kl": 0.7978515625, + "learning_rate": 1.1269059727345111e-05, + "loss": 0.0315, + "reward": 0.6607143133878708, + "reward_std": 0.10751155950129032, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1719 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.4486999511719, + "epoch": 0.5137779105369278, + "grad_norm": 0.6874276399612427, + "kl": 0.755859375, + "learning_rate": 1.1258713115621051e-05, + "loss": 0.0295, + "reward": 0.5401786044239998, + "reward_std": 0.10998388705775142, + "rewards/accuracy_reward": 0.04687500186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1720 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.8281707763672, + "epoch": 0.5140766186244493, + "grad_norm": 0.6956325173377991, + "kl": 0.9384765625, + "learning_rate": 1.1248365134540489e-05, + "loss": 0.0393, + "reward": 0.5340401977300644, + "reward_std": 0.09529023547656834, + "rewards/accuracy_reward": 0.044642860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 1721 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.3214721679688, + "epoch": 0.5143753267119707, + "grad_norm": 0.466769278049469, + "kl": 0.71728515625, + "learning_rate": 1.1238015795361011e-05, + "loss": 0.0255, + "reward": 0.544642873108387, + "reward_std": 0.09179880190640688, + "rewards/accuracy_reward": 0.053571431431919336, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1722 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.5714721679688, + "epoch": 0.5146740347994921, + "grad_norm": 0.9776527285575867, + "kl": 0.98388671875, + "learning_rate": 1.1227665109341686e-05, + "loss": 0.0315, + "reward": 0.5870535895228386, + "reward_std": 0.07572437357157469, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071566939354, + "step": 1723 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.5178833007812, + "epoch": 0.5149727428870137, + "grad_norm": 1.2232933044433594, + "kl": 1.72265625, + "learning_rate": 1.1217313087743048e-05, + "loss": 0.0592, + "reward": 0.5708705633878708, + "reward_std": 0.09161391854286194, + "rewards/accuracy_reward": 0.09151786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.479352705180645, + "step": 1724 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.9442291259766, + "epoch": 0.5152714509745351, + "grad_norm": 2.88553524017334, + "kl": 1.92578125, + "learning_rate": 1.1206959741827079e-05, + "loss": 0.0816, + "reward": 0.5619419813156128, + "reward_std": 0.11059664003551006, + "rewards/accuracy_reward": 0.08035714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848469734192, + "step": 1725 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.7879791259766, + "epoch": 0.5155701590620566, + "grad_norm": 1.2154606580734253, + "kl": 1.33984375, + "learning_rate": 1.1196605082857204e-05, + "loss": 0.0562, + "reward": 0.5585937798023224, + "reward_std": 0.1207724753767252, + "rewards/accuracy_reward": 0.06919642887078226, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 1726 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.1094207763672, + "epoch": 0.515868867149578, + "grad_norm": 0.785162627696991, + "kl": 1.1533203125, + "learning_rate": 1.1186249122098282e-05, + "loss": 0.039, + "reward": 0.5195312723517418, + "reward_std": 0.05279730586335063, + "rewards/accuracy_reward": 0.0379464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 1727 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.6942291259766, + "epoch": 0.5161675752370996, + "grad_norm": 0.695033073425293, + "kl": 0.7265625, + "learning_rate": 1.117589187081658e-05, + "loss": 0.0329, + "reward": 0.6679687798023224, + "reward_std": 0.12771194241940975, + "rewards/accuracy_reward": 0.1741071492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1728 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.6428985595703, + "epoch": 0.516466283324621, + "grad_norm": 0.4374649226665497, + "kl": 0.455078125, + "learning_rate": 1.1165533340279771e-05, + "loss": 0.019, + "reward": 0.5591517984867096, + "reward_std": 0.06978954188525677, + "rewards/accuracy_reward": 0.06473214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1729 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.6540679931641, + "epoch": 0.5167649914121425, + "grad_norm": 0.8800238370895386, + "kl": 0.45751953125, + "learning_rate": 1.115517354175692e-05, + "loss": 0.0191, + "reward": 0.6992187798023224, + "reward_std": 0.06995145604014397, + "rewards/accuracy_reward": 0.2053571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1730 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.7522735595703, + "epoch": 0.5170636994996639, + "grad_norm": 0.2019539475440979, + "kl": 0.40087890625, + "learning_rate": 1.1144812486518478e-05, + "loss": 0.0153, + "reward": 0.598772332072258, + "reward_std": 0.10098286019638181, + "rewards/accuracy_reward": 0.10491072107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1731 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.8214721679688, + "epoch": 0.5173624075871854, + "grad_norm": 0.30168241262435913, + "kl": 0.38818359375, + "learning_rate": 1.1134450185836254e-05, + "loss": 0.0144, + "reward": 0.593191996216774, + "reward_std": 0.12047781329602003, + "rewards/accuracy_reward": 0.09821429010480642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1732 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.9621124267578, + "epoch": 0.5176611156747069, + "grad_norm": 0.44964373111724854, + "kl": 0.32861328125, + "learning_rate": 1.1124086650983415e-05, + "loss": 0.0132, + "reward": 0.5926339626312256, + "reward_std": 0.0869055949151516, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1733 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.6495971679688, + "epoch": 0.5179598237622284, + "grad_norm": 0.5879350304603577, + "kl": 0.265625, + "learning_rate": 1.1113721893234472e-05, + "loss": 0.0146, + "reward": 0.6529018133878708, + "reward_std": 0.05401732702739537, + "rewards/accuracy_reward": 0.1562500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1734 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.2277374267578, + "epoch": 0.5182585318497498, + "grad_norm": 0.3128017485141754, + "kl": 0.390625, + "learning_rate": 1.1103355923865266e-05, + "loss": 0.0163, + "reward": 0.581473246216774, + "reward_std": 0.04587844503112137, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1735 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.4085235595703, + "epoch": 0.5185572399372713, + "grad_norm": 0.3167850077152252, + "kl": 0.4912109375, + "learning_rate": 1.1092988754152956e-05, + "loss": 0.019, + "reward": 0.6367187798023224, + "reward_std": 0.17689238488674164, + "rewards/accuracy_reward": 0.14508929662406445, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1736 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.9844207763672, + "epoch": 0.5188559480247927, + "grad_norm": 0.7631819248199463, + "kl": 0.478515625, + "learning_rate": 1.1082620395376006e-05, + "loss": 0.019, + "reward": 0.5390625298023224, + "reward_std": 0.07937385747209191, + "rewards/accuracy_reward": 0.0468750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1737 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3884429931641, + "epoch": 0.5191546561123143, + "grad_norm": 0.5349926352500916, + "kl": 0.5810546875, + "learning_rate": 1.1072250858814173e-05, + "loss": 0.0231, + "reward": 0.5323660969734192, + "reward_std": 0.10938720218837261, + "rewards/accuracy_reward": 0.037946430733427405, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1738 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.4085388183594, + "epoch": 0.5194533641998357, + "grad_norm": 0.9242334365844727, + "kl": 1.201171875, + "learning_rate": 1.1061880155748497e-05, + "loss": 0.0541, + "reward": 0.5786830633878708, + "reward_std": 0.0691300043836236, + "rewards/accuracy_reward": 0.09151786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651902794838, + "step": 1739 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.6897888183594, + "epoch": 0.5197520722873572, + "grad_norm": 1.065272331237793, + "kl": 0.81591796875, + "learning_rate": 1.1051508297461286e-05, + "loss": 0.0228, + "reward": 0.5998884215950966, + "reward_std": 0.09270904306322336, + "rewards/accuracy_reward": 0.10937500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 1740 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.6518402099609, + "epoch": 0.5200507803748786, + "grad_norm": 0.7584066390991211, + "kl": 0.5791015625, + "learning_rate": 1.104113529523611e-05, + "loss": 0.0254, + "reward": 0.6054687798023224, + "reward_std": 0.07794129988178611, + "rewards/accuracy_reward": 0.11383929033763707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 1741 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.8638763427734, + "epoch": 0.5203494884624001, + "grad_norm": 1.5826950073242188, + "kl": 0.6474609375, + "learning_rate": 1.1030761160357773e-05, + "loss": 0.0316, + "reward": 0.5703125298023224, + "reward_std": 0.13345038518309593, + "rewards/accuracy_reward": 0.08482143329456449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 1742 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.6830749511719, + "epoch": 0.5206481965499216, + "grad_norm": 0.4319549798965454, + "kl": 0.552001953125, + "learning_rate": 1.1020385904112318e-05, + "loss": 0.022, + "reward": 0.5267857313156128, + "reward_std": 0.07233788073062897, + "rewards/accuracy_reward": 0.0379464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1743 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.5067291259766, + "epoch": 0.5209469046374431, + "grad_norm": 0.4642598032951355, + "kl": 0.83935546875, + "learning_rate": 1.101000953778701e-05, + "loss": 0.033, + "reward": 0.5552455633878708, + "reward_std": 0.09997626626864076, + "rewards/accuracy_reward": 0.06919643213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 1744 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.185302734375, + "epoch": 0.5212456127249645, + "grad_norm": 0.508354902267456, + "kl": 0.9990234375, + "learning_rate": 1.0999632072670314e-05, + "loss": 0.0388, + "reward": 0.5541294813156128, + "reward_std": 0.11456367000937462, + "rewards/accuracy_reward": 0.06919643003493547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 1745 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.8683471679688, + "epoch": 0.521544320812486, + "grad_norm": 0.5406972765922546, + "kl": 1.6171875, + "learning_rate": 1.0989253520051898e-05, + "loss": 0.0677, + "reward": 0.6361607387661934, + "reward_std": 0.11081078741699457, + "rewards/accuracy_reward": 0.16294643771834671, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143059372902, + "step": 1746 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.9420013427734, + "epoch": 0.5218430289000074, + "grad_norm": 1.4337260723114014, + "kl": 1.662109375, + "learning_rate": 1.097887389122261e-05, + "loss": 0.0709, + "reward": 0.5502232313156128, + "reward_std": 0.085924182087183, + "rewards/accuracy_reward": 0.0736607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4765625223517418, + "step": 1747 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.7165679931641, + "epoch": 0.522141736987529, + "grad_norm": 2.3228495121002197, + "kl": 2.55078125, + "learning_rate": 1.0968493197474469e-05, + "loss": 0.1027, + "reward": 0.4648437723517418, + "reward_std": 0.08961001597344875, + "rewards/accuracy_reward": 0.0022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4626116305589676, + "step": 1748 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.8817443847656, + "epoch": 0.5224404450750504, + "grad_norm": 1.5953247547149658, + "kl": 1.978515625, + "learning_rate": 1.095811145010065e-05, + "loss": 0.094, + "reward": 0.5530133992433548, + "reward_std": 0.12435757927596569, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.470424123108387, + "step": 1749 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.5067291259766, + "epoch": 0.5227391531625719, + "grad_norm": 1.0949406623840332, + "kl": 2.46875, + "learning_rate": 1.094772866039548e-05, + "loss": 0.0982, + "reward": 0.4916294813156128, + "reward_std": 0.13675987347960472, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4536830484867096, + "step": 1750 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.560302734375, + "epoch": 0.5230378612500933, + "grad_norm": 2.7015552520751953, + "kl": 1.607421875, + "learning_rate": 1.0937344839654416e-05, + "loss": 0.0642, + "reward": 0.4843750149011612, + "reward_std": 0.0996945770457387, + "rewards/accuracy_reward": 0.015625000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4687500223517418, + "step": 1751 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.5111999511719, + "epoch": 0.5233365693376149, + "grad_norm": 1.4783365726470947, + "kl": 1.3720703125, + "learning_rate": 1.0926959999174032e-05, + "loss": 0.0475, + "reward": 0.601004496216774, + "reward_std": 0.14928723126649857, + "rewards/accuracy_reward": 0.12276786426082253, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4782366305589676, + "step": 1752 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.5848693847656, + "epoch": 0.5236352774251363, + "grad_norm": 0.7711188197135925, + "kl": 0.95703125, + "learning_rate": 1.0916574150252024e-05, + "loss": 0.0486, + "reward": 0.612723246216774, + "reward_std": 0.15982291661202908, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 1753 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.0491333007812, + "epoch": 0.5239339855126578, + "grad_norm": 0.8932147026062012, + "kl": 0.77734375, + "learning_rate": 1.0906187304187175e-05, + "loss": 0.0367, + "reward": 0.570312537252903, + "reward_std": 0.10029816906899214, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 1754 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.7857513427734, + "epoch": 0.5242326936001792, + "grad_norm": 0.8007723689079285, + "kl": 0.466796875, + "learning_rate": 1.0895799472279351e-05, + "loss": 0.0211, + "reward": 0.5904018133878708, + "reward_std": 0.06987339747138321, + "rewards/accuracy_reward": 0.09598214970901608, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1755 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.075927734375, + "epoch": 0.5245314016877007, + "grad_norm": 1.2058749198913574, + "kl": 0.372802734375, + "learning_rate": 1.0885410665829503e-05, + "loss": 0.0152, + "reward": 0.5937500298023224, + "reward_std": 0.08664587885141373, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1756 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.3616485595703, + "epoch": 0.5248301097752222, + "grad_norm": 0.3912625014781952, + "kl": 0.32421875, + "learning_rate": 1.087502089613963e-05, + "loss": 0.0137, + "reward": 0.6601562798023224, + "reward_std": 0.12451834976673126, + "rewards/accuracy_reward": 0.16294643841683865, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1757 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.2991485595703, + "epoch": 0.5251288178627437, + "grad_norm": 0.7383750081062317, + "kl": 0.3876953125, + "learning_rate": 1.0864630174512783e-05, + "loss": 0.0167, + "reward": 0.6093750149011612, + "reward_std": 0.08145249914377928, + "rewards/accuracy_reward": 0.11160714412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1758 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.2545013427734, + "epoch": 0.5254275259502651, + "grad_norm": 0.29108574986457825, + "kl": 0.26123046875, + "learning_rate": 1.0854238512253045e-05, + "loss": 0.0126, + "reward": 0.6238839477300644, + "reward_std": 0.13876312598586082, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1759 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.4464874267578, + "epoch": 0.5257262340377866, + "grad_norm": 0.30329543352127075, + "kl": 0.30859375, + "learning_rate": 1.0843845920665534e-05, + "loss": 0.009, + "reward": 0.6679687798023224, + "reward_std": 0.13026218116283417, + "rewards/accuracy_reward": 0.1718750111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1760 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.6741485595703, + "epoch": 0.526024942125308, + "grad_norm": 0.22957250475883484, + "kl": 0.22607421875, + "learning_rate": 1.0833452411056366e-05, + "loss": 0.0089, + "reward": 0.5742187649011612, + "reward_std": 0.08882723236456513, + "rewards/accuracy_reward": 0.07812500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1761 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.1451416015625, + "epoch": 0.5263236502128296, + "grad_norm": 0.30869260430336, + "kl": 0.310302734375, + "learning_rate": 1.0823057994732661e-05, + "loss": 0.0105, + "reward": 0.6556919813156128, + "reward_std": 0.08397351112216711, + "rewards/accuracy_reward": 0.16071429662406445, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 1762 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0446929931641, + "epoch": 0.526622358300351, + "grad_norm": 0.18453162908554077, + "kl": 0.233642578125, + "learning_rate": 1.0812662683002528e-05, + "loss": 0.01, + "reward": 0.6021205484867096, + "reward_std": 0.08917112648487091, + "rewards/accuracy_reward": 0.10714286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 1763 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.6228179931641, + "epoch": 0.5269210663878725, + "grad_norm": 0.19168178737163544, + "kl": 0.228515625, + "learning_rate": 1.0802266487175044e-05, + "loss": 0.009, + "reward": 0.5574776977300644, + "reward_std": 0.07554671471007168, + "rewards/accuracy_reward": 0.060267860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1764 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.8147888183594, + "epoch": 0.5272197744753939, + "grad_norm": 0.4351561665534973, + "kl": 0.262451171875, + "learning_rate": 1.0791869418560254e-05, + "loss": 0.0105, + "reward": 0.565848246216774, + "reward_std": 0.08799769449979067, + "rewards/accuracy_reward": 0.07366071571595967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1765 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.185302734375, + "epoch": 0.5275184825629153, + "grad_norm": 0.34768882393836975, + "kl": 0.238525390625, + "learning_rate": 1.0781471488469146e-05, + "loss": 0.0114, + "reward": 0.577008955180645, + "reward_std": 0.10247433371841908, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1766 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.341552734375, + "epoch": 0.5278171906504369, + "grad_norm": 0.5033169984817505, + "kl": 0.286376953125, + "learning_rate": 1.0771072708213652e-05, + "loss": 0.0096, + "reward": 0.5870535895228386, + "reward_std": 0.08019038289785385, + "rewards/accuracy_reward": 0.0959821492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1767 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.3281707763672, + "epoch": 0.5281158987379583, + "grad_norm": 0.17641501128673553, + "kl": 0.244384765625, + "learning_rate": 1.0760673089106626e-05, + "loss": 0.0098, + "reward": 0.5334821492433548, + "reward_std": 0.07069925009272993, + "rewards/accuracy_reward": 0.03794643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1768 + }, + { + "clip_ratio": 0.0, + "completion_length": 1020.7433471679688, + "epoch": 0.5284146068254798, + "grad_norm": 0.3218882977962494, + "kl": 0.218017578125, + "learning_rate": 1.075027264246183e-05, + "loss": 0.0082, + "reward": 0.6004464775323868, + "reward_std": 0.09436605125665665, + "rewards/accuracy_reward": 0.10491071944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1769 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.8103179931641, + "epoch": 0.5287133149130012, + "grad_norm": 0.13595154881477356, + "kl": 0.24755859375, + "learning_rate": 1.0739871379593935e-05, + "loss": 0.0126, + "reward": 0.6054687798023224, + "reward_std": 0.10194601211696863, + "rewards/accuracy_reward": 0.10714286495931447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1770 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.7098541259766, + "epoch": 0.5290120230005227, + "grad_norm": 0.3443881869316101, + "kl": 0.24462890625, + "learning_rate": 1.0729469311818496e-05, + "loss": 0.0105, + "reward": 0.6506696790456772, + "reward_std": 0.07046688464470208, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1771 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1495971679688, + "epoch": 0.5293107310880442, + "grad_norm": 0.5593899488449097, + "kl": 0.351806640625, + "learning_rate": 1.0719066450451943e-05, + "loss": 0.0104, + "reward": 0.6835937947034836, + "reward_std": 0.10986971482634544, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1772 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.0670166015625, + "epoch": 0.5296094391755657, + "grad_norm": 0.21820393204689026, + "kl": 0.40673828125, + "learning_rate": 1.0708662806811563e-05, + "loss": 0.0167, + "reward": 0.6138393133878708, + "reward_std": 0.0715361856855452, + "rewards/accuracy_reward": 0.11830357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1773 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.0759429931641, + "epoch": 0.5299081472630871, + "grad_norm": 0.25806277990341187, + "kl": 0.28369140625, + "learning_rate": 1.0698258392215508e-05, + "loss": 0.0069, + "reward": 0.5513392984867096, + "reward_std": 0.05495638074353337, + "rewards/accuracy_reward": 0.05357142956927419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1774 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.4130096435547, + "epoch": 0.5302068553506086, + "grad_norm": 0.22938120365142822, + "kl": 0.205322265625, + "learning_rate": 1.068785321798276e-05, + "loss": 0.0081, + "reward": 0.5937500298023224, + "reward_std": 0.07123850006610155, + "rewards/accuracy_reward": 0.09375000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1775 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.0803985595703, + "epoch": 0.53050556343813, + "grad_norm": 0.15095870196819305, + "kl": 0.21337890625, + "learning_rate": 1.0677447295433122e-05, + "loss": 0.0094, + "reward": 0.5976562649011612, + "reward_std": 0.08095496636815369, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1776 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.7455749511719, + "epoch": 0.5308042715256516, + "grad_norm": 0.1697116196155548, + "kl": 0.24267578125, + "learning_rate": 1.0667040635887231e-05, + "loss": 0.0103, + "reward": 0.6489955633878708, + "reward_std": 0.12680703774094582, + "rewards/accuracy_reward": 0.1517857238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1777 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.4129943847656, + "epoch": 0.531102979613173, + "grad_norm": 0.29716724157333374, + "kl": 0.330322265625, + "learning_rate": 1.0656633250666501e-05, + "loss": 0.0129, + "reward": 0.616629496216774, + "reward_std": 0.16107550263404846, + "rewards/accuracy_reward": 0.12276786752045155, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1778 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.8504943847656, + "epoch": 0.5314016877006945, + "grad_norm": 0.5011783838272095, + "kl": 0.356689453125, + "learning_rate": 1.0646225151093154e-05, + "loss": 0.0112, + "reward": 0.6344866156578064, + "reward_std": 0.0825378280133009, + "rewards/accuracy_reward": 0.14062500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1779 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.1071929931641, + "epoch": 0.5317003957882159, + "grad_norm": 0.17961527407169342, + "kl": 0.357421875, + "learning_rate": 1.0635816348490176e-05, + "loss": 0.0128, + "reward": 0.5145089626312256, + "reward_std": 0.06132920947857201, + "rewards/accuracy_reward": 0.0200892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1780 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.1406707763672, + "epoch": 0.5319991038757375, + "grad_norm": 0.22196440398693085, + "kl": 0.37646484375, + "learning_rate": 1.062540685418133e-05, + "loss": 0.0179, + "reward": 0.604910746216774, + "reward_std": 0.11240472830832005, + "rewards/accuracy_reward": 0.10937500186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1781 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.5580902099609, + "epoch": 0.5322978119632589, + "grad_norm": 0.3064962327480316, + "kl": 0.3271484375, + "learning_rate": 1.0614996679491123e-05, + "loss": 0.0148, + "reward": 0.6300223469734192, + "reward_std": 0.074855612590909, + "rewards/accuracy_reward": 0.13392857741564512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1782 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.2031707763672, + "epoch": 0.5325965200507804, + "grad_norm": 0.17942877113819122, + "kl": 0.27685546875, + "learning_rate": 1.0604585835744802e-05, + "loss": 0.0089, + "reward": 0.6210937798023224, + "reward_std": 0.12644444149918854, + "rewards/accuracy_reward": 0.1250000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1783 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.6429138183594, + "epoch": 0.5328952281383018, + "grad_norm": 0.7565726041793823, + "kl": 0.47021484375, + "learning_rate": 1.0594174334268352e-05, + "loss": 0.0178, + "reward": 0.5859375149011612, + "reward_std": 0.10440582688897848, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1784 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.3348541259766, + "epoch": 0.5331939362258233, + "grad_norm": 0.1763266623020172, + "kl": 0.308349609375, + "learning_rate": 1.058376218638846e-05, + "loss": 0.0119, + "reward": 0.6925223618745804, + "reward_std": 0.09341863729059696, + "rewards/accuracy_reward": 0.196428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1785 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.7545013427734, + "epoch": 0.5334926443133448, + "grad_norm": 0.27993905544281006, + "kl": 0.46826171875, + "learning_rate": 1.0573349403432524e-05, + "loss": 0.0174, + "reward": 0.5920759290456772, + "reward_std": 0.09109870437532663, + "rewards/accuracy_reward": 0.10044643329456449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1786 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.3192291259766, + "epoch": 0.5337913524008663, + "grad_norm": 0.19920144975185394, + "kl": 0.3642578125, + "learning_rate": 1.0562935996728629e-05, + "loss": 0.0095, + "reward": 0.6372768133878708, + "reward_std": 0.07225801050662994, + "rewards/accuracy_reward": 0.14285714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1787 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.3080749511719, + "epoch": 0.5340900604883877, + "grad_norm": 0.7259738445281982, + "kl": 0.4541015625, + "learning_rate": 1.0552521977605546e-05, + "loss": 0.0187, + "reward": 0.627232164144516, + "reward_std": 0.08890492375940084, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1788 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.7366638183594, + "epoch": 0.5343887685759092, + "grad_norm": 0.6533480882644653, + "kl": 0.51513671875, + "learning_rate": 1.0542107357392704e-05, + "loss": 0.0214, + "reward": 0.628348246216774, + "reward_std": 0.0959056206047535, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1789 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.8348541259766, + "epoch": 0.5346874766634306, + "grad_norm": 0.5748757123947144, + "kl": 0.316650390625, + "learning_rate": 1.0531692147420187e-05, + "loss": 0.0123, + "reward": 0.6160714477300644, + "reward_std": 0.07241878844797611, + "rewards/accuracy_reward": 0.1183035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1790 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.6250457763672, + "epoch": 0.5349861847509522, + "grad_norm": 0.36072203516960144, + "kl": 0.329345703125, + "learning_rate": 1.0521276359018728e-05, + "loss": 0.0149, + "reward": 0.5435267984867096, + "reward_std": 0.05096272588707507, + "rewards/accuracy_reward": 0.04687500209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1791 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.7879791259766, + "epoch": 0.5352848928384736, + "grad_norm": 0.24038079380989075, + "kl": 0.32666015625, + "learning_rate": 1.0510860003519681e-05, + "loss": 0.0097, + "reward": 0.5424107313156128, + "reward_std": 0.10791720636188984, + "rewards/accuracy_reward": 0.046875002793967724, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1792 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.9687957763672, + "epoch": 0.5355836009259951, + "grad_norm": 0.3686796724796295, + "kl": 0.3427734375, + "learning_rate": 1.0500443092255017e-05, + "loss": 0.0103, + "reward": 0.6875000298023224, + "reward_std": 0.14182815700769424, + "rewards/accuracy_reward": 0.19419644260779023, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1793 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.7991638183594, + "epoch": 0.5358823090135165, + "grad_norm": 0.14972293376922607, + "kl": 0.24267578125, + "learning_rate": 1.049002563655732e-05, + "loss": 0.0084, + "reward": 0.5502232313156128, + "reward_std": 0.07238334510475397, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1794 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.2701416015625, + "epoch": 0.536181017101038, + "grad_norm": 1.2292002439498901, + "kl": 0.280517578125, + "learning_rate": 1.0479607647759755e-05, + "loss": 0.0087, + "reward": 0.710379496216774, + "reward_std": 0.13756313547492027, + "rewards/accuracy_reward": 0.21875000861473382, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491629496216774, + "step": 1795 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.1339569091797, + "epoch": 0.5364797251885595, + "grad_norm": 0.5090327262878418, + "kl": 0.29443359375, + "learning_rate": 1.0469189137196081e-05, + "loss": 0.0094, + "reward": 0.6054687649011612, + "reward_std": 0.13374659605324268, + "rewards/accuracy_reward": 0.11160714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 1796 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.0893249511719, + "epoch": 0.536778433276081, + "grad_norm": 0.4044405519962311, + "kl": 0.265625, + "learning_rate": 1.0458770116200605e-05, + "loss": 0.0108, + "reward": 0.6227678805589676, + "reward_std": 0.04869657987728715, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1797 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.7611999511719, + "epoch": 0.5370771413636024, + "grad_norm": 0.23033900558948517, + "kl": 0.36474609375, + "learning_rate": 1.044835059610821e-05, + "loss": 0.0145, + "reward": 0.5680803805589676, + "reward_std": 0.08423417364247143, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1798 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.9844207763672, + "epoch": 0.5373758494511239, + "grad_norm": 0.5838013887405396, + "kl": 0.41650390625, + "learning_rate": 1.043793058825431e-05, + "loss": 0.0126, + "reward": 0.573660746216774, + "reward_std": 0.10861829482018948, + "rewards/accuracy_reward": 0.08035714644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 1799 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.5491638183594, + "epoch": 0.5376745575386453, + "grad_norm": 0.5061129331588745, + "kl": 0.407958984375, + "learning_rate": 1.0427510103974853e-05, + "loss": 0.0144, + "reward": 0.5418527126312256, + "reward_std": 0.09268520586192608, + "rewards/accuracy_reward": 0.04464285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1800 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.6518402099609, + "epoch": 0.5379732656261669, + "grad_norm": 0.8912555575370789, + "kl": 0.4111328125, + "learning_rate": 1.0417089154606299e-05, + "loss": 0.018, + "reward": 0.589285746216774, + "reward_std": 0.057009545154869556, + "rewards/accuracy_reward": 0.09151786053553224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1801 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.1674499511719, + "epoch": 0.5382719737136883, + "grad_norm": 0.3210875391960144, + "kl": 0.4404296875, + "learning_rate": 1.0406667751485628e-05, + "loss": 0.0149, + "reward": 0.631138414144516, + "reward_std": 0.09655703091993928, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1802 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.9420166015625, + "epoch": 0.5385706818012098, + "grad_norm": 0.2454737424850464, + "kl": 0.37353515625, + "learning_rate": 1.03962459059503e-05, + "loss": 0.017, + "reward": 0.5853794813156128, + "reward_std": 0.07422399893403053, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1803 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.2455902099609, + "epoch": 0.5388693898887312, + "grad_norm": 0.7255860567092896, + "kl": 0.4580078125, + "learning_rate": 1.0385823629338262e-05, + "loss": 0.0207, + "reward": 0.5379464626312256, + "reward_std": 0.08060770714655519, + "rewards/accuracy_reward": 0.042410715483129025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1804 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.2545166015625, + "epoch": 0.5391680979762528, + "grad_norm": 0.20405496656894684, + "kl": 0.3759765625, + "learning_rate": 1.0375400932987932e-05, + "loss": 0.0087, + "reward": 0.5212053656578064, + "reward_std": 0.07488959655165672, + "rewards/accuracy_reward": 0.024553572293370962, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1805 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.5223693847656, + "epoch": 0.5394668060637742, + "grad_norm": 0.20951195061206818, + "kl": 0.353515625, + "learning_rate": 1.0364977828238176e-05, + "loss": 0.0128, + "reward": 0.5747767984867096, + "reward_std": 0.09775221440941095, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1806 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.2366485595703, + "epoch": 0.5397655141512957, + "grad_norm": 0.2046414613723755, + "kl": 0.296875, + "learning_rate": 1.0354554326428319e-05, + "loss": 0.0085, + "reward": 0.5898437798023224, + "reward_std": 0.08755934843793511, + "rewards/accuracy_reward": 0.0937500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1807 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.9040679931641, + "epoch": 0.5400642222388171, + "grad_norm": 3.240557909011841, + "kl": 0.42724609375, + "learning_rate": 1.0344130438898101e-05, + "loss": 0.0168, + "reward": 0.553571455180645, + "reward_std": 0.10575323738157749, + "rewards/accuracy_reward": 0.07142857392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 1808 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.0268402099609, + "epoch": 0.5403629303263385, + "grad_norm": 0.29672637581825256, + "kl": 0.30029296875, + "learning_rate": 1.0333706176987697e-05, + "loss": 0.0121, + "reward": 0.5641741305589676, + "reward_std": 0.09099002834409475, + "rewards/accuracy_reward": 0.06919643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1809 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.7165679931641, + "epoch": 0.5406616384138601, + "grad_norm": 0.25462785363197327, + "kl": 0.2470703125, + "learning_rate": 1.0323281552037678e-05, + "loss": 0.0096, + "reward": 0.5312500149011612, + "reward_std": 0.08210907061584294, + "rewards/accuracy_reward": 0.03571428777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1810 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.8192291259766, + "epoch": 0.5409603465013815, + "grad_norm": 0.832489013671875, + "kl": 0.33447265625, + "learning_rate": 1.0312856575389016e-05, + "loss": 0.0131, + "reward": 0.6060268133878708, + "reward_std": 0.08489973843097687, + "rewards/accuracy_reward": 0.1138392889406532, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 1811 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.2701416015625, + "epoch": 0.541259054588903, + "grad_norm": 0.31855231523513794, + "kl": 0.37646484375, + "learning_rate": 1.0302431258383062e-05, + "loss": 0.0167, + "reward": 0.568638414144516, + "reward_std": 0.12541467137634754, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1812 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.1138916015625, + "epoch": 0.5415577626764244, + "grad_norm": 0.33124521374702454, + "kl": 0.41259765625, + "learning_rate": 1.0292005612361542e-05, + "loss": 0.0147, + "reward": 0.5887276977300644, + "reward_std": 0.11575499176979065, + "rewards/accuracy_reward": 0.09598214528523386, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 1813 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.9553833007812, + "epoch": 0.5418564707639459, + "grad_norm": 0.2632022202014923, + "kl": 0.5322265625, + "learning_rate": 1.0281579648666533e-05, + "loss": 0.0218, + "reward": 0.5641741305589676, + "reward_std": 0.08191783027723432, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1814 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.4286041259766, + "epoch": 0.5421551788514674, + "grad_norm": 0.516929030418396, + "kl": 0.51416015625, + "learning_rate": 1.0271153378640464e-05, + "loss": 0.0216, + "reward": 0.5747768133878708, + "reward_std": 0.11607218720018864, + "rewards/accuracy_reward": 0.08258929033763707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1815 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.4955749511719, + "epoch": 0.5424538869389889, + "grad_norm": 0.410439670085907, + "kl": 0.420654296875, + "learning_rate": 1.02607268136261e-05, + "loss": 0.0183, + "reward": 0.564732164144516, + "reward_std": 0.12687495164573193, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1816 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.0335235595703, + "epoch": 0.5427525950265103, + "grad_norm": 1.0190026760101318, + "kl": 0.724609375, + "learning_rate": 1.025029996496651e-05, + "loss": 0.0253, + "reward": 0.5703125074505806, + "reward_std": 0.059969872469082475, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 1817 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.9844207763672, + "epoch": 0.5430513031140318, + "grad_norm": 1.5679301023483276, + "kl": 0.595703125, + "learning_rate": 1.0239872844005094e-05, + "loss": 0.0207, + "reward": 0.6411830633878708, + "reward_std": 0.0882290955632925, + "rewards/accuracy_reward": 0.1450892931316048, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937574505806, + "step": 1818 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.5870971679688, + "epoch": 0.5433500112015532, + "grad_norm": 0.2510552704334259, + "kl": 0.50341796875, + "learning_rate": 1.0229445462085531e-05, + "loss": 0.0208, + "reward": 0.631138414144516, + "reward_std": 0.11074019409716129, + "rewards/accuracy_reward": 0.13616072200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 1819 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.8259429931641, + "epoch": 0.5436487192890748, + "grad_norm": 1.0042781829833984, + "kl": 0.4287109375, + "learning_rate": 1.0219017830551797e-05, + "loss": 0.0083, + "reward": 0.576450914144516, + "reward_std": 0.1015287758782506, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1820 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.5536193847656, + "epoch": 0.5439474273765962, + "grad_norm": 0.4421047270298004, + "kl": 0.37939453125, + "learning_rate": 1.0208589960748127e-05, + "loss": 0.0143, + "reward": 0.6651785969734192, + "reward_std": 0.06606150418519974, + "rewards/accuracy_reward": 0.17187500977888703, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1821 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.1540679931641, + "epoch": 0.5442461354641177, + "grad_norm": 1.1630094051361084, + "kl": 0.3701171875, + "learning_rate": 1.0198161864019024e-05, + "loss": 0.0112, + "reward": 0.5708705633878708, + "reward_std": 0.07831584569066763, + "rewards/accuracy_reward": 0.07812500465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1822 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.1138763427734, + "epoch": 0.5445448435516391, + "grad_norm": 0.4763760566711426, + "kl": 0.39599609375, + "learning_rate": 1.0187733551709236e-05, + "loss": 0.0159, + "reward": 0.5675223469734192, + "reward_std": 0.10204876214265823, + "rewards/accuracy_reward": 0.07366071664728224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1823 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.1317596435547, + "epoch": 0.5448435516391607, + "grad_norm": 1.3590929508209229, + "kl": 0.53857421875, + "learning_rate": 1.0177305035163745e-05, + "loss": 0.0201, + "reward": 0.5904018133878708, + "reward_std": 0.11869375593960285, + "rewards/accuracy_reward": 0.10491072107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 1824 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.4308471679688, + "epoch": 0.5451422597266821, + "grad_norm": 0.5718656778335571, + "kl": 0.75390625, + "learning_rate": 1.016687632572775e-05, + "loss": 0.0337, + "reward": 0.6601562649011612, + "reward_std": 0.10719725117087364, + "rewards/accuracy_reward": 0.16964286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 1825 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.325927734375, + "epoch": 0.5454409678142036, + "grad_norm": 0.9593601226806641, + "kl": 0.970703125, + "learning_rate": 1.0156447434746669e-05, + "loss": 0.0399, + "reward": 0.5284598469734192, + "reward_std": 0.10768094193190336, + "rewards/accuracy_reward": 0.040178573690354824, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 1826 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.3281860351562, + "epoch": 0.545739675901725, + "grad_norm": 1.895792007446289, + "kl": 1.107421875, + "learning_rate": 1.0146018373566114e-05, + "loss": 0.0405, + "reward": 0.5904018059372902, + "reward_std": 0.060986649710685015, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1827 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.8013916015625, + "epoch": 0.5460383839892465, + "grad_norm": 1.434391975402832, + "kl": 1.23046875, + "learning_rate": 1.0135589153531879e-05, + "loss": 0.0517, + "reward": 0.6093750298023224, + "reward_std": 0.07033247360959649, + "rewards/accuracy_reward": 0.12053571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 1828 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.9866485595703, + "epoch": 0.546337092076768, + "grad_norm": 0.2785997688770294, + "kl": 0.67578125, + "learning_rate": 1.0125159785989933e-05, + "loss": 0.0288, + "reward": 0.5440848469734192, + "reward_std": 0.07956065470352769, + "rewards/accuracy_reward": 0.05133928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1829 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.6719207763672, + "epoch": 0.5466358001642895, + "grad_norm": 0.8221375942230225, + "kl": 0.71484375, + "learning_rate": 1.0114730282286408e-05, + "loss": 0.0232, + "reward": 0.6473214626312256, + "reward_std": 0.10123198479413986, + "rewards/accuracy_reward": 0.15401786845177412, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1830 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.0580902099609, + "epoch": 0.5469345082518109, + "grad_norm": 0.43279385566711426, + "kl": 0.45166015625, + "learning_rate": 1.0104300653767582e-05, + "loss": 0.0154, + "reward": 0.6406250298023224, + "reward_std": 0.1501653464511037, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1831 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.435302734375, + "epoch": 0.5472332163393324, + "grad_norm": 0.8806430697441101, + "kl": 0.4755859375, + "learning_rate": 1.0093870911779866e-05, + "loss": 0.0196, + "reward": 0.5898437798023224, + "reward_std": 0.12458022683858871, + "rewards/accuracy_reward": 0.09821429033763707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1832 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.9196624755859, + "epoch": 0.5475319244268538, + "grad_norm": 1.353391170501709, + "kl": 0.50341796875, + "learning_rate": 1.0083441067669797e-05, + "loss": 0.0179, + "reward": 0.5747768208384514, + "reward_std": 0.11651598941534758, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1833 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.919677734375, + "epoch": 0.5478306325143754, + "grad_norm": 1.1743707656860352, + "kl": 0.47802734375, + "learning_rate": 1.0073011132784026e-05, + "loss": 0.0202, + "reward": 0.5876116305589676, + "reward_std": 0.1362366247922182, + "rewards/accuracy_reward": 0.09598214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 1834 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.5692291259766, + "epoch": 0.5481293406018968, + "grad_norm": 0.33490222692489624, + "kl": 0.50439453125, + "learning_rate": 1.00625811184693e-05, + "loss": 0.0186, + "reward": 0.550223246216774, + "reward_std": 0.11695364117622375, + "rewards/accuracy_reward": 0.05580357275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1835 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.6942443847656, + "epoch": 0.5484280486894183, + "grad_norm": 0.9677874445915222, + "kl": 0.75537109375, + "learning_rate": 1.0052151036072446e-05, + "loss": 0.0336, + "reward": 0.599888414144516, + "reward_std": 0.07448113313876092, + "rewards/accuracy_reward": 0.10491071734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1836 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.7455749511719, + "epoch": 0.5487267567769397, + "grad_norm": 0.7104159593582153, + "kl": 0.78271484375, + "learning_rate": 1.004172089694038e-05, + "loss": 0.0334, + "reward": 0.6021205633878708, + "reward_std": 0.08439008938148618, + "rewards/accuracy_reward": 0.10937500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1837 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.7009429931641, + "epoch": 0.5490254648644612, + "grad_norm": 0.49201449751853943, + "kl": 0.833984375, + "learning_rate": 1.0031290712420065e-05, + "loss": 0.0312, + "reward": 0.5312500223517418, + "reward_std": 0.08405719976872206, + "rewards/accuracy_reward": 0.04241071501746774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1838 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.0134429931641, + "epoch": 0.5493241729519827, + "grad_norm": 0.5836870074272156, + "kl": 0.640625, + "learning_rate": 1.0020860493858524e-05, + "loss": 0.0186, + "reward": 0.615513414144516, + "reward_std": 0.13359053060412407, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 1839 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.7098693847656, + "epoch": 0.5496228810395042, + "grad_norm": 0.4869648814201355, + "kl": 0.72265625, + "learning_rate": 1.0010430252602808e-05, + "loss": 0.0277, + "reward": 0.6155134290456772, + "reward_std": 0.12474491260945797, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 1840 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.1585235595703, + "epoch": 0.5499215891270256, + "grad_norm": 0.7610355019569397, + "kl": 1.40625, + "learning_rate": 1e-05, + "loss": 0.06, + "reward": 0.5837053805589676, + "reward_std": 0.11017232201993465, + "rewards/accuracy_reward": 0.10044643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 1841 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.0513916015625, + "epoch": 0.5502202972145471, + "grad_norm": 0.3906748294830322, + "kl": 1.275390625, + "learning_rate": 9.989569747397194e-06, + "loss": 0.0564, + "reward": 0.5647321790456772, + "reward_std": 0.11044486053287983, + "rewards/accuracy_reward": 0.08035714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750298023224, + "step": 1842 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.9576416015625, + "epoch": 0.5505190053020685, + "grad_norm": 0.7600957155227661, + "kl": 0.85546875, + "learning_rate": 9.979139506141477e-06, + "loss": 0.0342, + "reward": 0.6032366380095482, + "reward_std": 0.08747678273357451, + "rewards/accuracy_reward": 0.11607143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 1843 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.1674499511719, + "epoch": 0.5508177133895901, + "grad_norm": 0.4685240387916565, + "kl": 1.59765625, + "learning_rate": 9.968709287579937e-06, + "loss": 0.0711, + "reward": 0.576450914144516, + "reward_std": 0.13848367147147655, + "rewards/accuracy_reward": 0.10044643236324191, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4760044813156128, + "step": 1844 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.2879943847656, + "epoch": 0.5511164214771115, + "grad_norm": 0.722981333732605, + "kl": 1.162109375, + "learning_rate": 9.958279103059624e-06, + "loss": 0.0477, + "reward": 0.564732164144516, + "reward_std": 0.10740260221064091, + "rewards/accuracy_reward": 0.07812500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 1845 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.7210235595703, + "epoch": 0.551415129564633, + "grad_norm": 0.7947911024093628, + "kl": 1.744140625, + "learning_rate": 9.947848963927556e-06, + "loss": 0.0747, + "reward": 0.6188616454601288, + "reward_std": 0.12089626677334309, + "rewards/accuracy_reward": 0.14062500861473382, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4782366380095482, + "step": 1846 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.8594055175781, + "epoch": 0.5517138376521544, + "grad_norm": 0.5569537281990051, + "kl": 1.3359375, + "learning_rate": 9.937418881530704e-06, + "loss": 0.0488, + "reward": 0.6238839626312256, + "reward_std": 0.09949576482176781, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.483258955180645, + "step": 1847 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.0134429931641, + "epoch": 0.552012545739676, + "grad_norm": 0.6917606592178345, + "kl": 0.77734375, + "learning_rate": 9.926988867215976e-06, + "loss": 0.0291, + "reward": 0.589285746216774, + "reward_std": 0.12055604066699743, + "rewards/accuracy_reward": 0.10044642887078226, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1848 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.0670013427734, + "epoch": 0.5523112538271974, + "grad_norm": 1.9167808294296265, + "kl": 1.1103515625, + "learning_rate": 9.916558932330206e-06, + "loss": 0.0434, + "reward": 0.602120578289032, + "reward_std": 0.14169002324342728, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.479352705180645, + "step": 1849 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.4062957763672, + "epoch": 0.5526099619147189, + "grad_norm": 1.0606739521026611, + "kl": 1.4072265625, + "learning_rate": 9.906129088220137e-06, + "loss": 0.0564, + "reward": 0.5479910969734192, + "reward_std": 0.13065369240939617, + "rewards/accuracy_reward": 0.07589286123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4720982387661934, + "step": 1850 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.919677734375, + "epoch": 0.5529086700022403, + "grad_norm": 1.425682544708252, + "kl": 1.4521484375, + "learning_rate": 9.895699346232422e-06, + "loss": 0.0528, + "reward": 0.5541294813156128, + "reward_std": 0.0916549600660801, + "rewards/accuracy_reward": 0.07812500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4760044813156128, + "step": 1851 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.3036193847656, + "epoch": 0.5532073780897617, + "grad_norm": 2.852597713470459, + "kl": 2.6640625, + "learning_rate": 9.885269717713595e-06, + "loss": 0.1056, + "reward": 0.576450914144516, + "reward_std": 0.1255715023726225, + "rewards/accuracy_reward": 0.1026785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4737723469734192, + "step": 1852 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.9732666015625, + "epoch": 0.5535060861772833, + "grad_norm": 2.5439975261688232, + "kl": 2.400390625, + "learning_rate": 9.874840214010069e-06, + "loss": 0.096, + "reward": 0.5909598618745804, + "reward_std": 0.14665881730616093, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.474888414144516, + "step": 1853 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.247802734375, + "epoch": 0.5538047942648047, + "grad_norm": 3.220322847366333, + "kl": 2.65625, + "learning_rate": 9.864410846468123e-06, + "loss": 0.1113, + "reward": 0.6378348618745804, + "reward_std": 0.15192414447665215, + "rewards/accuracy_reward": 0.16294643562287092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4748884066939354, + "step": 1854 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.9330749511719, + "epoch": 0.5541035023523262, + "grad_norm": 2.1653213500976562, + "kl": 2.267578125, + "learning_rate": 9.85398162643389e-06, + "loss": 0.0881, + "reward": 0.5920759290456772, + "reward_std": 0.13746205065399408, + "rewards/accuracy_reward": 0.11830357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4737723395228386, + "step": 1855 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.1451263427734, + "epoch": 0.5544022104398476, + "grad_norm": 1.3833640813827515, + "kl": 1.2861328125, + "learning_rate": 9.843552565253333e-06, + "loss": 0.0419, + "reward": 0.4882812723517418, + "reward_std": 0.06358728371560574, + "rewards/accuracy_reward": 0.004464285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169813156128, + "step": 1856 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.1786193847656, + "epoch": 0.5547009185273691, + "grad_norm": 0.6670596599578857, + "kl": 1.126953125, + "learning_rate": 9.833123674272252e-06, + "loss": 0.0443, + "reward": 0.5552455484867096, + "reward_std": 0.14060906879603863, + "rewards/accuracy_reward": 0.06919643329456449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 1857 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.7812957763672, + "epoch": 0.5549996266148906, + "grad_norm": 0.7983633875846863, + "kl": 1.033203125, + "learning_rate": 9.822694964836259e-06, + "loss": 0.0346, + "reward": 0.619419664144516, + "reward_std": 0.09814239386469126, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 1858 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.6942291259766, + "epoch": 0.5552983347024121, + "grad_norm": 0.4847555160522461, + "kl": 1.1337890625, + "learning_rate": 9.812266448290767e-06, + "loss": 0.0464, + "reward": 0.554129496216774, + "reward_std": 0.08892242424190044, + "rewards/accuracy_reward": 0.06696428707800806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 1859 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.6562957763672, + "epoch": 0.5555970427899335, + "grad_norm": 0.6816853284835815, + "kl": 1.09765625, + "learning_rate": 9.80183813598098e-06, + "loss": 0.0477, + "reward": 0.5552455633878708, + "reward_std": 0.09902964532375336, + "rewards/accuracy_reward": 0.06919643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 1860 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.7076263427734, + "epoch": 0.555895750877455, + "grad_norm": 0.6268030405044556, + "kl": 0.6162109375, + "learning_rate": 9.791410039251874e-06, + "loss": 0.0228, + "reward": 0.6283482313156128, + "reward_std": 0.1267897360958159, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1861 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.3281555175781, + "epoch": 0.5561944589649764, + "grad_norm": 0.6589494943618774, + "kl": 1.095703125, + "learning_rate": 9.780982169448205e-06, + "loss": 0.0472, + "reward": 0.6395089626312256, + "reward_std": 0.09993134113028646, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 1862 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.5201263427734, + "epoch": 0.556493167052498, + "grad_norm": 0.8035640716552734, + "kl": 0.5751953125, + "learning_rate": 9.77055453791447e-06, + "loss": 0.0254, + "reward": 0.5407366305589676, + "reward_std": 0.0950203649699688, + "rewards/accuracy_reward": 0.046875001629814506, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1863 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.0893402099609, + "epoch": 0.5567918751400194, + "grad_norm": 0.6647756695747375, + "kl": 0.598388671875, + "learning_rate": 9.760127155994907e-06, + "loss": 0.0145, + "reward": 0.5931919887661934, + "reward_std": 0.08197006210684776, + "rewards/accuracy_reward": 0.09821429057046771, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 1864 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.1295013427734, + "epoch": 0.5570905832275409, + "grad_norm": 0.8954685926437378, + "kl": 0.7265625, + "learning_rate": 9.749700035033492e-06, + "loss": 0.028, + "reward": 0.6021205633878708, + "reward_std": 0.03330313181504607, + "rewards/accuracy_reward": 0.10937500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1865 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.341552734375, + "epoch": 0.5573892913150623, + "grad_norm": 1.2304906845092773, + "kl": 0.6806640625, + "learning_rate": 9.739273186373906e-06, + "loss": 0.0298, + "reward": 0.6238839626312256, + "reward_std": 0.06634592171758413, + "rewards/accuracy_reward": 0.12723214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1866 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.7054138183594, + "epoch": 0.5576879994025838, + "grad_norm": 0.30317020416259766, + "kl": 0.611328125, + "learning_rate": 9.728846621359538e-06, + "loss": 0.0241, + "reward": 0.6601562798023224, + "reward_std": 0.11228051083162427, + "rewards/accuracy_reward": 0.16517857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1867 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.3103179931641, + "epoch": 0.5579867074901053, + "grad_norm": 0.4156428575515747, + "kl": 0.785400390625, + "learning_rate": 9.718420351333469e-06, + "loss": 0.0237, + "reward": 0.5485491305589676, + "reward_std": 0.0726815378293395, + "rewards/accuracy_reward": 0.05580357322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1868 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.0580749511719, + "epoch": 0.5582854155776268, + "grad_norm": 0.38157835602760315, + "kl": 0.517333984375, + "learning_rate": 9.707994387638461e-06, + "loss": 0.0199, + "reward": 0.590401828289032, + "reward_std": 0.0862534511834383, + "rewards/accuracy_reward": 0.09598214831203222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1869 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.1607666015625, + "epoch": 0.5585841236651482, + "grad_norm": 0.23616370558738708, + "kl": 0.261474609375, + "learning_rate": 9.697568741616942e-06, + "loss": 0.0117, + "reward": 0.5770089477300644, + "reward_std": 0.03125000209547579, + "rewards/accuracy_reward": 0.07812500349245965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1870 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.0826263427734, + "epoch": 0.5588828317526697, + "grad_norm": 2.020028829574585, + "kl": 0.54052734375, + "learning_rate": 9.687143424610986e-06, + "loss": 0.0241, + "reward": 0.6065848469734192, + "reward_std": 0.14900080859661102, + "rewards/accuracy_reward": 0.11160714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1871 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.1920013427734, + "epoch": 0.5591815398401911, + "grad_norm": 1.1324833631515503, + "kl": 0.305419921875, + "learning_rate": 9.676718447962325e-06, + "loss": 0.0136, + "reward": 0.5647321492433548, + "reward_std": 0.08030681288801134, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1872 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.825927734375, + "epoch": 0.5594802479277127, + "grad_norm": 1.0039772987365723, + "kl": 0.4169921875, + "learning_rate": 9.666293823012306e-06, + "loss": 0.0155, + "reward": 0.6484375149011612, + "reward_std": 0.12530706822872162, + "rewards/accuracy_reward": 0.15401786798611283, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1873 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.0558471679688, + "epoch": 0.5597789560152341, + "grad_norm": 0.6570011973381042, + "kl": 0.439697265625, + "learning_rate": 9.6558695611019e-06, + "loss": 0.0195, + "reward": 0.6629464626312256, + "reward_std": 0.14039033837616444, + "rewards/accuracy_reward": 0.16964286845177412, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1874 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.3928985595703, + "epoch": 0.5600776641027556, + "grad_norm": 0.7661553025245667, + "kl": 0.60546875, + "learning_rate": 9.645445673571685e-06, + "loss": 0.0277, + "reward": 0.577008955180645, + "reward_std": 0.09756680717691779, + "rewards/accuracy_reward": 0.0825892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.5870971679688, + "epoch": 0.560376372190277, + "grad_norm": 1.3629052639007568, + "kl": 0.6669921875, + "learning_rate": 9.635022171761826e-06, + "loss": 0.0267, + "reward": 0.6004464477300644, + "reward_std": 0.06608309270814061, + "rewards/accuracy_reward": 0.10267857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1876 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.4844360351562, + "epoch": 0.5606750802777986, + "grad_norm": 1.2126963138580322, + "kl": 0.74560546875, + "learning_rate": 9.624599067012073e-06, + "loss": 0.0311, + "reward": 0.6462053805589676, + "reward_std": 0.08352856733836234, + "rewards/accuracy_reward": 0.149553582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1877 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.4107666015625, + "epoch": 0.56097378836532, + "grad_norm": 0.4593494236469269, + "kl": 0.451416015625, + "learning_rate": 9.61417637066174e-06, + "loss": 0.0162, + "reward": 0.5691964477300644, + "reward_std": 0.04445278365164995, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1878 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.450927734375, + "epoch": 0.5612724964528415, + "grad_norm": 1.205290675163269, + "kl": 0.47705078125, + "learning_rate": 9.603754094049702e-06, + "loss": 0.0223, + "reward": 0.5758928805589676, + "reward_std": 0.0850803591310978, + "rewards/accuracy_reward": 0.07812500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1879 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.5424652099609, + "epoch": 0.5615712045403629, + "grad_norm": 0.3801566958427429, + "kl": 0.337646484375, + "learning_rate": 9.593332248514374e-06, + "loss": 0.0173, + "reward": 0.6250000298023224, + "reward_std": 0.12660833192057908, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1880 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.747802734375, + "epoch": 0.5618699126278844, + "grad_norm": 0.27400511503219604, + "kl": 0.236328125, + "learning_rate": 9.582910845393703e-06, + "loss": 0.0079, + "reward": 0.6049107313156128, + "reward_std": 0.07658668933436275, + "rewards/accuracy_reward": 0.10714286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1881 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.3549499511719, + "epoch": 0.5621686207154059, + "grad_norm": 0.29914337396621704, + "kl": 0.190185546875, + "learning_rate": 9.57248989602515e-06, + "loss": 0.0014, + "reward": 0.6255580633878708, + "reward_std": 0.12307333946228027, + "rewards/accuracy_reward": 0.1272321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1882 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.529052734375, + "epoch": 0.5624673288029274, + "grad_norm": 0.4274207651615143, + "kl": 0.25830078125, + "learning_rate": 9.562069411745692e-06, + "loss": 0.0052, + "reward": 0.612165205180645, + "reward_std": 0.08936994336545467, + "rewards/accuracy_reward": 0.1205357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1883 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.0089721679688, + "epoch": 0.5627660368904488, + "grad_norm": 0.6492719650268555, + "kl": 0.2041015625, + "learning_rate": 9.551649403891792e-06, + "loss": 0.0086, + "reward": 0.6216518133878708, + "reward_std": 0.05828662519343197, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1884 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.0089721679688, + "epoch": 0.5630647449779703, + "grad_norm": 1.2211929559707642, + "kl": 0.244140625, + "learning_rate": 9.541229883799397e-06, + "loss": 0.0048, + "reward": 0.5468750447034836, + "reward_std": 0.11977116577327251, + "rewards/accuracy_reward": 0.05357143119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1885 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.6585235595703, + "epoch": 0.5633634530654917, + "grad_norm": 0.7344136238098145, + "kl": 0.21630859375, + "learning_rate": 9.530810862803922e-06, + "loss": 0.0091, + "reward": 0.6294643133878708, + "reward_std": 0.08472267724573612, + "rewards/accuracy_reward": 0.13392858020961285, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1886 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.7545166015625, + "epoch": 0.5636621611530133, + "grad_norm": 0.5084440112113953, + "kl": 0.2744140625, + "learning_rate": 9.520392352240246e-06, + "loss": 0.0029, + "reward": 0.5864955633878708, + "reward_std": 0.1136244498193264, + "rewards/accuracy_reward": 0.09151786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1887 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.1473693847656, + "epoch": 0.5639608692405347, + "grad_norm": 0.2785111367702484, + "kl": 0.27783203125, + "learning_rate": 9.509974363442684e-06, + "loss": -0.0007, + "reward": 0.5507812649011612, + "reward_std": 0.11774492636322975, + "rewards/accuracy_reward": 0.05580357392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1888 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.3237152099609, + "epoch": 0.5642595773280562, + "grad_norm": 0.3539488911628723, + "kl": 0.34375, + "learning_rate": 9.499556907744985e-06, + "loss": 0.015, + "reward": 0.624441996216774, + "reward_std": 0.04731598449870944, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1889 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.5848541259766, + "epoch": 0.5645582854155776, + "grad_norm": 0.5450630784034729, + "kl": 0.404296875, + "learning_rate": 9.489139996480324e-06, + "loss": 0.0164, + "reward": 0.7092634290456772, + "reward_std": 0.11450026789680123, + "rewards/accuracy_reward": 0.2165178656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 1890 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.1138763427734, + "epoch": 0.5648569935030991, + "grad_norm": 0.5103784799575806, + "kl": 0.38916015625, + "learning_rate": 9.478723640981276e-06, + "loss": 0.0158, + "reward": 0.572544664144516, + "reward_std": 0.11201830208301544, + "rewards/accuracy_reward": 0.08035714831203222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1891 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.0469360351562, + "epoch": 0.5651557015906206, + "grad_norm": 0.5682791471481323, + "kl": 0.5380859375, + "learning_rate": 9.468307852579815e-06, + "loss": 0.0209, + "reward": 0.6702009290456772, + "reward_std": 0.06706511927768588, + "rewards/accuracy_reward": 0.176339291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 1892 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.1763763427734, + "epoch": 0.5654544096781421, + "grad_norm": 0.8140679001808167, + "kl": 0.5966796875, + "learning_rate": 9.4578926426073e-06, + "loss": 0.0215, + "reward": 0.6601562947034836, + "reward_std": 0.07435656059533358, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1893 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.6607666015625, + "epoch": 0.5657531177656635, + "grad_norm": 1.0170034170150757, + "kl": 0.5947265625, + "learning_rate": 9.447478022394457e-06, + "loss": 0.0273, + "reward": 0.6422991454601288, + "reward_std": 0.09641105588525534, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 1894 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.4553985595703, + "epoch": 0.5660518258531849, + "grad_norm": 0.41490545868873596, + "kl": 0.69921875, + "learning_rate": 9.437064003271373e-06, + "loss": 0.0233, + "reward": 0.5786830633878708, + "reward_std": 0.15407798998057842, + "rewards/accuracy_reward": 0.08705357694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1895 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.5804138183594, + "epoch": 0.5663505339407064, + "grad_norm": 2.523482084274292, + "kl": 0.75390625, + "learning_rate": 9.426650596567479e-06, + "loss": 0.0225, + "reward": 0.647321455180645, + "reward_std": 0.07513363193720579, + "rewards/accuracy_reward": 0.15848214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1896 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.3928985595703, + "epoch": 0.5666492420282279, + "grad_norm": 0.3494671583175659, + "kl": 0.44482421875, + "learning_rate": 9.416237813611542e-06, + "loss": 0.0133, + "reward": 0.5066964402794838, + "reward_std": 0.06824837997555733, + "rewards/accuracy_reward": 0.015625000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714402794838, + "step": 1897 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.4620819091797, + "epoch": 0.5669479501157494, + "grad_norm": 0.7911489605903625, + "kl": 0.68017578125, + "learning_rate": 9.405825665731651e-06, + "loss": 0.0238, + "reward": 0.6015625298023224, + "reward_std": 0.12495629116892815, + "rewards/accuracy_reward": 0.11607143515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910969734192, + "step": 1898 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.0402221679688, + "epoch": 0.5672466582032708, + "grad_norm": 0.4374513030052185, + "kl": 0.689453125, + "learning_rate": 9.3954141642552e-06, + "loss": 0.0125, + "reward": 0.5167411044239998, + "reward_std": 0.1198814120143652, + "rewards/accuracy_reward": 0.03125000139698386, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 1899 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.1406860351562, + "epoch": 0.5675453662907923, + "grad_norm": 0.34470272064208984, + "kl": 0.279296875, + "learning_rate": 9.38500332050888e-06, + "loss": 0.0094, + "reward": 0.541294664144516, + "reward_std": 0.11160377226769924, + "rewards/accuracy_reward": 0.04687500209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 1900 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.7210235595703, + "epoch": 0.5678440743783137, + "grad_norm": 1.2210451364517212, + "kl": 0.47998046875, + "learning_rate": 9.374593145818673e-06, + "loss": 0.0131, + "reward": 0.581473246216774, + "reward_std": 0.12006509397178888, + "rewards/accuracy_reward": 0.09598214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 1901 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.263427734375, + "epoch": 0.5681427824658353, + "grad_norm": 0.76024329662323, + "kl": 0.48828125, + "learning_rate": 9.364183651509826e-06, + "loss": 0.024, + "reward": 0.5837053954601288, + "reward_std": 0.14724087342619896, + "rewards/accuracy_reward": 0.09375000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1902 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.4330749511719, + "epoch": 0.5684414905533567, + "grad_norm": 0.22192487120628357, + "kl": 0.6435546875, + "learning_rate": 9.353774848906849e-06, + "loss": 0.0198, + "reward": 0.5613839626312256, + "reward_std": 0.0830288715660572, + "rewards/accuracy_reward": 0.07142857438884676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 1903 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.3839721679688, + "epoch": 0.5687401986408782, + "grad_norm": 0.3532494902610779, + "kl": 0.9482421875, + "learning_rate": 9.343366749333502e-06, + "loss": 0.0315, + "reward": 0.6121651977300644, + "reward_std": 0.10079008061438799, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 1904 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.5089721679688, + "epoch": 0.5690389067283996, + "grad_norm": 1.2553430795669556, + "kl": 0.70654296875, + "learning_rate": 9.332959364112772e-06, + "loss": 0.03, + "reward": 0.6104910969734192, + "reward_std": 0.13419714756309986, + "rewards/accuracy_reward": 0.1138392947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1905 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.2411193847656, + "epoch": 0.5693376148159212, + "grad_norm": 0.41217443346977234, + "kl": 0.62646484375, + "learning_rate": 9.32255270456688e-06, + "loss": 0.026, + "reward": 0.6026785895228386, + "reward_std": 0.0980789428576827, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1906 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.7545166015625, + "epoch": 0.5696363229034426, + "grad_norm": 0.6736293435096741, + "kl": 1.00390625, + "learning_rate": 9.312146782017244e-06, + "loss": 0.0364, + "reward": 0.6026786118745804, + "reward_std": 0.1251160241663456, + "rewards/accuracy_reward": 0.11160714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 1907 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.3683471679688, + "epoch": 0.5699350309909641, + "grad_norm": 0.4671131372451782, + "kl": 0.6689453125, + "learning_rate": 9.301741607784495e-06, + "loss": 0.0217, + "reward": 0.6294643133878708, + "reward_std": 0.1257907748222351, + "rewards/accuracy_reward": 0.13616072200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1908 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.9799346923828, + "epoch": 0.5702337390784855, + "grad_norm": 0.9143176078796387, + "kl": 0.841796875, + "learning_rate": 9.29133719318844e-06, + "loss": 0.0392, + "reward": 0.5697544813156128, + "reward_std": 0.15909136831760406, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 1909 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.6629943847656, + "epoch": 0.570532447166007, + "grad_norm": 0.5856113433837891, + "kl": 1.111328125, + "learning_rate": 9.28093354954806e-06, + "loss": 0.0448, + "reward": 0.6143973469734192, + "reward_std": 0.15039185620844364, + "rewards/accuracy_reward": 0.1272321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 1910 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.732177734375, + "epoch": 0.5708311552535285, + "grad_norm": 0.5327513813972473, + "kl": 1.0498046875, + "learning_rate": 9.270530688181506e-06, + "loss": 0.043, + "reward": 0.608816996216774, + "reward_std": 0.0921252304688096, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848469734192, + "step": 1911 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.5357666015625, + "epoch": 0.57112986334105, + "grad_norm": 2.6334939002990723, + "kl": 1.40625, + "learning_rate": 9.260128620406066e-06, + "loss": 0.0509, + "reward": 0.5725446715950966, + "reward_std": 0.13659517467021942, + "rewards/accuracy_reward": 0.0937500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478794664144516, + "step": 1912 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.3192443847656, + "epoch": 0.5714285714285714, + "grad_norm": 1.0320028066635132, + "kl": 0.7763671875, + "learning_rate": 9.249727357538171e-06, + "loss": 0.0355, + "reward": 0.5781250298023224, + "reward_std": 0.16044574417173862, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 1913 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.3973541259766, + "epoch": 0.5717272795160929, + "grad_norm": 0.8051930069923401, + "kl": 0.37353515625, + "learning_rate": 9.239326910893378e-06, + "loss": 0.0138, + "reward": 0.6690848469734192, + "reward_std": 0.12933863885700703, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1914 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.3013763427734, + "epoch": 0.5720259876036143, + "grad_norm": 0.5296238660812378, + "kl": 0.48876953125, + "learning_rate": 9.22892729178635e-06, + "loss": 0.0208, + "reward": 0.6473214477300644, + "reward_std": 0.04204250103794038, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1915 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.2902221679688, + "epoch": 0.5723246956911359, + "grad_norm": 0.5437720417976379, + "kl": 0.69140625, + "learning_rate": 9.218528511530857e-06, + "loss": 0.0282, + "reward": 0.6930803954601288, + "reward_std": 0.12944872863590717, + "rewards/accuracy_reward": 0.2053571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 1916 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.888427734375, + "epoch": 0.5726234037786573, + "grad_norm": 0.7492548823356628, + "kl": 0.81884765625, + "learning_rate": 9.208130581439749e-06, + "loss": 0.0364, + "reward": 0.6261160969734192, + "reward_std": 0.09536515548825264, + "rewards/accuracy_reward": 0.1361607238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1917 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.6518249511719, + "epoch": 0.5729221118661788, + "grad_norm": 0.35469573736190796, + "kl": 0.44580078125, + "learning_rate": 9.197733512824958e-06, + "loss": 0.016, + "reward": 0.5457589402794838, + "reward_std": 0.09453185647726059, + "rewards/accuracy_reward": 0.053571431431919336, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 1918 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.1361999511719, + "epoch": 0.5732208199537002, + "grad_norm": 0.5449403524398804, + "kl": 0.67578125, + "learning_rate": 9.187337316997475e-06, + "loss": 0.029, + "reward": 0.5814732313156128, + "reward_std": 0.09788876585662365, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 1919 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.8683471679688, + "epoch": 0.5735195280412217, + "grad_norm": 0.7410491108894348, + "kl": 0.669921875, + "learning_rate": 9.176942005267342e-06, + "loss": 0.0284, + "reward": 0.616629496216774, + "reward_std": 0.1302061825990677, + "rewards/accuracy_reward": 0.12946428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 1920 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.6272735595703, + "epoch": 0.5738182361287432, + "grad_norm": 0.7999907732009888, + "kl": 0.83203125, + "learning_rate": 9.166547588943636e-06, + "loss": 0.0353, + "reward": 0.550223246216774, + "reward_std": 0.1123586893081665, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 1921 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.4375305175781, + "epoch": 0.5741169442162647, + "grad_norm": 0.43413859605789185, + "kl": 0.69140625, + "learning_rate": 9.15615407933447e-06, + "loss": 0.0247, + "reward": 0.5926339477300644, + "reward_std": 0.06485252734273672, + "rewards/accuracy_reward": 0.10267857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 1922 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.3951416015625, + "epoch": 0.5744156523037861, + "grad_norm": 0.6436145305633545, + "kl": 0.9326171875, + "learning_rate": 9.145761487746958e-06, + "loss": 0.0392, + "reward": 0.6936384290456772, + "reward_std": 0.11205736733973026, + "rewards/accuracy_reward": 0.20758929569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 1923 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.3839721679688, + "epoch": 0.5747143603913076, + "grad_norm": 1.0321674346923828, + "kl": 1.1484375, + "learning_rate": 9.135369825487222e-06, + "loss": 0.0454, + "reward": 0.6177455633878708, + "reward_std": 0.10758802015334368, + "rewards/accuracy_reward": 0.13169643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 1924 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.8080902099609, + "epoch": 0.575013068478829, + "grad_norm": 1.48521888256073, + "kl": 1.5224609375, + "learning_rate": 9.124979103860374e-06, + "loss": 0.066, + "reward": 0.58761166036129, + "reward_std": 0.12647302821278572, + "rewards/accuracy_reward": 0.10937500651925802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4782366305589676, + "step": 1925 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.9486999511719, + "epoch": 0.5753117765663506, + "grad_norm": 1.3288354873657227, + "kl": 1.1826171875, + "learning_rate": 9.1145893341705e-06, + "loss": 0.0555, + "reward": 0.5719866454601288, + "reward_std": 0.18557320162653923, + "rewards/accuracy_reward": 0.08928571920841932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4827009066939354, + "step": 1926 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.5357513427734, + "epoch": 0.575610484653872, + "grad_norm": 1.1308348178863525, + "kl": 1.568359375, + "learning_rate": 9.104200527720652e-06, + "loss": 0.0694, + "reward": 0.545200914144516, + "reward_std": 0.16482634656131268, + "rewards/accuracy_reward": 0.06696428963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478236623108387, + "step": 1927 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.8393249511719, + "epoch": 0.5759091927413935, + "grad_norm": 1.3013314008712769, + "kl": 0.90380859375, + "learning_rate": 9.093812695812828e-06, + "loss": 0.0419, + "reward": 0.509486623108387, + "reward_std": 0.09554746188223362, + "rewards/accuracy_reward": 0.02455357275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 1928 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.7991638183594, + "epoch": 0.5762079008289149, + "grad_norm": 0.5156699419021606, + "kl": 0.798828125, + "learning_rate": 9.08342584974798e-06, + "loss": 0.0391, + "reward": 0.5563616305589676, + "reward_std": 0.09217548370361328, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 1929 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.0915679931641, + "epoch": 0.5765066089164365, + "grad_norm": 0.9787622094154358, + "kl": 1.1474609375, + "learning_rate": 9.07304000082597e-06, + "loss": 0.0454, + "reward": 0.5106026977300644, + "reward_std": 0.12265736423432827, + "rewards/accuracy_reward": 0.03125000209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.479352705180645, + "step": 1930 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.3103179931641, + "epoch": 0.5768053170039579, + "grad_norm": 1.2262998819351196, + "kl": 1.1689453125, + "learning_rate": 9.062655160345587e-06, + "loss": 0.0458, + "reward": 0.5234375149011612, + "reward_std": 0.12438693456351757, + "rewards/accuracy_reward": 0.042410717345774174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 1931 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.8326416015625, + "epoch": 0.5771040250914794, + "grad_norm": 0.5863243341445923, + "kl": 1.576171875, + "learning_rate": 9.052271339604523e-06, + "loss": 0.0572, + "reward": 0.638392873108387, + "reward_std": 0.10692266281694174, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107387661934, + "step": 1932 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.1451416015625, + "epoch": 0.5774027331790008, + "grad_norm": 1.702649712562561, + "kl": 2.03515625, + "learning_rate": 9.041888549899352e-06, + "loss": 0.096, + "reward": 0.606026828289032, + "reward_std": 0.1306488774716854, + "rewards/accuracy_reward": 0.12500000419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268133878708, + "step": 1933 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.7299499511719, + "epoch": 0.5777014412665223, + "grad_norm": 1.5583155155181885, + "kl": 2.140625, + "learning_rate": 9.031506802525535e-06, + "loss": 0.0943, + "reward": 0.5323660895228386, + "reward_std": 0.14520784001797438, + "rewards/accuracy_reward": 0.05580357392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4765625223517418, + "step": 1934 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.5446624755859, + "epoch": 0.5780001493540438, + "grad_norm": 0.741608738899231, + "kl": 2.00390625, + "learning_rate": 9.021126108777391e-06, + "loss": 0.0874, + "reward": 0.5876116380095482, + "reward_std": 0.12865851260721684, + "rewards/accuracy_reward": 0.10937500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4782366380095482, + "step": 1935 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.4509429931641, + "epoch": 0.5782988574415653, + "grad_norm": 0.9162938594818115, + "kl": 1.4609375, + "learning_rate": 9.010746479948105e-06, + "loss": 0.05, + "reward": 0.5987723544239998, + "reward_std": 0.1195142176002264, + "rewards/accuracy_reward": 0.11830357951112092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687723517418, + "step": 1936 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.3661193847656, + "epoch": 0.5785975655290867, + "grad_norm": 0.704346239566803, + "kl": 2.080078125, + "learning_rate": 9.000367927329691e-06, + "loss": 0.0696, + "reward": 0.4804687723517418, + "reward_std": 0.10765038803219795, + "rewards/accuracy_reward": 0.011160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4693080633878708, + "step": 1937 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.5201416015625, + "epoch": 0.5788962736166081, + "grad_norm": 0.7267599105834961, + "kl": 2.2138671875, + "learning_rate": 8.989990462212994e-06, + "loss": 0.0909, + "reward": 0.5686384066939354, + "reward_std": 0.11942362226545811, + "rewards/accuracy_reward": 0.09375000349245965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4748884215950966, + "step": 1938 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.5178985595703, + "epoch": 0.5791949817041296, + "grad_norm": 0.6135819554328918, + "kl": 2.412109375, + "learning_rate": 8.979614095887685e-06, + "loss": 0.1025, + "reward": 0.5664062723517418, + "reward_std": 0.1810113526880741, + "rewards/accuracy_reward": 0.09821429336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4681919887661934, + "step": 1939 + }, + { + "clip_ratio": 0.0, + "completion_length": 930.6183471679688, + "epoch": 0.5794936897916511, + "grad_norm": 1.1785398721694946, + "kl": 1.94921875, + "learning_rate": 8.969238839642232e-06, + "loss": 0.0502, + "reward": 0.5613839626312256, + "reward_std": 0.16962501592934132, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4676339477300644, + "step": 1940 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.7143249511719, + "epoch": 0.5797923978791726, + "grad_norm": 1.4109786748886108, + "kl": 2.185546875, + "learning_rate": 8.958864704763896e-06, + "loss": 0.0805, + "reward": 0.5401786118745804, + "reward_std": 0.1944921799004078, + "rewards/accuracy_reward": 0.07812500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4620535895228386, + "step": 1941 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.0201416015625, + "epoch": 0.580091105966694, + "grad_norm": 2.2717528343200684, + "kl": 1.998046875, + "learning_rate": 8.948491702538716e-06, + "loss": 0.0936, + "reward": 0.6205357611179352, + "reward_std": 0.15086417272686958, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.475446455180645, + "step": 1942 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.9062805175781, + "epoch": 0.5803898140542155, + "grad_norm": 1.8063970804214478, + "kl": 1.9765625, + "learning_rate": 8.938119844251507e-06, + "loss": 0.0824, + "reward": 0.608816996216774, + "reward_std": 0.09829339478164911, + "rewards/accuracy_reward": 0.12723215017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848469734192, + "step": 1943 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.6339721679688, + "epoch": 0.5806885221417369, + "grad_norm": 1.640350103378296, + "kl": 1.658203125, + "learning_rate": 8.927749141185833e-06, + "loss": 0.0594, + "reward": 0.5686384215950966, + "reward_std": 0.1578914187848568, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812798023224, + "step": 1944 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.4241485595703, + "epoch": 0.5809872302292585, + "grad_norm": 2.152050495147705, + "kl": 1.966796875, + "learning_rate": 8.917379604624e-06, + "loss": 0.0963, + "reward": 0.5742187798023224, + "reward_std": 0.12416724115610123, + "rewards/accuracy_reward": 0.08705357694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 1945 + }, + { + "clip_ratio": 0.0, + "completion_length": 905.2031707763672, + "epoch": 0.5812859383167799, + "grad_norm": 2.1162638664245605, + "kl": 1.97265625, + "learning_rate": 8.907011245847049e-06, + "loss": 0.1027, + "reward": 0.6339286044239998, + "reward_std": 0.1657712273299694, + "rewards/accuracy_reward": 0.14732143748551607, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 1946 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.9598541259766, + "epoch": 0.5815846464043014, + "grad_norm": 0.9747107625007629, + "kl": 1.6328125, + "learning_rate": 8.896644076134739e-06, + "loss": 0.0785, + "reward": 0.6328125298023224, + "reward_std": 0.0876779118552804, + "rewards/accuracy_reward": 0.14285715157166123, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 1947 + }, + { + "clip_ratio": 0.0, + "completion_length": 914.1741485595703, + "epoch": 0.5818833544918228, + "grad_norm": 1.455542802810669, + "kl": 0.871826171875, + "learning_rate": 8.886278106765533e-06, + "loss": 0.0424, + "reward": 0.632254496216774, + "reward_std": 0.06362019432708621, + "rewards/accuracy_reward": 0.13616071874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1948 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.6004791259766, + "epoch": 0.5821820625793444, + "grad_norm": 0.24586892127990723, + "kl": 0.4912109375, + "learning_rate": 8.87591334901659e-06, + "loss": 0.0296, + "reward": 0.631138414144516, + "reward_std": 0.12855239026248455, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 1949 + }, + { + "clip_ratio": 0.0, + "completion_length": 914.2277374267578, + "epoch": 0.5824807706668658, + "grad_norm": 0.7866555452346802, + "kl": 0.396240234375, + "learning_rate": 8.865549814163752e-06, + "loss": 0.0207, + "reward": 0.676339328289032, + "reward_std": 0.05867402255535126, + "rewards/accuracy_reward": 0.17857143771834671, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1950 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.8616485595703, + "epoch": 0.5827794787543873, + "grad_norm": 0.4137403964996338, + "kl": 0.431640625, + "learning_rate": 8.855187513481527e-06, + "loss": 0.022, + "reward": 0.7332589477300644, + "reward_std": 0.16385059989988804, + "rewards/accuracy_reward": 0.2366071529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 1951 + }, + { + "clip_ratio": 0.0, + "completion_length": 930.8594055175781, + "epoch": 0.5830781868419087, + "grad_norm": 0.316651314496994, + "kl": 0.36328125, + "learning_rate": 8.844826458243083e-06, + "loss": 0.0142, + "reward": 0.6417411118745804, + "reward_std": 0.10693824477493763, + "rewards/accuracy_reward": 0.1428571529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1952 + }, + { + "clip_ratio": 0.0, + "completion_length": 927.5000457763672, + "epoch": 0.5833768949294302, + "grad_norm": 0.40889403223991394, + "kl": 0.453125, + "learning_rate": 8.834466659720234e-06, + "loss": 0.0233, + "reward": 0.607700914144516, + "reward_std": 0.07896154560148716, + "rewards/accuracy_reward": 0.10937500419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1953 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.2254791259766, + "epoch": 0.5836756030169516, + "grad_norm": 0.2377958446741104, + "kl": 0.16552734375, + "learning_rate": 8.824108129183427e-06, + "loss": 0.014, + "reward": 0.714285746216774, + "reward_std": 0.08594719879329205, + "rewards/accuracy_reward": 0.21428572433069348, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 1954 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.4263763427734, + "epoch": 0.5839743111044732, + "grad_norm": 0.3280980587005615, + "kl": 0.2822265625, + "learning_rate": 8.813750877901723e-06, + "loss": 0.0145, + "reward": 0.5926339477300644, + "reward_std": 0.12596739828586578, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1955 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.6295166015625, + "epoch": 0.5842730191919946, + "grad_norm": 0.7636029720306396, + "kl": 0.259033203125, + "learning_rate": 8.803394917142797e-06, + "loss": 0.0103, + "reward": 0.5719866305589676, + "reward_std": 0.06429193168878555, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1956 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.4687957763672, + "epoch": 0.5845717272795161, + "grad_norm": 0.9460614323616028, + "kl": 0.39306640625, + "learning_rate": 8.793040258172926e-06, + "loss": 0.026, + "reward": 0.7617187947034836, + "reward_std": 0.16539637930691242, + "rewards/accuracy_reward": 0.2656250149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1957 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.529052734375, + "epoch": 0.5848704353670375, + "grad_norm": 0.553244411945343, + "kl": 0.37109375, + "learning_rate": 8.782686912256957e-06, + "loss": 0.016, + "reward": 0.6233259290456772, + "reward_std": 0.13321028463542461, + "rewards/accuracy_reward": 0.12723214970901608, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1958 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.8839721679688, + "epoch": 0.5851691434545591, + "grad_norm": 0.2785080373287201, + "kl": 0.22705078125, + "learning_rate": 8.772334890658317e-06, + "loss": 0.0102, + "reward": 0.6043526977300644, + "reward_std": 0.06975204404443502, + "rewards/accuracy_reward": 0.10491072060540318, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4994419664144516, + "step": 1959 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.0045166015625, + "epoch": 0.5854678515420805, + "grad_norm": 0.3351287245750427, + "kl": 0.3642578125, + "learning_rate": 8.761984204638994e-06, + "loss": 0.0164, + "reward": 0.5652901977300644, + "reward_std": 0.05715416371822357, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1960 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.2924499511719, + "epoch": 0.585766559629602, + "grad_norm": 0.4344964921474457, + "kl": 0.263916015625, + "learning_rate": 8.751634865459518e-06, + "loss": 0.0106, + "reward": 0.5993303954601288, + "reward_std": 0.09417224302887917, + "rewards/accuracy_reward": 0.10044643003493547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1961 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.1473846435547, + "epoch": 0.5860652677171234, + "grad_norm": 0.30676785111427307, + "kl": 0.416748046875, + "learning_rate": 8.741286884378954e-06, + "loss": 0.013, + "reward": 0.6462053954601288, + "reward_std": 0.0636117160320282, + "rewards/accuracy_reward": 0.14955357648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.496651791036129, + "step": 1962 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.8058471679688, + "epoch": 0.5863639758046449, + "grad_norm": 0.459846168756485, + "kl": 1.08984375, + "learning_rate": 8.73094027265489e-06, + "loss": 0.0481, + "reward": 0.6244419887661934, + "reward_std": 0.05347771616652608, + "rewards/accuracy_reward": 0.13392857764847577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 1963 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.3795166015625, + "epoch": 0.5866626838921664, + "grad_norm": 1.1932950019836426, + "kl": 0.775390625, + "learning_rate": 8.720595041543433e-06, + "loss": 0.038, + "reward": 0.6149553805589676, + "reward_std": 0.09037165204063058, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 1964 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.9732513427734, + "epoch": 0.5869613919796879, + "grad_norm": 0.6855806112289429, + "kl": 0.8017578125, + "learning_rate": 8.710251202299174e-06, + "loss": 0.0299, + "reward": 0.5625000298023224, + "reward_std": 0.11107923835515976, + "rewards/accuracy_reward": 0.06919643236324191, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 1965 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.1518402099609, + "epoch": 0.5872601000672093, + "grad_norm": 1.2764898538589478, + "kl": 1.501953125, + "learning_rate": 8.699908766175195e-06, + "loss": 0.0619, + "reward": 0.6277902126312256, + "reward_std": 0.09847011230885983, + "rewards/accuracy_reward": 0.13616072200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 1966 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.2411041259766, + "epoch": 0.5875588081547308, + "grad_norm": 0.42679640650749207, + "kl": 1.005859375, + "learning_rate": 8.68956774442306e-06, + "loss": 0.0488, + "reward": 0.6858259290456772, + "reward_std": 0.16348016820847988, + "rewards/accuracy_reward": 0.1941964291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1967 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.4687957763672, + "epoch": 0.5878575162422522, + "grad_norm": 1.0118252038955688, + "kl": 1.4658203125, + "learning_rate": 8.679228148292782e-06, + "loss": 0.0611, + "reward": 0.493861623108387, + "reward_std": 0.0458538681268692, + "rewards/accuracy_reward": 0.004464285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 1968 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.1272888183594, + "epoch": 0.5881562243297738, + "grad_norm": 5.286367416381836, + "kl": 3.013671875, + "learning_rate": 8.66888998903283e-06, + "loss": 0.1108, + "reward": 0.5652901977300644, + "reward_std": 0.13725543208420277, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478236623108387, + "step": 1969 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.919677734375, + "epoch": 0.5884549324172952, + "grad_norm": 2.6014232635498047, + "kl": 2.99609375, + "learning_rate": 8.658553277890102e-06, + "loss": 0.0958, + "reward": 0.5736607387661934, + "reward_std": 0.1096813976764679, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4776785895228386, + "step": 1970 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.4911193847656, + "epoch": 0.5887536405048167, + "grad_norm": 3.559483051300049, + "kl": 3.271484375, + "learning_rate": 8.648218026109937e-06, + "loss": 0.111, + "reward": 0.5597098395228386, + "reward_std": 0.1212918683886528, + "rewards/accuracy_reward": 0.08928571688011289, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4704241305589676, + "step": 1971 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.6696929931641, + "epoch": 0.5890523485923381, + "grad_norm": 3.055520534515381, + "kl": 3.02734375, + "learning_rate": 8.637884244936069e-06, + "loss": 0.1049, + "reward": 0.7265625298023224, + "reward_std": 0.14475055038928986, + "rewards/accuracy_reward": 0.2566964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4698660969734192, + "step": 1972 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.8772735595703, + "epoch": 0.5893510566798597, + "grad_norm": 1.6872588396072388, + "kl": 2.490234375, + "learning_rate": 8.627551945610641e-06, + "loss": 0.0827, + "reward": 0.5479910969734192, + "reward_std": 0.14302659034729004, + "rewards/accuracy_reward": 0.08258929080329835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4654018133878708, + "step": 1973 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.0357513427734, + "epoch": 0.5896497647673811, + "grad_norm": 3.251986503601074, + "kl": 2.111328125, + "learning_rate": 8.617221139374181e-06, + "loss": 0.0941, + "reward": 0.5457589477300644, + "reward_std": 0.1587413903325796, + "rewards/accuracy_reward": 0.07366071944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4720982313156128, + "step": 1974 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.1786193847656, + "epoch": 0.5899484728549026, + "grad_norm": 4.073571681976318, + "kl": 1.7890625, + "learning_rate": 8.606891837465596e-06, + "loss": 0.0657, + "reward": 0.6383928880095482, + "reward_std": 0.15724141709506512, + "rewards/accuracy_reward": 0.16294643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.475446455180645, + "step": 1975 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.7678985595703, + "epoch": 0.590247180942424, + "grad_norm": 1.2155531644821167, + "kl": 2.4921875, + "learning_rate": 8.596564051122152e-06, + "loss": 0.0902, + "reward": 0.6210937649011612, + "reward_std": 0.15883464273065329, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.467075914144516, + "step": 1976 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.0268402099609, + "epoch": 0.5905458890299455, + "grad_norm": 1.0026315450668335, + "kl": 3.0625, + "learning_rate": 8.586237791579466e-06, + "loss": 0.121, + "reward": 0.584263414144516, + "reward_std": 0.14794188179075718, + "rewards/accuracy_reward": 0.11830357741564512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4659598469734192, + "step": 1977 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.4888763427734, + "epoch": 0.590844597117467, + "grad_norm": 3.332176923751831, + "kl": 3.37890625, + "learning_rate": 8.575913070071503e-06, + "loss": 0.1255, + "reward": 0.6350446715950966, + "reward_std": 0.1550820767879486, + "rewards/accuracy_reward": 0.16517857927829027, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4698660969734192, + "step": 1978 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.7321929931641, + "epoch": 0.5911433052049885, + "grad_norm": 3.2074472904205322, + "kl": 3.35546875, + "learning_rate": 8.565589897830543e-06, + "loss": 0.1298, + "reward": 0.5463169887661934, + "reward_std": 0.17417271248996258, + "rewards/accuracy_reward": 0.07589286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4704241305589676, + "step": 1979 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.6451416015625, + "epoch": 0.5914420132925099, + "grad_norm": 3.098095178604126, + "kl": 3.009765625, + "learning_rate": 8.555268286087187e-06, + "loss": 0.1145, + "reward": 0.5463169887661934, + "reward_std": 0.10408947058022022, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.474888414144516, + "step": 1980 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.0223693847656, + "epoch": 0.5917407213800313, + "grad_norm": 0.7180175185203552, + "kl": 1.98046875, + "learning_rate": 8.544948246070335e-06, + "loss": 0.0769, + "reward": 0.5820312649011612, + "reward_std": 0.09496423043310642, + "rewards/accuracy_reward": 0.10044643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 1981 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.0469207763672, + "epoch": 0.5920394294675528, + "grad_norm": 1.3956860303878784, + "kl": 2.080078125, + "learning_rate": 8.534629789007183e-06, + "loss": 0.0865, + "reward": 0.5585937649011612, + "reward_std": 0.1279850285500288, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4782366305589676, + "step": 1982 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.5803985595703, + "epoch": 0.5923381375550743, + "grad_norm": 0.4239858388900757, + "kl": 1.673828125, + "learning_rate": 8.524312926123199e-06, + "loss": 0.0552, + "reward": 0.5474330708384514, + "reward_std": 0.09456372819840908, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 1983 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.7344055175781, + "epoch": 0.5926368456425958, + "grad_norm": 0.4689911901950836, + "kl": 1.14453125, + "learning_rate": 8.513997668642117e-06, + "loss": 0.0485, + "reward": 0.5273437798023224, + "reward_std": 0.08971974719315767, + "rewards/accuracy_reward": 0.03571428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 1984 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.9241638183594, + "epoch": 0.5929355537301172, + "grad_norm": 0.5434146523475647, + "kl": 0.994140625, + "learning_rate": 8.503684027785929e-06, + "loss": 0.0295, + "reward": 0.7421875298023224, + "reward_std": 0.16802735812962055, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 1985 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.1674499511719, + "epoch": 0.5932342618176387, + "grad_norm": 1.4135565757751465, + "kl": 1.287109375, + "learning_rate": 8.493372014774863e-06, + "loss": 0.0471, + "reward": 0.569754496216774, + "reward_std": 0.08614031225442886, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 1986 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.8437957763672, + "epoch": 0.5935329699051601, + "grad_norm": 0.29450953006744385, + "kl": 0.74365234375, + "learning_rate": 8.48306164082738e-06, + "loss": 0.0307, + "reward": 0.5870535969734192, + "reward_std": 0.0926420371979475, + "rewards/accuracy_reward": 0.09151786239817739, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1987 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.0402221679688, + "epoch": 0.5938316779926817, + "grad_norm": 0.8516994714736938, + "kl": 0.716796875, + "learning_rate": 8.472752917160155e-06, + "loss": 0.0228, + "reward": 0.6517857313156128, + "reward_std": 0.12151830643415451, + "rewards/accuracy_reward": 0.1562500111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357238650322, + "step": 1988 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.4085235595703, + "epoch": 0.5941303860802031, + "grad_norm": 0.5449825525283813, + "kl": 0.87109375, + "learning_rate": 8.462445854988071e-06, + "loss": 0.0291, + "reward": 0.6690848469734192, + "reward_std": 0.07159795821644366, + "rewards/accuracy_reward": 0.1741071492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 1989 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.9308319091797, + "epoch": 0.5944290941677246, + "grad_norm": 0.6262717247009277, + "kl": 0.72119140625, + "learning_rate": 8.452140465524201e-06, + "loss": 0.0351, + "reward": 0.5731027126312256, + "reward_std": 0.05639704060740769, + "rewards/accuracy_reward": 0.07589286030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1990 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.8683471679688, + "epoch": 0.594727802255246, + "grad_norm": 1.0323184728622437, + "kl": 0.671875, + "learning_rate": 8.441836759979796e-06, + "loss": 0.0281, + "reward": 0.5189732387661934, + "reward_std": 0.07061850395984948, + "rewards/accuracy_reward": 0.024553573224693537, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 1991 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.4888763427734, + "epoch": 0.5950265103427675, + "grad_norm": 0.5010284781455994, + "kl": 0.322021484375, + "learning_rate": 8.43153474956428e-06, + "loss": 0.0097, + "reward": 0.6478794813156128, + "reward_std": 0.11289777606725693, + "rewards/accuracy_reward": 0.1495535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 1992 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.7924499511719, + "epoch": 0.595325218430289, + "grad_norm": 1.0301748514175415, + "kl": 0.66162109375, + "learning_rate": 8.421234445485232e-06, + "loss": 0.0343, + "reward": 0.5245535969734192, + "reward_std": 0.042757630813866854, + "rewards/accuracy_reward": 0.029017859371379018, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1993 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.1004791259766, + "epoch": 0.5956239265178105, + "grad_norm": 0.29368922114372253, + "kl": 0.5537109375, + "learning_rate": 8.410935858948372e-06, + "loss": 0.0227, + "reward": 0.5083705633878708, + "reward_std": 0.04833107767626643, + "rewards/accuracy_reward": 0.011160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 1994 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.7478179931641, + "epoch": 0.5959226346053319, + "grad_norm": 0.731917142868042, + "kl": 0.390869140625, + "learning_rate": 8.400639001157549e-06, + "loss": 0.0185, + "reward": 0.5781250298023224, + "reward_std": 0.07034909841604531, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 1995 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.2969207763672, + "epoch": 0.5962213426928534, + "grad_norm": 0.3574620187282562, + "kl": 0.453125, + "learning_rate": 8.39034388331474e-06, + "loss": 0.0247, + "reward": 0.6121651977300644, + "reward_std": 0.0466959907207638, + "rewards/accuracy_reward": 0.11607143143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 1996 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.8170013427734, + "epoch": 0.5965200507803748, + "grad_norm": 0.22094067931175232, + "kl": 0.288818359375, + "learning_rate": 8.380050516620026e-06, + "loss": 0.0166, + "reward": 0.5970982313156128, + "reward_std": 0.074997846968472, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 1997 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.638427734375, + "epoch": 0.5968187588678964, + "grad_norm": 0.6158170700073242, + "kl": 0.65185546875, + "learning_rate": 8.369758912271573e-06, + "loss": 0.0222, + "reward": 0.6450892984867096, + "reward_std": 0.03225403372198343, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 1998 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.4911193847656, + "epoch": 0.5971174669554178, + "grad_norm": 0.8566539287567139, + "kl": 0.739990234375, + "learning_rate": 8.359469081465645e-06, + "loss": 0.0211, + "reward": 0.5195312574505806, + "reward_std": 0.056494983145967126, + "rewards/accuracy_reward": 0.024553571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 1999 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.3415679931641, + "epoch": 0.5974161750429393, + "grad_norm": 0.6751669049263, + "kl": 0.76904296875, + "learning_rate": 8.349181035396568e-06, + "loss": 0.0263, + "reward": 0.6116071790456772, + "reward_std": 0.07892540143802762, + "rewards/accuracy_reward": 0.11607143469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2000 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.2545013427734, + "epoch": 0.5977148831304607, + "grad_norm": 0.31243035197257996, + "kl": 0.7763671875, + "learning_rate": 8.338894785256726e-06, + "loss": 0.0363, + "reward": 0.5546875149011612, + "reward_std": 0.10610523680225015, + "rewards/accuracy_reward": 0.06026786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2001 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.9397735595703, + "epoch": 0.5980135912179823, + "grad_norm": 0.47344520688056946, + "kl": 0.456298828125, + "learning_rate": 8.32861034223655e-06, + "loss": 0.0259, + "reward": 0.7003348469734192, + "reward_std": 0.13513533864170313, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2002 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.0156707763672, + "epoch": 0.5983122993055037, + "grad_norm": 0.2684749960899353, + "kl": 0.51171875, + "learning_rate": 8.31832771752451e-06, + "loss": 0.0171, + "reward": 0.709263414144516, + "reward_std": 0.08359804376959801, + "rewards/accuracy_reward": 0.21205358020961285, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2003 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.3259429931641, + "epoch": 0.5986110073930252, + "grad_norm": 0.6187272071838379, + "kl": 0.88232421875, + "learning_rate": 8.308046922307091e-06, + "loss": 0.0399, + "reward": 0.5223214626312256, + "reward_std": 0.06589773343876004, + "rewards/accuracy_reward": 0.029017858672887087, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 2004 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.6964874267578, + "epoch": 0.5989097154805466, + "grad_norm": 0.7631335854530334, + "kl": 0.73779296875, + "learning_rate": 8.29776796776879e-06, + "loss": 0.0276, + "reward": 0.7053571790456772, + "reward_std": 0.09594650310464203, + "rewards/accuracy_reward": 0.2098214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2005 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.9241485595703, + "epoch": 0.5992084235680681, + "grad_norm": 0.3180990517139435, + "kl": 0.853515625, + "learning_rate": 8.287490865092106e-06, + "loss": 0.0333, + "reward": 0.6484375298023224, + "reward_std": 0.09084689524024725, + "rewards/accuracy_reward": 0.15625000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2006 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.6250305175781, + "epoch": 0.5995071316555896, + "grad_norm": 0.45014864206314087, + "kl": 0.669921875, + "learning_rate": 8.277215625457516e-06, + "loss": 0.0285, + "reward": 0.565848246216774, + "reward_std": 0.12269381247460842, + "rewards/accuracy_reward": 0.06919643189758062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2007 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.3348693847656, + "epoch": 0.5998058397431111, + "grad_norm": 0.42655399441719055, + "kl": 0.97802734375, + "learning_rate": 8.266942260043474e-06, + "loss": 0.0322, + "reward": 0.5697544813156128, + "reward_std": 0.13059851247817278, + "rewards/accuracy_reward": 0.07812500419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2008 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.8147888183594, + "epoch": 0.6001045478306325, + "grad_norm": 1.4168803691864014, + "kl": 1.00927734375, + "learning_rate": 8.256670780026393e-06, + "loss": 0.0395, + "reward": 0.6116071790456772, + "reward_std": 0.10984068550169468, + "rewards/accuracy_reward": 0.11830358114093542, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2009 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.8549499511719, + "epoch": 0.600403255918154, + "grad_norm": 0.4808467626571655, + "kl": 0.56298828125, + "learning_rate": 8.246401196580642e-06, + "loss": 0.0213, + "reward": 0.5948660969734192, + "reward_std": 0.14126519113779068, + "rewards/accuracy_reward": 0.10044643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2010 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.4621124267578, + "epoch": 0.6007019640056754, + "grad_norm": 0.9959577322006226, + "kl": 0.61669921875, + "learning_rate": 8.236133520878517e-06, + "loss": 0.0227, + "reward": 0.5591517984867096, + "reward_std": 0.08021623129025102, + "rewards/accuracy_reward": 0.06473214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2011 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.6384429931641, + "epoch": 0.601000672093197, + "grad_norm": 0.19624210894107819, + "kl": 0.36328125, + "learning_rate": 8.225867764090243e-06, + "loss": 0.0137, + "reward": 0.675223246216774, + "reward_std": 0.10550705343484879, + "rewards/accuracy_reward": 0.1785714402794838, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2012 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.1897888183594, + "epoch": 0.6012993801807184, + "grad_norm": 0.6613380312919617, + "kl": 0.41455078125, + "learning_rate": 8.215603937383959e-06, + "loss": 0.0174, + "reward": 0.6149553805589676, + "reward_std": 0.11471748538315296, + "rewards/accuracy_reward": 0.12053572107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2013 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.6049499511719, + "epoch": 0.6015980882682399, + "grad_norm": 0.2803587019443512, + "kl": 0.506103515625, + "learning_rate": 8.205342051925702e-06, + "loss": 0.0178, + "reward": 0.5848214626312256, + "reward_std": 0.10510122682899237, + "rewards/accuracy_reward": 0.08928571734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357238650322, + "step": 2014 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.2321929931641, + "epoch": 0.6018967963557613, + "grad_norm": 0.4579240381717682, + "kl": 0.714599609375, + "learning_rate": 8.195082118879397e-06, + "loss": 0.0309, + "reward": 0.6277902126312256, + "reward_std": 0.1564341001212597, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2015 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.9487152099609, + "epoch": 0.6021955044432828, + "grad_norm": 0.5683494210243225, + "kl": 0.271240234375, + "learning_rate": 8.184824149406843e-06, + "loss": 0.0104, + "reward": 0.6093750298023224, + "reward_std": 0.13165776059031487, + "rewards/accuracy_reward": 0.11160714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 2016 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.1920013427734, + "epoch": 0.6024942125308043, + "grad_norm": 0.8894829750061035, + "kl": 0.3828125, + "learning_rate": 8.174568154667712e-06, + "loss": 0.0159, + "reward": 0.570870578289032, + "reward_std": 0.10417655296623707, + "rewards/accuracy_reward": 0.07589285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2017 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.9442443847656, + "epoch": 0.6027929206183258, + "grad_norm": 0.7313665747642517, + "kl": 0.83935546875, + "learning_rate": 8.164314145819514e-06, + "loss": 0.0294, + "reward": 0.5703125298023224, + "reward_std": 0.10876883799210191, + "rewards/accuracy_reward": 0.08035714644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2018 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.8058624267578, + "epoch": 0.6030916287058472, + "grad_norm": 0.4808817505836487, + "kl": 0.494873046875, + "learning_rate": 8.15406213401761e-06, + "loss": 0.0095, + "reward": 0.5881696566939354, + "reward_std": 0.060064272256568074, + "rewards/accuracy_reward": 0.09375000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2019 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.5000457763672, + "epoch": 0.6033903367933687, + "grad_norm": 0.47100406885147095, + "kl": 0.73681640625, + "learning_rate": 8.143812130415182e-06, + "loss": 0.0324, + "reward": 0.6847098469734192, + "reward_std": 0.07565355603583157, + "rewards/accuracy_reward": 0.1897321566939354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2020 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.9152221679688, + "epoch": 0.6036890448808901, + "grad_norm": 0.8119372725486755, + "kl": 0.9033203125, + "learning_rate": 8.133564146163232e-06, + "loss": 0.0404, + "reward": 0.670200914144516, + "reward_std": 0.12902269512414932, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2021 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.8661193847656, + "epoch": 0.6039877529684117, + "grad_norm": 2.1679539680480957, + "kl": 1.13525390625, + "learning_rate": 8.12331819241056e-06, + "loss": 0.0458, + "reward": 0.5781250298023224, + "reward_std": 0.10548447631299496, + "rewards/accuracy_reward": 0.08482143329456449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2022 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.9375305175781, + "epoch": 0.6042864610559331, + "grad_norm": 3.5500142574310303, + "kl": 1.95703125, + "learning_rate": 8.11307428030376e-06, + "loss": 0.0734, + "reward": 0.6199777126312256, + "reward_std": 0.11912416480481625, + "rewards/accuracy_reward": 0.13392857694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491380095482, + "step": 2023 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.6518249511719, + "epoch": 0.6045851691434545, + "grad_norm": 3.2974462509155273, + "kl": 2.572265625, + "learning_rate": 8.102832420987205e-06, + "loss": 0.0995, + "reward": 0.5273437723517418, + "reward_std": 0.10575946420431137, + "rewards/accuracy_reward": 0.0424107164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330633878708, + "step": 2024 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.7187957763672, + "epoch": 0.604883877230976, + "grad_norm": 1.47163987159729, + "kl": 1.3818359375, + "learning_rate": 8.092592625603033e-06, + "loss": 0.0422, + "reward": 0.6422991454601288, + "reward_std": 0.18108482658863068, + "rewards/accuracy_reward": 0.1562500111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2025 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.4866485595703, + "epoch": 0.6051825853184974, + "grad_norm": 0.7446434497833252, + "kl": 0.990234375, + "learning_rate": 8.082354905291136e-06, + "loss": 0.0342, + "reward": 0.5641741305589676, + "reward_std": 0.09265469503588974, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2026 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.497802734375, + "epoch": 0.605481293406019, + "grad_norm": 0.7001441717147827, + "kl": 0.8056640625, + "learning_rate": 8.072119271189155e-06, + "loss": 0.0272, + "reward": 0.6372767984867096, + "reward_std": 0.18520412221550941, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 2027 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.4353179931641, + "epoch": 0.6057800014935404, + "grad_norm": 1.1627142429351807, + "kl": 0.79345703125, + "learning_rate": 8.061885734432455e-06, + "loss": 0.0294, + "reward": 0.5848214626312256, + "reward_std": 0.08963924297131598, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2028 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.0759429931641, + "epoch": 0.6060787095810619, + "grad_norm": 1.061020851135254, + "kl": 0.7646484375, + "learning_rate": 8.05165430615412e-06, + "loss": 0.0291, + "reward": 0.563058078289032, + "reward_std": 0.11289658024907112, + "rewards/accuracy_reward": 0.07142857694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2029 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.5513763427734, + "epoch": 0.6063774176685833, + "grad_norm": 0.9592245221138, + "kl": 1.169921875, + "learning_rate": 8.041424997484938e-06, + "loss": 0.0486, + "reward": 0.6099330633878708, + "reward_std": 0.14462242368608713, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330633878708, + "step": 2030 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.8058319091797, + "epoch": 0.6066761257561049, + "grad_norm": 0.5603060126304626, + "kl": 1.4736328125, + "learning_rate": 8.031197819553398e-06, + "loss": 0.0648, + "reward": 0.651785746216774, + "reward_std": 0.16934622265398502, + "rewards/accuracy_reward": 0.16741072130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750149011612, + "step": 2031 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.7835235595703, + "epoch": 0.6069748338436263, + "grad_norm": 1.8189579248428345, + "kl": 2.0546875, + "learning_rate": 8.020972783485671e-06, + "loss": 0.0905, + "reward": 0.5474330633878708, + "reward_std": 0.11367291957139969, + "rewards/accuracy_reward": 0.06473214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4827009066939354, + "step": 2032 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.9330902099609, + "epoch": 0.6072735419311478, + "grad_norm": 0.7769159078598022, + "kl": 1.982421875, + "learning_rate": 8.01074990040559e-06, + "loss": 0.076, + "reward": 0.5658482387661934, + "reward_std": 0.0973914721980691, + "rewards/accuracy_reward": 0.08258928777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.483258955180645, + "step": 2033 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.4866485595703, + "epoch": 0.6075722500186692, + "grad_norm": 0.8841477036476135, + "kl": 1.849609375, + "learning_rate": 8.000529181434649e-06, + "loss": 0.0701, + "reward": 0.5279018133878708, + "reward_std": 0.12399688735604286, + "rewards/accuracy_reward": 0.04464285867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 2034 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.216552734375, + "epoch": 0.6078709581061907, + "grad_norm": 0.9838257431983948, + "kl": 1.6015625, + "learning_rate": 7.990310637691988e-06, + "loss": 0.0714, + "reward": 0.6462053954601288, + "reward_std": 0.12451355718076229, + "rewards/accuracy_reward": 0.1629464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.483258955180645, + "step": 2035 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.2946929931641, + "epoch": 0.6081696661937122, + "grad_norm": 6.703609466552734, + "kl": 1.66796875, + "learning_rate": 7.980094280294383e-06, + "loss": 0.071, + "reward": 0.5172991380095482, + "reward_std": 0.1037688571959734, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.479352705180645, + "step": 2036 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.6094207763672, + "epoch": 0.6084683742812337, + "grad_norm": 0.6773443222045898, + "kl": 1.75, + "learning_rate": 7.96988012035623e-06, + "loss": 0.0671, + "reward": 0.5708705559372902, + "reward_std": 0.13106071762740612, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848469734192, + "step": 2037 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.7254943847656, + "epoch": 0.6087670823687551, + "grad_norm": 1.6082650423049927, + "kl": 1.3544921875, + "learning_rate": 7.959668168989531e-06, + "loss": 0.0525, + "reward": 0.6283482313156128, + "reward_std": 0.14399192295968533, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 2038 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.6049652099609, + "epoch": 0.6090657904562766, + "grad_norm": 0.6439588069915771, + "kl": 1.22607421875, + "learning_rate": 7.949458437303892e-06, + "loss": 0.0417, + "reward": 0.6612723469734192, + "reward_std": 0.12471855245530605, + "rewards/accuracy_reward": 0.176339291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2039 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.8817291259766, + "epoch": 0.609364498543798, + "grad_norm": 0.9008980989456177, + "kl": 1.416015625, + "learning_rate": 7.939250936406499e-06, + "loss": 0.055, + "reward": 0.6261160895228386, + "reward_std": 0.09780317638069391, + "rewards/accuracy_reward": 0.14062500931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 2040 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.1250457763672, + "epoch": 0.6096632066313196, + "grad_norm": 0.8407750725746155, + "kl": 2.224609375, + "learning_rate": 7.92904567740211e-06, + "loss": 0.0931, + "reward": 0.511160746216774, + "reward_std": 0.14164345152676105, + "rewards/accuracy_reward": 0.03571428777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.475446455180645, + "step": 2041 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.9129943847656, + "epoch": 0.609961914718841, + "grad_norm": 1.4419273138046265, + "kl": 2.3828125, + "learning_rate": 7.918842671393048e-06, + "loss": 0.1013, + "reward": 0.577566996216774, + "reward_std": 0.12396768108010292, + "rewards/accuracy_reward": 0.09821429196745157, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.479352705180645, + "step": 2042 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.3504791259766, + "epoch": 0.6102606228063625, + "grad_norm": 2.398226737976074, + "kl": 2.5859375, + "learning_rate": 7.908641929479187e-06, + "loss": 0.0868, + "reward": 0.5876116380095482, + "reward_std": 0.15555733256042004, + "rewards/accuracy_reward": 0.10937500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4782366305589676, + "step": 2043 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.0022735595703, + "epoch": 0.6105593308938839, + "grad_norm": 2.235558032989502, + "kl": 2.58984375, + "learning_rate": 7.898443462757933e-06, + "loss": 0.1034, + "reward": 0.5859375298023224, + "reward_std": 0.12020892463624477, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478794664144516, + "step": 2044 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.6250457763672, + "epoch": 0.6108580389814054, + "grad_norm": 1.6454672813415527, + "kl": 1.947265625, + "learning_rate": 7.888247282324212e-06, + "loss": 0.0728, + "reward": 0.541294664144516, + "reward_std": 0.16857368685305119, + "rewards/accuracy_reward": 0.06250000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4787946715950966, + "step": 2045 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.7879943847656, + "epoch": 0.6111567470689269, + "grad_norm": 0.9603869318962097, + "kl": 1.87890625, + "learning_rate": 7.878053399270475e-06, + "loss": 0.0801, + "reward": 0.5407366305589676, + "reward_std": 0.14524002373218536, + "rewards/accuracy_reward": 0.06250000093132257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478236623108387, + "step": 2046 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.1540679931641, + "epoch": 0.6114554551564484, + "grad_norm": 2.505882978439331, + "kl": 2.009765625, + "learning_rate": 7.86786182468667e-06, + "loss": 0.086, + "reward": 0.5825893059372902, + "reward_std": 0.10330124758183956, + "rewards/accuracy_reward": 0.1071428656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4754464477300644, + "step": 2047 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.3013763427734, + "epoch": 0.6117541632439698, + "grad_norm": 2.2987630367279053, + "kl": 1.69140625, + "learning_rate": 7.857672569660226e-06, + "loss": 0.0764, + "reward": 0.5680803805589676, + "reward_std": 0.11610349453985691, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478794664144516, + "step": 2048 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.8817443847656, + "epoch": 0.6120528713314913, + "grad_norm": 0.6690412163734436, + "kl": 1.955078125, + "learning_rate": 7.847485645276053e-06, + "loss": 0.0714, + "reward": 0.560825914144516, + "reward_std": 0.12183425389230251, + "rewards/accuracy_reward": 0.08258928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4782366305589676, + "step": 2049 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.247802734375, + "epoch": 0.6123515794190127, + "grad_norm": 2.5628061294555664, + "kl": 2.0, + "learning_rate": 7.837301062616531e-06, + "loss": 0.0875, + "reward": 0.614397332072258, + "reward_std": 0.11850786302238703, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 2050 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.3058624267578, + "epoch": 0.6126502875065343, + "grad_norm": 1.0391439199447632, + "kl": 1.951171875, + "learning_rate": 7.827118832761487e-06, + "loss": 0.0872, + "reward": 0.624441996216774, + "reward_std": 0.17189620435237885, + "rewards/accuracy_reward": 0.14062500931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 2051 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.6228179931641, + "epoch": 0.6129489955940557, + "grad_norm": 0.8938345909118652, + "kl": 1.8994140625, + "learning_rate": 7.816938966788185e-06, + "loss": 0.0798, + "reward": 0.659040205180645, + "reward_std": 0.13664689008146524, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2052 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.0558319091797, + "epoch": 0.6132477036815772, + "grad_norm": 0.9370392560958862, + "kl": 1.2001953125, + "learning_rate": 7.806761475771325e-06, + "loss": 0.0534, + "reward": 0.6322545111179352, + "reward_std": 0.09906433336436749, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2053 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.9308471679688, + "epoch": 0.6135464117690986, + "grad_norm": 1.1178346872329712, + "kl": 1.52734375, + "learning_rate": 7.796586370783019e-06, + "loss": 0.0721, + "reward": 0.6171875298023224, + "reward_std": 0.07998538296669722, + "rewards/accuracy_reward": 0.12723215017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553880095482, + "step": 2054 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.7165679931641, + "epoch": 0.6138451198566202, + "grad_norm": 0.9652811884880066, + "kl": 1.4130859375, + "learning_rate": 7.786413662892785e-06, + "loss": 0.0609, + "reward": 0.5256696492433548, + "reward_std": 0.07619444956071675, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2055 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.9844207763672, + "epoch": 0.6141438279441416, + "grad_norm": 0.4431708753108978, + "kl": 0.83056640625, + "learning_rate": 7.776243363167529e-06, + "loss": 0.0367, + "reward": 0.577566996216774, + "reward_std": 0.0986925158649683, + "rewards/accuracy_reward": 0.08258929033763707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2056 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.6830749511719, + "epoch": 0.6144425360316631, + "grad_norm": 0.4977600574493408, + "kl": 0.56982421875, + "learning_rate": 7.766075482671544e-06, + "loss": 0.029, + "reward": 0.576450914144516, + "reward_std": 0.09198926016688347, + "rewards/accuracy_reward": 0.0803571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2057 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.7656555175781, + "epoch": 0.6147412441191845, + "grad_norm": 0.897567629814148, + "kl": 0.6748046875, + "learning_rate": 7.755910032466485e-06, + "loss": 0.0253, + "reward": 0.6294643133878708, + "reward_std": 0.08057615021243691, + "rewards/accuracy_reward": 0.13392857951112092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2058 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.8772735595703, + "epoch": 0.615039952206706, + "grad_norm": 1.3844127655029297, + "kl": 0.841796875, + "learning_rate": 7.745747023611367e-06, + "loss": 0.049, + "reward": 0.6679687798023224, + "reward_std": 0.15116459969431162, + "rewards/accuracy_reward": 0.17410715040750802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2059 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.9018249511719, + "epoch": 0.6153386602942275, + "grad_norm": 0.5445359349250793, + "kl": 0.44189453125, + "learning_rate": 7.735586467162544e-06, + "loss": 0.0177, + "reward": 0.537388414144516, + "reward_std": 0.05909485602751374, + "rewards/accuracy_reward": 0.04017857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2060 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.6406707763672, + "epoch": 0.615637368381749, + "grad_norm": 0.1896967887878418, + "kl": 0.263427734375, + "learning_rate": 7.725428374173712e-06, + "loss": 0.0113, + "reward": 0.529575914144516, + "reward_std": 0.03911088826134801, + "rewards/accuracy_reward": 0.031250000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 2061 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.5692291259766, + "epoch": 0.6159360764692704, + "grad_norm": 0.14744772017002106, + "kl": 0.189208984375, + "learning_rate": 7.715272755695876e-06, + "loss": 0.0079, + "reward": 0.6501116454601288, + "reward_std": 0.04350606631487608, + "rewards/accuracy_reward": 0.15178571944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 2062 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.3817443847656, + "epoch": 0.6162347845567919, + "grad_norm": 0.4436878263950348, + "kl": 0.271240234375, + "learning_rate": 7.705119622777351e-06, + "loss": 0.0154, + "reward": 0.6417411118745804, + "reward_std": 0.08205306995660067, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 2063 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.9263763427734, + "epoch": 0.6165334926443133, + "grad_norm": 1.165212869644165, + "kl": 0.296142578125, + "learning_rate": 7.694968986463758e-06, + "loss": 0.0129, + "reward": 0.5390625298023224, + "reward_std": 0.09727109270170331, + "rewards/accuracy_reward": 0.042410717345774174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2064 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.0178833007812, + "epoch": 0.6168322007318349, + "grad_norm": 1.2037378549575806, + "kl": 0.3310546875, + "learning_rate": 7.68482085779799e-06, + "loss": 0.0131, + "reward": 0.6662946790456772, + "reward_std": 0.09598501306027174, + "rewards/accuracy_reward": 0.1696428693830967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2065 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.9219360351562, + "epoch": 0.6171309088193563, + "grad_norm": 0.4145765006542206, + "kl": 0.225341796875, + "learning_rate": 7.674675247820215e-06, + "loss": 0.009, + "reward": 0.510044664144516, + "reward_std": 0.043448752257972956, + "rewards/accuracy_reward": 0.011160715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 2066 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.5692443847656, + "epoch": 0.6174296169068777, + "grad_norm": 0.3074344992637634, + "kl": 0.212890625, + "learning_rate": 7.664532167567864e-06, + "loss": 0.0115, + "reward": 0.5987723618745804, + "reward_std": 0.10754550062119961, + "rewards/accuracy_reward": 0.10044643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 2067 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.3036193847656, + "epoch": 0.6177283249943992, + "grad_norm": 0.38206639885902405, + "kl": 0.22119140625, + "learning_rate": 7.654391628075616e-06, + "loss": 0.0105, + "reward": 0.631138414144516, + "reward_std": 0.06454206863418221, + "rewards/accuracy_reward": 0.1339285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2068 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.3013916015625, + "epoch": 0.6180270330819206, + "grad_norm": 0.21003785729408264, + "kl": 0.285400390625, + "learning_rate": 7.644253640375382e-06, + "loss": 0.0144, + "reward": 0.6434151977300644, + "reward_std": 0.12258049473166466, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 2069 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.2522888183594, + "epoch": 0.6183257411694422, + "grad_norm": 0.34064939618110657, + "kl": 0.479248046875, + "learning_rate": 7.634118215496298e-06, + "loss": 0.0198, + "reward": 0.645647332072258, + "reward_std": 0.0600777855142951, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 2070 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.5804138183594, + "epoch": 0.6186244492569636, + "grad_norm": 0.2533400356769562, + "kl": 0.3203125, + "learning_rate": 7.623985364464715e-06, + "loss": 0.013, + "reward": 0.6450893133878708, + "reward_std": 0.026785715715959668, + "rewards/accuracy_reward": 0.1473214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 2071 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.2210235595703, + "epoch": 0.6189231573444851, + "grad_norm": 0.6483811140060425, + "kl": 0.361572265625, + "learning_rate": 7.613855098304182e-06, + "loss": 0.0143, + "reward": 0.5569196492433548, + "reward_std": 0.061524902703240514, + "rewards/accuracy_reward": 0.0580357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 2072 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.9308471679688, + "epoch": 0.6192218654320065, + "grad_norm": 0.22186313569545746, + "kl": 0.290771484375, + "learning_rate": 7.6037274280354345e-06, + "loss": 0.0097, + "reward": 0.5485491305589676, + "reward_std": 0.07236112281680107, + "rewards/accuracy_reward": 0.051339288940653205, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2073 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.6719207763672, + "epoch": 0.619520573519528, + "grad_norm": 0.4941845238208771, + "kl": 0.56298828125, + "learning_rate": 7.593602364676382e-06, + "loss": 0.0214, + "reward": 0.5708705633878708, + "reward_std": 0.09620491042733192, + "rewards/accuracy_reward": 0.07589286402799189, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2074 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.6562957763672, + "epoch": 0.6198192816070495, + "grad_norm": 0.27482202649116516, + "kl": 0.484619140625, + "learning_rate": 7.583479919242108e-06, + "loss": 0.0207, + "reward": 0.5485491305589676, + "reward_std": 0.09428776358254254, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2075 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.404052734375, + "epoch": 0.620117989694571, + "grad_norm": 0.3323429226875305, + "kl": 0.5888671875, + "learning_rate": 7.573360102744838e-06, + "loss": 0.023, + "reward": 0.6914062649011612, + "reward_std": 0.09917688113637269, + "rewards/accuracy_reward": 0.1941964402794838, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2076 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.1272735595703, + "epoch": 0.6204166977820924, + "grad_norm": 0.2695518434047699, + "kl": 0.343017578125, + "learning_rate": 7.563242926193937e-06, + "loss": 0.0167, + "reward": 0.631138414144516, + "reward_std": 0.09011871553957462, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2077 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.0870971679688, + "epoch": 0.6207154058696139, + "grad_norm": 1.5313605070114136, + "kl": 0.58251953125, + "learning_rate": 7.553128400595906e-06, + "loss": 0.0243, + "reward": 0.5803571492433548, + "reward_std": 0.07778281648643315, + "rewards/accuracy_reward": 0.08258928591385484, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 2078 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.3594055175781, + "epoch": 0.6210141139571353, + "grad_norm": 0.4999546706676483, + "kl": 0.61474609375, + "learning_rate": 7.5430165369543566e-06, + "loss": 0.0255, + "reward": 0.5323660969734192, + "reward_std": 0.12883657962083817, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2079 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.216552734375, + "epoch": 0.6213128220446569, + "grad_norm": 1.0005441904067993, + "kl": 0.6044921875, + "learning_rate": 7.532907346270004e-06, + "loss": 0.0303, + "reward": 0.5954241156578064, + "reward_std": 0.1331852525472641, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2080 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.669677734375, + "epoch": 0.6216115301321783, + "grad_norm": 0.9342233538627625, + "kl": 0.7734375, + "learning_rate": 7.522800839540656e-06, + "loss": 0.0319, + "reward": 0.5195312798023224, + "reward_std": 0.09326805360615253, + "rewards/accuracy_reward": 0.03125000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2081 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.5268249511719, + "epoch": 0.6219102382196998, + "grad_norm": 1.9621220827102661, + "kl": 1.0810546875, + "learning_rate": 7.512697027761204e-06, + "loss": 0.0451, + "reward": 0.6863839477300644, + "reward_std": 0.11674288660287857, + "rewards/accuracy_reward": 0.1986607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 2082 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.0268402099609, + "epoch": 0.6222089463072212, + "grad_norm": 1.131304144859314, + "kl": 0.90673828125, + "learning_rate": 7.5025959219236055e-06, + "loss": 0.0339, + "reward": 0.5708705484867096, + "reward_std": 0.13914230093359947, + "rewards/accuracy_reward": 0.0803571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2083 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.6585235595703, + "epoch": 0.6225076543947428, + "grad_norm": 0.5696030855178833, + "kl": 1.53125, + "learning_rate": 7.49249753301687e-06, + "loss": 0.0546, + "reward": 0.631696455180645, + "reward_std": 0.11209673713892698, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 2084 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.1897735595703, + "epoch": 0.6228063624822642, + "grad_norm": 0.8875250816345215, + "kl": 1.4921875, + "learning_rate": 7.482401872027058e-06, + "loss": 0.0609, + "reward": 0.628348246216774, + "reward_std": 0.1229532640427351, + "rewards/accuracy_reward": 0.1428571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910969734192, + "step": 2085 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.7835235595703, + "epoch": 0.6231050705697857, + "grad_norm": 0.6828718185424805, + "kl": 1.193359375, + "learning_rate": 7.4723089499372595e-06, + "loss": 0.0526, + "reward": 0.5563616380095482, + "reward_std": 0.09709831327199936, + "rewards/accuracy_reward": 0.07142857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2086 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.1964721679688, + "epoch": 0.6234037786573071, + "grad_norm": 0.5193591117858887, + "kl": 1.470703125, + "learning_rate": 7.462218777727581e-06, + "loss": 0.0547, + "reward": 0.592075914144516, + "reward_std": 0.14380438067018986, + "rewards/accuracy_reward": 0.11160715040750802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687723517418, + "step": 2087 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.0402374267578, + "epoch": 0.6237024867448286, + "grad_norm": 0.6761676669120789, + "kl": 1.46484375, + "learning_rate": 7.452131366375142e-06, + "loss": 0.0669, + "reward": 0.5502232387661934, + "reward_std": 0.11491194553673267, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 2088 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.8192443847656, + "epoch": 0.6240011948323501, + "grad_norm": 0.811362624168396, + "kl": 1.4951171875, + "learning_rate": 7.442046726854061e-06, + "loss": 0.0669, + "reward": 0.556919664144516, + "reward_std": 0.10916595719754696, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478794664144516, + "step": 2089 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.6629943847656, + "epoch": 0.6242999029198716, + "grad_norm": 1.5432021617889404, + "kl": 1.361328125, + "learning_rate": 7.4319648701354355e-06, + "loss": 0.062, + "reward": 0.5820312649011612, + "reward_std": 0.11599225178360939, + "rewards/accuracy_reward": 0.09598214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 2090 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.5781707763672, + "epoch": 0.624598611007393, + "grad_norm": 1.0778391361236572, + "kl": 1.666015625, + "learning_rate": 7.421885807187332e-06, + "loss": 0.0703, + "reward": 0.5496651902794838, + "reward_std": 0.11283225938677788, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478236623108387, + "step": 2091 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.9598541259766, + "epoch": 0.6248973190949145, + "grad_norm": 0.6139773726463318, + "kl": 1.76171875, + "learning_rate": 7.411809548974792e-06, + "loss": 0.0669, + "reward": 0.5937500298023224, + "reward_std": 0.14057935401797295, + "rewards/accuracy_reward": 0.1138392873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107313156128, + "step": 2092 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.0781707763672, + "epoch": 0.6251960271824359, + "grad_norm": 1.3870023488998413, + "kl": 1.94921875, + "learning_rate": 7.4017361064597925e-06, + "loss": 0.0748, + "reward": 0.6406250149011612, + "reward_std": 0.17403830774128437, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4754464477300644, + "step": 2093 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.6205749511719, + "epoch": 0.6254947352699575, + "grad_norm": 0.7372204661369324, + "kl": 1.958984375, + "learning_rate": 7.391665490601252e-06, + "loss": 0.0808, + "reward": 0.6160714626312256, + "reward_std": 0.09922522306442261, + "rewards/accuracy_reward": 0.13392857951112092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 2094 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.3817443847656, + "epoch": 0.6257934433574789, + "grad_norm": 1.2305902242660522, + "kl": 2.263671875, + "learning_rate": 7.381597712355011e-06, + "loss": 0.1015, + "reward": 0.5496652126312256, + "reward_std": 0.13222614116966724, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687723517418, + "step": 2095 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.7433471679688, + "epoch": 0.6260921514450004, + "grad_norm": 1.2451568841934204, + "kl": 2.076171875, + "learning_rate": 7.371532782673832e-06, + "loss": 0.0933, + "reward": 0.5558035969734192, + "reward_std": 0.1374222543090582, + "rewards/accuracy_reward": 0.07589286286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107313156128, + "step": 2096 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.6808471679688, + "epoch": 0.6263908595325218, + "grad_norm": 0.9594593048095703, + "kl": 2.091796875, + "learning_rate": 7.3614707125073645e-06, + "loss": 0.0807, + "reward": 0.7315848469734192, + "reward_std": 0.1597341001033783, + "rewards/accuracy_reward": 0.2522321529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4793526977300644, + "step": 2097 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.8839721679688, + "epoch": 0.6266895676200434, + "grad_norm": 0.6172942519187927, + "kl": 2.01171875, + "learning_rate": 7.351411512802158e-06, + "loss": 0.0828, + "reward": 0.5351562798023224, + "reward_std": 0.1370671410113573, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4793526977300644, + "step": 2098 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.7746124267578, + "epoch": 0.6269882757075648, + "grad_norm": 1.065045714378357, + "kl": 1.939453125, + "learning_rate": 7.341355194501638e-06, + "loss": 0.0917, + "reward": 0.5340401902794838, + "reward_std": 0.08268214017152786, + "rewards/accuracy_reward": 0.05133928777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 2099 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.8638916015625, + "epoch": 0.6272869837950863, + "grad_norm": 2.0034422874450684, + "kl": 1.849609375, + "learning_rate": 7.331301768546091e-06, + "loss": 0.0798, + "reward": 0.674107164144516, + "reward_std": 0.15869303233921528, + "rewards/accuracy_reward": 0.1897321566939354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 2100 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.0201416015625, + "epoch": 0.6275856918826077, + "grad_norm": 0.6482963562011719, + "kl": 1.52734375, + "learning_rate": 7.3212512458726605e-06, + "loss": 0.0659, + "reward": 0.5797991305589676, + "reward_std": 0.1352705042809248, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2101 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.4844207763672, + "epoch": 0.6278843999701292, + "grad_norm": 0.8381676077842712, + "kl": 1.51953125, + "learning_rate": 7.311203637415325e-06, + "loss": 0.0678, + "reward": 0.6277901977300644, + "reward_std": 0.10390076227486134, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2102 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.8080902099609, + "epoch": 0.6281831080576507, + "grad_norm": 0.3910122215747833, + "kl": 0.9423828125, + "learning_rate": 7.301158954104905e-06, + "loss": 0.0421, + "reward": 0.6344866305589676, + "reward_std": 0.11358737852424383, + "rewards/accuracy_reward": 0.14062500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2103 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.1808624267578, + "epoch": 0.6284818161451722, + "grad_norm": 0.46680957078933716, + "kl": 0.3662109375, + "learning_rate": 7.291117206869027e-06, + "loss": 0.0061, + "reward": 0.5820312798023224, + "reward_std": 0.10504274070262909, + "rewards/accuracy_reward": 0.08482143189758062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2104 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.9732513427734, + "epoch": 0.6287805242326936, + "grad_norm": 0.5122930407524109, + "kl": 0.423583984375, + "learning_rate": 7.281078406632127e-06, + "loss": 0.0223, + "reward": 0.628348246216774, + "reward_std": 0.08997954754158854, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 2105 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.3884429931641, + "epoch": 0.6290792323202151, + "grad_norm": 0.1991850733757019, + "kl": 0.34375, + "learning_rate": 7.27104256431544e-06, + "loss": 0.0129, + "reward": 0.6450893133878708, + "reward_std": 0.09921126067638397, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 2106 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.8281555175781, + "epoch": 0.6293779404077365, + "grad_norm": 0.38539037108421326, + "kl": 0.335205078125, + "learning_rate": 7.261009690836977e-06, + "loss": 0.0157, + "reward": 0.7220982313156128, + "reward_std": 0.13000646233558655, + "rewards/accuracy_reward": 0.223214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 2107 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.5826263427734, + "epoch": 0.6296766484952581, + "grad_norm": 0.8312045931816101, + "kl": 0.31494140625, + "learning_rate": 7.2509797971115195e-06, + "loss": 0.013, + "reward": 0.6311384290456772, + "reward_std": 0.09290281590074301, + "rewards/accuracy_reward": 0.1339285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2108 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.5268249511719, + "epoch": 0.6299753565827795, + "grad_norm": 0.3069949448108673, + "kl": 0.312255859375, + "learning_rate": 7.240952894050608e-06, + "loss": 0.0162, + "reward": 0.6473214477300644, + "reward_std": 0.08484486304223537, + "rewards/accuracy_reward": 0.14955357648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 2109 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.4308624267578, + "epoch": 0.6302740646703009, + "grad_norm": 0.35475239157676697, + "kl": 0.17822265625, + "learning_rate": 7.230928992562534e-06, + "loss": 0.0078, + "reward": 0.5357143133878708, + "reward_std": 0.06199972238391638, + "rewards/accuracy_reward": 0.03571428777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5, + "step": 2110 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.6496124267578, + "epoch": 0.6305727727578224, + "grad_norm": 0.7703284621238708, + "kl": 0.22509765625, + "learning_rate": 7.220908103552319e-06, + "loss": 0.0129, + "reward": 0.6300223618745804, + "reward_std": 0.1340099722146988, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 2111 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.8705749511719, + "epoch": 0.6308714808453438, + "grad_norm": 0.2718397378921509, + "kl": 0.267578125, + "learning_rate": 7.210890237921704e-06, + "loss": 0.0104, + "reward": 0.5680803805589676, + "reward_std": 0.04891707026399672, + "rewards/accuracy_reward": 0.06919643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 2112 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.122802734375, + "epoch": 0.6311701889328654, + "grad_norm": 0.5201249718666077, + "kl": 0.531005859375, + "learning_rate": 7.20087540656915e-06, + "loss": 0.0247, + "reward": 0.5904017984867096, + "reward_std": 0.05140371061861515, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2113 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.8973541259766, + "epoch": 0.6314688970203868, + "grad_norm": 0.39981138706207275, + "kl": 0.34130859375, + "learning_rate": 7.1908636203898094e-06, + "loss": 0.0163, + "reward": 0.627232164144516, + "reward_std": 0.10739377443678677, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 2114 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.3973693847656, + "epoch": 0.6317676051079083, + "grad_norm": 0.3499888479709625, + "kl": 0.273193359375, + "learning_rate": 7.180854890275527e-06, + "loss": 0.0103, + "reward": 0.6774553805589676, + "reward_std": 0.1537309568375349, + "rewards/accuracy_reward": 0.17857143143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 2115 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.9888916015625, + "epoch": 0.6320663131954297, + "grad_norm": 0.4166637361049652, + "kl": 0.4443359375, + "learning_rate": 7.1708492271148144e-06, + "loss": 0.0202, + "reward": 0.6489955633878708, + "reward_std": 0.09333761339075863, + "rewards/accuracy_reward": 0.1517857238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2116 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.1071624755859, + "epoch": 0.6323650212829512, + "grad_norm": 0.2572956383228302, + "kl": 0.33642578125, + "learning_rate": 7.160846641792858e-06, + "loss": 0.0137, + "reward": 0.599888414144516, + "reward_std": 0.07957001822069287, + "rewards/accuracy_reward": 0.10267857508733869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2117 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.2723693847656, + "epoch": 0.6326637293704727, + "grad_norm": 0.25122907757759094, + "kl": 0.31201171875, + "learning_rate": 7.150847145191489e-06, + "loss": 0.013, + "reward": 0.5195312649011612, + "reward_std": 0.046046038158237934, + "rewards/accuracy_reward": 0.022321430267766118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2118 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.3572082519531, + "epoch": 0.6329624374579942, + "grad_norm": 0.22618910670280457, + "kl": 0.276611328125, + "learning_rate": 7.140850748189177e-06, + "loss": 0.0114, + "reward": 0.565848246216774, + "reward_std": 0.10773994959890842, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2119 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.2701263427734, + "epoch": 0.6332611455455156, + "grad_norm": 1.4197640419006348, + "kl": 0.43310546875, + "learning_rate": 7.130857461661027e-06, + "loss": 0.0164, + "reward": 0.5295759215950966, + "reward_std": 0.054419394582509995, + "rewards/accuracy_reward": 0.0379464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2120 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.4375457763672, + "epoch": 0.6335598536330371, + "grad_norm": 0.6805276274681091, + "kl": 0.41015625, + "learning_rate": 7.1208672964787505e-06, + "loss": 0.0178, + "reward": 0.6082589477300644, + "reward_std": 0.14920718874782324, + "rewards/accuracy_reward": 0.11607143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2121 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.6317443847656, + "epoch": 0.6338585617205585, + "grad_norm": 0.41297805309295654, + "kl": 0.72216796875, + "learning_rate": 7.110880263510672e-06, + "loss": 0.0346, + "reward": 0.5703125298023224, + "reward_std": 0.08823821996338665, + "rewards/accuracy_reward": 0.07589286100119352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2122 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.6652221679688, + "epoch": 0.6341572698080801, + "grad_norm": 0.5298332571983337, + "kl": 0.478515625, + "learning_rate": 7.1008963736217e-06, + "loss": 0.0199, + "reward": 0.6612723618745804, + "reward_std": 0.13920381478965282, + "rewards/accuracy_reward": 0.16741072502918541, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2123 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.4754791259766, + "epoch": 0.6344559778956015, + "grad_norm": 0.3406008183956146, + "kl": 0.513916015625, + "learning_rate": 7.090915637673333e-06, + "loss": 0.0179, + "reward": 0.6132812798023224, + "reward_std": 0.04182449961081147, + "rewards/accuracy_reward": 0.11830357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2124 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.6585388183594, + "epoch": 0.634754685983123, + "grad_norm": 0.5695227980613708, + "kl": 0.8447265625, + "learning_rate": 7.080938066523631e-06, + "loss": 0.0369, + "reward": 0.577008955180645, + "reward_std": 0.058816577307879925, + "rewards/accuracy_reward": 0.08482143003493547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2125 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.4888763427734, + "epoch": 0.6350533940706444, + "grad_norm": 0.8431451320648193, + "kl": 0.8330078125, + "learning_rate": 7.0709636710272115e-06, + "loss": 0.0344, + "reward": 0.6054687649011612, + "reward_std": 0.13727711886167526, + "rewards/accuracy_reward": 0.11160714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2126 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.1719207763672, + "epoch": 0.635352102158166, + "grad_norm": 1.1832058429718018, + "kl": 1.244140625, + "learning_rate": 7.060992462035243e-06, + "loss": 0.0506, + "reward": 0.5602678954601288, + "reward_std": 0.06675258139148355, + "rewards/accuracy_reward": 0.06919643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2127 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.5290679931641, + "epoch": 0.6356508102456874, + "grad_norm": 0.5806981921195984, + "kl": 0.8515625, + "learning_rate": 7.051024450395424e-06, + "loss": 0.0356, + "reward": 0.616071455180645, + "reward_std": 0.06105575431138277, + "rewards/accuracy_reward": 0.12276786682195961, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2128 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.0536041259766, + "epoch": 0.6359495183332089, + "grad_norm": 0.9895308017730713, + "kl": 0.62939453125, + "learning_rate": 7.041059646951971e-06, + "loss": 0.0288, + "reward": 0.6160714626312256, + "reward_std": 0.10077936318702996, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2129 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.9955749511719, + "epoch": 0.6362482264207303, + "grad_norm": 0.649876594543457, + "kl": 0.9765625, + "learning_rate": 7.031098062545614e-06, + "loss": 0.037, + "reward": 0.5474330633878708, + "reward_std": 0.12845918163657188, + "rewards/accuracy_reward": 0.05803571664728224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2130 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.4397735595703, + "epoch": 0.6365469345082518, + "grad_norm": 0.7114452719688416, + "kl": 0.8857421875, + "learning_rate": 7.021139708013582e-06, + "loss": 0.0368, + "reward": 0.577566996216774, + "reward_std": 0.08424340607598424, + "rewards/accuracy_reward": 0.08705357671715319, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 2131 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.9129943847656, + "epoch": 0.6368456425957733, + "grad_norm": 0.8164703249931335, + "kl": 0.83349609375, + "learning_rate": 7.0111845941895885e-06, + "loss": 0.0304, + "reward": 0.6662946939468384, + "reward_std": 0.11573021113872528, + "rewards/accuracy_reward": 0.17633929220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2132 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.3884429931641, + "epoch": 0.6371443506832948, + "grad_norm": 1.0227220058441162, + "kl": 0.966796875, + "learning_rate": 7.001232731903818e-06, + "loss": 0.039, + "reward": 0.6383928954601288, + "reward_std": 0.09003833122551441, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 2133 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.0714874267578, + "epoch": 0.6374430587708162, + "grad_norm": 0.5778571367263794, + "kl": 0.9599609375, + "learning_rate": 6.991284131982927e-06, + "loss": 0.0417, + "reward": 0.6049107387661934, + "reward_std": 0.060520872473716736, + "rewards/accuracy_reward": 0.11383929220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2134 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.2500305175781, + "epoch": 0.6377417668583377, + "grad_norm": 0.36697208881378174, + "kl": 0.55322265625, + "learning_rate": 6.981338805250015e-06, + "loss": 0.0185, + "reward": 0.6026786118745804, + "reward_std": 0.1355750486254692, + "rewards/accuracy_reward": 0.10937500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 2135 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.5647735595703, + "epoch": 0.6380404749458591, + "grad_norm": 0.40585726499557495, + "kl": 0.8486328125, + "learning_rate": 6.971396762524622e-06, + "loss": 0.0359, + "reward": 0.6462053880095482, + "reward_std": 0.1111032348126173, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2136 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.9955902099609, + "epoch": 0.6383391830333807, + "grad_norm": 0.3111594617366791, + "kl": 0.79296875, + "learning_rate": 6.9614580146227155e-06, + "loss": 0.026, + "reward": 0.6132812798023224, + "reward_std": 0.10990881291218102, + "rewards/accuracy_reward": 0.12500000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 2137 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.7522888183594, + "epoch": 0.6386378911209021, + "grad_norm": 0.8238638639450073, + "kl": 1.1298828125, + "learning_rate": 6.951522572356682e-06, + "loss": 0.0529, + "reward": 0.6434152126312256, + "reward_std": 0.16860534437000751, + "rewards/accuracy_reward": 0.15848215110599995, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2138 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.6875457763672, + "epoch": 0.6389365992084236, + "grad_norm": 0.5733610391616821, + "kl": 1.107421875, + "learning_rate": 6.9415904465353045e-06, + "loss": 0.0394, + "reward": 0.5463169887661934, + "reward_std": 0.09040075726807117, + "rewards/accuracy_reward": 0.0602678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 2139 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.5804138183594, + "epoch": 0.639235307295945, + "grad_norm": 0.728426456451416, + "kl": 1.6328125, + "learning_rate": 6.931661647963766e-06, + "loss": 0.0614, + "reward": 0.5937500223517418, + "reward_std": 0.1261790469288826, + "rewards/accuracy_reward": 0.11160715133883059, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 2140 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.0736846923828, + "epoch": 0.6395340153834665, + "grad_norm": 1.4300826787948608, + "kl": 1.462890625, + "learning_rate": 6.921736187443624e-06, + "loss": 0.0646, + "reward": 0.5976562798023224, + "reward_std": 0.07269168365746737, + "rewards/accuracy_reward": 0.11160714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2141 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.4063110351562, + "epoch": 0.639832723470988, + "grad_norm": 1.025917410850525, + "kl": 2.091796875, + "learning_rate": 6.911814075772809e-06, + "loss": 0.0831, + "reward": 0.6702009290456772, + "reward_std": 0.1358182206749916, + "rewards/accuracy_reward": 0.19419644447043538, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4760044887661934, + "step": 2142 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.5335388183594, + "epoch": 0.6401314315585095, + "grad_norm": 0.8780614733695984, + "kl": 1.662109375, + "learning_rate": 6.901895323745604e-06, + "loss": 0.0566, + "reward": 0.553571455180645, + "reward_std": 0.15273914113640785, + "rewards/accuracy_reward": 0.07142857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 2143 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.9330749511719, + "epoch": 0.6404301396460309, + "grad_norm": 1.3437047004699707, + "kl": 1.408203125, + "learning_rate": 6.89197994215264e-06, + "loss": 0.055, + "reward": 0.5915178954601288, + "reward_std": 0.11283598467707634, + "rewards/accuracy_reward": 0.11160714644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107313156128, + "step": 2144 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.2031555175781, + "epoch": 0.6407288477335524, + "grad_norm": 1.1529022455215454, + "kl": 1.1787109375, + "learning_rate": 6.882067941780881e-06, + "loss": 0.043, + "reward": 0.5172991305589676, + "reward_std": 0.11196042224764824, + "rewards/accuracy_reward": 0.03348214388824999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 2145 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.0178985595703, + "epoch": 0.6410275558210738, + "grad_norm": 0.31695693731307983, + "kl": 1.0712890625, + "learning_rate": 6.87215933341361e-06, + "loss": 0.0354, + "reward": 0.542410746216774, + "reward_std": 0.08597779180854559, + "rewards/accuracy_reward": 0.05580357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2146 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.6629943847656, + "epoch": 0.6413262639085954, + "grad_norm": 0.4368631839752197, + "kl": 1.130859375, + "learning_rate": 6.862254127830426e-06, + "loss": 0.0507, + "reward": 0.663504496216774, + "reward_std": 0.1501939594745636, + "rewards/accuracy_reward": 0.1785714402794838, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330484867096, + "step": 2147 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.0870971679688, + "epoch": 0.6416249719961168, + "grad_norm": 0.4947449564933777, + "kl": 1.345703125, + "learning_rate": 6.852352335807213e-06, + "loss": 0.0517, + "reward": 0.5652901977300644, + "reward_std": 0.10884686186909676, + "rewards/accuracy_reward": 0.08482143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687649011612, + "step": 2148 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.9397583007812, + "epoch": 0.6419236800836383, + "grad_norm": 0.33358725905418396, + "kl": 0.875, + "learning_rate": 6.84245396811616e-06, + "loss": 0.033, + "reward": 0.645647332072258, + "reward_std": 0.15817617811262608, + "rewards/accuracy_reward": 0.1562500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2149 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.513427734375, + "epoch": 0.6422223881711597, + "grad_norm": 0.4517800807952881, + "kl": 1.0859375, + "learning_rate": 6.832559035525716e-06, + "loss": 0.0388, + "reward": 0.5965401977300644, + "reward_std": 0.09739200631156564, + "rewards/accuracy_reward": 0.10714286286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2150 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.2567291259766, + "epoch": 0.6425210962586813, + "grad_norm": 1.0554146766662598, + "kl": 1.37890625, + "learning_rate": 6.822667548800599e-06, + "loss": 0.05, + "reward": 0.502790205180645, + "reward_std": 0.09152681473642588, + "rewards/accuracy_reward": 0.022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4804687723517418, + "step": 2151 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.2991485595703, + "epoch": 0.6428198043462027, + "grad_norm": 1.1017884016036987, + "kl": 1.37890625, + "learning_rate": 6.812779518701778e-06, + "loss": 0.0487, + "reward": 0.5831473395228386, + "reward_std": 0.09328066557645798, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2152 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.8683471679688, + "epoch": 0.6431185124337241, + "grad_norm": 0.5278904438018799, + "kl": 1.1650390625, + "learning_rate": 6.802894955986459e-06, + "loss": 0.0451, + "reward": 0.5468750223517418, + "reward_std": 0.06113169435411692, + "rewards/accuracy_reward": 0.05803571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2153 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.3951263427734, + "epoch": 0.6434172205212456, + "grad_norm": 0.38453367352485657, + "kl": 1.0234375, + "learning_rate": 6.793013871408076e-06, + "loss": 0.0333, + "reward": 0.5758928805589676, + "reward_std": 0.1181839257478714, + "rewards/accuracy_reward": 0.0870535725262016, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2154 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.1161041259766, + "epoch": 0.643715928608767, + "grad_norm": 1.004463791847229, + "kl": 0.66259765625, + "learning_rate": 6.783136275716283e-06, + "loss": 0.0305, + "reward": 0.6289062798023224, + "reward_std": 0.165894391015172, + "rewards/accuracy_reward": 0.13392857694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2155 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.8995819091797, + "epoch": 0.6440146366962886, + "grad_norm": 0.4910001754760742, + "kl": 1.001953125, + "learning_rate": 6.773262179656936e-06, + "loss": 0.0418, + "reward": 0.5982143133878708, + "reward_std": 0.11123557109385729, + "rewards/accuracy_reward": 0.10937500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888392984867096, + "step": 2156 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.1049499511719, + "epoch": 0.64431334478381, + "grad_norm": 0.37806516885757446, + "kl": 0.92578125, + "learning_rate": 6.763391593972084e-06, + "loss": 0.0362, + "reward": 0.5675223395228386, + "reward_std": 0.11556066107004881, + "rewards/accuracy_reward": 0.07812500465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2157 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.9732513427734, + "epoch": 0.6446120528713315, + "grad_norm": 0.37917736172676086, + "kl": 0.8505859375, + "learning_rate": 6.7535245293999556e-06, + "loss": 0.0355, + "reward": 0.553013414144516, + "reward_std": 0.07319658854976296, + "rewards/accuracy_reward": 0.06250000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2158 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.8795013427734, + "epoch": 0.6449107609588529, + "grad_norm": 0.7409151196479797, + "kl": 0.619140625, + "learning_rate": 6.74366099667495e-06, + "loss": 0.0231, + "reward": 0.6311384290456772, + "reward_std": 0.0858603110536933, + "rewards/accuracy_reward": 0.13616071874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2159 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.6161193847656, + "epoch": 0.6452094690463744, + "grad_norm": 0.383797287940979, + "kl": 0.651123046875, + "learning_rate": 6.733801006527625e-06, + "loss": 0.029, + "reward": 0.5708705633878708, + "reward_std": 0.14656018931418657, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2160 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.7054138183594, + "epoch": 0.6455081771338959, + "grad_norm": 0.3935738205909729, + "kl": 0.63037109375, + "learning_rate": 6.723944569684684e-06, + "loss": 0.028, + "reward": 0.5820312798023224, + "reward_std": 0.0618733037263155, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2161 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.0558471679688, + "epoch": 0.6458068852214174, + "grad_norm": 0.3667910397052765, + "kl": 0.74853515625, + "learning_rate": 6.71409169686896e-06, + "loss": 0.0297, + "reward": 0.612723246216774, + "reward_std": 0.10073816170915961, + "rewards/accuracy_reward": 0.12053571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 2162 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.5179138183594, + "epoch": 0.6461055933089388, + "grad_norm": 0.9862575531005859, + "kl": 0.77734375, + "learning_rate": 6.704242398799419e-06, + "loss": 0.0339, + "reward": 0.5742187947034836, + "reward_std": 0.1340541336685419, + "rewards/accuracy_reward": 0.08705357811413705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2163 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.5647735595703, + "epoch": 0.6464043013964603, + "grad_norm": 0.3412686884403229, + "kl": 0.7900390625, + "learning_rate": 6.6943966861911295e-06, + "loss": 0.0357, + "reward": 0.5619419813156128, + "reward_std": 0.10508348233997822, + "rewards/accuracy_reward": 0.07366071944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2164 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.5692443847656, + "epoch": 0.6467030094839817, + "grad_norm": 0.4404727816581726, + "kl": 0.9453125, + "learning_rate": 6.684554569755258e-06, + "loss": 0.0386, + "reward": 0.5368303805589676, + "reward_std": 0.1026340713724494, + "rewards/accuracy_reward": 0.0491071455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 2165 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.7031555175781, + "epoch": 0.6470017175715033, + "grad_norm": 0.3061926066875458, + "kl": 0.48681640625, + "learning_rate": 6.674716060199069e-06, + "loss": 0.0181, + "reward": 0.5775669813156128, + "reward_std": 0.09744521090760827, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2166 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.357177734375, + "epoch": 0.6473004256590247, + "grad_norm": 0.6774777770042419, + "kl": 0.8916015625, + "learning_rate": 6.664881168225894e-06, + "loss": 0.0381, + "reward": 0.663504496216774, + "reward_std": 0.08523313514888287, + "rewards/accuracy_reward": 0.17187500931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2167 + }, + { + "clip_ratio": 0.0, + "completion_length": 1000.3013916015625, + "epoch": 0.6475991337465462, + "grad_norm": 0.6415561437606812, + "kl": 1.412109375, + "learning_rate": 6.655049904535131e-06, + "loss": 0.0594, + "reward": 0.591517873108387, + "reward_std": 0.13319113664329052, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 2168 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.825927734375, + "epoch": 0.6478978418340676, + "grad_norm": 0.8761913180351257, + "kl": 1.06640625, + "learning_rate": 6.645222279822229e-06, + "loss": 0.0429, + "reward": 0.5831473544239998, + "reward_std": 0.12037032935768366, + "rewards/accuracy_reward": 0.09375000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2169 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.5558471679688, + "epoch": 0.6481965499215891, + "grad_norm": 1.0824707746505737, + "kl": 0.900390625, + "learning_rate": 6.635398304778685e-06, + "loss": 0.0353, + "reward": 0.564732164144516, + "reward_std": 0.09490375150926411, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714402794838, + "step": 2170 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.8036193847656, + "epoch": 0.6484952580091106, + "grad_norm": 0.4050203859806061, + "kl": 0.7216796875, + "learning_rate": 6.625577990092019e-06, + "loss": 0.0315, + "reward": 0.5948660969734192, + "reward_std": 0.055031922878697515, + "rewards/accuracy_reward": 0.10044643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2171 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.0670013427734, + "epoch": 0.6487939660966321, + "grad_norm": 1.1505259275436401, + "kl": 0.8427734375, + "learning_rate": 6.615761346445769e-06, + "loss": 0.0377, + "reward": 0.5943080633878708, + "reward_std": 0.08827809989452362, + "rewards/accuracy_reward": 0.10491071501746774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2172 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.2254791259766, + "epoch": 0.6490926741841535, + "grad_norm": 0.5167048573493958, + "kl": 0.71337890625, + "learning_rate": 6.605948384519485e-06, + "loss": 0.0273, + "reward": 0.5697544813156128, + "reward_std": 0.10100223775953054, + "rewards/accuracy_reward": 0.07812500582076609, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2173 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.8348693847656, + "epoch": 0.649391382271675, + "grad_norm": 1.1547198295593262, + "kl": 0.7236328125, + "learning_rate": 6.5961391149887065e-06, + "loss": 0.0318, + "reward": 0.5485491305589676, + "reward_std": 0.10336998105049133, + "rewards/accuracy_reward": 0.05803571594879031, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 2174 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.9464721679688, + "epoch": 0.6496900903591964, + "grad_norm": 0.5959609150886536, + "kl": 0.61181640625, + "learning_rate": 6.586333548524957e-06, + "loss": 0.0195, + "reward": 0.6289062798023224, + "reward_std": 0.1502219494432211, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2175 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.4509429931641, + "epoch": 0.649988798446718, + "grad_norm": 0.8579246997833252, + "kl": 1.07763671875, + "learning_rate": 6.576531695795727e-06, + "loss": 0.049, + "reward": 0.5485491305589676, + "reward_std": 0.10734725114889443, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2176 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.8370971679688, + "epoch": 0.6502875065342394, + "grad_norm": 0.623225212097168, + "kl": 0.85205078125, + "learning_rate": 6.56673356746448e-06, + "loss": 0.0281, + "reward": 0.5055803880095482, + "reward_std": 0.06666272692382336, + "rewards/accuracy_reward": 0.01562500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2177 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.5223693847656, + "epoch": 0.6505862146217609, + "grad_norm": 0.39104679226875305, + "kl": 0.9892578125, + "learning_rate": 6.556939174190615e-06, + "loss": 0.0414, + "reward": 0.6121651977300644, + "reward_std": 0.14093644730746746, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2178 + }, + { + "clip_ratio": 0.0, + "completion_length": 1003.8303985595703, + "epoch": 0.6508849227092823, + "grad_norm": 0.5357213616371155, + "kl": 0.72119140625, + "learning_rate": 6.54714852662947e-06, + "loss": 0.0271, + "reward": 0.5898437798023224, + "reward_std": 0.08447528630495071, + "rewards/accuracy_reward": 0.0959821455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2179 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.6964721679688, + "epoch": 0.6511836307968039, + "grad_norm": 1.6977477073669434, + "kl": 1.009765625, + "learning_rate": 6.537361635432316e-06, + "loss": 0.0374, + "reward": 0.5993303805589676, + "reward_std": 0.09920112416148186, + "rewards/accuracy_reward": 0.1071428656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2180 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.0022735595703, + "epoch": 0.6514823388843253, + "grad_norm": 0.8421157002449036, + "kl": 0.9599609375, + "learning_rate": 6.527578511246325e-06, + "loss": 0.0379, + "reward": 0.559151828289032, + "reward_std": 0.10459611937403679, + "rewards/accuracy_reward": 0.07142857392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 2181 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.9643249511719, + "epoch": 0.6517810469718468, + "grad_norm": 0.3860340416431427, + "kl": 0.6630859375, + "learning_rate": 6.517799164714581e-06, + "loss": 0.0272, + "reward": 0.6969866454601288, + "reward_std": 0.14571506343781948, + "rewards/accuracy_reward": 0.20312500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2182 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.4442443847656, + "epoch": 0.6520797550593682, + "grad_norm": 0.4233379364013672, + "kl": 0.8310546875, + "learning_rate": 6.508023606476052e-06, + "loss": 0.0416, + "reward": 0.6015625447034836, + "reward_std": 0.11999266780912876, + "rewards/accuracy_reward": 0.10937500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2183 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.9866485595703, + "epoch": 0.6523784631468897, + "grad_norm": 1.1031183004379272, + "kl": 0.7900390625, + "learning_rate": 6.498251847165589e-06, + "loss": 0.0301, + "reward": 0.5630580708384514, + "reward_std": 0.08944011759012938, + "rewards/accuracy_reward": 0.07142857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2184 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.529052734375, + "epoch": 0.6526771712344112, + "grad_norm": 0.5715473890304565, + "kl": 0.81689453125, + "learning_rate": 6.4884838974139096e-06, + "loss": 0.0347, + "reward": 0.5993303805589676, + "reward_std": 0.10960838012397289, + "rewards/accuracy_reward": 0.10491072339937091, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2185 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.5781707763672, + "epoch": 0.6529758793219327, + "grad_norm": 0.6863930821418762, + "kl": 0.904296875, + "learning_rate": 6.478719767847581e-06, + "loss": 0.0407, + "reward": 0.6121652126312256, + "reward_std": 0.10681519471108913, + "rewards/accuracy_reward": 0.12053571850992739, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2186 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.0178985595703, + "epoch": 0.6532745874094541, + "grad_norm": 0.45875459909439087, + "kl": 0.6240234375, + "learning_rate": 6.468959469089025e-06, + "loss": 0.0293, + "reward": 0.632254496216774, + "reward_std": 0.06478786934167147, + "rewards/accuracy_reward": 0.1361607222352177, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2187 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.1920013427734, + "epoch": 0.6535732954969756, + "grad_norm": 0.20766444504261017, + "kl": 0.513916015625, + "learning_rate": 6.4592030117564885e-06, + "loss": 0.0254, + "reward": 0.5747767984867096, + "reward_std": 0.07995840907096863, + "rewards/accuracy_reward": 0.07812500465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2188 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.7455902099609, + "epoch": 0.653872003584497, + "grad_norm": 0.4797004163265228, + "kl": 0.671875, + "learning_rate": 6.44945040646404e-06, + "loss": 0.0271, + "reward": 0.5792410969734192, + "reward_std": 0.094409991055727, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2189 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.5290679931641, + "epoch": 0.6541707116720186, + "grad_norm": 0.24659061431884766, + "kl": 0.61474609375, + "learning_rate": 6.4397016638215535e-06, + "loss": 0.024, + "reward": 0.5362723469734192, + "reward_std": 0.08325026836246252, + "rewards/accuracy_reward": 0.04241071501746774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616156578064, + "step": 2190 + }, + { + "clip_ratio": 0.0, + "completion_length": 1021.0000457763672, + "epoch": 0.65446941975954, + "grad_norm": 0.3591518700122833, + "kl": 0.681640625, + "learning_rate": 6.429956794434714e-06, + "loss": 0.0274, + "reward": 0.534598246216774, + "reward_std": 0.08357477188110352, + "rewards/accuracy_reward": 0.04017857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2191 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.5558471679688, + "epoch": 0.6547681278470615, + "grad_norm": 0.2463628053665161, + "kl": 0.336181640625, + "learning_rate": 6.420215808904979e-06, + "loss": 0.0131, + "reward": 0.6121651977300644, + "reward_std": 0.09299615141935647, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2192 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.6518249511719, + "epoch": 0.6550668359345829, + "grad_norm": 1.6853657960891724, + "kl": 0.712890625, + "learning_rate": 6.410478717829587e-06, + "loss": 0.0288, + "reward": 0.6121651977300644, + "reward_std": 0.14167637843638659, + "rewards/accuracy_reward": 0.12276786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2193 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.2634429931641, + "epoch": 0.6553655440221045, + "grad_norm": 0.6525989770889282, + "kl": 0.364501953125, + "learning_rate": 6.40074553180154e-06, + "loss": 0.0143, + "reward": 0.7382812798023224, + "reward_std": 0.1339323464781046, + "rewards/accuracy_reward": 0.2433035783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2194 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.2299499511719, + "epoch": 0.6556642521096259, + "grad_norm": 1.0359281301498413, + "kl": 0.42529296875, + "learning_rate": 6.39101626140959e-06, + "loss": 0.0202, + "reward": 0.5753348469734192, + "reward_std": 0.10253606364130974, + "rewards/accuracy_reward": 0.08258929033763707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2195 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.5335388183594, + "epoch": 0.6559629601971473, + "grad_norm": 0.5343538522720337, + "kl": 0.62255859375, + "learning_rate": 6.381290917238229e-06, + "loss": 0.0259, + "reward": 0.541294664144516, + "reward_std": 0.08042792649939656, + "rewards/accuracy_reward": 0.053571430733427405, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 2196 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.4576416015625, + "epoch": 0.6562616682846688, + "grad_norm": 0.7102624177932739, + "kl": 0.71484375, + "learning_rate": 6.371569509867676e-06, + "loss": 0.03, + "reward": 0.5647321790456772, + "reward_std": 0.11160638369619846, + "rewards/accuracy_reward": 0.07812500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2197 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.5491485595703, + "epoch": 0.6565603763721902, + "grad_norm": 1.1389648914337158, + "kl": 0.8134765625, + "learning_rate": 6.361852049873875e-06, + "loss": 0.0362, + "reward": 0.5485491305589676, + "reward_std": 0.10298341047018766, + "rewards/accuracy_reward": 0.06250000302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2198 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.5960235595703, + "epoch": 0.6568590844597118, + "grad_norm": 1.4177355766296387, + "kl": 0.8359375, + "learning_rate": 6.352138547828466e-06, + "loss": 0.0328, + "reward": 0.5613839626312256, + "reward_std": 0.13799288868904114, + "rewards/accuracy_reward": 0.06919643189758062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2199 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.5670013427734, + "epoch": 0.6571577925472332, + "grad_norm": 1.473509669303894, + "kl": 1.58984375, + "learning_rate": 6.342429014298786e-06, + "loss": 0.0644, + "reward": 0.556361623108387, + "reward_std": 0.12220897898077965, + "rewards/accuracy_reward": 0.066964291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2200 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.7835083007812, + "epoch": 0.6574565006347547, + "grad_norm": 1.4488283395767212, + "kl": 1.6181640625, + "learning_rate": 6.3327234598478605e-06, + "loss": 0.0662, + "reward": 0.5658482313156128, + "reward_std": 0.04971216106787324, + "rewards/accuracy_reward": 0.07589285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2201 + }, + { + "clip_ratio": 0.0, + "completion_length": 992.8929138183594, + "epoch": 0.6577552087222761, + "grad_norm": 0.5934643745422363, + "kl": 1.2138671875, + "learning_rate": 6.323021895034378e-06, + "loss": 0.0467, + "reward": 0.5770089477300644, + "reward_std": 0.10944618145003915, + "rewards/accuracy_reward": 0.08928571920841932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 2202 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.5223693847656, + "epoch": 0.6580539168097976, + "grad_norm": 1.2924200296401978, + "kl": 0.88134765625, + "learning_rate": 6.313324330412692e-06, + "loss": 0.0365, + "reward": 0.609375037252903, + "reward_std": 0.14808541350066662, + "rewards/accuracy_reward": 0.11607143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2203 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.060302734375, + "epoch": 0.658352624897319, + "grad_norm": 0.9182121157646179, + "kl": 1.1455078125, + "learning_rate": 6.303630776532799e-06, + "loss": 0.0421, + "reward": 0.6210937574505806, + "reward_std": 0.11590467859059572, + "rewards/accuracy_reward": 0.1294642947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2204 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.1116333007812, + "epoch": 0.6586513329848406, + "grad_norm": 0.3982387185096741, + "kl": 0.82421875, + "learning_rate": 6.29394124394034e-06, + "loss": 0.033, + "reward": 0.5440848469734192, + "reward_std": 0.0920057212933898, + "rewards/accuracy_reward": 0.05357143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 2205 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.6406707763672, + "epoch": 0.658950041072362, + "grad_norm": 0.9898501038551331, + "kl": 1.091796875, + "learning_rate": 6.284255743176576e-06, + "loss": 0.0402, + "reward": 0.558035746216774, + "reward_std": 0.13316749222576618, + "rewards/accuracy_reward": 0.07589286169968545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482142873108387, + "step": 2206 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.1942443847656, + "epoch": 0.6592487491598835, + "grad_norm": 0.5590789318084717, + "kl": 0.6943359375, + "learning_rate": 6.274574284778379e-06, + "loss": 0.0286, + "reward": 0.5223214328289032, + "reward_std": 0.11709038354456425, + "rewards/accuracy_reward": 0.03348214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2207 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.0201416015625, + "epoch": 0.6595474572474049, + "grad_norm": 0.34505918622016907, + "kl": 0.587890625, + "learning_rate": 6.26489687927823e-06, + "loss": 0.0245, + "reward": 0.5563616305589676, + "reward_std": 0.11595427058637142, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2208 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.2790679931641, + "epoch": 0.6598461653349265, + "grad_norm": 0.3524356782436371, + "kl": 0.54931640625, + "learning_rate": 6.2552235372041985e-06, + "loss": 0.0209, + "reward": 0.5987723469734192, + "reward_std": 0.07926274463534355, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491629496216774, + "step": 2209 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.6049499511719, + "epoch": 0.6601448734224479, + "grad_norm": 0.518404483795166, + "kl": 0.60498046875, + "learning_rate": 6.245554269079929e-06, + "loss": 0.0227, + "reward": 0.5970982313156128, + "reward_std": 0.08823249815031886, + "rewards/accuracy_reward": 0.10491072130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2210 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.0759429931641, + "epoch": 0.6604435815099694, + "grad_norm": 0.6499364972114563, + "kl": 0.57421875, + "learning_rate": 6.235889085424638e-06, + "loss": 0.0252, + "reward": 0.5658482313156128, + "reward_std": 0.0845095943659544, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2211 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.8303985595703, + "epoch": 0.6607422895974908, + "grad_norm": 1.2426725625991821, + "kl": 0.61376953125, + "learning_rate": 6.226227996753102e-06, + "loss": 0.0269, + "reward": 0.6774553805589676, + "reward_std": 0.08310611546039581, + "rewards/accuracy_reward": 0.1852678693830967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2212 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.388427734375, + "epoch": 0.6610409976850123, + "grad_norm": 1.5519074201583862, + "kl": 0.87890625, + "learning_rate": 6.2165710135756365e-06, + "loss": 0.0289, + "reward": 0.7271205633878708, + "reward_std": 0.12737772520631552, + "rewards/accuracy_reward": 0.2410714402794838, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2213 + }, + { + "clip_ratio": 0.0, + "completion_length": 1006.2500305175781, + "epoch": 0.6613397057725338, + "grad_norm": 1.0144455432891846, + "kl": 0.96142578125, + "learning_rate": 6.206918146398091e-06, + "loss": 0.0356, + "reward": 0.5619419813156128, + "reward_std": 0.07802959764376283, + "rewards/accuracy_reward": 0.06919642887078226, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2214 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.6406707763672, + "epoch": 0.6616384138600553, + "grad_norm": 0.6780259013175964, + "kl": 1.07421875, + "learning_rate": 6.1972694057218404e-06, + "loss": 0.043, + "reward": 0.6785714477300644, + "reward_std": 0.12754846177995205, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2215 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.6964721679688, + "epoch": 0.6619371219475767, + "grad_norm": 1.2668324708938599, + "kl": 0.9375, + "learning_rate": 6.18762480204377e-06, + "loss": 0.0352, + "reward": 0.6439732313156128, + "reward_std": 0.08412761357612908, + "rewards/accuracy_reward": 0.15401785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2216 + }, + { + "clip_ratio": 0.0, + "completion_length": 998.9777221679688, + "epoch": 0.6622358300350982, + "grad_norm": 0.6055968999862671, + "kl": 0.84033203125, + "learning_rate": 6.177984345856262e-06, + "loss": 0.034, + "reward": 0.5362723469734192, + "reward_std": 0.11023041419684887, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2217 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.0268249511719, + "epoch": 0.6625345381226196, + "grad_norm": 0.6557443141937256, + "kl": 0.765625, + "learning_rate": 6.168348047647185e-06, + "loss": 0.0322, + "reward": 0.5976562649011612, + "reward_std": 0.09617492370307446, + "rewards/accuracy_reward": 0.10491072200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2218 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.1339721679688, + "epoch": 0.6628332462101412, + "grad_norm": 0.3502541780471802, + "kl": 0.47705078125, + "learning_rate": 6.158715917899892e-06, + "loss": 0.0145, + "reward": 0.5809152126312256, + "reward_std": 0.07535755261778831, + "rewards/accuracy_reward": 0.08482143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2219 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.6920166015625, + "epoch": 0.6631319542976626, + "grad_norm": 0.2229478359222412, + "kl": 0.470703125, + "learning_rate": 6.149087967093195e-06, + "loss": 0.0188, + "reward": 0.5680803805589676, + "reward_std": 0.10590216936543584, + "rewards/accuracy_reward": 0.0758928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2220 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.1473541259766, + "epoch": 0.6634306623851841, + "grad_norm": 1.1075505018234253, + "kl": 0.61572265625, + "learning_rate": 6.13946420570136e-06, + "loss": 0.0256, + "reward": 0.6735491454601288, + "reward_std": 0.11273685283958912, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 2221 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.2589721679688, + "epoch": 0.6637293704727055, + "grad_norm": 0.2778659462928772, + "kl": 0.4482421875, + "learning_rate": 6.1298446441940916e-06, + "loss": 0.0167, + "reward": 0.6032366156578064, + "reward_std": 0.12568804435431957, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2222 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.5178985595703, + "epoch": 0.664028078560227, + "grad_norm": 1.7298190593719482, + "kl": 0.53076171875, + "learning_rate": 6.120229293036539e-06, + "loss": 0.0225, + "reward": 0.643973246216774, + "reward_std": 0.10197559976950288, + "rewards/accuracy_reward": 0.15625000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 2223 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.5022735595703, + "epoch": 0.6643267866477485, + "grad_norm": 0.973955512046814, + "kl": 0.4853515625, + "learning_rate": 6.110618162689257e-06, + "loss": 0.0191, + "reward": 0.6238839477300644, + "reward_std": 0.07476191152818501, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2224 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.2902374267578, + "epoch": 0.66462549473527, + "grad_norm": 0.523257315158844, + "kl": 0.5517578125, + "learning_rate": 6.10101126360821e-06, + "loss": 0.021, + "reward": 0.5887277126312256, + "reward_std": 0.07641125936061144, + "rewards/accuracy_reward": 0.09598214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 2225 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.5736999511719, + "epoch": 0.6649242028227914, + "grad_norm": 0.3817518353462219, + "kl": 0.865234375, + "learning_rate": 6.091408606244769e-06, + "loss": 0.0293, + "reward": 0.5301339626312256, + "reward_std": 0.14227662608027458, + "rewards/accuracy_reward": 0.04464285867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 2226 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.3705749511719, + "epoch": 0.6652229109103129, + "grad_norm": 0.886033833026886, + "kl": 1.1533203125, + "learning_rate": 6.081810201045681e-06, + "loss": 0.0481, + "reward": 0.6316964626312256, + "reward_std": 0.07532603712752461, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2227 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.2924652099609, + "epoch": 0.6655216189978344, + "grad_norm": 0.6420987248420715, + "kl": 1.01171875, + "learning_rate": 6.072216058453071e-06, + "loss": 0.0421, + "reward": 0.549665205180645, + "reward_std": 0.0995666729286313, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2228 + }, + { + "clip_ratio": 0.0, + "completion_length": 1018.9732666015625, + "epoch": 0.6658203270853559, + "grad_norm": 1.4975491762161255, + "kl": 1.5, + "learning_rate": 6.0626261889044236e-06, + "loss": 0.0601, + "reward": 0.5770089477300644, + "reward_std": 0.08508024644106627, + "rewards/accuracy_reward": 0.09151786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 2229 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.3437957763672, + "epoch": 0.6661190351728773, + "grad_norm": 0.5161663293838501, + "kl": 0.8349609375, + "learning_rate": 6.053040602832581e-06, + "loss": 0.0309, + "reward": 0.516741082072258, + "reward_std": 0.0982140121050179, + "rewards/accuracy_reward": 0.026785715715959668, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2230 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.4241485595703, + "epoch": 0.6664177432603988, + "grad_norm": 0.37445011734962463, + "kl": 1.03515625, + "learning_rate": 6.043459310665716e-06, + "loss": 0.0444, + "reward": 0.6261160969734192, + "reward_std": 0.17215615138411522, + "rewards/accuracy_reward": 0.13616071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2231 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.0982666015625, + "epoch": 0.6667164513479202, + "grad_norm": 0.5996907353401184, + "kl": 0.85302734375, + "learning_rate": 6.033882322827338e-06, + "loss": 0.0349, + "reward": 0.650669664144516, + "reward_std": 0.06104830093681812, + "rewards/accuracy_reward": 0.1584821455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2232 + }, + { + "clip_ratio": 0.0, + "completion_length": 1016.9821929931641, + "epoch": 0.6670151594354418, + "grad_norm": 0.6072633862495422, + "kl": 0.9375, + "learning_rate": 6.024309649736276e-06, + "loss": 0.0352, + "reward": 0.5195312723517418, + "reward_std": 0.09307891316711903, + "rewards/accuracy_reward": 0.03125000046566129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2233 + }, + { + "clip_ratio": 0.0, + "completion_length": 1015.1987152099609, + "epoch": 0.6673138675229632, + "grad_norm": 0.641674816608429, + "kl": 1.279296875, + "learning_rate": 6.0147413018066515e-06, + "loss": 0.0493, + "reward": 0.5089285895228386, + "reward_std": 0.07974492199718952, + "rewards/accuracy_reward": 0.022321430267766118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2234 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.0312805175781, + "epoch": 0.6676125756104847, + "grad_norm": 0.4755811393260956, + "kl": 1.1318359375, + "learning_rate": 6.005177289447895e-06, + "loss": 0.0481, + "reward": 0.6160714626312256, + "reward_std": 0.09260386694222689, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071715950966, + "step": 2235 + }, + { + "clip_ratio": 0.0, + "completion_length": 1017.7478179931641, + "epoch": 0.6679112836980061, + "grad_norm": 0.6794756054878235, + "kl": 1.099609375, + "learning_rate": 5.9956176230647115e-06, + "loss": 0.0453, + "reward": 0.5396205484867096, + "reward_std": 0.09629574324935675, + "rewards/accuracy_reward": 0.0580357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 2236 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.8549652099609, + "epoch": 0.6682099917855276, + "grad_norm": 0.5873441696166992, + "kl": 0.76416015625, + "learning_rate": 5.986062313057084e-06, + "loss": 0.0315, + "reward": 0.5535714626312256, + "reward_std": 0.16826334595680237, + "rewards/accuracy_reward": 0.07142857555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 2237 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.8728179931641, + "epoch": 0.6685086998730491, + "grad_norm": 0.8793880343437195, + "kl": 1.388671875, + "learning_rate": 5.97651136982025e-06, + "loss": 0.0564, + "reward": 0.6032366380095482, + "reward_std": 0.07648633047938347, + "rewards/accuracy_reward": 0.12053572107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 2238 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.4107513427734, + "epoch": 0.6688074079605706, + "grad_norm": 0.6835016012191772, + "kl": 1.060546875, + "learning_rate": 5.966964803744701e-06, + "loss": 0.0374, + "reward": 0.5719866305589676, + "reward_std": 0.14014616049826145, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330633878708, + "step": 2239 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.3460235595703, + "epoch": 0.669106116048092, + "grad_norm": 0.5353832840919495, + "kl": 1.12109375, + "learning_rate": 5.957422625216168e-06, + "loss": 0.0467, + "reward": 0.588169664144516, + "reward_std": 0.08727864967659116, + "rewards/accuracy_reward": 0.10044643515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 2240 + }, + { + "clip_ratio": 0.0, + "completion_length": 1013.6473846435547, + "epoch": 0.6694048241356134, + "grad_norm": 0.9681724905967712, + "kl": 1.095703125, + "learning_rate": 5.947884844615603e-06, + "loss": 0.0451, + "reward": 0.5094866380095482, + "reward_std": 0.08537047542631626, + "rewards/accuracy_reward": 0.020089286845177412, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2241 + }, + { + "clip_ratio": 0.0, + "completion_length": 1001.9129943847656, + "epoch": 0.669703532223135, + "grad_norm": 0.5871195793151855, + "kl": 1.0908203125, + "learning_rate": 5.938351472319177e-06, + "loss": 0.0471, + "reward": 0.6238839626312256, + "reward_std": 0.12483706139028072, + "rewards/accuracy_reward": 0.13616072293370962, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 2242 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.1361999511719, + "epoch": 0.6700022403106564, + "grad_norm": 1.8571641445159912, + "kl": 1.2275390625, + "learning_rate": 5.928822518698263e-06, + "loss": 0.0473, + "reward": 0.7539062798023224, + "reward_std": 0.10763327591121197, + "rewards/accuracy_reward": 0.2656250111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812798023224, + "step": 2243 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.6719055175781, + "epoch": 0.6703009483981779, + "grad_norm": 1.3211793899536133, + "kl": 0.904296875, + "learning_rate": 5.919297994119433e-06, + "loss": 0.0382, + "reward": 0.5731026977300644, + "reward_std": 0.095084382686764, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2244 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.5022735595703, + "epoch": 0.6705996564856993, + "grad_norm": 0.650820791721344, + "kl": 0.716796875, + "learning_rate": 5.909777908944433e-06, + "loss": 0.0274, + "reward": 0.640066996216774, + "reward_std": 0.08875192329287529, + "rewards/accuracy_reward": 0.14955357951112092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2245 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.2812957763672, + "epoch": 0.6708983645732208, + "grad_norm": 0.6110336780548096, + "kl": 0.65625, + "learning_rate": 5.9002622735301815e-06, + "loss": 0.0267, + "reward": 0.6484375298023224, + "reward_std": 0.07157322252169251, + "rewards/accuracy_reward": 0.15625000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2246 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.3973693847656, + "epoch": 0.6711970726607422, + "grad_norm": 0.5824168920516968, + "kl": 0.47412109375, + "learning_rate": 5.89075109822876e-06, + "loss": 0.02, + "reward": 0.6132812649011612, + "reward_std": 0.04994848766364157, + "rewards/accuracy_reward": 0.12053571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2247 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.9777374267578, + "epoch": 0.6714957807482638, + "grad_norm": 1.1376447677612305, + "kl": 0.7900390625, + "learning_rate": 5.881244393387395e-06, + "loss": 0.0332, + "reward": 0.5195312798023224, + "reward_std": 0.07437247037887573, + "rewards/accuracy_reward": 0.031250001629814506, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812798023224, + "step": 2248 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.5491333007812, + "epoch": 0.6717944888357852, + "grad_norm": 0.5216787457466125, + "kl": 0.58349609375, + "learning_rate": 5.871742169348447e-06, + "loss": 0.0285, + "reward": 0.545200914144516, + "reward_std": 0.06913393456488848, + "rewards/accuracy_reward": 0.05133928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2249 + }, + { + "clip_ratio": 0.0, + "completion_length": 1005.1272735595703, + "epoch": 0.6720931969233067, + "grad_norm": 0.2912411391735077, + "kl": 0.6650390625, + "learning_rate": 5.862244436449405e-06, + "loss": 0.0299, + "reward": 0.550223246216774, + "reward_std": 0.08908412419259548, + "rewards/accuracy_reward": 0.06026785890571773, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2250 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.9553985595703, + "epoch": 0.6723919050108281, + "grad_norm": 0.7545526623725891, + "kl": 0.6240234375, + "learning_rate": 5.852751205022875e-06, + "loss": 0.0229, + "reward": 0.617187537252903, + "reward_std": 0.08675350062549114, + "rewards/accuracy_reward": 0.12723214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2251 + }, + { + "clip_ratio": 0.0, + "completion_length": 1010.1585235595703, + "epoch": 0.6726906130983497, + "grad_norm": 0.5838342905044556, + "kl": 0.64306640625, + "learning_rate": 5.84326248539656e-06, + "loss": 0.0248, + "reward": 0.5708705633878708, + "reward_std": 0.11750354059040546, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2252 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.8080749511719, + "epoch": 0.6729893211858711, + "grad_norm": 0.43511906266212463, + "kl": 0.712890625, + "learning_rate": 5.833778287893257e-06, + "loss": 0.0271, + "reward": 0.6305803805589676, + "reward_std": 0.1410332815721631, + "rewards/accuracy_reward": 0.13839286682195961, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2253 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.3170166015625, + "epoch": 0.6732880292733926, + "grad_norm": 0.9605370163917542, + "kl": 0.8330078125, + "learning_rate": 5.82429862283084e-06, + "loss": 0.0363, + "reward": 0.5301339477300644, + "reward_std": 0.10063901171088219, + "rewards/accuracy_reward": 0.037946430733427405, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2254 + }, + { + "clip_ratio": 0.0, + "completion_length": 1012.4174499511719, + "epoch": 0.673586737360914, + "grad_norm": 0.7285669445991516, + "kl": 0.984375, + "learning_rate": 5.81482350052226e-06, + "loss": 0.0424, + "reward": 0.5719866454601288, + "reward_std": 0.1108868457376957, + "rewards/accuracy_reward": 0.07812500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2255 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.6071929931641, + "epoch": 0.6738854454484355, + "grad_norm": 0.8449117541313171, + "kl": 1.0166015625, + "learning_rate": 5.805352931275522e-06, + "loss": 0.0396, + "reward": 0.5982143133878708, + "reward_std": 0.06659391708672047, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 2256 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.8259429931641, + "epoch": 0.674184153535957, + "grad_norm": 0.6059203147888184, + "kl": 1.109375, + "learning_rate": 5.795886925393672e-06, + "loss": 0.0475, + "reward": 0.5686384215950966, + "reward_std": 0.075085308868438, + "rewards/accuracy_reward": 0.0758928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2257 + }, + { + "clip_ratio": 0.0, + "completion_length": 1011.8951416015625, + "epoch": 0.6744828616234785, + "grad_norm": 0.3263081908226013, + "kl": 0.678466796875, + "learning_rate": 5.786425493174801e-06, + "loss": 0.0263, + "reward": 0.5719866305589676, + "reward_std": 0.05849523702636361, + "rewards/accuracy_reward": 0.0803571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2258 + }, + { + "clip_ratio": 0.0, + "completion_length": 1007.9978179931641, + "epoch": 0.6747815697109999, + "grad_norm": 0.5792844295501709, + "kl": 0.7431640625, + "learning_rate": 5.7769686449120225e-06, + "loss": 0.0354, + "reward": 0.6060268133878708, + "reward_std": 0.12003615498542786, + "rewards/accuracy_reward": 0.11830358020961285, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 2259 + }, + { + "clip_ratio": 0.0, + "completion_length": 1004.5938110351562, + "epoch": 0.6750802777985214, + "grad_norm": 0.41545629501342773, + "kl": 0.72265625, + "learning_rate": 5.767516390893451e-06, + "loss": 0.0251, + "reward": 0.5703125149011612, + "reward_std": 0.04705207422375679, + "rewards/accuracy_reward": 0.07589285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2260 + }, + { + "clip_ratio": 0.0, + "completion_length": 999.841552734375, + "epoch": 0.6753789858860428, + "grad_norm": 0.48785367608070374, + "kl": 0.404296875, + "learning_rate": 5.758068741402223e-06, + "loss": 0.0222, + "reward": 0.588169664144516, + "reward_std": 0.08830190310254693, + "rewards/accuracy_reward": 0.09598214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2261 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.325927734375, + "epoch": 0.6756776939735644, + "grad_norm": 0.3996833264827728, + "kl": 0.49951171875, + "learning_rate": 5.748625706716448e-06, + "loss": 0.0204, + "reward": 0.5468750149011612, + "reward_std": 0.12377753853797913, + "rewards/accuracy_reward": 0.055803573690354824, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2262 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.7254943847656, + "epoch": 0.6759764020610858, + "grad_norm": 0.839296817779541, + "kl": 0.478515625, + "learning_rate": 5.739187297109223e-06, + "loss": 0.0277, + "reward": 0.629464328289032, + "reward_std": 0.09471411351114511, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 2263 + }, + { + "clip_ratio": 0.0, + "completion_length": 1008.6496124267578, + "epoch": 0.6762751101486073, + "grad_norm": 0.9316655993461609, + "kl": 0.53369140625, + "learning_rate": 5.729753522848618e-06, + "loss": 0.0222, + "reward": 0.553013414144516, + "reward_std": 0.10825521685183048, + "rewards/accuracy_reward": 0.06250000325962901, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 2264 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.8013916015625, + "epoch": 0.6765738182361287, + "grad_norm": 1.1620794534683228, + "kl": 0.451904296875, + "learning_rate": 5.720324394197649e-06, + "loss": 0.0219, + "reward": 0.6032366305589676, + "reward_std": 0.1515002530068159, + "rewards/accuracy_reward": 0.11160715040750802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2265 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.8460235595703, + "epoch": 0.6768725263236502, + "grad_norm": 0.5392255783081055, + "kl": 0.7099609375, + "learning_rate": 5.710899921414284e-06, + "loss": 0.0341, + "reward": 0.5625000298023224, + "reward_std": 0.0652616317383945, + "rewards/accuracy_reward": 0.0669642873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2266 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.4353179931641, + "epoch": 0.6771712344111717, + "grad_norm": 0.5845035314559937, + "kl": 0.95947265625, + "learning_rate": 5.701480114751432e-06, + "loss": 0.0394, + "reward": 0.6478794813156128, + "reward_std": 0.12705429270863533, + "rewards/accuracy_reward": 0.15625000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2267 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.5245819091797, + "epoch": 0.6774699424986932, + "grad_norm": 0.6388135552406311, + "kl": 1.1201171875, + "learning_rate": 5.692064984456911e-06, + "loss": 0.0537, + "reward": 0.6082589626312256, + "reward_std": 0.12243225611746311, + "rewards/accuracy_reward": 0.1160714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2268 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.7656707763672, + "epoch": 0.6777686505862146, + "grad_norm": 1.2839926481246948, + "kl": 1.767578125, + "learning_rate": 5.6826545407734636e-06, + "loss": 0.0727, + "reward": 0.6261161044239998, + "reward_std": 0.12953107338398695, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 2269 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.5736999511719, + "epoch": 0.6780673586737361, + "grad_norm": 0.9373607039451599, + "kl": 1.3994140625, + "learning_rate": 5.673248793938735e-06, + "loss": 0.0663, + "reward": 0.5558036118745804, + "reward_std": 0.07971106376498938, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2270 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.9442291259766, + "epoch": 0.6783660667612575, + "grad_norm": 1.8201558589935303, + "kl": 2.62109375, + "learning_rate": 5.663847754185246e-06, + "loss": 0.1202, + "reward": 0.5786830559372902, + "reward_std": 0.1269170381128788, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4827009215950966, + "step": 2271 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.2188110351562, + "epoch": 0.6786647748487791, + "grad_norm": 0.9710779786109924, + "kl": 2.400390625, + "learning_rate": 5.654451431740417e-06, + "loss": 0.1254, + "reward": 0.5496651977300644, + "reward_std": 0.11617325991392136, + "rewards/accuracy_reward": 0.06473214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2272 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.4933471679688, + "epoch": 0.6789634829363005, + "grad_norm": 2.0340657234191895, + "kl": 2.4609375, + "learning_rate": 5.645059836826518e-06, + "loss": 0.1085, + "reward": 0.5608259215950966, + "reward_std": 0.07715952210128307, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4827009066939354, + "step": 2273 + }, + { + "clip_ratio": 0.0, + "completion_length": 903.9085388183594, + "epoch": 0.679262191023822, + "grad_norm": 1.0206689834594727, + "kl": 3.31640625, + "learning_rate": 5.6356729796606844e-06, + "loss": 0.1782, + "reward": 0.6266741305589676, + "reward_std": 0.14891498163342476, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4726562723517418, + "step": 2274 + }, + { + "clip_ratio": 0.0, + "completion_length": 900.4487152099609, + "epoch": 0.6795608991113434, + "grad_norm": 1.5625824928283691, + "kl": 2.97265625, + "learning_rate": 5.626290870454905e-06, + "loss": 0.1205, + "reward": 0.4888393059372902, + "reward_std": 0.10827358812093735, + "rewards/accuracy_reward": 0.01562500069849193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143059372902, + "step": 2275 + }, + { + "clip_ratio": 0.0, + "completion_length": 908.7009429931641, + "epoch": 0.679859607198865, + "grad_norm": 1.2848411798477173, + "kl": 2.71484375, + "learning_rate": 5.616913519415983e-06, + "loss": 0.1366, + "reward": 0.5502232387661934, + "reward_std": 0.1683738213032484, + "rewards/accuracy_reward": 0.07589286030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4743303805589676, + "step": 2276 + }, + { + "clip_ratio": 0.0, + "completion_length": 853.0960388183594, + "epoch": 0.6801583152863864, + "grad_norm": 0.9368349313735962, + "kl": 2.8046875, + "learning_rate": 5.607540936745564e-06, + "loss": 0.1412, + "reward": 0.6445312798023224, + "reward_std": 0.1845649890601635, + "rewards/accuracy_reward": 0.1696428693830967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.474888414144516, + "step": 2277 + }, + { + "clip_ratio": 0.0, + "completion_length": 900.6495971679688, + "epoch": 0.6804570233739079, + "grad_norm": 1.4679728746414185, + "kl": 3.0078125, + "learning_rate": 5.598173132640102e-06, + "loss": 0.1238, + "reward": 0.5546875223517418, + "reward_std": 0.1464481372386217, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4698660895228386, + "step": 2278 + }, + { + "clip_ratio": 0.0, + "completion_length": 887.9308471679688, + "epoch": 0.6807557314614293, + "grad_norm": 1.6968547105789185, + "kl": 2.88671875, + "learning_rate": 5.588810117290843e-06, + "loss": 0.16, + "reward": 0.5033482313156128, + "reward_std": 0.13329822197556496, + "rewards/accuracy_reward": 0.03125000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4720982313156128, + "step": 2279 + }, + { + "clip_ratio": 0.0, + "completion_length": 856.966552734375, + "epoch": 0.6810544395489508, + "grad_norm": 3.2029387950897217, + "kl": 3.58203125, + "learning_rate": 5.579451900883833e-06, + "loss": 0.1517, + "reward": 0.6640625447034836, + "reward_std": 0.13390755467116833, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4720982387661934, + "step": 2280 + }, + { + "clip_ratio": 0.0, + "completion_length": 878.1964569091797, + "epoch": 0.6813531476364723, + "grad_norm": 6.218813419342041, + "kl": 4.76953125, + "learning_rate": 5.570098493599898e-06, + "loss": 0.2092, + "reward": 0.5585937798023224, + "reward_std": 0.1368691883981228, + "rewards/accuracy_reward": 0.1049107164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4536830559372902, + "step": 2281 + }, + { + "clip_ratio": 0.0, + "completion_length": 868.6585388183594, + "epoch": 0.6816518557239938, + "grad_norm": 3.097515821456909, + "kl": 3.78515625, + "learning_rate": 5.5607499056146216e-06, + "loss": 0.1692, + "reward": 0.5485491380095482, + "reward_std": 0.17117414809763432, + "rewards/accuracy_reward": 0.0803571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4681919887661934, + "step": 2282 + }, + { + "clip_ratio": 0.0, + "completion_length": 878.4107513427734, + "epoch": 0.6819505638115152, + "grad_norm": 1.2510727643966675, + "kl": 2.62109375, + "learning_rate": 5.551406147098355e-06, + "loss": 0.1367, + "reward": 0.6294643133878708, + "reward_std": 0.13352996110916138, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482142873108387, + "step": 2283 + }, + { + "clip_ratio": 0.0, + "completion_length": 865.6004791259766, + "epoch": 0.6822492718990366, + "grad_norm": 1.0049707889556885, + "kl": 1.5625, + "learning_rate": 5.542067228216195e-06, + "loss": 0.0522, + "reward": 0.5915178880095482, + "reward_std": 0.11386702116578817, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2284 + }, + { + "clip_ratio": 0.0, + "completion_length": 860.1406555175781, + "epoch": 0.6825479799865581, + "grad_norm": 0.9550243616104126, + "kl": 1.400390625, + "learning_rate": 5.532733159127963e-06, + "loss": 0.0581, + "reward": 0.5837053656578064, + "reward_std": 0.13790348544716835, + "rewards/accuracy_reward": 0.09821428777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 2285 + }, + { + "clip_ratio": 0.0, + "completion_length": 849.8125305175781, + "epoch": 0.6828466880740796, + "grad_norm": 3.1308789253234863, + "kl": 1.3515625, + "learning_rate": 5.523403949988217e-06, + "loss": 0.0771, + "reward": 0.5507812798023224, + "reward_std": 0.08103388268500566, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 2286 + }, + { + "clip_ratio": 0.0, + "completion_length": 844.2656555175781, + "epoch": 0.6831453961616011, + "grad_norm": 1.414449691772461, + "kl": 1.31640625, + "learning_rate": 5.514079610946217e-06, + "loss": 0.0622, + "reward": 0.5429687649011612, + "reward_std": 0.11723512969911098, + "rewards/accuracy_reward": 0.055803574388846755, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2287 + }, + { + "clip_ratio": 0.0, + "completion_length": 854.2053985595703, + "epoch": 0.6834441042491225, + "grad_norm": 1.204046368598938, + "kl": 0.9521484375, + "learning_rate": 5.504760152145934e-06, + "loss": 0.0342, + "reward": 0.624441996216774, + "reward_std": 0.13528104312717915, + "rewards/accuracy_reward": 0.13169643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2288 + }, + { + "clip_ratio": 0.0, + "completion_length": 882.9308319091797, + "epoch": 0.683742812336644, + "grad_norm": 1.1176677942276, + "kl": 0.873046875, + "learning_rate": 5.4954455837260265e-06, + "loss": 0.0402, + "reward": 0.658482164144516, + "reward_std": 0.14806018769741058, + "rewards/accuracy_reward": 0.16294643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2289 + }, + { + "clip_ratio": 0.0, + "completion_length": 844.1094207763672, + "epoch": 0.6840415204241654, + "grad_norm": 0.7400912642478943, + "kl": 0.5966796875, + "learning_rate": 5.486135915819827e-06, + "loss": 0.0244, + "reward": 0.645089328289032, + "reward_std": 0.025371116818860173, + "rewards/accuracy_reward": 0.14732143515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 2290 + }, + { + "clip_ratio": 0.0, + "completion_length": 837.3214721679688, + "epoch": 0.684340228511687, + "grad_norm": 0.7298743724822998, + "kl": 0.6064453125, + "learning_rate": 5.476831158555345e-06, + "loss": 0.0412, + "reward": 0.6774553954601288, + "reward_std": 0.09267000039108098, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2291 + }, + { + "clip_ratio": 0.0, + "completion_length": 856.3527069091797, + "epoch": 0.6846389365992084, + "grad_norm": 0.9168291687965393, + "kl": 0.48779296875, + "learning_rate": 5.467531322055247e-06, + "loss": 0.0184, + "reward": 0.631138414144516, + "reward_std": 0.1441698968410492, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2292 + }, + { + "clip_ratio": 0.0, + "completion_length": 879.5558471679688, + "epoch": 0.6849376446867299, + "grad_norm": 0.6878460049629211, + "kl": 0.4755859375, + "learning_rate": 5.458236416436838e-06, + "loss": 0.0223, + "reward": 0.6568080633878708, + "reward_std": 0.12029659003019333, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2293 + }, + { + "clip_ratio": 0.0, + "completion_length": 896.3348693847656, + "epoch": 0.6852363527742513, + "grad_norm": 1.4197533130645752, + "kl": 0.6181640625, + "learning_rate": 5.448946451812067e-06, + "loss": 0.0393, + "reward": 0.534040205180645, + "reward_std": 0.10227187257260084, + "rewards/accuracy_reward": 0.04017857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2294 + }, + { + "clip_ratio": 0.0, + "completion_length": 886.4777221679688, + "epoch": 0.6855350608617728, + "grad_norm": 1.3714686632156372, + "kl": 0.420654296875, + "learning_rate": 5.43966143828751e-06, + "loss": 0.0235, + "reward": 0.5859375298023224, + "reward_std": 0.09965930553153157, + "rewards/accuracy_reward": 0.0892857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2295 + }, + { + "clip_ratio": 0.0, + "completion_length": 891.7121124267578, + "epoch": 0.6858337689492943, + "grad_norm": 0.900376558303833, + "kl": 0.44580078125, + "learning_rate": 5.430381385964343e-06, + "loss": 0.0387, + "reward": 0.636160746216774, + "reward_std": 0.0910401763394475, + "rewards/accuracy_reward": 0.14062500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2296 + }, + { + "clip_ratio": 0.0, + "completion_length": 880.6138916015625, + "epoch": 0.6861324770368158, + "grad_norm": 0.3552631139755249, + "kl": 0.552978515625, + "learning_rate": 5.421106304938356e-06, + "loss": 0.0302, + "reward": 0.6774553805589676, + "reward_std": 0.061344658955931664, + "rewards/accuracy_reward": 0.1808035783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2297 + }, + { + "clip_ratio": 0.0, + "completion_length": 920.1585388183594, + "epoch": 0.6864311851243372, + "grad_norm": 0.5404791235923767, + "kl": 0.41015625, + "learning_rate": 5.411836205299934e-06, + "loss": 0.014, + "reward": 0.5641741454601288, + "reward_std": 0.084486348554492, + "rewards/accuracy_reward": 0.06696428777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2298 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.279052734375, + "epoch": 0.6867298932118587, + "grad_norm": 0.6566925644874573, + "kl": 0.655517578125, + "learning_rate": 5.402571097134029e-06, + "loss": 0.0344, + "reward": 0.624441996216774, + "reward_std": 0.07641484308987856, + "rewards/accuracy_reward": 0.1294642968568951, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2299 + }, + { + "clip_ratio": 0.0, + "completion_length": 916.8906707763672, + "epoch": 0.6870286012993801, + "grad_norm": 0.4329005181789398, + "kl": 0.69580078125, + "learning_rate": 5.393310990520177e-06, + "loss": 0.038, + "reward": 0.5948660969734192, + "reward_std": 0.1080315844155848, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2300 + }, + { + "clip_ratio": 0.0, + "completion_length": 928.3527374267578, + "epoch": 0.6873273093869017, + "grad_norm": 1.493180513381958, + "kl": 1.39453125, + "learning_rate": 5.384055895532458e-06, + "loss": 0.0616, + "reward": 0.534040205180645, + "reward_std": 0.05145139992237091, + "rewards/accuracy_reward": 0.04241071501746774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2301 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.9062957763672, + "epoch": 0.6876260174744231, + "grad_norm": 0.7735431790351868, + "kl": 0.9248046875, + "learning_rate": 5.374805822239516e-06, + "loss": 0.0454, + "reward": 0.6065848469734192, + "reward_std": 0.089778371155262, + "rewards/accuracy_reward": 0.11383929336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 2302 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.9196929931641, + "epoch": 0.6879247255619446, + "grad_norm": 0.40955275297164917, + "kl": 0.689453125, + "learning_rate": 5.365560780704524e-06, + "loss": 0.0324, + "reward": 0.6054687798023224, + "reward_std": 0.10872763954102993, + "rewards/accuracy_reward": 0.10937500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2303 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.7745971679688, + "epoch": 0.688223433649466, + "grad_norm": 0.7186397314071655, + "kl": 0.861328125, + "learning_rate": 5.356320780985176e-06, + "loss": 0.0449, + "reward": 0.5597098469734192, + "reward_std": 0.07827440742403269, + "rewards/accuracy_reward": 0.06473214481957257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2304 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.8080749511719, + "epoch": 0.6885221417369876, + "grad_norm": 1.3082655668258667, + "kl": 1.00146484375, + "learning_rate": 5.347085833133689e-06, + "loss": 0.0437, + "reward": 0.674107164144516, + "reward_std": 0.07973567070439458, + "rewards/accuracy_reward": 0.18080358114093542, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2305 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.5379791259766, + "epoch": 0.688820849824509, + "grad_norm": 0.6414564251899719, + "kl": 1.0341796875, + "learning_rate": 5.337855947196784e-06, + "loss": 0.0559, + "reward": 0.6138393133878708, + "reward_std": 0.06562908180058002, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2306 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.8549652099609, + "epoch": 0.6891195579120305, + "grad_norm": 0.42226359248161316, + "kl": 1.04052734375, + "learning_rate": 5.328631133215665e-06, + "loss": 0.04, + "reward": 0.5239955559372902, + "reward_std": 0.09195517748594284, + "rewards/accuracy_reward": 0.033482145285233855, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 2307 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.8348846435547, + "epoch": 0.6894182659995519, + "grad_norm": 1.2425715923309326, + "kl": 1.078125, + "learning_rate": 5.31941140122603e-06, + "loss": 0.0522, + "reward": 0.5792410969734192, + "reward_std": 0.12808703631162643, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2308 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.1518249511719, + "epoch": 0.6897169740870734, + "grad_norm": 1.0626394748687744, + "kl": 1.4482421875, + "learning_rate": 5.310196761258048e-06, + "loss": 0.0773, + "reward": 0.5585937798023224, + "reward_std": 0.08716937154531479, + "rewards/accuracy_reward": 0.06919643213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2309 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.5223541259766, + "epoch": 0.6900156821745949, + "grad_norm": 1.2980016469955444, + "kl": 1.4228515625, + "learning_rate": 5.300987223336334e-06, + "loss": 0.0589, + "reward": 0.5585937798023224, + "reward_std": 0.09620257280766964, + "rewards/accuracy_reward": 0.06919643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2310 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.9866638183594, + "epoch": 0.6903143902621164, + "grad_norm": 2.290274143218994, + "kl": 1.94921875, + "learning_rate": 5.29178279747997e-06, + "loss": 0.082, + "reward": 0.5440848469734192, + "reward_std": 0.11364425159990788, + "rewards/accuracy_reward": 0.05580357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 2311 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.7902069091797, + "epoch": 0.6906130983496378, + "grad_norm": 1.2070415019989014, + "kl": 1.80078125, + "learning_rate": 5.282583493702471e-06, + "loss": 0.0501, + "reward": 0.5664062798023224, + "reward_std": 0.10347943753004074, + "rewards/accuracy_reward": 0.08258929150179029, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 2312 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.0714721679688, + "epoch": 0.6909118064371593, + "grad_norm": 1.5340511798858643, + "kl": 1.845703125, + "learning_rate": 5.273389322011771e-06, + "loss": 0.072, + "reward": 0.6054687649011612, + "reward_std": 0.13203074783086777, + "rewards/accuracy_reward": 0.12053571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2313 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.3504791259766, + "epoch": 0.6912105145246807, + "grad_norm": 1.5476839542388916, + "kl": 1.544921875, + "learning_rate": 5.2642002924102334e-06, + "loss": 0.0426, + "reward": 0.5491071566939354, + "reward_std": 0.10596776567399502, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071715950966, + "step": 2314 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.9420318603516, + "epoch": 0.6915092226122023, + "grad_norm": 0.7196974158287048, + "kl": 0.9814453125, + "learning_rate": 5.255016414894616e-06, + "loss": 0.0338, + "reward": 0.7220982313156128, + "reward_std": 0.15225201286375523, + "rewards/accuracy_reward": 0.2299107275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2315 + }, + { + "clip_ratio": 0.0, + "completion_length": 970.8170013427734, + "epoch": 0.6918079306997237, + "grad_norm": 0.4817245602607727, + "kl": 1.1005859375, + "learning_rate": 5.245837699456083e-06, + "loss": 0.0387, + "reward": 0.6143973469734192, + "reward_std": 0.15726806223392487, + "rewards/accuracy_reward": 0.12500000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2316 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.2545166015625, + "epoch": 0.6921066387872452, + "grad_norm": 0.5536483526229858, + "kl": 1.1298828125, + "learning_rate": 5.236664156080175e-06, + "loss": 0.0441, + "reward": 0.6244419813156128, + "reward_std": 0.18437091447412968, + "rewards/accuracy_reward": 0.13392857927829027, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 2317 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.5826416015625, + "epoch": 0.6924053468747666, + "grad_norm": 1.2759424448013306, + "kl": 0.9228515625, + "learning_rate": 5.227495794746806e-06, + "loss": 0.0252, + "reward": 0.6160714477300644, + "reward_std": 0.1524201575666666, + "rewards/accuracy_reward": 0.1272321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2318 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.841552734375, + "epoch": 0.6927040549622882, + "grad_norm": 1.577523112297058, + "kl": 0.98681640625, + "learning_rate": 5.218332625430258e-06, + "loss": 0.0494, + "reward": 0.5457589477300644, + "reward_std": 0.1422082670032978, + "rewards/accuracy_reward": 0.05803571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 2319 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.6428833007812, + "epoch": 0.6930027630498096, + "grad_norm": 1.6629871129989624, + "kl": 0.9619140625, + "learning_rate": 5.209174658099162e-06, + "loss": 0.0225, + "reward": 0.5954241156578064, + "reward_std": 0.12529388815164566, + "rewards/accuracy_reward": 0.10714286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2320 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.6786041259766, + "epoch": 0.6933014711373311, + "grad_norm": 0.6952606439590454, + "kl": 1.1279296875, + "learning_rate": 5.200021902716483e-06, + "loss": 0.0252, + "reward": 0.5524553805589676, + "reward_std": 0.12629284337162971, + "rewards/accuracy_reward": 0.06250000302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2321 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.591552734375, + "epoch": 0.6936001792248525, + "grad_norm": 0.6249592304229736, + "kl": 1.849609375, + "learning_rate": 5.190874369239526e-06, + "loss": 0.0709, + "reward": 0.5440848469734192, + "reward_std": 0.11664954759180546, + "rewards/accuracy_reward": 0.06473214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4793526977300644, + "step": 2322 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.4263763427734, + "epoch": 0.693898887312374, + "grad_norm": 1.2787264585494995, + "kl": 1.095703125, + "learning_rate": 5.181732067619913e-06, + "loss": 0.0465, + "reward": 0.6735491305589676, + "reward_std": 0.15244632121175528, + "rewards/accuracy_reward": 0.18750000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 2323 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.2433471679688, + "epoch": 0.6941975953998955, + "grad_norm": 0.5299885272979736, + "kl": 1.4375, + "learning_rate": 5.172595007803567e-06, + "loss": 0.0553, + "reward": 0.5647321715950966, + "reward_std": 0.11474999785423279, + "rewards/accuracy_reward": 0.07812500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071715950966, + "step": 2324 + }, + { + "clip_ratio": 0.0, + "completion_length": 970.3125305175781, + "epoch": 0.694496303487417, + "grad_norm": 0.9915356040000916, + "kl": 1.94921875, + "learning_rate": 5.1634631997307165e-06, + "loss": 0.0679, + "reward": 0.5357143059372902, + "reward_std": 0.12317575328052044, + "rewards/accuracy_reward": 0.05357143236324191, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482142873108387, + "step": 2325 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.4911041259766, + "epoch": 0.6947950115749384, + "grad_norm": 0.7341747283935547, + "kl": 1.12109375, + "learning_rate": 5.1543366533358755e-06, + "loss": 0.0347, + "reward": 0.5625000447034836, + "reward_std": 0.13164393976330757, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2326 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.2478179931641, + "epoch": 0.6950937196624598, + "grad_norm": 0.8175023198127747, + "kl": 2.22265625, + "learning_rate": 5.145215378547825e-06, + "loss": 0.075, + "reward": 0.6099330633878708, + "reward_std": 0.15691225603222847, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478236623108387, + "step": 2327 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.2321929931641, + "epoch": 0.6953924277499813, + "grad_norm": 1.2166837453842163, + "kl": 2.2734375, + "learning_rate": 5.136099385289628e-06, + "loss": 0.0791, + "reward": 0.5340401902794838, + "reward_std": 0.08494272269308567, + "rewards/accuracy_reward": 0.05133928777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4827009066939354, + "step": 2328 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.6763916015625, + "epoch": 0.6956911358375027, + "grad_norm": 0.5633280873298645, + "kl": 1.947265625, + "learning_rate": 5.126988683478582e-06, + "loss": 0.0776, + "reward": 0.6104910969734192, + "reward_std": 0.14045016467571259, + "rewards/accuracy_reward": 0.12723214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 2329 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.8995971679688, + "epoch": 0.6959898439250243, + "grad_norm": 2.9707958698272705, + "kl": 3.078125, + "learning_rate": 5.117883283026243e-06, + "loss": 0.0978, + "reward": 0.5507812723517418, + "reward_std": 0.09658108092844486, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4726562723517418, + "step": 2330 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.2143249511719, + "epoch": 0.6962885520125457, + "grad_norm": 0.6101799011230469, + "kl": 1.912109375, + "learning_rate": 5.108783193838396e-06, + "loss": 0.071, + "reward": 0.526227705180645, + "reward_std": 0.11991364322602749, + "rewards/accuracy_reward": 0.04687500186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.479352705180645, + "step": 2331 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.794677734375, + "epoch": 0.6965872601000672, + "grad_norm": 0.6312741041183472, + "kl": 1.708984375, + "learning_rate": 5.099688425815039e-06, + "loss": 0.0703, + "reward": 0.6065848469734192, + "reward_std": 0.11145278811454773, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 2332 + }, + { + "clip_ratio": 0.0, + "completion_length": 926.0759582519531, + "epoch": 0.6968859681875886, + "grad_norm": 1.2091243267059326, + "kl": 3.55078125, + "learning_rate": 5.0905989888503924e-06, + "loss": 0.1595, + "reward": 0.521205373108387, + "reward_std": 0.1702426467090845, + "rewards/accuracy_reward": 0.0602678582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4609375223517418, + "step": 2333 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.4844055175781, + "epoch": 0.6971846762751102, + "grad_norm": 2.459554672241211, + "kl": 2.005859375, + "learning_rate": 5.081514892832878e-06, + "loss": 0.1069, + "reward": 0.6004464477300644, + "reward_std": 0.1608918271958828, + "rewards/accuracy_reward": 0.12053572246804833, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4799107313156128, + "step": 2334 + }, + { + "clip_ratio": 0.0, + "completion_length": 926.7857513427734, + "epoch": 0.6974833843626316, + "grad_norm": 1.413552165031433, + "kl": 2.138671875, + "learning_rate": 5.0724361476450925e-06, + "loss": 0.055, + "reward": 0.5468750223517418, + "reward_std": 0.11181829404085875, + "rewards/accuracy_reward": 0.0736607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4732143059372902, + "step": 2335 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.169677734375, + "epoch": 0.6977820924501531, + "grad_norm": 1.5344475507736206, + "kl": 3.0859375, + "learning_rate": 5.063362763163826e-06, + "loss": 0.1135, + "reward": 0.595982164144516, + "reward_std": 0.15565991029143333, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4642857387661934, + "step": 2336 + }, + { + "clip_ratio": 0.0, + "completion_length": 908.310302734375, + "epoch": 0.6980808005376745, + "grad_norm": 2.509596824645996, + "kl": 3.609375, + "learning_rate": 5.0542947492600336e-06, + "loss": 0.1462, + "reward": 0.533482164144516, + "reward_std": 0.17748410999774933, + "rewards/accuracy_reward": 0.06919643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4642857387661934, + "step": 2337 + }, + { + "clip_ratio": 0.0, + "completion_length": 923.6205596923828, + "epoch": 0.698379508625196, + "grad_norm": 0.9829892516136169, + "kl": 3.26953125, + "learning_rate": 5.045232115798819e-06, + "loss": 0.1413, + "reward": 0.526227705180645, + "reward_std": 0.12210731022059917, + "rewards/accuracy_reward": 0.058035716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4681919887661934, + "step": 2338 + }, + { + "clip_ratio": 0.0, + "completion_length": 896.747802734375, + "epoch": 0.6986782167127175, + "grad_norm": 1.3963788747787476, + "kl": 3.8671875, + "learning_rate": 5.0361748726394435e-06, + "loss": 0.1592, + "reward": 0.5161830633878708, + "reward_std": 0.13182325661182404, + "rewards/accuracy_reward": 0.05580357392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4603794887661934, + "step": 2339 + }, + { + "clip_ratio": 0.0, + "completion_length": 894.5402069091797, + "epoch": 0.698976924800239, + "grad_norm": 1.4613755941390991, + "kl": 2.7265625, + "learning_rate": 5.027123029635301e-06, + "loss": 0.1001, + "reward": 0.5457589700818062, + "reward_std": 0.13341778330504894, + "rewards/accuracy_reward": 0.07812500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4676339477300644, + "step": 2340 + }, + { + "clip_ratio": 0.0, + "completion_length": 907.1674652099609, + "epoch": 0.6992756328877604, + "grad_norm": 2.267685651779175, + "kl": 3.93359375, + "learning_rate": 5.018076596633907e-06, + "loss": 0.1504, + "reward": 0.5429687649011612, + "reward_std": 0.12278798781335354, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.455915205180645, + "step": 2341 + }, + { + "clip_ratio": 0.0, + "completion_length": 899.9420013427734, + "epoch": 0.6995743409752819, + "grad_norm": 1.1475216150283813, + "kl": 3.4765625, + "learning_rate": 5.009035583476898e-06, + "loss": 0.1313, + "reward": 0.5005580484867096, + "reward_std": 0.15899059176445007, + "rewards/accuracy_reward": 0.03571428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4648437723517418, + "step": 2342 + }, + { + "clip_ratio": 0.0, + "completion_length": 914.6629943847656, + "epoch": 0.6998730490628033, + "grad_norm": 1.6217063665390015, + "kl": 3.640625, + "learning_rate": 5.000000000000003e-06, + "loss": 0.1356, + "reward": 0.5625000298023224, + "reward_std": 0.1449382584542036, + "rewards/accuracy_reward": 0.09598214761354029, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.466517873108387, + "step": 2343 + }, + { + "clip_ratio": 0.0, + "completion_length": 886.4085083007812, + "epoch": 0.7001717571503249, + "grad_norm": 2.2864134311676025, + "kl": 3.578125, + "learning_rate": 4.990969856033055e-06, + "loss": 0.1723, + "reward": 0.5904018133878708, + "reward_std": 0.13428116962313652, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.463169664144516, + "step": 2344 + }, + { + "clip_ratio": 0.0, + "completion_length": 893.3906707763672, + "epoch": 0.7004704652378463, + "grad_norm": 1.3012127876281738, + "kl": 4.02734375, + "learning_rate": 4.981945161399969e-06, + "loss": 0.1718, + "reward": 0.5708705633878708, + "reward_std": 0.14412924647331238, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4659598395228386, + "step": 2345 + }, + { + "clip_ratio": 0.0, + "completion_length": 890.7098541259766, + "epoch": 0.7007691733253678, + "grad_norm": 3.9713733196258545, + "kl": 2.71875, + "learning_rate": 4.9729259259187235e-06, + "loss": 0.15, + "reward": 0.5569196790456772, + "reward_std": 0.10889676213264465, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4720982387661934, + "step": 2346 + }, + { + "clip_ratio": 0.0, + "completion_length": 919.8527374267578, + "epoch": 0.7010678814128892, + "grad_norm": 1.2677091360092163, + "kl": 4.2109375, + "learning_rate": 4.963912159401363e-06, + "loss": 0.1858, + "reward": 0.517857164144516, + "reward_std": 0.18947621248662472, + "rewards/accuracy_reward": 0.058035717345774174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4598214477300644, + "step": 2347 + }, + { + "clip_ratio": 0.0, + "completion_length": 845.9955749511719, + "epoch": 0.7013665895004108, + "grad_norm": 0.8196856379508972, + "kl": 3.27734375, + "learning_rate": 4.9549038716539865e-06, + "loss": 0.1361, + "reward": 0.5630580633878708, + "reward_std": 0.12916136905550957, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4715401977300644, + "step": 2348 + }, + { + "clip_ratio": 0.0, + "completion_length": 855.4464721679688, + "epoch": 0.7016652975879322, + "grad_norm": 0.8694122433662415, + "kl": 2.875, + "learning_rate": 4.945901072476723e-06, + "loss": 0.1259, + "reward": 0.5731026977300644, + "reward_std": 0.15238522551953793, + "rewards/accuracy_reward": 0.0959821492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4771205559372902, + "step": 2349 + }, + { + "clip_ratio": 0.0, + "completion_length": 880.1964721679688, + "epoch": 0.7019640056754537, + "grad_norm": 0.5704330801963806, + "kl": 2.005859375, + "learning_rate": 4.936903771663737e-06, + "loss": 0.1019, + "reward": 0.632254496216774, + "reward_std": 0.14952127821743488, + "rewards/accuracy_reward": 0.14955357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4827009066939354, + "step": 2350 + }, + { + "clip_ratio": 0.0, + "completion_length": 897.6920013427734, + "epoch": 0.7022627137629751, + "grad_norm": 2.290260076522827, + "kl": 1.810546875, + "learning_rate": 4.927911979003214e-06, + "loss": 0.0764, + "reward": 0.525111623108387, + "reward_std": 0.10115913022309542, + "rewards/accuracy_reward": 0.0401785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2351 + }, + { + "clip_ratio": 0.0, + "completion_length": 845.0089721679688, + "epoch": 0.7025614218504966, + "grad_norm": 1.0734349489212036, + "kl": 1.966796875, + "learning_rate": 4.918925704277336e-06, + "loss": 0.0934, + "reward": 0.6406250149011612, + "reward_std": 0.13101894594728947, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 2352 + }, + { + "clip_ratio": 0.0, + "completion_length": 832.4576263427734, + "epoch": 0.702860129938018, + "grad_norm": 0.6581523418426514, + "kl": 1.5185546875, + "learning_rate": 4.909944957262298e-06, + "loss": 0.1113, + "reward": 0.6367187798023224, + "reward_std": 0.16639637760818005, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2353 + }, + { + "clip_ratio": 0.0, + "completion_length": 896.419677734375, + "epoch": 0.7031588380255396, + "grad_norm": 1.1680039167404175, + "kl": 1.47216796875, + "learning_rate": 4.900969747728263e-06, + "loss": 0.0878, + "reward": 0.5680803954601288, + "reward_std": 0.10901550017297268, + "rewards/accuracy_reward": 0.07589285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2354 + }, + { + "clip_ratio": 0.0, + "completion_length": 827.5357666015625, + "epoch": 0.703457546113061, + "grad_norm": 0.958340585231781, + "kl": 1.904296875, + "learning_rate": 4.892000085439383e-06, + "loss": 0.0901, + "reward": 0.593191996216774, + "reward_std": 0.10815301910042763, + "rewards/accuracy_reward": 0.10714285774156451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2355 + }, + { + "clip_ratio": 0.0, + "completion_length": 875.3036193847656, + "epoch": 0.7037562542005825, + "grad_norm": 1.4911092519760132, + "kl": 1.5263671875, + "learning_rate": 4.8830359801537765e-06, + "loss": 0.069, + "reward": 0.5390625223517418, + "reward_std": 0.08318030554801226, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2356 + }, + { + "clip_ratio": 0.0, + "completion_length": 850.950927734375, + "epoch": 0.7040549622881039, + "grad_norm": 1.0958266258239746, + "kl": 2.306640625, + "learning_rate": 4.874077441623504e-06, + "loss": 0.1265, + "reward": 0.6177455633878708, + "reward_std": 0.12683501932770014, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169813156128, + "step": 2357 + }, + { + "clip_ratio": 0.0, + "completion_length": 918.0424499511719, + "epoch": 0.7043536703756255, + "grad_norm": 1.814439296722412, + "kl": 0.9931640625, + "learning_rate": 4.86512447959458e-06, + "loss": 0.0421, + "reward": 0.6188616454601288, + "reward_std": 0.07045938912779093, + "rewards/accuracy_reward": 0.1250000090803951, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2358 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.7946929931641, + "epoch": 0.7046523784631469, + "grad_norm": 0.48165804147720337, + "kl": 1.392578125, + "learning_rate": 4.856177103806954e-06, + "loss": 0.0622, + "reward": 0.5507812649011612, + "reward_std": 0.1288612000644207, + "rewards/accuracy_reward": 0.060267859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2359 + }, + { + "clip_ratio": 0.0, + "completion_length": 882.4777069091797, + "epoch": 0.7049510865506684, + "grad_norm": 0.5431146025657654, + "kl": 1.46484375, + "learning_rate": 4.847235323994487e-06, + "loss": 0.0999, + "reward": 0.648995578289032, + "reward_std": 0.15847235918045044, + "rewards/accuracy_reward": 0.15848214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2360 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.1161041259766, + "epoch": 0.7052497946381898, + "grad_norm": 1.7509874105453491, + "kl": 1.2099609375, + "learning_rate": 4.8382991498849615e-06, + "loss": 0.0296, + "reward": 0.524553582072258, + "reward_std": 0.07333804527297616, + "rewards/accuracy_reward": 0.03125000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2361 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.950927734375, + "epoch": 0.7055485027257113, + "grad_norm": 1.5281645059585571, + "kl": 1.693359375, + "learning_rate": 4.829368591200064e-06, + "loss": 0.0706, + "reward": 0.5636160969734192, + "reward_std": 0.09305542334914207, + "rewards/accuracy_reward": 0.07142857508733869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2362 + }, + { + "clip_ratio": 0.0, + "completion_length": 924.200927734375, + "epoch": 0.7058472108132328, + "grad_norm": 3.3521735668182373, + "kl": 2.1875, + "learning_rate": 4.82044365765536e-06, + "loss": 0.0953, + "reward": 0.5853794813156128, + "reward_std": 0.12567281955853105, + "rewards/accuracy_reward": 0.0959821455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2363 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.4174652099609, + "epoch": 0.7061459189007543, + "grad_norm": 0.5476042628288269, + "kl": 1.037109375, + "learning_rate": 4.811524358960304e-06, + "loss": 0.023, + "reward": 0.5530134215950966, + "reward_std": 0.05030921380966902, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2364 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.4710235595703, + "epoch": 0.7064446269882757, + "grad_norm": 1.0442898273468018, + "kl": 1.32421875, + "learning_rate": 4.802610704818226e-06, + "loss": 0.0413, + "reward": 0.568638414144516, + "reward_std": 0.10303945653140545, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2365 + }, + { + "clip_ratio": 0.0, + "completion_length": 917.9174499511719, + "epoch": 0.7067433350757972, + "grad_norm": 0.44954317808151245, + "kl": 0.91015625, + "learning_rate": 4.793702704926297e-06, + "loss": 0.0441, + "reward": 0.6556919813156128, + "reward_std": 0.07354127638973296, + "rewards/accuracy_reward": 0.16071429336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2366 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.4799652099609, + "epoch": 0.7070420431633186, + "grad_norm": 1.4099836349487305, + "kl": 0.962890625, + "learning_rate": 4.784800368975557e-06, + "loss": 0.0287, + "reward": 0.6143973618745804, + "reward_std": 0.061066851019859314, + "rewards/accuracy_reward": 0.12053572363220155, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2367 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.0848541259766, + "epoch": 0.7073407512508402, + "grad_norm": 0.7969189882278442, + "kl": 1.125, + "learning_rate": 4.775903706650866e-06, + "loss": 0.0446, + "reward": 0.5675223469734192, + "reward_std": 0.030227418756112456, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2368 + }, + { + "clip_ratio": 0.0, + "completion_length": 907.2366485595703, + "epoch": 0.7076394593383616, + "grad_norm": 0.5063198804855347, + "kl": 1.435546875, + "learning_rate": 4.767012727630927e-06, + "loss": 0.0811, + "reward": 0.5948660969734192, + "reward_std": 0.08691668626852334, + "rewards/accuracy_reward": 0.1026785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2369 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.6495819091797, + "epoch": 0.707938167425883, + "grad_norm": 0.38909488916397095, + "kl": 1.26318359375, + "learning_rate": 4.758127441588257e-06, + "loss": 0.0538, + "reward": 0.5418526977300644, + "reward_std": 0.061273976461961865, + "rewards/accuracy_reward": 0.046875000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2370 + }, + { + "clip_ratio": 0.0, + "completion_length": 932.6183624267578, + "epoch": 0.7082368755134045, + "grad_norm": 2.1572067737579346, + "kl": 1.64453125, + "learning_rate": 4.749247858189167e-06, + "loss": 0.0734, + "reward": 0.6450892984867096, + "reward_std": 0.1184859573841095, + "rewards/accuracy_reward": 0.15178571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2371 + }, + { + "clip_ratio": 0.0, + "completion_length": 913.3370971679688, + "epoch": 0.7085355836009259, + "grad_norm": 0.8586599230766296, + "kl": 1.11083984375, + "learning_rate": 4.7403739870937786e-06, + "loss": 0.0617, + "reward": 0.6406250149011612, + "reward_std": 0.12784298695623875, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357238650322, + "step": 2372 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.5491485595703, + "epoch": 0.7088342916884475, + "grad_norm": 0.6744475960731506, + "kl": 1.342529296875, + "learning_rate": 4.731505837955997e-06, + "loss": 0.0667, + "reward": 0.5452009215950966, + "reward_std": 0.05981268081814051, + "rewards/accuracy_reward": 0.05133928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2373 + }, + { + "clip_ratio": 0.0, + "completion_length": 923.9420166015625, + "epoch": 0.7091329997759689, + "grad_norm": 2.1524100303649902, + "kl": 1.236328125, + "learning_rate": 4.722643420423493e-06, + "loss": 0.0478, + "reward": 0.6065848469734192, + "reward_std": 0.1026619877666235, + "rewards/accuracy_reward": 0.11160714505240321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2374 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.1875457763672, + "epoch": 0.7094317078634904, + "grad_norm": 0.5228375196456909, + "kl": 1.2314453125, + "learning_rate": 4.71378674413771e-06, + "loss": 0.051, + "reward": 0.6311383992433548, + "reward_std": 0.11829115450382233, + "rewards/accuracy_reward": 0.13839286426082253, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2375 + }, + { + "clip_ratio": 0.0, + "completion_length": 921.9330749511719, + "epoch": 0.7097304159510118, + "grad_norm": 1.6988927125930786, + "kl": 0.916015625, + "learning_rate": 4.704935818733848e-06, + "loss": 0.0371, + "reward": 0.5485491305589676, + "reward_std": 0.06404878408648074, + "rewards/accuracy_reward": 0.055803573690354824, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2376 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.3504943847656, + "epoch": 0.7100291240385334, + "grad_norm": 0.9586864709854126, + "kl": 0.694091796875, + "learning_rate": 4.69609065384084e-06, + "loss": 0.0278, + "reward": 0.5848214477300644, + "reward_std": 0.07047041319310665, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2377 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.5670013427734, + "epoch": 0.7103278321260548, + "grad_norm": 0.704725444316864, + "kl": 1.22265625, + "learning_rate": 4.687251259081362e-06, + "loss": 0.0526, + "reward": 0.5909598618745804, + "reward_std": 0.10000273445621133, + "rewards/accuracy_reward": 0.09598214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2378 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.4866485595703, + "epoch": 0.7106265402135763, + "grad_norm": 0.7075048089027405, + "kl": 1.1240234375, + "learning_rate": 4.678417644071813e-06, + "loss": 0.0213, + "reward": 0.6294643133878708, + "reward_std": 0.07504073204472661, + "rewards/accuracy_reward": 0.13616072107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2379 + }, + { + "clip_ratio": 0.0, + "completion_length": 919.4353179931641, + "epoch": 0.7109252483010977, + "grad_norm": 0.9665032029151917, + "kl": 1.4267578125, + "learning_rate": 4.669589818422291e-06, + "loss": 0.0752, + "reward": 0.604910746216774, + "reward_std": 0.13575551472604275, + "rewards/accuracy_reward": 0.11160714970901608, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2380 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.6495971679688, + "epoch": 0.7112239563886192, + "grad_norm": 1.6207544803619385, + "kl": 0.916015625, + "learning_rate": 4.6607677917366155e-06, + "loss": 0.0378, + "reward": 0.5518973469734192, + "reward_std": 0.08468061499297619, + "rewards/accuracy_reward": 0.058035717345774174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2381 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.4241485595703, + "epoch": 0.7115226644761407, + "grad_norm": 0.8175951242446899, + "kl": 0.9013671875, + "learning_rate": 4.651951573612277e-06, + "loss": 0.0425, + "reward": 0.5837053805589676, + "reward_std": 0.07216750737279654, + "rewards/accuracy_reward": 0.08928571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2382 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.0513763427734, + "epoch": 0.7118213725636622, + "grad_norm": 0.6318609714508057, + "kl": 0.6982421875, + "learning_rate": 4.643141173640461e-06, + "loss": 0.0243, + "reward": 0.5859375298023224, + "reward_std": 0.10680120717734098, + "rewards/accuracy_reward": 0.08928571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2383 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.9531707763672, + "epoch": 0.7121200806511836, + "grad_norm": 0.5294427275657654, + "kl": 0.9912109375, + "learning_rate": 4.6343366014060235e-06, + "loss": 0.0419, + "reward": 0.541852705180645, + "reward_std": 0.10532245319336653, + "rewards/accuracy_reward": 0.046875000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2384 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.4933471679688, + "epoch": 0.7124187887387051, + "grad_norm": 0.5131205320358276, + "kl": 1.1455078125, + "learning_rate": 4.625537866487468e-06, + "loss": 0.0477, + "reward": 0.5909598618745804, + "reward_std": 0.10344007611274719, + "rewards/accuracy_reward": 0.09598214761354029, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2385 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.9799652099609, + "epoch": 0.7127174968262265, + "grad_norm": 1.3501533269882202, + "kl": 1.490234375, + "learning_rate": 4.61674497845696e-06, + "loss": 0.0621, + "reward": 0.5909598469734192, + "reward_std": 0.11029053200036287, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2386 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.7745971679688, + "epoch": 0.7130162049137481, + "grad_norm": 0.3009175658226013, + "kl": 0.9638671875, + "learning_rate": 4.607957946880305e-06, + "loss": 0.0269, + "reward": 0.5446428805589676, + "reward_std": 0.08996447781100869, + "rewards/accuracy_reward": 0.04910714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2387 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.263427734375, + "epoch": 0.7133149130012695, + "grad_norm": 0.5370942950248718, + "kl": 0.680908203125, + "learning_rate": 4.599176781316922e-06, + "loss": 0.0256, + "reward": 0.6646205633878708, + "reward_std": 0.08674143441021442, + "rewards/accuracy_reward": 0.16741072246804833, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2388 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.9531555175781, + "epoch": 0.713613621088791, + "grad_norm": 1.651830792427063, + "kl": 1.705078125, + "learning_rate": 4.590401491319864e-06, + "loss": 0.0611, + "reward": 0.6512277275323868, + "reward_std": 0.09772101463750005, + "rewards/accuracy_reward": 0.1584821529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2389 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.4241485595703, + "epoch": 0.7139123291763124, + "grad_norm": 1.0559720993041992, + "kl": 0.8095703125, + "learning_rate": 4.5816320864357875e-06, + "loss": 0.0521, + "reward": 0.5909598618745804, + "reward_std": 0.12569791451096535, + "rewards/accuracy_reward": 0.09375000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2390 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.5603179931641, + "epoch": 0.714211037263834, + "grad_norm": 0.6830852627754211, + "kl": 0.71484375, + "learning_rate": 4.5728685762049415e-06, + "loss": 0.0283, + "reward": 0.6155134290456772, + "reward_std": 0.09162616822868586, + "rewards/accuracy_reward": 0.11830358067527413, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2391 + }, + { + "clip_ratio": 0.0, + "completion_length": 944.3661041259766, + "epoch": 0.7145097453513554, + "grad_norm": 0.6863913536071777, + "kl": 0.65234375, + "learning_rate": 4.564110970161168e-06, + "loss": 0.0356, + "reward": 0.6060268133878708, + "reward_std": 0.066013986710459, + "rewards/accuracy_reward": 0.10937500349245965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2392 + }, + { + "clip_ratio": 0.0, + "completion_length": 926.2053985595703, + "epoch": 0.7148084534388769, + "grad_norm": 0.5125638842582703, + "kl": 0.697265625, + "learning_rate": 4.55535927783189e-06, + "loss": 0.0349, + "reward": 0.5809152126312256, + "reward_std": 0.10980432480573654, + "rewards/accuracy_reward": 0.08482143329456449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937574505806, + "step": 2393 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.5714569091797, + "epoch": 0.7151071615263983, + "grad_norm": 1.2969541549682617, + "kl": 0.49560546875, + "learning_rate": 4.54661350873808e-06, + "loss": 0.0196, + "reward": 0.571428582072258, + "reward_std": 0.12050709244795144, + "rewards/accuracy_reward": 0.0758928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2394 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.4241485595703, + "epoch": 0.7154058696139198, + "grad_norm": 4.739292144775391, + "kl": 1.015625, + "learning_rate": 4.537873672394288e-06, + "loss": 0.0351, + "reward": 0.541294664144516, + "reward_std": 0.06624249368906021, + "rewards/accuracy_reward": 0.051339289639145136, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2395 + }, + { + "clip_ratio": 0.0, + "completion_length": 915.6027221679688, + "epoch": 0.7157045777014412, + "grad_norm": 0.4174659550189972, + "kl": 0.418701171875, + "learning_rate": 4.52913977830859e-06, + "loss": 0.0204, + "reward": 0.6305803954601288, + "reward_std": 0.07974867522716522, + "rewards/accuracy_reward": 0.13169643771834671, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4988839328289032, + "step": 2396 + }, + { + "clip_ratio": 0.0, + "completion_length": 908.6205749511719, + "epoch": 0.7160032857889628, + "grad_norm": 2.3126697540283203, + "kl": 0.642578125, + "learning_rate": 4.520411835982612e-06, + "loss": 0.0529, + "reward": 0.6411830633878708, + "reward_std": 0.12692000716924667, + "rewards/accuracy_reward": 0.14508928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2397 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.2612152099609, + "epoch": 0.7163019938764842, + "grad_norm": 0.7152077555656433, + "kl": 0.4326171875, + "learning_rate": 4.5116898549115025e-06, + "loss": 0.0136, + "reward": 0.6707589626312256, + "reward_std": 0.13085803017020226, + "rewards/accuracy_reward": 0.17410715157166123, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2398 + }, + { + "clip_ratio": 0.0, + "completion_length": 920.1830749511719, + "epoch": 0.7166007019640057, + "grad_norm": 1.2751448154449463, + "kl": 0.55322265625, + "learning_rate": 4.502973844583914e-06, + "loss": 0.0001, + "reward": 0.580357164144516, + "reward_std": 0.1032504178583622, + "rewards/accuracy_reward": 0.08705357508733869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2399 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.8058471679688, + "epoch": 0.7168994100515271, + "grad_norm": 3.905773639678955, + "kl": 0.91796875, + "learning_rate": 4.494263814482018e-06, + "loss": 0.0354, + "reward": 0.681919664144516, + "reward_std": 0.08707257220521569, + "rewards/accuracy_reward": 0.18973215110599995, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 2400 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.9620819091797, + "epoch": 0.7171981181390487, + "grad_norm": 0.5282359719276428, + "kl": 0.61865234375, + "learning_rate": 4.485559774081475e-06, + "loss": 0.021, + "reward": 0.5920759290456772, + "reward_std": 0.07248491980135441, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2401 + }, + { + "clip_ratio": 0.0, + "completion_length": 944.7924652099609, + "epoch": 0.7174968262265701, + "grad_norm": 1.6633405685424805, + "kl": 1.400390625, + "learning_rate": 4.47686173285142e-06, + "loss": 0.0503, + "reward": 0.538504496216774, + "reward_std": 0.09556227549910545, + "rewards/accuracy_reward": 0.04687500209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2402 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.8236999511719, + "epoch": 0.7177955343140916, + "grad_norm": 0.469196617603302, + "kl": 1.142578125, + "learning_rate": 4.4681697002544746e-06, + "loss": 0.038, + "reward": 0.632254496216774, + "reward_std": 0.11349723860621452, + "rewards/accuracy_reward": 0.13839286286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2403 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.6808319091797, + "epoch": 0.718094242401613, + "grad_norm": 0.7843976616859436, + "kl": 1.0595703125, + "learning_rate": 4.459483685746721e-06, + "loss": 0.0294, + "reward": 0.5217633992433548, + "reward_std": 0.08236956968903542, + "rewards/accuracy_reward": 0.026785715483129025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2404 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.7321929931641, + "epoch": 0.7183929504891345, + "grad_norm": 0.7876648306846619, + "kl": 1.7998046875, + "learning_rate": 4.450803698777684e-06, + "loss": 0.0722, + "reward": 0.556919664144516, + "reward_std": 0.08045408502221107, + "rewards/accuracy_reward": 0.06473214505240321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2405 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.1138916015625, + "epoch": 0.718691658576656, + "grad_norm": 5.130773067474365, + "kl": 3.8125, + "learning_rate": 4.442129748790344e-06, + "loss": 0.146, + "reward": 0.6422991305589676, + "reward_std": 0.1080038771033287, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 2406 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.7277221679688, + "epoch": 0.7189903666641775, + "grad_norm": 3.5503063201904297, + "kl": 1.962890625, + "learning_rate": 4.4334618452211065e-06, + "loss": 0.0777, + "reward": 0.5418527126312256, + "reward_std": 0.09487099945545197, + "rewards/accuracy_reward": 0.046875000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2407 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.2745971679688, + "epoch": 0.7192890747516989, + "grad_norm": 5.345459938049316, + "kl": 2.392578125, + "learning_rate": 4.424799997499803e-06, + "loss": 0.0955, + "reward": 0.5530134290456772, + "reward_std": 0.11028414033353329, + "rewards/accuracy_reward": 0.06026785750873387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2408 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.0156707763672, + "epoch": 0.7195877828392204, + "grad_norm": 3.883439540863037, + "kl": 2.578125, + "learning_rate": 4.416144215049677e-06, + "loss": 0.0972, + "reward": 0.5329241305589676, + "reward_std": 0.11866093426942825, + "rewards/accuracy_reward": 0.04464285750873387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2409 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.7812805175781, + "epoch": 0.7198864909267418, + "grad_norm": 5.723872661590576, + "kl": 3.3671875, + "learning_rate": 4.4074945072873655e-06, + "loss": 0.1455, + "reward": 0.5513393133878708, + "reward_std": 0.10071296524256468, + "rewards/accuracy_reward": 0.06473214598372579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2410 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.3192443847656, + "epoch": 0.7201851990142634, + "grad_norm": 0.9828699231147766, + "kl": 1.94140625, + "learning_rate": 4.398850883622905e-06, + "loss": 0.0818, + "reward": 0.5156250298023224, + "reward_std": 0.09251939598470926, + "rewards/accuracy_reward": 0.024553572526201606, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2411 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.3192291259766, + "epoch": 0.7204839071017848, + "grad_norm": 1.8275272846221924, + "kl": 2.1884765625, + "learning_rate": 4.390213353459715e-06, + "loss": 0.089, + "reward": 0.6422991305589676, + "reward_std": 0.09330701734870672, + "rewards/accuracy_reward": 0.15401786915026605, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 2412 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.3683624267578, + "epoch": 0.7207826151893062, + "grad_norm": 0.814882755279541, + "kl": 1.291015625, + "learning_rate": 4.381581926194575e-06, + "loss": 0.0611, + "reward": 0.5625000298023224, + "reward_std": 0.13645959459245205, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2413 + }, + { + "clip_ratio": 0.0, + "completion_length": 933.5513763427734, + "epoch": 0.7210813232768277, + "grad_norm": 1.5039067268371582, + "kl": 1.244140625, + "learning_rate": 4.372956611217638e-06, + "loss": 0.0367, + "reward": 0.5976562649011612, + "reward_std": 0.1216379776597023, + "rewards/accuracy_reward": 0.10937500302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2414 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.2902221679688, + "epoch": 0.7213800313643491, + "grad_norm": 2.077181816101074, + "kl": 1.068359375, + "learning_rate": 4.3643374179123955e-06, + "loss": 0.0495, + "reward": 0.636160746216774, + "reward_std": 0.11466504447162151, + "rewards/accuracy_reward": 0.14732143562287092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2415 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.5647735595703, + "epoch": 0.7216787394518707, + "grad_norm": 2.014249086380005, + "kl": 1.365234375, + "learning_rate": 4.355724355655688e-06, + "loss": 0.0545, + "reward": 0.624441996216774, + "reward_std": 0.08750997669994831, + "rewards/accuracy_reward": 0.1383928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2416 + }, + { + "clip_ratio": 0.0, + "completion_length": 933.5446929931641, + "epoch": 0.7219774475393921, + "grad_norm": 1.4421818256378174, + "kl": 0.60693359375, + "learning_rate": 4.347117433817687e-06, + "loss": 0.0114, + "reward": 0.5625000149011612, + "reward_std": 0.08421296183951199, + "rewards/accuracy_reward": 0.06696429033763707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357238650322, + "step": 2417 + }, + { + "clip_ratio": 0.0, + "completion_length": 923.6361999511719, + "epoch": 0.7222761556269136, + "grad_norm": 1.822360634803772, + "kl": 1.12890625, + "learning_rate": 4.3385166617618725e-06, + "loss": 0.0341, + "reward": 0.608258955180645, + "reward_std": 0.08537950925529003, + "rewards/accuracy_reward": 0.1160714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2418 + }, + { + "clip_ratio": 0.0, + "completion_length": 910.8326263427734, + "epoch": 0.722574863714435, + "grad_norm": 1.6865886449813843, + "kl": 0.73095703125, + "learning_rate": 4.329922048845044e-06, + "loss": 0.0304, + "reward": 0.647879496216774, + "reward_std": 0.0738262441009283, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616156578064, + "step": 2419 + }, + { + "clip_ratio": 0.0, + "completion_length": 915.3728332519531, + "epoch": 0.7228735718019565, + "grad_norm": 0.9978194236755371, + "kl": 1.010986328125, + "learning_rate": 4.3213336044173034e-06, + "loss": 0.0579, + "reward": 0.6171875447034836, + "reward_std": 0.12037363089621067, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875074505806, + "step": 2420 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.0982513427734, + "epoch": 0.723172279889478, + "grad_norm": 0.5213198661804199, + "kl": 0.9052734375, + "learning_rate": 4.312751337822027e-06, + "loss": 0.0284, + "reward": 0.6160714626312256, + "reward_std": 0.11194937769323587, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2421 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.4442443847656, + "epoch": 0.7234709879769995, + "grad_norm": 0.5718435049057007, + "kl": 1.2802734375, + "learning_rate": 4.304175258395887e-06, + "loss": 0.0485, + "reward": 0.531808078289032, + "reward_std": 0.08291592821478844, + "rewards/accuracy_reward": 0.040178574388846755, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2422 + }, + { + "clip_ratio": 0.0, + "completion_length": 944.5089721679688, + "epoch": 0.7237696960645209, + "grad_norm": 1.080756425857544, + "kl": 1.5986328125, + "learning_rate": 4.295605375468818e-06, + "loss": 0.0605, + "reward": 0.5502232313156128, + "reward_std": 0.07890149764716625, + "rewards/accuracy_reward": 0.060267857974395156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2423 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.2879791259766, + "epoch": 0.7240684041520424, + "grad_norm": 0.8028028607368469, + "kl": 1.1806640625, + "learning_rate": 4.287041698364005e-06, + "loss": 0.0453, + "reward": 0.5797991305589676, + "reward_std": 0.11880149133503437, + "rewards/accuracy_reward": 0.08705357322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2424 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.0982513427734, + "epoch": 0.7243671122395638, + "grad_norm": 0.9508116841316223, + "kl": 1.5068359375, + "learning_rate": 4.278484236397895e-06, + "loss": 0.0599, + "reward": 0.569754496216774, + "reward_std": 0.0761841181665659, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2425 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.5625610351562, + "epoch": 0.7246658203270854, + "grad_norm": 0.3128349781036377, + "kl": 0.79345703125, + "learning_rate": 4.269932998880171e-06, + "loss": 0.0156, + "reward": 0.603794664144516, + "reward_std": 0.09558246936649084, + "rewards/accuracy_reward": 0.10937500302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2426 + }, + { + "clip_ratio": 0.0, + "completion_length": 928.591552734375, + "epoch": 0.7249645284146068, + "grad_norm": 0.6185355186462402, + "kl": 0.7998046875, + "learning_rate": 4.261387995113733e-06, + "loss": 0.0031, + "reward": 0.558035746216774, + "reward_std": 0.07617509830743074, + "rewards/accuracy_reward": 0.06250000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2427 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.4844207763672, + "epoch": 0.7252632365021283, + "grad_norm": 0.8361151814460754, + "kl": 1.2685546875, + "learning_rate": 4.2528492343947155e-06, + "loss": 0.0503, + "reward": 0.6300223469734192, + "reward_std": 0.12878869660198689, + "rewards/accuracy_reward": 0.13616072107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2428 + }, + { + "clip_ratio": 0.0, + "completion_length": 924.7210388183594, + "epoch": 0.7255619445896497, + "grad_norm": 0.4023713767528534, + "kl": 0.89697265625, + "learning_rate": 4.244316726012446e-06, + "loss": 0.0437, + "reward": 0.5719866305589676, + "reward_std": 0.06924588698893785, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2429 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.5915679931641, + "epoch": 0.7258606526771713, + "grad_norm": 0.8771995902061462, + "kl": 0.755859375, + "learning_rate": 4.2357904792494606e-06, + "loss": 0.0405, + "reward": 0.616629496216774, + "reward_std": 0.0958156893029809, + "rewards/accuracy_reward": 0.12053572316654027, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2430 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.7701263427734, + "epoch": 0.7261593607646927, + "grad_norm": 0.5012146830558777, + "kl": 0.8896484375, + "learning_rate": 4.2272705033814854e-06, + "loss": 0.0454, + "reward": 0.5524553805589676, + "reward_std": 0.11567545123398304, + "rewards/accuracy_reward": 0.058035715483129025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2431 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.9777221679688, + "epoch": 0.7264580688522142, + "grad_norm": 1.1017895936965942, + "kl": 1.0361328125, + "learning_rate": 4.21875680767741e-06, + "loss": 0.044, + "reward": 0.620535746216774, + "reward_std": 0.0769353280775249, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2432 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.7745971679688, + "epoch": 0.7267567769397356, + "grad_norm": 0.4317852258682251, + "kl": 0.73779296875, + "learning_rate": 4.210249401399305e-06, + "loss": 0.0248, + "reward": 0.5820312798023224, + "reward_std": 0.0701794596388936, + "rewards/accuracy_reward": 0.08705357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2433 + }, + { + "clip_ratio": 0.0, + "completion_length": 924.9620971679688, + "epoch": 0.7270554850272571, + "grad_norm": 1.075592279434204, + "kl": 0.842529296875, + "learning_rate": 4.201748293802398e-06, + "loss": 0.0305, + "reward": 0.5686384290456772, + "reward_std": 0.08641545381397009, + "rewards/accuracy_reward": 0.07366071850992739, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2434 + }, + { + "clip_ratio": 0.0, + "completion_length": 897.9933471679688, + "epoch": 0.7273541931147786, + "grad_norm": 0.5647106170654297, + "kl": 0.890625, + "learning_rate": 4.1932534941350545e-06, + "loss": 0.0469, + "reward": 0.5887277126312256, + "reward_std": 0.15365272760391235, + "rewards/accuracy_reward": 0.09375000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2435 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.2745971679688, + "epoch": 0.7276529012023001, + "grad_norm": 0.4871516227722168, + "kl": 1.02099609375, + "learning_rate": 4.184765011638787e-06, + "loss": 0.0406, + "reward": 0.5479911044239998, + "reward_std": 0.07736439071595669, + "rewards/accuracy_reward": 0.053571431431919336, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2436 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.6027221679688, + "epoch": 0.7279516092898215, + "grad_norm": 1.3736978769302368, + "kl": 1.5205078125, + "learning_rate": 4.176282855548236e-06, + "loss": 0.0647, + "reward": 0.5368303880095482, + "reward_std": 0.06507209222763777, + "rewards/accuracy_reward": 0.04687500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2437 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.575927734375, + "epoch": 0.728250317377343, + "grad_norm": 0.5246115922927856, + "kl": 1.2060546875, + "learning_rate": 4.1678070350911496e-06, + "loss": 0.0571, + "reward": 0.5608259290456772, + "reward_std": 0.08690498117357492, + "rewards/accuracy_reward": 0.06696428754366934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2438 + }, + { + "clip_ratio": 0.0, + "completion_length": 920.1562957763672, + "epoch": 0.7285490254648644, + "grad_norm": 0.9574338793754578, + "kl": 1.119140625, + "learning_rate": 4.1593375594883955e-06, + "loss": 0.0354, + "reward": 0.6155134290456772, + "reward_std": 0.11670575849711895, + "rewards/accuracy_reward": 0.12276786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 2439 + }, + { + "clip_ratio": 0.0, + "completion_length": 932.7299652099609, + "epoch": 0.728847733552386, + "grad_norm": 1.5229543447494507, + "kl": 1.666015625, + "learning_rate": 4.150874437953927e-06, + "loss": 0.0779, + "reward": 0.5809152126312256, + "reward_std": 0.11236829124391079, + "rewards/accuracy_reward": 0.08928571501746774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2440 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.3906707763672, + "epoch": 0.7291464416399074, + "grad_norm": 0.7885149717330933, + "kl": 1.0732421875, + "learning_rate": 4.142417679694794e-06, + "loss": 0.046, + "reward": 0.5976562798023224, + "reward_std": 0.07064646715298295, + "rewards/accuracy_reward": 0.10267857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2441 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.544677734375, + "epoch": 0.7294451497274289, + "grad_norm": 1.0981320142745972, + "kl": 1.904296875, + "learning_rate": 4.133967293911124e-06, + "loss": 0.0761, + "reward": 0.6294643133878708, + "reward_std": 0.1557731293141842, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2442 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.6451263427734, + "epoch": 0.7297438578149503, + "grad_norm": 1.5188324451446533, + "kl": 1.470703125, + "learning_rate": 4.1255232897961015e-06, + "loss": 0.0622, + "reward": 0.5915178954601288, + "reward_std": 0.09880052413791418, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2443 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.3616638183594, + "epoch": 0.7300425659024719, + "grad_norm": 0.7661213874816895, + "kl": 1.95703125, + "learning_rate": 4.117085676535979e-06, + "loss": 0.0876, + "reward": 0.5658482313156128, + "reward_std": 0.1262266654521227, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 2444 + }, + { + "clip_ratio": 0.0, + "completion_length": 932.2857666015625, + "epoch": 0.7303412739899933, + "grad_norm": 1.0166102647781372, + "kl": 1.6123046875, + "learning_rate": 4.108654463310058e-06, + "loss": 0.0726, + "reward": 0.5535714477300644, + "reward_std": 0.12863255105912685, + "rewards/accuracy_reward": 0.06250000139698386, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2445 + }, + { + "clip_ratio": 0.0, + "completion_length": 927.4464569091797, + "epoch": 0.7306399820775148, + "grad_norm": 2.0439910888671875, + "kl": 3.2421875, + "learning_rate": 4.100229659290662e-06, + "loss": 0.1486, + "reward": 0.5457589477300644, + "reward_std": 0.09980915114283562, + "rewards/accuracy_reward": 0.06473214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 2446 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.8147735595703, + "epoch": 0.7309386901650362, + "grad_norm": 2.3577880859375, + "kl": 3.16015625, + "learning_rate": 4.091811273643157e-06, + "loss": 0.1234, + "reward": 0.5479911118745804, + "reward_std": 0.11558932811021805, + "rewards/accuracy_reward": 0.06473214598372579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 2447 + }, + { + "clip_ratio": 0.0, + "completion_length": 914.4509429931641, + "epoch": 0.7312373982525577, + "grad_norm": 3.0842974185943604, + "kl": 3.44921875, + "learning_rate": 4.083399315525925e-06, + "loss": 0.1301, + "reward": 0.5245535895228386, + "reward_std": 0.12081265263259411, + "rewards/accuracy_reward": 0.04687500302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4776785895228386, + "step": 2448 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.9598541259766, + "epoch": 0.7315361063400792, + "grad_norm": 3.1124918460845947, + "kl": 3.51953125, + "learning_rate": 4.074993794090346e-06, + "loss": 0.1464, + "reward": 0.577566996216774, + "reward_std": 0.11155863851308823, + "rewards/accuracy_reward": 0.09821428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.479352705180645, + "step": 2449 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.4665679931641, + "epoch": 0.7318348144276007, + "grad_norm": 2.8918423652648926, + "kl": 3.55859375, + "learning_rate": 4.066594718480805e-06, + "loss": 0.1357, + "reward": 0.5954241305589676, + "reward_std": 0.17482083663344383, + "rewards/accuracy_reward": 0.12276786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4726562723517418, + "step": 2450 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.4866638183594, + "epoch": 0.7321335225151221, + "grad_norm": 0.9625928997993469, + "kl": 1.958984375, + "learning_rate": 4.058202097834679e-06, + "loss": 0.06, + "reward": 0.5909598544239998, + "reward_std": 0.08951194491237402, + "rewards/accuracy_reward": 0.10491071944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 2451 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.5647735595703, + "epoch": 0.7324322306026436, + "grad_norm": 1.1409050226211548, + "kl": 2.1201171875, + "learning_rate": 4.049815941282307e-06, + "loss": 0.0726, + "reward": 0.6037946715950966, + "reward_std": 0.09813809674233198, + "rewards/accuracy_reward": 0.1205357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 2452 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.0022735595703, + "epoch": 0.732730938690165, + "grad_norm": 1.4979455471038818, + "kl": 2.9609375, + "learning_rate": 4.041436257947015e-06, + "loss": 0.0938, + "reward": 0.6277901977300644, + "reward_std": 0.19013355299830437, + "rewards/accuracy_reward": 0.15401786752045155, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4737723395228386, + "step": 2453 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.5379943847656, + "epoch": 0.7330296467776866, + "grad_norm": 1.44451904296875, + "kl": 1.865234375, + "learning_rate": 4.033063056945067e-06, + "loss": 0.0823, + "reward": 0.5870536118745804, + "reward_std": 0.16334962658584118, + "rewards/accuracy_reward": 0.10491071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 2454 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.4531555175781, + "epoch": 0.733328354865208, + "grad_norm": 1.4640569686889648, + "kl": 1.5078125, + "learning_rate": 4.0246963473856915e-06, + "loss": 0.0519, + "reward": 0.5708705633878708, + "reward_std": 0.16255970299243927, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 2455 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.294677734375, + "epoch": 0.7336270629527294, + "grad_norm": 1.0804673433303833, + "kl": 2.6640625, + "learning_rate": 4.01633613837105e-06, + "loss": 0.0731, + "reward": 0.5714285969734192, + "reward_std": 0.17275798507034779, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4709821566939354, + "step": 2456 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.4263763427734, + "epoch": 0.7339257710402509, + "grad_norm": 1.6305702924728394, + "kl": 1.451171875, + "learning_rate": 4.0079824389962255e-06, + "loss": 0.0506, + "reward": 0.6060268059372902, + "reward_std": 0.10347394645214081, + "rewards/accuracy_reward": 0.11607143469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2457 + }, + { + "clip_ratio": 0.0, + "completion_length": 928.2857666015625, + "epoch": 0.7342244791277723, + "grad_norm": 1.2008010149002075, + "kl": 2.6484375, + "learning_rate": 3.999635258349226e-06, + "loss": 0.1222, + "reward": 0.588169664144516, + "reward_std": 0.12086916901171207, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4743303805589676, + "step": 2458 + }, + { + "clip_ratio": 0.0, + "completion_length": 914.1384429931641, + "epoch": 0.7345231872152939, + "grad_norm": 1.3365281820297241, + "kl": 2.509765625, + "learning_rate": 3.991294605510969e-06, + "loss": 0.1466, + "reward": 0.5524553805589676, + "reward_std": 0.1303621083498001, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268133878708, + "step": 2459 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.4732666015625, + "epoch": 0.7348218953028153, + "grad_norm": 2.49928879737854, + "kl": 2.34375, + "learning_rate": 3.982960489555263e-06, + "loss": 0.1087, + "reward": 0.6099330633878708, + "reward_std": 0.13544081896543503, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330484867096, + "step": 2460 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.3326263427734, + "epoch": 0.7351206033903368, + "grad_norm": 1.2989201545715332, + "kl": 1.85546875, + "learning_rate": 3.97463291954881e-06, + "loss": 0.0874, + "reward": 0.679129496216774, + "reward_std": 0.12117552570998669, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2461 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.2344207763672, + "epoch": 0.7354193114778582, + "grad_norm": 0.8361518979072571, + "kl": 1.654296875, + "learning_rate": 3.966311904551195e-06, + "loss": 0.0574, + "reward": 0.5502232313156128, + "reward_std": 0.11268411576747894, + "rewards/accuracy_reward": 0.06473214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 2462 + }, + { + "clip_ratio": 0.0, + "completion_length": 936.6384429931641, + "epoch": 0.7357180195653797, + "grad_norm": 0.5838207006454468, + "kl": 2.48046875, + "learning_rate": 3.957997453614859e-06, + "loss": 0.0929, + "reward": 0.7438616305589676, + "reward_std": 0.17944259382784367, + "rewards/accuracy_reward": 0.2678571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4760044813156128, + "step": 2463 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.357177734375, + "epoch": 0.7360167276529012, + "grad_norm": 1.84611976146698, + "kl": 1.91015625, + "learning_rate": 3.949689575785114e-06, + "loss": 0.0729, + "reward": 0.5368303880095482, + "reward_std": 0.06528602447360754, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 2464 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.7299652099609, + "epoch": 0.7363154357404227, + "grad_norm": 1.5945247411727905, + "kl": 2.6953125, + "learning_rate": 3.94138828010012e-06, + "loss": 0.1147, + "reward": 0.5279018133878708, + "reward_std": 0.10466625913977623, + "rewards/accuracy_reward": 0.04910714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.478794664144516, + "step": 2465 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.9286193847656, + "epoch": 0.7366141438279441, + "grad_norm": 1.0073342323303223, + "kl": 2.85546875, + "learning_rate": 3.933093575590866e-06, + "loss": 0.0956, + "reward": 0.4933036044239998, + "reward_std": 0.09440141916275024, + "rewards/accuracy_reward": 0.01562500069849193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4776785969734192, + "step": 2466 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.8750610351562, + "epoch": 0.7369128519154656, + "grad_norm": 3.0678582191467285, + "kl": 2.115234375, + "learning_rate": 3.924805471281184e-06, + "loss": 0.0692, + "reward": 0.6785714626312256, + "reward_std": 0.07267277128994465, + "rewards/accuracy_reward": 0.196428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 2467 + }, + { + "clip_ratio": 0.0, + "completion_length": 920.8281707763672, + "epoch": 0.737211560002987, + "grad_norm": 1.4324976205825806, + "kl": 2.515625, + "learning_rate": 3.916523976187713e-06, + "loss": 0.0696, + "reward": 0.4921875149011612, + "reward_std": 0.10711454227566719, + "rewards/accuracy_reward": 0.01562500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4765625223517418, + "step": 2468 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.5736999511719, + "epoch": 0.7375102680905086, + "grad_norm": 0.8243094086647034, + "kl": 2.423828125, + "learning_rate": 3.90824909931991e-06, + "loss": 0.0843, + "reward": 0.5853794813156128, + "reward_std": 0.16979046538472176, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 2469 + }, + { + "clip_ratio": 0.0, + "completion_length": 888.8616333007812, + "epoch": 0.73780897617803, + "grad_norm": 0.8172890543937683, + "kl": 2.54296875, + "learning_rate": 3.899980849680036e-06, + "loss": 0.0911, + "reward": 0.6065848469734192, + "reward_std": 0.13284173980355263, + "rewards/accuracy_reward": 0.12500000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4815848395228386, + "step": 2470 + }, + { + "clip_ratio": 0.0, + "completion_length": 916.9286041259766, + "epoch": 0.7381076842655515, + "grad_norm": 2.34430193901062, + "kl": 2.5458984375, + "learning_rate": 3.8917192362631285e-06, + "loss": 0.1006, + "reward": 0.5418527126312256, + "reward_std": 0.11794877052307129, + "rewards/accuracy_reward": 0.05803571664728224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169813156128, + "step": 2471 + }, + { + "clip_ratio": 0.0, + "completion_length": 922.8772583007812, + "epoch": 0.7384063923530729, + "grad_norm": 0.8023180365562439, + "kl": 2.42578125, + "learning_rate": 3.883464268057015e-06, + "loss": 0.0786, + "reward": 0.6746652126312256, + "reward_std": 0.09847716800868511, + "rewards/accuracy_reward": 0.18973214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2472 + }, + { + "clip_ratio": 0.0, + "completion_length": 921.5893249511719, + "epoch": 0.7387051004405945, + "grad_norm": 1.0689990520477295, + "kl": 2.5, + "learning_rate": 3.875215954042297e-06, + "loss": 0.0889, + "reward": 0.6037946790456772, + "reward_std": 0.10772062465548515, + "rewards/accuracy_reward": 0.12053571990691125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589402794838, + "step": 2473 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.3080749511719, + "epoch": 0.7390038085281159, + "grad_norm": 1.690209150314331, + "kl": 2.560546875, + "learning_rate": 3.866974303192323e-06, + "loss": 0.1022, + "reward": 0.5904018133878708, + "reward_std": 0.05206897249445319, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 2474 + }, + { + "clip_ratio": 0.0, + "completion_length": 933.6451263427734, + "epoch": 0.7393025166156374, + "grad_norm": 1.3794306516647339, + "kl": 2.02734375, + "learning_rate": 3.858739324473208e-06, + "loss": 0.0659, + "reward": 0.557477705180645, + "reward_std": 0.11207528878003359, + "rewards/accuracy_reward": 0.07142857369035482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2475 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.2879943847656, + "epoch": 0.7396012247031588, + "grad_norm": 0.8328306078910828, + "kl": 1.708984375, + "learning_rate": 3.850511026843802e-06, + "loss": 0.0707, + "reward": 0.5133928880095482, + "reward_std": 0.0736262509599328, + "rewards/accuracy_reward": 0.0267857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071715950966, + "step": 2476 + }, + { + "clip_ratio": 0.0, + "completion_length": 907.9486999511719, + "epoch": 0.7398999327906803, + "grad_norm": 0.6826727390289307, + "kl": 2.30859375, + "learning_rate": 3.842289419255681e-06, + "loss": 0.0679, + "reward": 0.5647321790456772, + "reward_std": 0.17400727793574333, + "rewards/accuracy_reward": 0.08258928824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4821428805589676, + "step": 2477 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.1228179931641, + "epoch": 0.7401986408782018, + "grad_norm": 1.9432952404022217, + "kl": 1.6220703125, + "learning_rate": 3.834074510653151e-06, + "loss": 0.0623, + "reward": 0.5106026977300644, + "reward_std": 0.10736207664012909, + "rewards/accuracy_reward": 0.02455357206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 2478 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.3192291259766, + "epoch": 0.7404973489657233, + "grad_norm": 0.9117223024368286, + "kl": 1.484375, + "learning_rate": 3.8258663099732304e-06, + "loss": 0.0685, + "reward": 0.5982142984867096, + "reward_std": 0.12421995028853416, + "rewards/accuracy_reward": 0.10714286402799189, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2479 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.4486999511719, + "epoch": 0.7407960570532447, + "grad_norm": 2.124189853668213, + "kl": 1.958984375, + "learning_rate": 3.817664826145633e-06, + "loss": 0.0742, + "reward": 0.584821455180645, + "reward_std": 0.10762601345777512, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 2480 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.935302734375, + "epoch": 0.7410947651407662, + "grad_norm": 1.1673767566680908, + "kl": 0.92333984375, + "learning_rate": 3.809470068092772e-06, + "loss": 0.0324, + "reward": 0.6422991305589676, + "reward_std": 0.10792281478643417, + "rewards/accuracy_reward": 0.14732143562287092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2481 + }, + { + "clip_ratio": 0.0, + "completion_length": 906.0670013427734, + "epoch": 0.7413934732282876, + "grad_norm": 3.17400860786438, + "kl": 2.34375, + "learning_rate": 3.8012820447297384e-06, + "loss": 0.1002, + "reward": 0.6679687798023224, + "reward_std": 0.1427952367812395, + "rewards/accuracy_reward": 0.18303572200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330633878708, + "step": 2482 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.7344055175781, + "epoch": 0.7416921813158092, + "grad_norm": 3.2224273681640625, + "kl": 2.169921875, + "learning_rate": 3.793100764964299e-06, + "loss": 0.0836, + "reward": 0.5496651977300644, + "reward_std": 0.06841737241484225, + "rewards/accuracy_reward": 0.06250000279396772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2483 + }, + { + "clip_ratio": 0.0, + "completion_length": 913.5022583007812, + "epoch": 0.7419908894033306, + "grad_norm": 1.759329080581665, + "kl": 1.5625, + "learning_rate": 3.7849262376968897e-06, + "loss": 0.0492, + "reward": 0.571428582072258, + "reward_std": 0.09976426884531975, + "rewards/accuracy_reward": 0.08035714458674192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2484 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.0870971679688, + "epoch": 0.7422895974908521, + "grad_norm": 0.8366640210151672, + "kl": 2.12109375, + "learning_rate": 3.7767584718205875e-06, + "loss": 0.0766, + "reward": 0.5334821790456772, + "reward_std": 0.12761671654880047, + "rewards/accuracy_reward": 0.046875001629814506, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2485 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.0000305175781, + "epoch": 0.7425883055783735, + "grad_norm": 0.47319188714027405, + "kl": 1.5595703125, + "learning_rate": 3.768597476221125e-06, + "loss": 0.0675, + "reward": 0.5530134290456772, + "reward_std": 0.09460313059389591, + "rewards/accuracy_reward": 0.06473214761354029, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 2486 + }, + { + "clip_ratio": 0.0, + "completion_length": 933.8795013427734, + "epoch": 0.742887013665895, + "grad_norm": 0.6250870227813721, + "kl": 1.466796875, + "learning_rate": 3.7604432597768693e-06, + "loss": 0.0437, + "reward": 0.5664062798023224, + "reward_std": 0.049982622265815735, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2487 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.2745971679688, + "epoch": 0.7431857217534165, + "grad_norm": 1.6321547031402588, + "kl": 1.0576171875, + "learning_rate": 3.7522958313587996e-06, + "loss": 0.0303, + "reward": 0.5446428805589676, + "reward_std": 0.06867929105646908, + "rewards/accuracy_reward": 0.051339289639145136, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 2488 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.0960388183594, + "epoch": 0.743484429840938, + "grad_norm": 1.1608474254608154, + "kl": 1.810546875, + "learning_rate": 3.744155199830526e-06, + "loss": 0.0707, + "reward": 0.6428571715950966, + "reward_std": 0.08564047142863274, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2489 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.8750305175781, + "epoch": 0.7437831379284594, + "grad_norm": 0.45524880290031433, + "kl": 1.615234375, + "learning_rate": 3.73602137404826e-06, + "loss": 0.0481, + "reward": 0.5407366305589676, + "reward_std": 0.0679548061452806, + "rewards/accuracy_reward": 0.04910714505240321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2490 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.8460235595703, + "epoch": 0.7440818460159809, + "grad_norm": 0.7020501494407654, + "kl": 1.4453125, + "learning_rate": 3.727894362860799e-06, + "loss": 0.0537, + "reward": 0.5781250223517418, + "reward_std": 0.047928490210324526, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2491 + }, + { + "clip_ratio": 0.0, + "completion_length": 924.8304138183594, + "epoch": 0.7443805541035023, + "grad_norm": 3.256895065307617, + "kl": 2.40625, + "learning_rate": 3.7197741751095383e-06, + "loss": 0.0972, + "reward": 0.6004464626312256, + "reward_std": 0.09076566807925701, + "rewards/accuracy_reward": 0.11383928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2492 + }, + { + "clip_ratio": 0.0, + "completion_length": 915.8549652099609, + "epoch": 0.7446792621910239, + "grad_norm": 2.126485824584961, + "kl": 1.6982421875, + "learning_rate": 3.711660819628451e-06, + "loss": 0.0344, + "reward": 0.5814732387661934, + "reward_std": 0.07449371740221977, + "rewards/accuracy_reward": 0.09151786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2493 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.6205749511719, + "epoch": 0.7449779702785453, + "grad_norm": 1.9216333627700806, + "kl": 1.9091796875, + "learning_rate": 3.7035543052440646e-06, + "loss": 0.0799, + "reward": 0.6473214626312256, + "reward_std": 0.1222611628472805, + "rewards/accuracy_reward": 0.15848214970901608, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888392984867096, + "step": 2494 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.263427734375, + "epoch": 0.7452766783660668, + "grad_norm": 0.9625482559204102, + "kl": 1.8935546875, + "learning_rate": 3.69545464077548e-06, + "loss": 0.0789, + "reward": 0.5792410969734192, + "reward_std": 0.07480919826775789, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2495 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.9598693847656, + "epoch": 0.7455753864535882, + "grad_norm": 2.6230101585388184, + "kl": 2.0625, + "learning_rate": 3.68736183503433e-06, + "loss": 0.0898, + "reward": 0.641183078289032, + "reward_std": 0.1424654107540846, + "rewards/accuracy_reward": 0.1540178656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 2496 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.2522735595703, + "epoch": 0.7458740945411098, + "grad_norm": 0.5065740942955017, + "kl": 2.0009765625, + "learning_rate": 3.6792758968247986e-06, + "loss": 0.0612, + "reward": 0.574776828289032, + "reward_std": 0.11497476883232594, + "rewards/accuracy_reward": 0.09151786123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.483258955180645, + "step": 2497 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.7522888183594, + "epoch": 0.7461728026286312, + "grad_norm": 0.47476136684417725, + "kl": 1.0615234375, + "learning_rate": 3.6711968349435988e-06, + "loss": 0.0341, + "reward": 0.5909598469734192, + "reward_std": 0.12118699448183179, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2498 + }, + { + "clip_ratio": 0.0, + "completion_length": 928.5558471679688, + "epoch": 0.7464715107161526, + "grad_norm": 0.621989369392395, + "kl": 1.365234375, + "learning_rate": 3.6631246581799483e-06, + "loss": 0.0324, + "reward": 0.6026785969734192, + "reward_std": 0.10790305817499757, + "rewards/accuracy_reward": 0.11160714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2499 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.138427734375, + "epoch": 0.7467702188036741, + "grad_norm": 1.419030785560608, + "kl": 1.65087890625, + "learning_rate": 3.6550593753155893e-06, + "loss": 0.0383, + "reward": 0.5904017984867096, + "reward_std": 0.1668499242514372, + "rewards/accuracy_reward": 0.10267857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 2500 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.4018249511719, + "epoch": 0.7470689268911955, + "grad_norm": 1.4103344678878784, + "kl": 1.0517578125, + "learning_rate": 3.647000995124763e-06, + "loss": 0.0555, + "reward": 0.5870535969734192, + "reward_std": 0.13334079086780548, + "rewards/accuracy_reward": 0.09375000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2501 + }, + { + "clip_ratio": 0.0, + "completion_length": 893.7589874267578, + "epoch": 0.747367634978717, + "grad_norm": 1.9805729389190674, + "kl": 1.123046875, + "learning_rate": 3.6389495263741894e-06, + "loss": 0.0515, + "reward": 0.644531287252903, + "reward_std": 0.10169443488121033, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 2502 + }, + { + "clip_ratio": 0.0, + "completion_length": 912.9152221679688, + "epoch": 0.7476663430662385, + "grad_norm": 1.7813571691513062, + "kl": 1.26171875, + "learning_rate": 3.6309049778230822e-06, + "loss": 0.086, + "reward": 0.6746651977300644, + "reward_std": 0.127302554436028, + "rewards/accuracy_reward": 0.1852678693830967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2503 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.7232513427734, + "epoch": 0.74796505115376, + "grad_norm": 0.31133949756622314, + "kl": 0.79931640625, + "learning_rate": 3.622867358223122e-06, + "loss": 0.0301, + "reward": 0.5703125298023224, + "reward_std": 0.13917432352900505, + "rewards/accuracy_reward": 0.0758928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2504 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.9620819091797, + "epoch": 0.7482637592412814, + "grad_norm": 0.8940873742103577, + "kl": 1.3447265625, + "learning_rate": 3.6148366763184485e-06, + "loss": 0.0512, + "reward": 0.5234375298023224, + "reward_std": 0.08233908377587795, + "rewards/accuracy_reward": 0.03125000139698386, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2505 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.6049499511719, + "epoch": 0.7485624673288029, + "grad_norm": 0.9472060203552246, + "kl": 1.3681640625, + "learning_rate": 3.6068129408456564e-06, + "loss": 0.0584, + "reward": 0.5641741305589676, + "reward_std": 0.12595845572650433, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 2506 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.9933471679688, + "epoch": 0.7488611754163244, + "grad_norm": 0.6041693091392517, + "kl": 0.8837890625, + "learning_rate": 3.5987961605337894e-06, + "loss": 0.0341, + "reward": 0.5546875298023224, + "reward_std": 0.07351373066194355, + "rewards/accuracy_reward": 0.0602678582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2507 + }, + { + "clip_ratio": 0.0, + "completion_length": 919.1986999511719, + "epoch": 0.7491598835038459, + "grad_norm": 2.4849231243133545, + "kl": 1.4609375, + "learning_rate": 3.5907863441043113e-06, + "loss": 0.0766, + "reward": 0.6272321790456772, + "reward_std": 0.15541251376271248, + "rewards/accuracy_reward": 0.13839286752045155, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2508 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.7545166015625, + "epoch": 0.7494585915913673, + "grad_norm": 0.5542618632316589, + "kl": 1.2734375, + "learning_rate": 3.582783500271122e-06, + "loss": 0.0555, + "reward": 0.5457589477300644, + "reward_std": 0.11370484391227365, + "rewards/accuracy_reward": 0.05357142956927419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2509 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.544677734375, + "epoch": 0.7497572996788888, + "grad_norm": 2.169302463531494, + "kl": 1.32958984375, + "learning_rate": 3.574787637740528e-06, + "loss": 0.0461, + "reward": 0.5574776977300644, + "reward_std": 0.07519912905991077, + "rewards/accuracy_reward": 0.06919643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 2510 + }, + { + "clip_ratio": 0.0, + "completion_length": 933.5067443847656, + "epoch": 0.7500560077664102, + "grad_norm": 1.1334092617034912, + "kl": 1.361328125, + "learning_rate": 3.5667987652112445e-06, + "loss": 0.0695, + "reward": 0.6065848469734192, + "reward_std": 0.14777579717338085, + "rewards/accuracy_reward": 0.11830357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2511 + }, + { + "clip_ratio": 0.0, + "completion_length": 924.4420013427734, + "epoch": 0.7503547158539318, + "grad_norm": 0.6295472383499146, + "kl": 1.7138671875, + "learning_rate": 3.558816891374387e-06, + "loss": 0.105, + "reward": 0.6300223618745804, + "reward_std": 0.10554848052561283, + "rewards/accuracy_reward": 0.14062500977888703, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2512 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.9710083007812, + "epoch": 0.7506534239414532, + "grad_norm": 0.8904004096984863, + "kl": 1.55078125, + "learning_rate": 3.5508420249134432e-06, + "loss": 0.0595, + "reward": 0.6143973544239998, + "reward_std": 0.12503691669553518, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2513 + }, + { + "clip_ratio": 0.0, + "completion_length": 926.5513916015625, + "epoch": 0.7509521320289747, + "grad_norm": 1.3499257564544678, + "kl": 2.0078125, + "learning_rate": 3.5428741745042926e-06, + "loss": 0.0916, + "reward": 0.537388414144516, + "reward_std": 0.11302725411951542, + "rewards/accuracy_reward": 0.05357143026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 2514 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.654052734375, + "epoch": 0.7512508401164961, + "grad_norm": 0.5216354131698608, + "kl": 1.0341796875, + "learning_rate": 3.5349133488151764e-06, + "loss": 0.0506, + "reward": 0.5691964477300644, + "reward_std": 0.11180362291634083, + "rewards/accuracy_reward": 0.07366071548312902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2515 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.8995819091797, + "epoch": 0.7515495482040176, + "grad_norm": 1.9187862873077393, + "kl": 1.3349609375, + "learning_rate": 3.526959556506687e-06, + "loss": 0.0597, + "reward": 0.5608258992433548, + "reward_std": 0.0661222655326128, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616156578064, + "step": 2516 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.5781555175781, + "epoch": 0.7518482562915391, + "grad_norm": 0.40121889114379883, + "kl": 1.0400390625, + "learning_rate": 3.5190128062317742e-06, + "loss": 0.0477, + "reward": 0.5641741305589676, + "reward_std": 0.09289886243641376, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2517 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.5960388183594, + "epoch": 0.7521469643790606, + "grad_norm": 0.6619483232498169, + "kl": 1.916015625, + "learning_rate": 3.5110731066357264e-06, + "loss": 0.0759, + "reward": 0.6467634290456772, + "reward_std": 0.12384060397744179, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2518 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.0736846923828, + "epoch": 0.752445672466582, + "grad_norm": 1.4373509883880615, + "kl": 1.3544921875, + "learning_rate": 3.5031404663561507e-06, + "loss": 0.0489, + "reward": 0.572544664144516, + "reward_std": 0.0751924216747284, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2519 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.2701416015625, + "epoch": 0.7527443805541035, + "grad_norm": 1.1344735622406006, + "kl": 1.5068359375, + "learning_rate": 3.4952148940229915e-06, + "loss": 0.0387, + "reward": 0.5747767984867096, + "reward_std": 0.0657216627150774, + "rewards/accuracy_reward": 0.08482143143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2520 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.825927734375, + "epoch": 0.753043088641625, + "grad_norm": 0.6264263987541199, + "kl": 1.173828125, + "learning_rate": 3.4872963982584873e-06, + "loss": 0.0619, + "reward": 0.592075914144516, + "reward_std": 0.10940453037619591, + "rewards/accuracy_reward": 0.09821429057046771, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2521 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.0848541259766, + "epoch": 0.7533417967291465, + "grad_norm": 0.6222893595695496, + "kl": 1.01025390625, + "learning_rate": 3.4793849876771867e-06, + "loss": 0.0326, + "reward": 0.6160714626312256, + "reward_std": 0.1334065180271864, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493303582072258, + "step": 2522 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.9911193847656, + "epoch": 0.7536405048166679, + "grad_norm": 2.0116655826568604, + "kl": 1.744140625, + "learning_rate": 3.471480670885935e-06, + "loss": 0.0761, + "reward": 0.6478794813156128, + "reward_std": 0.18454236164689064, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330633878708, + "step": 2523 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.3192291259766, + "epoch": 0.7539392129041894, + "grad_norm": 1.7390755414962769, + "kl": 0.9951171875, + "learning_rate": 3.4635834564838467e-06, + "loss": 0.0451, + "reward": 0.560825914144516, + "reward_std": 0.08136337203904986, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2524 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.0803985595703, + "epoch": 0.7542379209917108, + "grad_norm": 0.9953936338424683, + "kl": 1.2255859375, + "learning_rate": 3.4556933530623193e-06, + "loss": 0.0454, + "reward": 0.6612723618745804, + "reward_std": 0.12355076149106026, + "rewards/accuracy_reward": 0.16964286658912897, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2525 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.2143249511719, + "epoch": 0.7545366290792324, + "grad_norm": 0.8600478172302246, + "kl": 1.134765625, + "learning_rate": 3.4478103692050168e-06, + "loss": 0.0388, + "reward": 0.5452009290456772, + "reward_std": 0.06716622924432158, + "rewards/accuracy_reward": 0.051339288242161274, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2526 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.4375610351562, + "epoch": 0.7548353371667538, + "grad_norm": 0.927910566329956, + "kl": 1.666015625, + "learning_rate": 3.439934513487845e-06, + "loss": 0.0543, + "reward": 0.7645089626312256, + "reward_std": 0.10296139307320118, + "rewards/accuracy_reward": 0.2745535857975483, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2527 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.8303985595703, + "epoch": 0.7551340452542753, + "grad_norm": 0.7413008213043213, + "kl": 1.6015625, + "learning_rate": 3.432065794478967e-06, + "loss": 0.0588, + "reward": 0.5736607313156128, + "reward_std": 0.057988185435533524, + "rewards/accuracy_reward": 0.08258928940631449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2528 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.1272888183594, + "epoch": 0.7554327533417967, + "grad_norm": 1.730878472328186, + "kl": 1.552734375, + "learning_rate": 3.4242042207387815e-06, + "loss": 0.0599, + "reward": 0.5306919887661934, + "reward_std": 0.043680332601070404, + "rewards/accuracy_reward": 0.04017857206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2529 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.1562957763672, + "epoch": 0.7557314614293182, + "grad_norm": 1.5828449726104736, + "kl": 1.6083984375, + "learning_rate": 3.4163498008199038e-06, + "loss": 0.0439, + "reward": 0.5825893133878708, + "reward_std": 0.10113639011979103, + "rewards/accuracy_reward": 0.08928571944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2530 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.3348693847656, + "epoch": 0.7560301695168397, + "grad_norm": 0.7216354012489319, + "kl": 1.2412109375, + "learning_rate": 3.4085025432671746e-06, + "loss": 0.057, + "reward": 0.5708705633878708, + "reward_std": 0.07789031602442265, + "rewards/accuracy_reward": 0.07589286100119352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2531 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.9687957763672, + "epoch": 0.7563288776043612, + "grad_norm": 0.9103665351867676, + "kl": 1.5361328125, + "learning_rate": 3.400662456617646e-06, + "loss": 0.0477, + "reward": 0.5206473618745804, + "reward_std": 0.10940778627991676, + "rewards/accuracy_reward": 0.031250001629814506, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2532 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.9397735595703, + "epoch": 0.7566275856918826, + "grad_norm": 0.7875356078147888, + "kl": 1.451904296875, + "learning_rate": 3.392829549400557e-06, + "loss": 0.0729, + "reward": 0.6026786118745804, + "reward_std": 0.09866566490381956, + "rewards/accuracy_reward": 0.11160714877769351, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2533 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.1451416015625, + "epoch": 0.7569262937794041, + "grad_norm": 0.5831905603408813, + "kl": 1.19091796875, + "learning_rate": 3.385003830137349e-06, + "loss": 0.0594, + "reward": 0.5870535969734192, + "reward_std": 0.13585089705884457, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2534 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.7924499511719, + "epoch": 0.7572250018669255, + "grad_norm": 0.6846439242362976, + "kl": 1.337890625, + "learning_rate": 3.3771853073416306e-06, + "loss": 0.0483, + "reward": 0.6534598469734192, + "reward_std": 0.1418961868621409, + "rewards/accuracy_reward": 0.1629464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2535 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.3973541259766, + "epoch": 0.7575237099544471, + "grad_norm": 0.7304684519767761, + "kl": 1.390625, + "learning_rate": 3.3693739895191934e-06, + "loss": 0.0485, + "reward": 0.6149553954601288, + "reward_std": 0.14723478630185127, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2536 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.9643249511719, + "epoch": 0.7578224180419685, + "grad_norm": 1.5256106853485107, + "kl": 1.419921875, + "learning_rate": 3.3615698851679866e-06, + "loss": 0.051, + "reward": 0.518415205180645, + "reward_std": 0.10913025215268135, + "rewards/accuracy_reward": 0.03125000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2537 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.8281707763672, + "epoch": 0.75812112612949, + "grad_norm": 2.2853236198425293, + "kl": 1.427734375, + "learning_rate": 3.3537730027781057e-06, + "loss": 0.0647, + "reward": 0.6835937947034836, + "reward_std": 0.18477307632565498, + "rewards/accuracy_reward": 0.19642857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2538 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.1138916015625, + "epoch": 0.7584198342170114, + "grad_norm": 1.0895369052886963, + "kl": 1.00390625, + "learning_rate": 3.3459833508317984e-06, + "loss": 0.0258, + "reward": 0.5976562798023224, + "reward_std": 0.1266792882233858, + "rewards/accuracy_reward": 0.10937500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2539 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.8817443847656, + "epoch": 0.758718542304533, + "grad_norm": 0.96067214012146, + "kl": 1.310546875, + "learning_rate": 3.338200937803444e-06, + "loss": 0.0386, + "reward": 0.5915178954601288, + "reward_std": 0.09295538812875748, + "rewards/accuracy_reward": 0.10044643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2540 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.263427734375, + "epoch": 0.7590172503920544, + "grad_norm": 0.5116685628890991, + "kl": 0.784423828125, + "learning_rate": 3.33042577215954e-06, + "loss": 0.0197, + "reward": 0.5887277126312256, + "reward_std": 0.09768202714622021, + "rewards/accuracy_reward": 0.09375000582076609, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2541 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.6116485595703, + "epoch": 0.7593159584795758, + "grad_norm": 0.9523541331291199, + "kl": 0.69384765625, + "learning_rate": 3.322657862358707e-06, + "loss": 0.0379, + "reward": 0.6858259290456772, + "reward_std": 0.14320833794772625, + "rewards/accuracy_reward": 0.1897321529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2542 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.3705902099609, + "epoch": 0.7596146665670973, + "grad_norm": 0.5612021684646606, + "kl": 1.3662109375, + "learning_rate": 3.3148972168516737e-06, + "loss": 0.042, + "reward": 0.534598246216774, + "reward_std": 0.06208943761885166, + "rewards/accuracy_reward": 0.04464285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2543 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.1205749511719, + "epoch": 0.7599133746546187, + "grad_norm": 0.5806952714920044, + "kl": 0.8466796875, + "learning_rate": 3.307143844081253e-06, + "loss": 0.0272, + "reward": 0.5172991305589676, + "reward_std": 0.07753591425716877, + "rewards/accuracy_reward": 0.024553572526201606, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2544 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.1808471679688, + "epoch": 0.7602120827421402, + "grad_norm": 1.4378899335861206, + "kl": 1.96875, + "learning_rate": 3.2993977524823616e-06, + "loss": 0.0906, + "reward": 0.6344866305589676, + "reward_std": 0.1593756526708603, + "rewards/accuracy_reward": 0.14955358020961285, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2545 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.6004943847656, + "epoch": 0.7605107908296617, + "grad_norm": 0.899006724357605, + "kl": 1.341796875, + "learning_rate": 3.2916589504819886e-06, + "loss": 0.0663, + "reward": 0.5998884290456772, + "reward_std": 0.10773398634046316, + "rewards/accuracy_reward": 0.11160714644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 2546 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.1161193847656, + "epoch": 0.7608094989171832, + "grad_norm": 1.1622257232666016, + "kl": 1.4599609375, + "learning_rate": 3.2839274464991856e-06, + "loss": 0.0478, + "reward": 0.6501116305589676, + "reward_std": 0.08556616911664605, + "rewards/accuracy_reward": 0.15625000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2547 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.7433471679688, + "epoch": 0.7611082070047046, + "grad_norm": 0.6875313520431519, + "kl": 1.01171875, + "learning_rate": 3.276203248945078e-06, + "loss": 0.0467, + "reward": 0.624441996216774, + "reward_std": 0.07112700119614601, + "rewards/accuracy_reward": 0.13169643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2548 + }, + { + "clip_ratio": 0.0, + "completion_length": 944.1518249511719, + "epoch": 0.7614069150922261, + "grad_norm": 1.0792595148086548, + "kl": 1.2451171875, + "learning_rate": 3.2684863662228307e-06, + "loss": 0.0585, + "reward": 0.631138414144516, + "reward_std": 0.14024658547714353, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 2549 + }, + { + "clip_ratio": 0.0, + "completion_length": 944.0647583007812, + "epoch": 0.7617056231797475, + "grad_norm": 0.7971240878105164, + "kl": 0.773681640625, + "learning_rate": 3.260776806727657e-06, + "loss": 0.0308, + "reward": 0.5267857313156128, + "reward_std": 0.05805951543152332, + "rewards/accuracy_reward": 0.03125000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2550 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.841552734375, + "epoch": 0.7620043312672691, + "grad_norm": 0.38613075017929077, + "kl": 0.83154296875, + "learning_rate": 3.2530745788468052e-06, + "loss": 0.0143, + "reward": 0.6026785969734192, + "reward_std": 0.11073778197169304, + "rewards/accuracy_reward": 0.10714285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2551 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.2455749511719, + "epoch": 0.7623030393547905, + "grad_norm": 0.9856998324394226, + "kl": 1.533203125, + "learning_rate": 3.2453796909595394e-06, + "loss": 0.0792, + "reward": 0.6121652126312256, + "reward_std": 0.15273778326809406, + "rewards/accuracy_reward": 0.12500000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 2552 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.7366638183594, + "epoch": 0.762601747442312, + "grad_norm": 0.8024538159370422, + "kl": 1.416015625, + "learning_rate": 3.237692151437146e-06, + "loss": 0.0258, + "reward": 0.5920759066939354, + "reward_std": 0.11607392271980643, + "rewards/accuracy_reward": 0.10267857951112092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2553 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.0112152099609, + "epoch": 0.7629004555298334, + "grad_norm": 1.062017560005188, + "kl": 0.9453125, + "learning_rate": 3.2300119686429177e-06, + "loss": 0.0175, + "reward": 0.554129496216774, + "reward_std": 0.1286082249134779, + "rewards/accuracy_reward": 0.06250000419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2554 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.6808471679688, + "epoch": 0.763199163617355, + "grad_norm": 0.47602346539497375, + "kl": 0.669921875, + "learning_rate": 3.2223391509321335e-06, + "loss": 0.0238, + "reward": 0.6724330484867096, + "reward_std": 0.11634750291705132, + "rewards/accuracy_reward": 0.176339291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2555 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.3437957763672, + "epoch": 0.7634978717048764, + "grad_norm": 0.8434492945671082, + "kl": 0.9287109375, + "learning_rate": 3.2146737066520705e-06, + "loss": 0.0287, + "reward": 0.5915178954601288, + "reward_std": 0.11293264105916023, + "rewards/accuracy_reward": 0.09821429289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2556 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.6250457763672, + "epoch": 0.7637965797923979, + "grad_norm": 1.4155141115188599, + "kl": 1.23828125, + "learning_rate": 3.2070156441419864e-06, + "loss": 0.0512, + "reward": 0.5797991156578064, + "reward_std": 0.12926893681287766, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 2557 + }, + { + "clip_ratio": 0.0, + "completion_length": 997.8147888183594, + "epoch": 0.7640952878799193, + "grad_norm": 1.1922924518585205, + "kl": 1.015625, + "learning_rate": 3.199364971733092e-06, + "loss": 0.0376, + "reward": 0.6127232313156128, + "reward_std": 0.0929373325780034, + "rewards/accuracy_reward": 0.1183035783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2558 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.5022735595703, + "epoch": 0.7643939959674408, + "grad_norm": 1.3451001644134521, + "kl": 0.653076171875, + "learning_rate": 3.1917216977485765e-06, + "loss": 0.0296, + "reward": 0.5167410969734192, + "reward_std": 0.05980470450595021, + "rewards/accuracy_reward": 0.02008928661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2559 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.1875457763672, + "epoch": 0.7646927040549623, + "grad_norm": 0.5320243239402771, + "kl": 0.9619140625, + "learning_rate": 3.1840858305035727e-06, + "loss": 0.0402, + "reward": 0.6590402126312256, + "reward_std": 0.09581681271083653, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2560 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.1250610351562, + "epoch": 0.7649914121424838, + "grad_norm": 0.5176110863685608, + "kl": 1.517578125, + "learning_rate": 3.176457378305151e-06, + "loss": 0.0467, + "reward": 0.6127232313156128, + "reward_std": 0.10163722885772586, + "rewards/accuracy_reward": 0.12276786495931447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2561 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.3884429931641, + "epoch": 0.7652901202300052, + "grad_norm": 0.9152581095695496, + "kl": 1.4599609375, + "learning_rate": 3.1688363494523267e-06, + "loss": 0.0518, + "reward": 0.5597098618745804, + "reward_std": 0.10274068266153336, + "rewards/accuracy_reward": 0.06919643213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2562 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.1049652099609, + "epoch": 0.7655888283175267, + "grad_norm": 1.9606372117996216, + "kl": 1.5888671875, + "learning_rate": 3.161222752236024e-06, + "loss": 0.0692, + "reward": 0.594866082072258, + "reward_std": 0.11959850043058395, + "rewards/accuracy_reward": 0.10491071688011289, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2563 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.5089569091797, + "epoch": 0.7658875364050481, + "grad_norm": 2.6834774017333984, + "kl": 1.146484375, + "learning_rate": 3.1536165949390953e-06, + "loss": 0.0509, + "reward": 0.577566996216774, + "reward_std": 0.10757166519761086, + "rewards/accuracy_reward": 0.08482143259607255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2564 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.3504791259766, + "epoch": 0.7661862444925697, + "grad_norm": 2.3919901847839355, + "kl": 1.310546875, + "learning_rate": 3.1460178858362955e-06, + "loss": 0.0565, + "reward": 0.5139509215950966, + "reward_std": 0.09956280142068863, + "rewards/accuracy_reward": 0.02455357206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2565 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.8170013427734, + "epoch": 0.7664849525800911, + "grad_norm": 1.2876585721969604, + "kl": 1.369140625, + "learning_rate": 3.1384266331942715e-06, + "loss": 0.046, + "reward": 0.5820312723517418, + "reward_std": 0.08669689670205116, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2566 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.3594207763672, + "epoch": 0.7667836606676126, + "grad_norm": 0.8304136991500854, + "kl": 1.5732421875, + "learning_rate": 3.1308428452715643e-06, + "loss": 0.0446, + "reward": 0.6071428954601288, + "reward_std": 0.14281130209565163, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2567 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.1250457763672, + "epoch": 0.767082368755134, + "grad_norm": 1.9785131216049194, + "kl": 2.0234375, + "learning_rate": 3.123266530318594e-06, + "loss": 0.0684, + "reward": 0.597098246216774, + "reward_std": 0.12188026029616594, + "rewards/accuracy_reward": 0.10937500651925802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 2568 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.2053985595703, + "epoch": 0.7673810768426556, + "grad_norm": 0.42311036586761475, + "kl": 1.2119140625, + "learning_rate": 3.115697696577644e-06, + "loss": 0.059, + "reward": 0.6489955633878708, + "reward_std": 0.16379360668361187, + "rewards/accuracy_reward": 0.1584821529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2569 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.7120971679688, + "epoch": 0.767679784930177, + "grad_norm": 1.1748348474502563, + "kl": 1.3828125, + "learning_rate": 3.1081363522828655e-06, + "loss": 0.029, + "reward": 0.5976562798023224, + "reward_std": 0.08892642520368099, + "rewards/accuracy_reward": 0.10714286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2570 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.3393249511719, + "epoch": 0.7679784930176985, + "grad_norm": 0.6002743244171143, + "kl": 1.42236328125, + "learning_rate": 3.1005825056602634e-06, + "loss": 0.0474, + "reward": 0.576450914144516, + "reward_std": 0.13146694004535675, + "rewards/accuracy_reward": 0.08705357694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2571 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.0491485595703, + "epoch": 0.7682772011052199, + "grad_norm": 0.53298020362854, + "kl": 1.4140625, + "learning_rate": 3.0930361649276774e-06, + "loss": 0.033, + "reward": 0.5803571790456772, + "reward_std": 0.10317645408213139, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2572 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.4754943847656, + "epoch": 0.7685759091927414, + "grad_norm": 0.6237444877624512, + "kl": 1.05859375, + "learning_rate": 3.0854973382947884e-06, + "loss": 0.0348, + "reward": 0.5496651977300644, + "reward_std": 0.0812270911410451, + "rewards/accuracy_reward": 0.05580357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2573 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.7031555175781, + "epoch": 0.7688746172802629, + "grad_norm": 0.8324204087257385, + "kl": 1.822265625, + "learning_rate": 3.0779660339631035e-06, + "loss": 0.0522, + "reward": 0.553013414144516, + "reward_std": 0.09834026172757149, + "rewards/accuracy_reward": 0.06696428963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2574 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.3348541259766, + "epoch": 0.7691733253677844, + "grad_norm": 1.1858683824539185, + "kl": 0.9912109375, + "learning_rate": 3.070442260125939e-06, + "loss": 0.0302, + "reward": 0.660156287252903, + "reward_std": 0.09571161703206599, + "rewards/accuracy_reward": 0.1674107238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2575 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.2254943847656, + "epoch": 0.7694720334553058, + "grad_norm": 1.0797185897827148, + "kl": 2.22265625, + "learning_rate": 3.0629260249684288e-06, + "loss": 0.0914, + "reward": 0.541294664144516, + "reward_std": 0.10352344531565905, + "rewards/accuracy_reward": 0.05580357392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 2576 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.2053985595703, + "epoch": 0.7697707415428273, + "grad_norm": 0.9822916984558105, + "kl": 0.9951171875, + "learning_rate": 3.0554173366674944e-06, + "loss": 0.033, + "reward": 0.5959821790456772, + "reward_std": 0.12659474276006222, + "rewards/accuracy_reward": 0.10267857741564512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493303582072258, + "step": 2577 + }, + { + "clip_ratio": 0.0, + "completion_length": 970.6183471679688, + "epoch": 0.7700694496303487, + "grad_norm": 0.47037944197654724, + "kl": 1.5703125, + "learning_rate": 3.0479162033918553e-06, + "loss": 0.056, + "reward": 0.5664062798023224, + "reward_std": 0.033434624783694744, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 2578 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.3482666015625, + "epoch": 0.7703681577178703, + "grad_norm": 0.3863815367221832, + "kl": 1.8515625, + "learning_rate": 3.0404226333020117e-06, + "loss": 0.0577, + "reward": 0.5574776977300644, + "reward_std": 0.07423304114490747, + "rewards/accuracy_reward": 0.06919643259607255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2579 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.4062805175781, + "epoch": 0.7706668658053917, + "grad_norm": 1.0012288093566895, + "kl": 1.189453125, + "learning_rate": 3.0329366345502287e-06, + "loss": 0.0458, + "reward": 0.5524553805589676, + "reward_std": 0.06779463891871274, + "rewards/accuracy_reward": 0.06026785960420966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2580 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.3393402099609, + "epoch": 0.7709655738929132, + "grad_norm": 1.0988041162490845, + "kl": 1.427734375, + "learning_rate": 3.025458215280542e-06, + "loss": 0.0296, + "reward": 0.6356027126312256, + "reward_std": 0.09889769274741411, + "rewards/accuracy_reward": 0.14508929383009672, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2581 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.3170166015625, + "epoch": 0.7712642819804346, + "grad_norm": 0.4521418511867523, + "kl": 1.09814453125, + "learning_rate": 3.017987383628741e-06, + "loss": 0.0548, + "reward": 0.6077009215950966, + "reward_std": 0.08879757579416037, + "rewards/accuracy_reward": 0.11383928824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2582 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.0446929931641, + "epoch": 0.7715629900679561, + "grad_norm": 0.9274017810821533, + "kl": 1.10546875, + "learning_rate": 3.0105241477223533e-06, + "loss": 0.0418, + "reward": 0.5563616156578064, + "reward_std": 0.09403185732662678, + "rewards/accuracy_reward": 0.06250000325962901, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2583 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.2076416015625, + "epoch": 0.7718616981554776, + "grad_norm": 0.6017051339149475, + "kl": 1.3505859375, + "learning_rate": 3.0030685156806506e-06, + "loss": 0.0539, + "reward": 0.6071428954601288, + "reward_std": 0.13637983985245228, + "rewards/accuracy_reward": 0.12053571734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2584 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.4687957763672, + "epoch": 0.772160406242999, + "grad_norm": 0.5382633209228516, + "kl": 1.140625, + "learning_rate": 2.995620495614633e-06, + "loss": 0.0196, + "reward": 0.5909598469734192, + "reward_std": 0.07835428323596716, + "rewards/accuracy_reward": 0.10044643236324191, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2585 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.3348693847656, + "epoch": 0.7724591143305205, + "grad_norm": 0.5761297345161438, + "kl": 1.05859375, + "learning_rate": 2.98818009562701e-06, + "loss": 0.0363, + "reward": 0.5876116454601288, + "reward_std": 0.06880486663430929, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2586 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.4866485595703, + "epoch": 0.7727578224180419, + "grad_norm": 1.0628557205200195, + "kl": 1.2470703125, + "learning_rate": 2.9807473238122097e-06, + "loss": 0.0475, + "reward": 0.5591517984867096, + "reward_std": 0.08531290129758418, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2587 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.9553985595703, + "epoch": 0.7730565305055634, + "grad_norm": 0.7662690877914429, + "kl": 0.8740234375, + "learning_rate": 2.9733221882563647e-06, + "loss": 0.0384, + "reward": 0.6138393133878708, + "reward_std": 0.14081570878624916, + "rewards/accuracy_reward": 0.1205357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2588 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.9420013427734, + "epoch": 0.7733552385930849, + "grad_norm": 0.8630743026733398, + "kl": 1.0966796875, + "learning_rate": 2.9659046970372875e-06, + "loss": 0.0255, + "reward": 0.6177455633878708, + "reward_std": 0.10311495512723923, + "rewards/accuracy_reward": 0.12500000977888703, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2589 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.3862152099609, + "epoch": 0.7736539466806064, + "grad_norm": 0.43274784088134766, + "kl": 1.0576171875, + "learning_rate": 2.9584948582244865e-06, + "loss": 0.0396, + "reward": 0.5781250298023224, + "reward_std": 0.13411962613463402, + "rewards/accuracy_reward": 0.0870535783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714402794838, + "step": 2590 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.4062805175781, + "epoch": 0.7739526547681278, + "grad_norm": 0.6015554070472717, + "kl": 1.2802734375, + "learning_rate": 2.951092679879136e-06, + "loss": 0.0567, + "reward": 0.5792410969734192, + "reward_std": 0.06629600655287504, + "rewards/accuracy_reward": 0.08705357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2591 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.3482666015625, + "epoch": 0.7742513628556493, + "grad_norm": 0.9314785599708557, + "kl": 1.0634765625, + "learning_rate": 2.9436981700540824e-06, + "loss": 0.0413, + "reward": 0.5820312649011612, + "reward_std": 0.1481003314256668, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2592 + }, + { + "clip_ratio": 0.0, + "completion_length": 917.1518096923828, + "epoch": 0.7745500709431707, + "grad_norm": 1.0262006521224976, + "kl": 1.64453125, + "learning_rate": 2.936311336793831e-06, + "loss": 0.057, + "reward": 0.6344866380095482, + "reward_std": 0.10722707863897085, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2593 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.9978179931641, + "epoch": 0.7748487790306923, + "grad_norm": 0.997988760471344, + "kl": 1.228515625, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.0363, + "reward": 0.6149553805589676, + "reward_std": 0.18245506845414639, + "rewards/accuracy_reward": 0.12500000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2594 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.8951416015625, + "epoch": 0.7751474871182137, + "grad_norm": 0.7781645059585571, + "kl": 1.1435546875, + "learning_rate": 2.9215607321039606e-06, + "loss": 0.023, + "reward": 0.546316996216774, + "reward_std": 0.06246102903969586, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 2595 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.4442443847656, + "epoch": 0.7754461952057352, + "grad_norm": 1.8203015327453613, + "kl": 2.0908203125, + "learning_rate": 2.9141969767215607e-06, + "loss": 0.0878, + "reward": 0.5675223395228386, + "reward_std": 0.11512834206223488, + "rewards/accuracy_reward": 0.0825892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2596 + }, + { + "clip_ratio": 0.0, + "completion_length": 926.7746124267578, + "epoch": 0.7757449032932566, + "grad_norm": 2.6425087451934814, + "kl": 1.28125, + "learning_rate": 2.9068409299983634e-06, + "loss": 0.0541, + "reward": 0.5546875298023224, + "reward_std": 0.06818881258368492, + "rewards/accuracy_reward": 0.06026786030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2597 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.3103332519531, + "epoch": 0.7760436113807782, + "grad_norm": 0.7184537053108215, + "kl": 1.4052734375, + "learning_rate": 2.8994925999370305e-06, + "loss": 0.039, + "reward": 0.700334832072258, + "reward_std": 0.15165321715176105, + "rewards/accuracy_reward": 0.2075892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2598 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.4911193847656, + "epoch": 0.7763423194682996, + "grad_norm": 1.3054965734481812, + "kl": 1.9775390625, + "learning_rate": 2.8921519945318276e-06, + "loss": 0.0802, + "reward": 0.5368303805589676, + "reward_std": 0.119094492867589, + "rewards/accuracy_reward": 0.051339288242161274, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 2599 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.5513916015625, + "epoch": 0.7766410275558211, + "grad_norm": 0.8595686554908752, + "kl": 1.29296875, + "learning_rate": 2.884819121768607e-06, + "loss": 0.0524, + "reward": 0.5000000223517418, + "reward_std": 0.05442149378359318, + "rewards/accuracy_reward": 0.008928572060540318, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2600 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.1942443847656, + "epoch": 0.7769397356433425, + "grad_norm": 1.4757076501846313, + "kl": 1.68798828125, + "learning_rate": 2.877493989624822e-06, + "loss": 0.05, + "reward": 0.647879496216774, + "reward_std": 0.09838119335472584, + "rewards/accuracy_reward": 0.15848215040750802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489397332072258, + "step": 2601 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.6830749511719, + "epoch": 0.777238443730864, + "grad_norm": 0.6848925948143005, + "kl": 1.36376953125, + "learning_rate": 2.8701766060694937e-06, + "loss": 0.0681, + "reward": 0.6188616305589676, + "reward_std": 0.08856676425784826, + "rewards/accuracy_reward": 0.12946428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2602 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.2969207763672, + "epoch": 0.7775371518183855, + "grad_norm": 2.579760789871216, + "kl": 1.6015625, + "learning_rate": 2.862866979063219e-06, + "loss": 0.0566, + "reward": 0.5223214477300644, + "reward_std": 0.09755481779575348, + "rewards/accuracy_reward": 0.03571428754366934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071715950966, + "step": 2603 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.3638763427734, + "epoch": 0.777835859905907, + "grad_norm": 0.8081057667732239, + "kl": 1.0205078125, + "learning_rate": 2.855565116558161e-06, + "loss": 0.0316, + "reward": 0.5931919813156128, + "reward_std": 0.0761173889040947, + "rewards/accuracy_reward": 0.10044643585570157, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2604 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.7500610351562, + "epoch": 0.7781345679934284, + "grad_norm": 0.7021357417106628, + "kl": 0.89501953125, + "learning_rate": 2.848271026498023e-06, + "loss": 0.0415, + "reward": 0.7014509439468384, + "reward_std": 0.1237042834982276, + "rewards/accuracy_reward": 0.2075892947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2605 + }, + { + "clip_ratio": 0.0, + "completion_length": 911.8995971679688, + "epoch": 0.7784332760809499, + "grad_norm": 0.7990726232528687, + "kl": 1.701171875, + "learning_rate": 2.8409847168180628e-06, + "loss": 0.089, + "reward": 0.6462053805589676, + "reward_std": 0.14410669542849064, + "rewards/accuracy_reward": 0.16071429196745157, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910969734192, + "step": 2606 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.4018096923828, + "epoch": 0.7787319841684713, + "grad_norm": 1.318623423576355, + "kl": 1.0126953125, + "learning_rate": 2.833706195445075e-06, + "loss": 0.0328, + "reward": 0.5976562649011612, + "reward_std": 0.08404536359012127, + "rewards/accuracy_reward": 0.10491071618162096, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2607 + }, + { + "clip_ratio": 0.0, + "completion_length": 927.5067291259766, + "epoch": 0.7790306922559929, + "grad_norm": 0.7475293278694153, + "kl": 1.494140625, + "learning_rate": 2.826435470297372e-06, + "loss": 0.0686, + "reward": 0.6367187649011612, + "reward_std": 0.11467515490949154, + "rewards/accuracy_reward": 0.14732143376022577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2608 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.7723846435547, + "epoch": 0.7793294003435143, + "grad_norm": 0.3547382950782776, + "kl": 1.080078125, + "learning_rate": 2.8191725492847923e-06, + "loss": 0.0299, + "reward": 0.5876116305589676, + "reward_std": 0.08039747830480337, + "rewards/accuracy_reward": 0.09598214691504836, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2609 + }, + { + "clip_ratio": 0.0, + "completion_length": 914.8817443847656, + "epoch": 0.7796281084310358, + "grad_norm": 2.354496479034424, + "kl": 2.17578125, + "learning_rate": 2.8119174403086845e-06, + "loss": 0.0899, + "reward": 0.5976562798023224, + "reward_std": 0.1633897484280169, + "rewards/accuracy_reward": 0.10937500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2610 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.294677734375, + "epoch": 0.7799268165185572, + "grad_norm": 1.5742592811584473, + "kl": 1.56640625, + "learning_rate": 2.8046701512618914e-06, + "loss": 0.0504, + "reward": 0.5697544813156128, + "reward_std": 0.1541295237839222, + "rewards/accuracy_reward": 0.07812500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491629496216774, + "step": 2611 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.1674652099609, + "epoch": 0.7802255246060787, + "grad_norm": 0.34335896372795105, + "kl": 1.4892578125, + "learning_rate": 2.797430690028755e-06, + "loss": 0.0533, + "reward": 0.541294664144516, + "reward_std": 0.10107885533943772, + "rewards/accuracy_reward": 0.04910714388824999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2612 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.294677734375, + "epoch": 0.7805242326936002, + "grad_norm": 0.7344179749488831, + "kl": 1.57421875, + "learning_rate": 2.7901990644851042e-06, + "loss": 0.0531, + "reward": 0.5128348469734192, + "reward_std": 0.09179290197789669, + "rewards/accuracy_reward": 0.02232142980210483, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2613 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.6652069091797, + "epoch": 0.7808229407811217, + "grad_norm": 1.466677188873291, + "kl": 1.27294921875, + "learning_rate": 2.7829752824982305e-06, + "loss": 0.0619, + "reward": 0.595982164144516, + "reward_std": 0.13844134472310543, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714402794838, + "step": 2614 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.1027374267578, + "epoch": 0.7811216488686431, + "grad_norm": 1.4287880659103394, + "kl": 1.6904296875, + "learning_rate": 2.7757593519269088e-06, + "loss": 0.0541, + "reward": 0.6406250298023224, + "reward_std": 0.10394320008344948, + "rewards/accuracy_reward": 0.1495535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2615 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.9152221679688, + "epoch": 0.7814203569561646, + "grad_norm": 0.5866729021072388, + "kl": 1.6220703125, + "learning_rate": 2.768551280621358e-06, + "loss": 0.0473, + "reward": 0.5831473544239998, + "reward_std": 0.0821845424361527, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2616 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.0111999511719, + "epoch": 0.781719065043686, + "grad_norm": 1.0453860759735107, + "kl": 0.965576171875, + "learning_rate": 2.7613510764232542e-06, + "loss": 0.0492, + "reward": 0.5664062723517418, + "reward_std": 0.13775003422051668, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2617 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.6339721679688, + "epoch": 0.7820177731312076, + "grad_norm": 0.7177817225456238, + "kl": 1.349609375, + "learning_rate": 2.7541587471657205e-06, + "loss": 0.0496, + "reward": 0.6333705633878708, + "reward_std": 0.1138130221515894, + "rewards/accuracy_reward": 0.1450892980210483, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2618 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.9486999511719, + "epoch": 0.782316481218729, + "grad_norm": 1.7515251636505127, + "kl": 1.650390625, + "learning_rate": 2.7469743006732964e-06, + "loss": 0.0566, + "reward": 0.546316996216774, + "reward_std": 0.1332417968660593, + "rewards/accuracy_reward": 0.05803571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812798023224, + "step": 2619 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.8303985595703, + "epoch": 0.7826151893062505, + "grad_norm": 0.5198062658309937, + "kl": 0.946044921875, + "learning_rate": 2.7397977447619606e-06, + "loss": 0.0536, + "reward": 0.5636160969734192, + "reward_std": 0.07500340603291988, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2620 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.6004943847656, + "epoch": 0.7829138973937719, + "grad_norm": 0.822754442691803, + "kl": 1.0341796875, + "learning_rate": 2.732629087239106e-06, + "loss": 0.0487, + "reward": 0.6462053805589676, + "reward_std": 0.10940952226519585, + "rewards/accuracy_reward": 0.15178572200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2621 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.9978179931641, + "epoch": 0.7832126054812935, + "grad_norm": 1.0855075120925903, + "kl": 0.7548828125, + "learning_rate": 2.7254683359035216e-06, + "loss": 0.0308, + "reward": 0.6077009215950966, + "reward_std": 0.035695736296474934, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2622 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.3973541259766, + "epoch": 0.7835113135688149, + "grad_norm": 1.4670255184173584, + "kl": 0.76708984375, + "learning_rate": 2.7183154985454075e-06, + "loss": 0.0466, + "reward": 0.550223246216774, + "reward_std": 0.10984365455806255, + "rewards/accuracy_reward": 0.05580357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2623 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.5647735595703, + "epoch": 0.7838100216563364, + "grad_norm": 0.5551460385322571, + "kl": 0.9150390625, + "learning_rate": 2.711170582946352e-06, + "loss": 0.0349, + "reward": 0.5641741156578064, + "reward_std": 0.09264765703119338, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2624 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.0826416015625, + "epoch": 0.7841087297438578, + "grad_norm": 0.5824682712554932, + "kl": 0.8427734375, + "learning_rate": 2.7040335968793174e-06, + "loss": 0.0485, + "reward": 0.5948660969734192, + "reward_std": 0.07255633966997266, + "rewards/accuracy_reward": 0.10044643376022577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2625 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.4576263427734, + "epoch": 0.7844074378313793, + "grad_norm": 0.6090962886810303, + "kl": 0.8935546875, + "learning_rate": 2.6969045481086476e-06, + "loss": 0.0441, + "reward": 0.6021205484867096, + "reward_std": 0.07520358473993838, + "rewards/accuracy_reward": 0.10714286239817739, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2626 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.9018249511719, + "epoch": 0.7847061459189008, + "grad_norm": 1.0721534490585327, + "kl": 0.8603515625, + "learning_rate": 2.689783444390053e-06, + "loss": 0.0462, + "reward": 0.5792410969734192, + "reward_std": 0.1305356165394187, + "rewards/accuracy_reward": 0.08482143585570157, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2627 + }, + { + "clip_ratio": 0.0, + "completion_length": 932.4598693847656, + "epoch": 0.7850048540064222, + "grad_norm": 0.34738942980766296, + "kl": 0.97265625, + "learning_rate": 2.6826702934705885e-06, + "loss": 0.055, + "reward": 0.6780134290456772, + "reward_std": 0.10956043004989624, + "rewards/accuracy_reward": 0.1830357238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2628 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.1518249511719, + "epoch": 0.7853035620939437, + "grad_norm": 0.8590528964996338, + "kl": 1.0, + "learning_rate": 2.6755651030886733e-06, + "loss": 0.0509, + "reward": 0.5574777126312256, + "reward_std": 0.1151454234495759, + "rewards/accuracy_reward": 0.06473214761354029, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2629 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.216552734375, + "epoch": 0.7856022701814651, + "grad_norm": 0.4664997160434723, + "kl": 0.86669921875, + "learning_rate": 2.6684678809740505e-06, + "loss": 0.0431, + "reward": 0.5909598469734192, + "reward_std": 0.13500091806054115, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2630 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.5111999511719, + "epoch": 0.7859009782689866, + "grad_norm": 1.6540733575820923, + "kl": 1.0908203125, + "learning_rate": 2.6613786348478053e-06, + "loss": 0.0535, + "reward": 0.6395089477300644, + "reward_std": 0.11988940462470055, + "rewards/accuracy_reward": 0.14508928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2631 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.5580902099609, + "epoch": 0.786199686356508, + "grad_norm": 0.4637564420700073, + "kl": 0.513916015625, + "learning_rate": 2.6542973724223475e-06, + "loss": 0.0273, + "reward": 0.5552455484867096, + "reward_std": 0.06016910611651838, + "rewards/accuracy_reward": 0.058035716880112886, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2632 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.1741485595703, + "epoch": 0.7864983944440296, + "grad_norm": 1.5989456176757812, + "kl": 1.13671875, + "learning_rate": 2.647224101401389e-06, + "loss": 0.0471, + "reward": 0.549107164144516, + "reward_std": 0.11746721714735031, + "rewards/accuracy_reward": 0.053571431431919336, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2633 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.4554138183594, + "epoch": 0.786797102531551, + "grad_norm": 0.9868230819702148, + "kl": 1.953125, + "learning_rate": 2.6401588294799574e-06, + "loss": 0.0457, + "reward": 0.5987723544239998, + "reward_std": 0.09678564965724945, + "rewards/accuracy_reward": 0.10937500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2634 + }, + { + "clip_ratio": 0.0, + "completion_length": 933.4085388183594, + "epoch": 0.7870958106190725, + "grad_norm": 1.391589641571045, + "kl": 1.638671875, + "learning_rate": 2.633101564344381e-06, + "loss": 0.068, + "reward": 0.5792410969734192, + "reward_std": 0.11316564492881298, + "rewards/accuracy_reward": 0.08928572060540318, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2635 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.6495971679688, + "epoch": 0.7873945187065939, + "grad_norm": 0.5427027940750122, + "kl": 1.098388671875, + "learning_rate": 2.626052313672267e-06, + "loss": 0.0385, + "reward": 0.5200892984867096, + "reward_std": 0.06418491527438164, + "rewards/accuracy_reward": 0.029017859371379018, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714402794838, + "step": 2636 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.2121124267578, + "epoch": 0.7876932267941155, + "grad_norm": 0.4539378583431244, + "kl": 0.9677734375, + "learning_rate": 2.61901108513251e-06, + "loss": 0.0562, + "reward": 0.5965401977300644, + "reward_std": 0.10379488137550652, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2637 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.5045166015625, + "epoch": 0.7879919348816369, + "grad_norm": 0.5813997387886047, + "kl": 0.8837890625, + "learning_rate": 2.611977886385282e-06, + "loss": 0.0263, + "reward": 0.5876116305589676, + "reward_std": 0.125744441524148, + "rewards/accuracy_reward": 0.09375000651925802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2638 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.6741485595703, + "epoch": 0.7882906429691584, + "grad_norm": 1.6603161096572876, + "kl": 0.9169921875, + "learning_rate": 2.604952725082005e-06, + "loss": 0.0578, + "reward": 0.6858259290456772, + "reward_std": 0.11277913302183151, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2639 + }, + { + "clip_ratio": 0.0, + "completion_length": 932.2009429931641, + "epoch": 0.7885893510566798, + "grad_norm": 0.5771667957305908, + "kl": 0.55908203125, + "learning_rate": 2.5979356088653718e-06, + "loss": 0.0319, + "reward": 0.5892857313156128, + "reward_std": 0.04798686993308365, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 2640 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.0424499511719, + "epoch": 0.7888880591442013, + "grad_norm": 0.6754101514816284, + "kl": 0.67578125, + "learning_rate": 2.5909265453693187e-06, + "loss": 0.0188, + "reward": 0.5814732313156128, + "reward_std": 0.13272813707590103, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.496651791036129, + "step": 2641 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.1116485595703, + "epoch": 0.7891867672317228, + "grad_norm": 1.1098426580429077, + "kl": 0.740234375, + "learning_rate": 2.5839255422190136e-06, + "loss": 0.0396, + "reward": 0.5915178954601288, + "reward_std": 0.07659276854246855, + "rewards/accuracy_reward": 0.09598214901052415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2642 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.8214721679688, + "epoch": 0.7894854753192443, + "grad_norm": 0.4393722712993622, + "kl": 1.044921875, + "learning_rate": 2.5769326070308676e-06, + "loss": 0.0405, + "reward": 0.607700914144516, + "reward_std": 0.1114874524064362, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2643 + }, + { + "clip_ratio": 0.0, + "completion_length": 922.0268249511719, + "epoch": 0.7897841834067657, + "grad_norm": 0.9689352512359619, + "kl": 0.92626953125, + "learning_rate": 2.5699477474125044e-06, + "loss": 0.0541, + "reward": 0.572544664144516, + "reward_std": 0.10129989311099052, + "rewards/accuracy_reward": 0.07812500349245965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2644 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.4286041259766, + "epoch": 0.7900828914942872, + "grad_norm": 0.7895132303237915, + "kl": 1.49609375, + "learning_rate": 2.562970970962768e-06, + "loss": 0.0663, + "reward": 0.636160746216774, + "reward_std": 0.0790025438182056, + "rewards/accuracy_reward": 0.14732143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2645 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.6495971679688, + "epoch": 0.7903815995818086, + "grad_norm": 0.6797037124633789, + "kl": 0.63720703125, + "learning_rate": 2.5560022852717115e-06, + "loss": 0.0174, + "reward": 0.6529017984867096, + "reward_std": 0.09813080029562116, + "rewards/accuracy_reward": 0.15848214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2646 + }, + { + "clip_ratio": 0.0, + "completion_length": 994.5736999511719, + "epoch": 0.7906803076693302, + "grad_norm": 1.3603897094726562, + "kl": 1.18408203125, + "learning_rate": 2.5490416979205758e-06, + "loss": 0.0337, + "reward": 0.6462053805589676, + "reward_std": 0.08852703124284744, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2647 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.7232666015625, + "epoch": 0.7909790157568516, + "grad_norm": 0.48737889528274536, + "kl": 1.00390625, + "learning_rate": 2.542089216481799e-06, + "loss": 0.0453, + "reward": 0.5758928954601288, + "reward_std": 0.10478753410279751, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2648 + }, + { + "clip_ratio": 0.0, + "completion_length": 936.2031555175781, + "epoch": 0.7912777238443731, + "grad_norm": 1.5883127450942993, + "kl": 1.15869140625, + "learning_rate": 2.5351448485190043e-06, + "loss": 0.0448, + "reward": 0.6489955484867096, + "reward_std": 0.08155624126084149, + "rewards/accuracy_reward": 0.15401786752045155, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2649 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.1072082519531, + "epoch": 0.7915764319318945, + "grad_norm": 1.1101233959197998, + "kl": 1.6728515625, + "learning_rate": 2.5282086015869777e-06, + "loss": 0.0775, + "reward": 0.5736607313156128, + "reward_std": 0.10715580638498068, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2650 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.1897735595703, + "epoch": 0.7918751400194161, + "grad_norm": 0.9578092098236084, + "kl": 0.8740234375, + "learning_rate": 2.5212804832316783e-06, + "loss": 0.0442, + "reward": 0.593191996216774, + "reward_std": 0.06862019095569849, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2651 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.1451416015625, + "epoch": 0.7921738481069375, + "grad_norm": 0.6732943058013916, + "kl": 1.5615234375, + "learning_rate": 2.514360500990223e-06, + "loss": 0.0706, + "reward": 0.5664062798023224, + "reward_std": 0.07380404323339462, + "rewards/accuracy_reward": 0.07366071874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2652 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.6027374267578, + "epoch": 0.792472556194459, + "grad_norm": 0.8275911211967468, + "kl": 1.3583984375, + "learning_rate": 2.5074486623908668e-06, + "loss": 0.0568, + "reward": 0.5407366156578064, + "reward_std": 0.10058142617344856, + "rewards/accuracy_reward": 0.049107144586741924, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2653 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.3616485595703, + "epoch": 0.7927712642819804, + "grad_norm": 0.5299005508422852, + "kl": 1.052734375, + "learning_rate": 2.5005449749530174e-06, + "loss": 0.053, + "reward": 0.5647321790456772, + "reward_std": 0.07265990227460861, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2654 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.9375457763672, + "epoch": 0.7930699723695019, + "grad_norm": 0.94317626953125, + "kl": 1.189453125, + "learning_rate": 2.493649446187213e-06, + "loss": 0.0393, + "reward": 0.5574777126312256, + "reward_std": 0.11824733018875122, + "rewards/accuracy_reward": 0.06473214505240321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2655 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.5781707763672, + "epoch": 0.7933686804570234, + "grad_norm": 0.8980101346969604, + "kl": 1.3603515625, + "learning_rate": 2.4867620835951066e-06, + "loss": 0.0587, + "reward": 0.5535714477300644, + "reward_std": 0.10749701038002968, + "rewards/accuracy_reward": 0.06250000139698386, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2656 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.2187805175781, + "epoch": 0.7936673885445449, + "grad_norm": 0.7438026666641235, + "kl": 1.2275390625, + "learning_rate": 2.479882894669481e-06, + "loss": 0.0535, + "reward": 0.604910746216774, + "reward_std": 0.13400860503315926, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2657 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.1406860351562, + "epoch": 0.7939660966320663, + "grad_norm": 0.6463304162025452, + "kl": 1.11767578125, + "learning_rate": 2.473011886894211e-06, + "loss": 0.0531, + "reward": 0.6322544813156128, + "reward_std": 0.15859512612223625, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2658 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.3348541259766, + "epoch": 0.7942648047195878, + "grad_norm": 1.5922150611877441, + "kl": 1.2939453125, + "learning_rate": 2.4661490677442834e-06, + "loss": 0.0626, + "reward": 0.5876116305589676, + "reward_std": 0.1027336586266756, + "rewards/accuracy_reward": 0.09598214877769351, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2659 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.2232513427734, + "epoch": 0.7945635128071092, + "grad_norm": 1.2352688312530518, + "kl": 1.478515625, + "learning_rate": 2.459294444685778e-06, + "loss": 0.0562, + "reward": 0.568638414144516, + "reward_std": 0.15297593409195542, + "rewards/accuracy_reward": 0.08035714644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 2660 + }, + { + "clip_ratio": 0.0, + "completion_length": 936.4442291259766, + "epoch": 0.7948622208946308, + "grad_norm": 0.8465952277183533, + "kl": 1.63671875, + "learning_rate": 2.452448025175844e-06, + "loss": 0.0844, + "reward": 0.6238839402794838, + "reward_std": 0.1144093545153737, + "rewards/accuracy_reward": 0.13392858020961285, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2661 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.3080749511719, + "epoch": 0.7951609289821522, + "grad_norm": 0.6527948975563049, + "kl": 1.50390625, + "learning_rate": 2.4456098166627194e-06, + "loss": 0.0524, + "reward": 0.6155134290456772, + "reward_std": 0.12753679044544697, + "rewards/accuracy_reward": 0.12500000302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 2662 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.9888916015625, + "epoch": 0.7954596370696737, + "grad_norm": 2.1283977031707764, + "kl": 2.3515625, + "learning_rate": 2.4387798265857078e-06, + "loss": 0.0899, + "reward": 0.5876116454601288, + "reward_std": 0.16654831357300282, + "rewards/accuracy_reward": 0.10267857741564512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2663 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.1853179931641, + "epoch": 0.7957583451571951, + "grad_norm": 0.47735586762428284, + "kl": 0.9287109375, + "learning_rate": 2.4319580623751614e-06, + "loss": 0.0439, + "reward": 0.6735491454601288, + "reward_std": 0.09414281439967453, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2664 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.3013916015625, + "epoch": 0.7960570532447167, + "grad_norm": 0.5575221180915833, + "kl": 1.2265625, + "learning_rate": 2.425144531452497e-06, + "loss": 0.0338, + "reward": 0.6635045111179352, + "reward_std": 0.1235534343868494, + "rewards/accuracy_reward": 0.1741071529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2665 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.7098541259766, + "epoch": 0.7963557613322381, + "grad_norm": 1.7373520135879517, + "kl": 1.6328125, + "learning_rate": 2.4183392412301686e-06, + "loss": 0.0506, + "reward": 0.5111607313156128, + "reward_std": 0.09144561970606446, + "rewards/accuracy_reward": 0.022321430267766118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2666 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.2812805175781, + "epoch": 0.7966544694197596, + "grad_norm": 0.892176628112793, + "kl": 1.119140625, + "learning_rate": 2.4115421991116605e-06, + "loss": 0.0342, + "reward": 0.5842634215950966, + "reward_std": 0.11429908219724894, + "rewards/accuracy_reward": 0.09375000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2667 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.4554138183594, + "epoch": 0.796953177507281, + "grad_norm": 0.8341800570487976, + "kl": 1.1787109375, + "learning_rate": 2.4047534124914907e-06, + "loss": 0.0516, + "reward": 0.560825914144516, + "reward_std": 0.0699046899098903, + "rewards/accuracy_reward": 0.06919643189758062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2668 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.3884429931641, + "epoch": 0.7972518855948025, + "grad_norm": 0.4495987892150879, + "kl": 1.01513671875, + "learning_rate": 2.397972888755197e-06, + "loss": 0.0346, + "reward": 0.5195312649011612, + "reward_std": 0.08093486353754997, + "rewards/accuracy_reward": 0.02901785750873387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 2669 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.0134429931641, + "epoch": 0.797550593682324, + "grad_norm": 1.0055848360061646, + "kl": 1.4365234375, + "learning_rate": 2.3912006352793184e-06, + "loss": 0.069, + "reward": 0.5251116305589676, + "reward_std": 0.09030344523489475, + "rewards/accuracy_reward": 0.037946428870782256, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 2670 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.7880096435547, + "epoch": 0.7978493017698454, + "grad_norm": 0.6041277647018433, + "kl": 1.646484375, + "learning_rate": 2.3844366594314096e-06, + "loss": 0.0719, + "reward": 0.6395089626312256, + "reward_std": 0.10237098764628172, + "rewards/accuracy_reward": 0.15178572130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 2671 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.7544860839844, + "epoch": 0.7981480098573669, + "grad_norm": 0.6981601119041443, + "kl": 1.7158203125, + "learning_rate": 2.3776809685700063e-06, + "loss": 0.0766, + "reward": 0.5887277126312256, + "reward_std": 0.15274061262607574, + "rewards/accuracy_reward": 0.10267857369035482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 2672 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.0625305175781, + "epoch": 0.7984467179448883, + "grad_norm": 1.1439052820205688, + "kl": 1.5625, + "learning_rate": 2.3709335700446425e-06, + "loss": 0.0497, + "reward": 0.6205357611179352, + "reward_std": 0.09118006564676762, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 2673 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.0424652099609, + "epoch": 0.7987454260324098, + "grad_norm": 1.8502320051193237, + "kl": 1.564453125, + "learning_rate": 2.3641944711958286e-06, + "loss": 0.0643, + "reward": 0.5898437798023224, + "reward_std": 0.11653586477041245, + "rewards/accuracy_reward": 0.09821428963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2674 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.3058471679688, + "epoch": 0.7990441341199312, + "grad_norm": 1.8023834228515625, + "kl": 1.357421875, + "learning_rate": 2.3574636793550376e-06, + "loss": 0.0534, + "reward": 0.5904018208384514, + "reward_std": 0.1010827124118805, + "rewards/accuracy_reward": 0.10044643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2675 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.4732666015625, + "epoch": 0.7993428422074528, + "grad_norm": 1.009239912033081, + "kl": 1.681640625, + "learning_rate": 2.350741201844714e-06, + "loss": 0.081, + "reward": 0.5502232387661934, + "reward_std": 0.09765970706939697, + "rewards/accuracy_reward": 0.060267857974395156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2676 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.5558471679688, + "epoch": 0.7996415502949742, + "grad_norm": 0.6494220495223999, + "kl": 1.38671875, + "learning_rate": 2.3440270459782575e-06, + "loss": 0.0566, + "reward": 0.6088169813156128, + "reward_std": 0.15110270865261555, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2677 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.1339721679688, + "epoch": 0.7999402583824957, + "grad_norm": 1.6727488040924072, + "kl": 2.013671875, + "learning_rate": 2.337321219060007e-06, + "loss": 0.0764, + "reward": 0.6088169813156128, + "reward_std": 0.16090972907841206, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2678 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.4777221679688, + "epoch": 0.8002389664700171, + "grad_norm": 0.6933326721191406, + "kl": 1.328125, + "learning_rate": 2.330623728385246e-06, + "loss": 0.0385, + "reward": 0.5625000298023224, + "reward_std": 0.09835431538522243, + "rewards/accuracy_reward": 0.07142857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2679 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.091552734375, + "epoch": 0.8005376745575387, + "grad_norm": 0.5679628252983093, + "kl": 1.16845703125, + "learning_rate": 2.3239345812401913e-06, + "loss": 0.0501, + "reward": 0.6322545111179352, + "reward_std": 0.11212488822638988, + "rewards/accuracy_reward": 0.1406250074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2680 + }, + { + "clip_ratio": 0.0, + "completion_length": 932.5580749511719, + "epoch": 0.8008363826450601, + "grad_norm": 1.4327391386032104, + "kl": 1.7109375, + "learning_rate": 2.317253784901976e-06, + "loss": 0.0705, + "reward": 0.5396205559372902, + "reward_std": 0.08493056613951921, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 2681 + }, + { + "clip_ratio": 0.0, + "completion_length": 912.9420013427734, + "epoch": 0.8011350907325816, + "grad_norm": 0.8979551792144775, + "kl": 0.92236328125, + "learning_rate": 2.3105813466386538e-06, + "loss": 0.0297, + "reward": 0.6160714626312256, + "reward_std": 0.10067926347255707, + "rewards/accuracy_reward": 0.12500000302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2682 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.3125305175781, + "epoch": 0.801433798820103, + "grad_norm": 1.293056845664978, + "kl": 1.3779296875, + "learning_rate": 2.303917273709181e-06, + "loss": 0.0372, + "reward": 0.541294664144516, + "reward_std": 0.07810377702116966, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 2683 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.6830749511719, + "epoch": 0.8017325069076245, + "grad_norm": 1.87887442111969, + "kl": 1.923828125, + "learning_rate": 2.2972615733634164e-06, + "loss": 0.0488, + "reward": 0.5217634066939354, + "reward_std": 0.10527042206376791, + "rewards/accuracy_reward": 0.03571428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2684 + }, + { + "clip_ratio": 0.0, + "completion_length": 912.5625457763672, + "epoch": 0.802031214995146, + "grad_norm": 0.8225762844085693, + "kl": 1.4345703125, + "learning_rate": 2.2906142528421127e-06, + "loss": 0.0717, + "reward": 0.7427455633878708, + "reward_std": 0.1346355937421322, + "rewards/accuracy_reward": 0.2522321529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 2685 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.0870971679688, + "epoch": 0.8023299230826675, + "grad_norm": 0.7022082209587097, + "kl": 1.4228515625, + "learning_rate": 2.2839753193768988e-06, + "loss": 0.0491, + "reward": 0.5837053805589676, + "reward_std": 0.11578917689621449, + "rewards/accuracy_reward": 0.09375000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2686 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.8661041259766, + "epoch": 0.8026286311701889, + "grad_norm": 0.5285684466362, + "kl": 1.138671875, + "learning_rate": 2.277344780190286e-06, + "loss": 0.0482, + "reward": 0.5675223469734192, + "reward_std": 0.09888187516480684, + "rewards/accuracy_reward": 0.0736607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2687 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.3125457763672, + "epoch": 0.8029273392577104, + "grad_norm": 0.618523895740509, + "kl": 1.6220703125, + "learning_rate": 2.270722642495653e-06, + "loss": 0.0786, + "reward": 0.6501116454601288, + "reward_std": 0.15865625999867916, + "rewards/accuracy_reward": 0.160714291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2688 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.5201416015625, + "epoch": 0.8032260473452318, + "grad_norm": 1.281733512878418, + "kl": 1.3330078125, + "learning_rate": 2.2641089134972317e-06, + "loss": 0.0494, + "reward": 0.5820312649011612, + "reward_std": 0.10465907864272594, + "rewards/accuracy_reward": 0.08928571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 2689 + }, + { + "clip_ratio": 0.0, + "completion_length": 922.6496124267578, + "epoch": 0.8035247554327534, + "grad_norm": 0.7358635663986206, + "kl": 0.939453125, + "learning_rate": 2.257503600390114e-06, + "loss": 0.0336, + "reward": 0.6188616305589676, + "reward_std": 0.13572771102190018, + "rewards/accuracy_reward": 0.1272321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2690 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.3460388183594, + "epoch": 0.8038234635202748, + "grad_norm": 0.4851531982421875, + "kl": 1.220703125, + "learning_rate": 2.2509067103602354e-06, + "loss": 0.0454, + "reward": 0.607700914144516, + "reward_std": 0.16450663469731808, + "rewards/accuracy_reward": 0.11830357322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2691 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.1116485595703, + "epoch": 0.8041221716077963, + "grad_norm": 1.4306901693344116, + "kl": 0.72509765625, + "learning_rate": 2.244318250584361e-06, + "loss": 0.0327, + "reward": 0.6093750149011612, + "reward_std": 0.0371703642886132, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357238650322, + "step": 2692 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.4420013427734, + "epoch": 0.8044208796953177, + "grad_norm": 1.017693281173706, + "kl": 1.1826171875, + "learning_rate": 2.237738228230091e-06, + "loss": 0.0648, + "reward": 0.5602678880095482, + "reward_std": 0.09533228259533644, + "rewards/accuracy_reward": 0.066964291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2693 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.8928985595703, + "epoch": 0.8047195877828393, + "grad_norm": 0.8576295375823975, + "kl": 1.0556640625, + "learning_rate": 2.231166650455847e-06, + "loss": 0.033, + "reward": 0.584263414144516, + "reward_std": 0.08328771125525236, + "rewards/accuracy_reward": 0.09375000419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2694 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.4777221679688, + "epoch": 0.8050182958703607, + "grad_norm": 0.8738182187080383, + "kl": 1.408203125, + "learning_rate": 2.2246035244108588e-06, + "loss": 0.0561, + "reward": 0.6501116305589676, + "reward_std": 0.15038974583148956, + "rewards/accuracy_reward": 0.15848214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2695 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.8594207763672, + "epoch": 0.8053170039578822, + "grad_norm": 0.5025777220726013, + "kl": 1.728515625, + "learning_rate": 2.2180488572351667e-06, + "loss": 0.0559, + "reward": 0.5485491305589676, + "reward_std": 0.0994228646159172, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2696 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.0692291259766, + "epoch": 0.8056157120454036, + "grad_norm": 1.0673847198486328, + "kl": 0.7421875, + "learning_rate": 2.211502656059602e-06, + "loss": 0.0202, + "reward": 0.6143973469734192, + "reward_std": 0.037296785740181804, + "rewards/accuracy_reward": 0.11830357951112092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2697 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.2946929931641, + "epoch": 0.8059144201329251, + "grad_norm": 0.9285044074058533, + "kl": 0.7978515625, + "learning_rate": 2.204964928005794e-06, + "loss": 0.0273, + "reward": 0.607700914144516, + "reward_std": 0.09409596212208271, + "rewards/accuracy_reward": 0.11383929313160479, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2698 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.8929138183594, + "epoch": 0.8062131282204466, + "grad_norm": 0.6130215525627136, + "kl": 0.7705078125, + "learning_rate": 2.1984356801861506e-06, + "loss": 0.0435, + "reward": 0.5825893133878708, + "reward_std": 0.09728165622800589, + "rewards/accuracy_reward": 0.08928571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493303582072258, + "step": 2699 + }, + { + "clip_ratio": 0.0, + "completion_length": 930.1049499511719, + "epoch": 0.8065118363079681, + "grad_norm": 1.5894526243209839, + "kl": 0.94970703125, + "learning_rate": 2.1919149197038494e-06, + "loss": 0.0348, + "reward": 0.6512277275323868, + "reward_std": 0.07097484450787306, + "rewards/accuracy_reward": 0.1562500111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2700 + }, + { + "clip_ratio": 0.0, + "completion_length": 928.5134429931641, + "epoch": 0.8068105443954895, + "grad_norm": 0.6516120433807373, + "kl": 1.421875, + "learning_rate": 2.1854026536528405e-06, + "loss": 0.0762, + "reward": 0.6244419813156128, + "reward_std": 0.1294974060729146, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2701 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.779052734375, + "epoch": 0.807109252483011, + "grad_norm": 0.5296715497970581, + "kl": 1.71484375, + "learning_rate": 2.1788988891178342e-06, + "loss": 0.074, + "reward": 0.6305803880095482, + "reward_std": 0.0936898896470666, + "rewards/accuracy_reward": 0.1450892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910969734192, + "step": 2702 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.7344055175781, + "epoch": 0.8074079605705324, + "grad_norm": 1.125288486480713, + "kl": 1.13671875, + "learning_rate": 2.172403633174284e-06, + "loss": 0.0539, + "reward": 0.5736607387661934, + "reward_std": 0.08092865068465471, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2703 + }, + { + "clip_ratio": 0.0, + "completion_length": 970.2790832519531, + "epoch": 0.807706668658054, + "grad_norm": 0.6933705806732178, + "kl": 1.4375, + "learning_rate": 2.1659168928883933e-06, + "loss": 0.0687, + "reward": 0.5652902126312256, + "reward_std": 0.13280245289206505, + "rewards/accuracy_reward": 0.07812500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2704 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.0670166015625, + "epoch": 0.8080053767455754, + "grad_norm": 1.0197242498397827, + "kl": 1.54296875, + "learning_rate": 2.1594386753171035e-06, + "loss": 0.0533, + "reward": 0.525111623108387, + "reward_std": 0.08965882798656821, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2705 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.7611999511719, + "epoch": 0.8083040848330969, + "grad_norm": 0.39257171750068665, + "kl": 0.8818359375, + "learning_rate": 2.152968987508075e-06, + "loss": 0.0247, + "reward": 0.5323661118745804, + "reward_std": 0.0793882547877729, + "rewards/accuracy_reward": 0.03794643119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2706 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.1428985595703, + "epoch": 0.8086027929206183, + "grad_norm": 1.0232694149017334, + "kl": 1.224609375, + "learning_rate": 2.146507836499697e-06, + "loss": 0.0443, + "reward": 0.5797991454601288, + "reward_std": 0.12252862751483917, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812798023224, + "step": 2707 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.3214721679688, + "epoch": 0.8089015010081398, + "grad_norm": 0.3693239092826843, + "kl": 0.95703125, + "learning_rate": 2.1400552293210697e-06, + "loss": 0.0516, + "reward": 0.6149553954601288, + "reward_std": 0.1297887098044157, + "rewards/accuracy_reward": 0.12053572130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2708 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.5246124267578, + "epoch": 0.8092002090956613, + "grad_norm": 0.5342503190040588, + "kl": 1.068359375, + "learning_rate": 2.133611172991993e-06, + "loss": 0.0564, + "reward": 0.5736607313156128, + "reward_std": 0.10686001973226666, + "rewards/accuracy_reward": 0.0825892873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2709 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.0692443847656, + "epoch": 0.8094989171831828, + "grad_norm": 0.7805652022361755, + "kl": 1.00830078125, + "learning_rate": 2.1271756745229744e-06, + "loss": 0.0522, + "reward": 0.5591518133878708, + "reward_std": 0.09652221482247114, + "rewards/accuracy_reward": 0.06696429033763707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2710 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.0915679931641, + "epoch": 0.8097976252707042, + "grad_norm": 0.4031887650489807, + "kl": 0.830078125, + "learning_rate": 2.1207487409151984e-06, + "loss": 0.0399, + "reward": 0.5496652126312256, + "reward_std": 0.10158092831261456, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2711 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.3058471679688, + "epoch": 0.8100963333582257, + "grad_norm": 0.8763480186462402, + "kl": 0.9462890625, + "learning_rate": 2.114330379160543e-06, + "loss": 0.0472, + "reward": 0.5279017984867096, + "reward_std": 0.08185495994985104, + "rewards/accuracy_reward": 0.03348214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2712 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.1496124267578, + "epoch": 0.8103950414457471, + "grad_norm": 0.3810403347015381, + "kl": 1.228515625, + "learning_rate": 2.1079205962415593e-06, + "loss": 0.052, + "reward": 0.5279018133878708, + "reward_std": 0.11824629455804825, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2713 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.0826416015625, + "epoch": 0.8106937495332686, + "grad_norm": 0.9601551294326782, + "kl": 1.140625, + "learning_rate": 2.1015193991314577e-06, + "loss": 0.0377, + "reward": 0.583705373108387, + "reward_std": 0.09199206531047821, + "rewards/accuracy_reward": 0.09375000605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2714 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.2143402099609, + "epoch": 0.8109924576207901, + "grad_norm": 0.9023308753967285, + "kl": 1.01171875, + "learning_rate": 2.0951267947941146e-06, + "loss": 0.0423, + "reward": 0.6679687649011612, + "reward_std": 0.1637258380651474, + "rewards/accuracy_reward": 0.17410715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2715 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.029052734375, + "epoch": 0.8112911657083115, + "grad_norm": 0.9454244375228882, + "kl": 1.0263671875, + "learning_rate": 2.088742790184062e-06, + "loss": 0.04, + "reward": 0.6333705633878708, + "reward_std": 0.055892275646328926, + "rewards/accuracy_reward": 0.1406250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2716 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.1027221679688, + "epoch": 0.811589873795833, + "grad_norm": 2.163560152053833, + "kl": 1.8203125, + "learning_rate": 2.0823673922464625e-06, + "loss": 0.077, + "reward": 0.5429687798023224, + "reward_std": 0.08298484701663256, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2717 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.7991485595703, + "epoch": 0.8118885818833544, + "grad_norm": 0.8415524363517761, + "kl": 1.044921875, + "learning_rate": 2.0760006079171303e-06, + "loss": 0.0503, + "reward": 0.5558035969734192, + "reward_std": 0.11625534109771252, + "rewards/accuracy_reward": 0.06473214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2718 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.1920013427734, + "epoch": 0.812187289970876, + "grad_norm": 1.140848994255066, + "kl": 1.02978515625, + "learning_rate": 2.0696424441225037e-06, + "loss": 0.0512, + "reward": 0.632254496216774, + "reward_std": 0.14655038248747587, + "rewards/accuracy_reward": 0.14062500419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294738650322, + "step": 2719 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.1540679931641, + "epoch": 0.8124859980583974, + "grad_norm": 0.7912307977676392, + "kl": 1.34765625, + "learning_rate": 2.063292907779636e-06, + "loss": 0.0603, + "reward": 0.5664062798023224, + "reward_std": 0.10520829679444432, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 2720 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.5580749511719, + "epoch": 0.8127847061459189, + "grad_norm": 0.6369551420211792, + "kl": 0.49853515625, + "learning_rate": 2.0569520057962044e-06, + "loss": 0.0369, + "reward": 0.6043527126312256, + "reward_std": 0.1334128025919199, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2721 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.8638916015625, + "epoch": 0.8130834142334403, + "grad_norm": 0.6962555646896362, + "kl": 0.842041015625, + "learning_rate": 2.050619745070491e-06, + "loss": 0.0371, + "reward": 0.624441996216774, + "reward_std": 0.05398947326466441, + "rewards/accuracy_reward": 0.129464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2722 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.6741485595703, + "epoch": 0.8133821223209619, + "grad_norm": 1.1593927145004272, + "kl": 0.771484375, + "learning_rate": 2.044296132491369e-06, + "loss": 0.0294, + "reward": 0.7020089626312256, + "reward_std": 0.1081096138805151, + "rewards/accuracy_reward": 0.20758929708972573, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2723 + }, + { + "clip_ratio": 0.0, + "completion_length": 918.2411041259766, + "epoch": 0.8136808304084833, + "grad_norm": 0.5860064029693604, + "kl": 1.369140625, + "learning_rate": 2.037981174938315e-06, + "loss": 0.069, + "reward": 0.635044664144516, + "reward_std": 0.1343140099197626, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553880095482, + "step": 2724 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.2321929931641, + "epoch": 0.8139795384960048, + "grad_norm": 0.49678200483322144, + "kl": 0.9404296875, + "learning_rate": 2.031674879281378e-06, + "loss": 0.0342, + "reward": 0.5357143133878708, + "reward_std": 0.10325700789690018, + "rewards/accuracy_reward": 0.042410715483129025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 2725 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.4018249511719, + "epoch": 0.8142782465835262, + "grad_norm": 1.5235626697540283, + "kl": 1.0556640625, + "learning_rate": 2.025377252381192e-06, + "loss": 0.0477, + "reward": 0.5887276977300644, + "reward_std": 0.0900955491233617, + "rewards/accuracy_reward": 0.09598214784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2726 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.8995971679688, + "epoch": 0.8145769546710477, + "grad_norm": 0.6161275506019592, + "kl": 0.8662109375, + "learning_rate": 2.019088301088962e-06, + "loss": 0.0205, + "reward": 0.643973246216774, + "reward_std": 0.08874203497543931, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2727 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.8661041259766, + "epoch": 0.8148756627585692, + "grad_norm": 0.3187314569950104, + "kl": 0.7138671875, + "learning_rate": 2.0128080322464437e-06, + "loss": 0.0273, + "reward": 0.5809152126312256, + "reward_std": 0.09346170164644718, + "rewards/accuracy_reward": 0.08705357881262898, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2728 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.5580902099609, + "epoch": 0.8151743708460907, + "grad_norm": 1.235913634300232, + "kl": 0.83154296875, + "learning_rate": 2.0065364526859576e-06, + "loss": 0.0444, + "reward": 0.5602678805589676, + "reward_std": 0.09291042503900826, + "rewards/accuracy_reward": 0.0669642873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2729 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.3683471679688, + "epoch": 0.8154730789336121, + "grad_norm": 0.46569114923477173, + "kl": 0.64599609375, + "learning_rate": 2.00027356923037e-06, + "loss": 0.0287, + "reward": 0.6238839626312256, + "reward_std": 0.1390535570681095, + "rewards/accuracy_reward": 0.12946429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2730 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.2187957763672, + "epoch": 0.8157717870211336, + "grad_norm": 0.5546117424964905, + "kl": 0.8984375, + "learning_rate": 1.9940193886930783e-06, + "loss": 0.0116, + "reward": 0.5167410969734192, + "reward_std": 0.08683187607675791, + "rewards/accuracy_reward": 0.02455357206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 2731 + }, + { + "clip_ratio": 0.0, + "completion_length": 930.0826416015625, + "epoch": 0.816070495108655, + "grad_norm": 1.4710290431976318, + "kl": 0.9287109375, + "learning_rate": 1.987773917878022e-06, + "loss": 0.035, + "reward": 0.5781250149011612, + "reward_std": 0.1588497357442975, + "rewards/accuracy_reward": 0.08482143026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2732 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.2924499511719, + "epoch": 0.8163692031961766, + "grad_norm": 1.7937731742858887, + "kl": 1.003173828125, + "learning_rate": 1.981537163579663e-06, + "loss": 0.0482, + "reward": 0.577566996216774, + "reward_std": 0.0879623603541404, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2733 + }, + { + "clip_ratio": 0.0, + "completion_length": 930.7656707763672, + "epoch": 0.816667911283698, + "grad_norm": 0.7686635255813599, + "kl": 0.6591796875, + "learning_rate": 1.975309132582973e-06, + "loss": 0.0264, + "reward": 0.6049107313156128, + "reward_std": 0.10659648710861802, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2734 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.1496124267578, + "epoch": 0.8169666193712195, + "grad_norm": 0.41288065910339355, + "kl": 0.947265625, + "learning_rate": 1.969089831663443e-06, + "loss": 0.0362, + "reward": 0.5998884290456772, + "reward_std": 0.14177866652607918, + "rewards/accuracy_reward": 0.10714286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 2735 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.4554138183594, + "epoch": 0.8172653274587409, + "grad_norm": 0.429909884929657, + "kl": 0.7294921875, + "learning_rate": 1.9628792675870656e-06, + "loss": 0.0326, + "reward": 0.5982143133878708, + "reward_std": 0.10896180383861065, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2736 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.5290679931641, + "epoch": 0.8175640355462624, + "grad_norm": 0.5587837100028992, + "kl": 0.55126953125, + "learning_rate": 1.95667744711032e-06, + "loss": 0.02, + "reward": 0.5797991305589676, + "reward_std": 0.11150875687599182, + "rewards/accuracy_reward": 0.08482143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2737 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.3058471679688, + "epoch": 0.8178627436337839, + "grad_norm": 0.4623643457889557, + "kl": 0.734375, + "learning_rate": 1.950484376980183e-06, + "loss": 0.0288, + "reward": 0.5747768133878708, + "reward_std": 0.10618117195554078, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2738 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.544677734375, + "epoch": 0.8181614517213054, + "grad_norm": 0.620352566242218, + "kl": 0.57470703125, + "learning_rate": 1.9443000639341046e-06, + "loss": 0.0252, + "reward": 0.629464328289032, + "reward_std": 0.08696229290217161, + "rewards/accuracy_reward": 0.13169643515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 2739 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.4576568603516, + "epoch": 0.8184601598088268, + "grad_norm": 0.2921726107597351, + "kl": 0.80517578125, + "learning_rate": 1.9381245147000138e-06, + "loss": 0.0449, + "reward": 0.5797991454601288, + "reward_std": 0.07878213748335838, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2740 + }, + { + "clip_ratio": 0.0, + "completion_length": 936.1741333007812, + "epoch": 0.8187588678963483, + "grad_norm": 0.28810542821884155, + "kl": 0.492431640625, + "learning_rate": 1.931957735996304e-06, + "loss": 0.0191, + "reward": 0.5965402126312256, + "reward_std": 0.07275348645634949, + "rewards/accuracy_reward": 0.10044643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2741 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.154052734375, + "epoch": 0.8190575759838697, + "grad_norm": 0.3614002466201782, + "kl": 0.662109375, + "learning_rate": 1.9257997345318223e-06, + "loss": 0.0321, + "reward": 0.5898437798023224, + "reward_std": 0.11345085129141808, + "rewards/accuracy_reward": 0.09598214505240321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2742 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.6451416015625, + "epoch": 0.8193562840713913, + "grad_norm": 0.46963706612586975, + "kl": 0.86181640625, + "learning_rate": 1.919650517005872e-06, + "loss": 0.0399, + "reward": 0.5909598469734192, + "reward_std": 0.1217446019873023, + "rewards/accuracy_reward": 0.09598214854486287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2743 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.7388763427734, + "epoch": 0.8196549921589127, + "grad_norm": 0.42732641100883484, + "kl": 0.953125, + "learning_rate": 1.9135100901082025e-06, + "loss": 0.038, + "reward": 0.5345982313156128, + "reward_std": 0.044050724944099784, + "rewards/accuracy_reward": 0.04241071501746774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2744 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.6562957763672, + "epoch": 0.8199537002464342, + "grad_norm": 0.33028075098991394, + "kl": 0.952880859375, + "learning_rate": 1.9073784605189914e-06, + "loss": 0.0374, + "reward": 0.5613839626312256, + "reward_std": 0.08159541711211205, + "rewards/accuracy_reward": 0.06919642887078226, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2745 + }, + { + "clip_ratio": 0.0, + "completion_length": 936.3616485595703, + "epoch": 0.8202524083339556, + "grad_norm": 0.2882213294506073, + "kl": 0.4580078125, + "learning_rate": 1.901255634908854e-06, + "loss": 0.0156, + "reward": 0.6662946790456772, + "reward_std": 0.07259770482778549, + "rewards/accuracy_reward": 0.16964286495931447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2746 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.0000610351562, + "epoch": 0.8205511164214772, + "grad_norm": 1.6516717672348022, + "kl": 1.2978515625, + "learning_rate": 1.895141619938825e-06, + "loss": 0.0593, + "reward": 0.5379464477300644, + "reward_std": 0.08302612975239754, + "rewards/accuracy_reward": 0.05133928684517741, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2747 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.3214874267578, + "epoch": 0.8208498245089986, + "grad_norm": 1.174761414527893, + "kl": 0.8818359375, + "learning_rate": 1.8890364222603496e-06, + "loss": 0.0366, + "reward": 0.564732164144516, + "reward_std": 0.13400566577911377, + "rewards/accuracy_reward": 0.07142857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2748 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.8460235595703, + "epoch": 0.8211485325965201, + "grad_norm": 0.7981047034263611, + "kl": 0.65283203125, + "learning_rate": 1.8829400485152872e-06, + "loss": 0.0322, + "reward": 0.5507812798023224, + "reward_std": 0.0953306294977665, + "rewards/accuracy_reward": 0.05580357299186289, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2749 + }, + { + "clip_ratio": 0.0, + "completion_length": 936.8504943847656, + "epoch": 0.8214472406840415, + "grad_norm": 1.1671031713485718, + "kl": 1.037109375, + "learning_rate": 1.8768525053358976e-06, + "loss": 0.0559, + "reward": 0.5223214477300644, + "reward_std": 0.05588126461952925, + "rewards/accuracy_reward": 0.0290178582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2750 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.4486999511719, + "epoch": 0.821745948771563, + "grad_norm": 1.0035881996154785, + "kl": 1.1455078125, + "learning_rate": 1.8707737993448249e-06, + "loss": 0.0534, + "reward": 0.623325914144516, + "reward_std": 0.1417059227824211, + "rewards/accuracy_reward": 0.1316964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2751 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.1049652099609, + "epoch": 0.8220446568590845, + "grad_norm": 0.4346313774585724, + "kl": 1.330078125, + "learning_rate": 1.8647039371551124e-06, + "loss": 0.0635, + "reward": 0.5089286044239998, + "reward_std": 0.06494865659624338, + "rewards/accuracy_reward": 0.01785714295692742, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2752 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.9866333007812, + "epoch": 0.822343364946606, + "grad_norm": 0.8402987718582153, + "kl": 0.85546875, + "learning_rate": 1.8586429253701676e-06, + "loss": 0.0332, + "reward": 0.5318080633878708, + "reward_std": 0.03348214505240321, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2753 + }, + { + "clip_ratio": 0.0, + "completion_length": 914.1763763427734, + "epoch": 0.8226420730341274, + "grad_norm": 0.6881332397460938, + "kl": 0.864013671875, + "learning_rate": 1.852590770583782e-06, + "loss": 0.0355, + "reward": 0.6110491305589676, + "reward_std": 0.12045040726661682, + "rewards/accuracy_reward": 0.1160714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2754 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.6138763427734, + "epoch": 0.8229407811216489, + "grad_norm": 1.0878305435180664, + "kl": 1.90234375, + "learning_rate": 1.8465474793801086e-06, + "loss": 0.0849, + "reward": 0.6222098469734192, + "reward_std": 0.10341963730752468, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 2755 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.8549499511719, + "epoch": 0.8232394892091703, + "grad_norm": 2.0831470489501953, + "kl": 1.45458984375, + "learning_rate": 1.8405130583336507e-06, + "loss": 0.0597, + "reward": 0.5669643133878708, + "reward_std": 0.15105656906962395, + "rewards/accuracy_reward": 0.07589286030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2756 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.5000457763672, + "epoch": 0.8235381972966918, + "grad_norm": 0.7521653771400452, + "kl": 1.5068359375, + "learning_rate": 1.8344875140092689e-06, + "loss": 0.0616, + "reward": 0.5820312798023224, + "reward_std": 0.12140961177647114, + "rewards/accuracy_reward": 0.09375000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2757 + }, + { + "clip_ratio": 0.0, + "completion_length": 927.3995971679688, + "epoch": 0.8238369053842133, + "grad_norm": 1.0506209135055542, + "kl": 1.046875, + "learning_rate": 1.8284708529621687e-06, + "loss": 0.0284, + "reward": 0.5758928805589676, + "reward_std": 0.1082201786339283, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 2758 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.279052734375, + "epoch": 0.8241356134717347, + "grad_norm": 0.8937116265296936, + "kl": 1.2275390625, + "learning_rate": 1.822463081737883e-06, + "loss": 0.0514, + "reward": 0.6071428954601288, + "reward_std": 0.14775982312858105, + "rewards/accuracy_reward": 0.1138392947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2759 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.0603179931641, + "epoch": 0.8244343215592562, + "grad_norm": 0.8536131978034973, + "kl": 0.83251953125, + "learning_rate": 1.8164642068722782e-06, + "loss": 0.0237, + "reward": 0.664620578289032, + "reward_std": 0.13082798570394516, + "rewards/accuracy_reward": 0.17187500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2760 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.7745971679688, + "epoch": 0.8247330296467776, + "grad_norm": 0.9058293104171753, + "kl": 0.83154296875, + "learning_rate": 1.810474234891547e-06, + "loss": 0.0218, + "reward": 0.542410746216774, + "reward_std": 0.10236183926463127, + "rewards/accuracy_reward": 0.04910714481957257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2761 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.0714721679688, + "epoch": 0.8250317377342992, + "grad_norm": 0.6754835247993469, + "kl": 1.126953125, + "learning_rate": 1.8044931723121861e-06, + "loss": 0.0397, + "reward": 0.5390625298023224, + "reward_std": 0.05333383707329631, + "rewards/accuracy_reward": 0.04687500186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2762 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.5290679931641, + "epoch": 0.8253304458218206, + "grad_norm": 0.7320617437362671, + "kl": 1.140625, + "learning_rate": 1.798521025641009e-06, + "loss": 0.0532, + "reward": 0.5669643133878708, + "reward_std": 0.1417629960924387, + "rewards/accuracy_reward": 0.07589285867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2763 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.1295166015625, + "epoch": 0.8256291539093421, + "grad_norm": 0.7393817901611328, + "kl": 0.966796875, + "learning_rate": 1.7925578013751233e-06, + "loss": 0.037, + "reward": 0.612165205180645, + "reward_std": 0.09437964484095573, + "rewards/accuracy_reward": 0.12053571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2764 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.0223541259766, + "epoch": 0.8259278619968635, + "grad_norm": 0.5978937149047852, + "kl": 0.712890625, + "learning_rate": 1.7866035060019338e-06, + "loss": 0.0195, + "reward": 0.5691964477300644, + "reward_std": 0.07579410658217967, + "rewards/accuracy_reward": 0.07366071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357238650322, + "step": 2765 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.4553985595703, + "epoch": 0.826226570084385, + "grad_norm": 0.5006062388420105, + "kl": 1.0107421875, + "learning_rate": 1.7806581459991324e-06, + "loss": 0.0403, + "reward": 0.6171875447034836, + "reward_std": 0.1115698553621769, + "rewards/accuracy_reward": 0.12500000651925802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2766 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.2277374267578, + "epoch": 0.8265252781719065, + "grad_norm": 0.380677193403244, + "kl": 0.779296875, + "learning_rate": 1.774721727834684e-06, + "loss": 0.0285, + "reward": 0.5904018133878708, + "reward_std": 0.13374494877643883, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2767 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.0335235595703, + "epoch": 0.826823986259428, + "grad_norm": 1.0812262296676636, + "kl": 1.1171875, + "learning_rate": 1.7687942579668315e-06, + "loss": 0.0259, + "reward": 0.5496652126312256, + "reward_std": 0.1334460712969303, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2768 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.0536041259766, + "epoch": 0.8271226943469494, + "grad_norm": 1.2902685403823853, + "kl": 1.0146484375, + "learning_rate": 1.7628757428440846e-06, + "loss": 0.0446, + "reward": 0.694754496216774, + "reward_std": 0.07988554611802101, + "rewards/accuracy_reward": 0.2053571529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2769 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.9196929931641, + "epoch": 0.8274214024344709, + "grad_norm": 0.4341109097003937, + "kl": 0.9267578125, + "learning_rate": 1.7569661889052015e-06, + "loss": 0.036, + "reward": 0.612723246216774, + "reward_std": 0.10071538295596838, + "rewards/accuracy_reward": 0.12053572130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2770 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.9286041259766, + "epoch": 0.8277201105219923, + "grad_norm": 0.6232917308807373, + "kl": 1.21875, + "learning_rate": 1.7510656025792005e-06, + "loss": 0.0525, + "reward": 0.5680803880095482, + "reward_std": 0.09031710121780634, + "rewards/accuracy_reward": 0.07812500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2771 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.7701416015625, + "epoch": 0.8280188186095139, + "grad_norm": 1.5648374557495117, + "kl": 0.96337890625, + "learning_rate": 1.7451739902853448e-06, + "loss": 0.0479, + "reward": 0.6037946715950966, + "reward_std": 0.1318178093060851, + "rewards/accuracy_reward": 0.1138392947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2772 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.1428985595703, + "epoch": 0.8283175266970353, + "grad_norm": 0.8135423064231873, + "kl": 1.1240234375, + "learning_rate": 1.739291358433124e-06, + "loss": 0.0627, + "reward": 0.6361607313156128, + "reward_std": 0.1405070386826992, + "rewards/accuracy_reward": 0.14508928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2773 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.9754943847656, + "epoch": 0.8286162347845568, + "grad_norm": 1.0774941444396973, + "kl": 1.0888671875, + "learning_rate": 1.7334177134222696e-06, + "loss": 0.0598, + "reward": 0.6171875149011612, + "reward_std": 0.11330167576670647, + "rewards/accuracy_reward": 0.12500000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2774 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.4866485595703, + "epoch": 0.8289149428720782, + "grad_norm": 0.6007012128829956, + "kl": 1.0576171875, + "learning_rate": 1.7275530616427338e-06, + "loss": 0.0439, + "reward": 0.5669642984867096, + "reward_std": 0.1654883548617363, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2775 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.0580749511719, + "epoch": 0.8292136509595998, + "grad_norm": 1.4257858991622925, + "kl": 1.689453125, + "learning_rate": 1.7216974094746764e-06, + "loss": 0.0724, + "reward": 0.6729910969734192, + "reward_std": 0.14285633340477943, + "rewards/accuracy_reward": 0.18750000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910969734192, + "step": 2776 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.5736999511719, + "epoch": 0.8295123590471212, + "grad_norm": 1.1461013555526733, + "kl": 0.9482421875, + "learning_rate": 1.7158507632884801e-06, + "loss": 0.0357, + "reward": 0.5597098469734192, + "reward_std": 0.13296634331345558, + "rewards/accuracy_reward": 0.06696428661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2777 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.0357666015625, + "epoch": 0.8298110671346427, + "grad_norm": 1.0100817680358887, + "kl": 1.12255859375, + "learning_rate": 1.7100131294447165e-06, + "loss": 0.0483, + "reward": 0.7103795111179352, + "reward_std": 0.12616230105049908, + "rewards/accuracy_reward": 0.2187500111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2778 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.6987152099609, + "epoch": 0.8301097752221641, + "grad_norm": 1.0001771450042725, + "kl": 1.615234375, + "learning_rate": 1.7041845142941615e-06, + "loss": 0.0682, + "reward": 0.5039062723517418, + "reward_std": 0.07571411225944757, + "rewards/accuracy_reward": 0.01562500069849193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812798023224, + "step": 2779 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.3906555175781, + "epoch": 0.8304084833096856, + "grad_norm": 1.263022541999817, + "kl": 1.357421875, + "learning_rate": 1.6983649241777811e-06, + "loss": 0.0395, + "reward": 0.5474330484867096, + "reward_std": 0.09709043055772781, + "rewards/accuracy_reward": 0.055803573690354824, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2780 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.5781707763672, + "epoch": 0.8307071913972071, + "grad_norm": 1.0168509483337402, + "kl": 0.923828125, + "learning_rate": 1.692554365426713e-06, + "loss": 0.0336, + "reward": 0.6088169813156128, + "reward_std": 0.060030267806723714, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2781 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.0513916015625, + "epoch": 0.8310058994847286, + "grad_norm": 1.0789384841918945, + "kl": 0.8564453125, + "learning_rate": 1.6867528443622772e-06, + "loss": 0.0334, + "reward": 0.5502232313156128, + "reward_std": 0.048278180649504066, + "rewards/accuracy_reward": 0.05580357392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2782 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.5759429931641, + "epoch": 0.83130460757225, + "grad_norm": 0.50042325258255, + "kl": 1.33837890625, + "learning_rate": 1.6809603672959618e-06, + "loss": 0.0525, + "reward": 0.6177455633878708, + "reward_std": 0.12835717480629683, + "rewards/accuracy_reward": 0.12723215017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 2783 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.2254943847656, + "epoch": 0.8316033156597715, + "grad_norm": 1.0240778923034668, + "kl": 1.3525390625, + "learning_rate": 1.6751769405294128e-06, + "loss": 0.0601, + "reward": 0.534040205180645, + "reward_std": 0.11564603727310896, + "rewards/accuracy_reward": 0.04464285867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2784 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.6183471679688, + "epoch": 0.8319020237472929, + "grad_norm": 1.1243109703063965, + "kl": 1.0322265625, + "learning_rate": 1.6694025703544349e-06, + "loss": 0.0423, + "reward": 0.6160714477300644, + "reward_std": 0.09712732909247279, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2785 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.0736999511719, + "epoch": 0.8322007318348145, + "grad_norm": 0.7330344319343567, + "kl": 0.9072265625, + "learning_rate": 1.6636372630529718e-06, + "loss": 0.0498, + "reward": 0.5982143133878708, + "reward_std": 0.13802790455520153, + "rewards/accuracy_reward": 0.10267857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357238650322, + "step": 2786 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.1942291259766, + "epoch": 0.8324994399223359, + "grad_norm": 0.48745936155319214, + "kl": 1.63671875, + "learning_rate": 1.6578810248971144e-06, + "loss": 0.0494, + "reward": 0.533482164144516, + "reward_std": 0.07289670081809163, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2787 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.857177734375, + "epoch": 0.8327981480098574, + "grad_norm": 1.9986276626586914, + "kl": 1.1748046875, + "learning_rate": 1.652133862149089e-06, + "loss": 0.0496, + "reward": 0.556361623108387, + "reward_std": 0.08293665666133165, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2788 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.8460083007812, + "epoch": 0.8330968560973788, + "grad_norm": 0.5477078557014465, + "kl": 1.1533203125, + "learning_rate": 1.6463957810612408e-06, + "loss": 0.0445, + "reward": 0.5613839626312256, + "reward_std": 0.11366862989962101, + "rewards/accuracy_reward": 0.06919643143191934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2789 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.6361999511719, + "epoch": 0.8333955641849004, + "grad_norm": 0.5805773735046387, + "kl": 1.4150390625, + "learning_rate": 1.6406667878760418e-06, + "loss": 0.0394, + "reward": 0.6523437798023224, + "reward_std": 0.13775851763784885, + "rewards/accuracy_reward": 0.1651785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 2790 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.3326263427734, + "epoch": 0.8336942722724218, + "grad_norm": 1.6055172681808472, + "kl": 1.20703125, + "learning_rate": 1.6349468888260766e-06, + "loss": 0.043, + "reward": 0.603794664144516, + "reward_std": 0.10429865960031748, + "rewards/accuracy_reward": 0.11160714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 2791 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.4933319091797, + "epoch": 0.8339929803599433, + "grad_norm": 0.7684637308120728, + "kl": 1.2333984375, + "learning_rate": 1.629236090134031e-06, + "loss": 0.0344, + "reward": 0.5524553805589676, + "reward_std": 0.0826688576489687, + "rewards/accuracy_reward": 0.06250000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2792 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.6987152099609, + "epoch": 0.8342916884474647, + "grad_norm": 0.8640532493591309, + "kl": 1.001953125, + "learning_rate": 1.6235343980126973e-06, + "loss": 0.042, + "reward": 0.5803571790456772, + "reward_std": 0.0984832369722426, + "rewards/accuracy_reward": 0.08705357648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2793 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.2143249511719, + "epoch": 0.8345903965349862, + "grad_norm": 1.8475314378738403, + "kl": 1.578125, + "learning_rate": 1.617841818664957e-06, + "loss": 0.0586, + "reward": 0.6188616380095482, + "reward_std": 0.08141594845801592, + "rewards/accuracy_reward": 0.1316964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2794 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.4464721679688, + "epoch": 0.8348891046225076, + "grad_norm": 1.1174519062042236, + "kl": 1.2041015625, + "learning_rate": 1.6121583582837773e-06, + "loss": 0.0516, + "reward": 0.6316964626312256, + "reward_std": 0.1564614288508892, + "rewards/accuracy_reward": 0.14285715389996767, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2795 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.7277221679688, + "epoch": 0.8351878127100292, + "grad_norm": 0.7482967376708984, + "kl": 1.390625, + "learning_rate": 1.6064840230522094e-06, + "loss": 0.0462, + "reward": 0.5736607313156128, + "reward_std": 0.09288019128143787, + "rewards/accuracy_reward": 0.08258929150179029, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2796 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.3594207763672, + "epoch": 0.8354865207975506, + "grad_norm": 1.7287441492080688, + "kl": 1.46044921875, + "learning_rate": 1.6008188191433683e-06, + "loss": 0.0593, + "reward": 0.6143973469734192, + "reward_std": 0.1960950754582882, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2797 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.9040679931641, + "epoch": 0.8357852288850721, + "grad_norm": 0.8954575061798096, + "kl": 1.697265625, + "learning_rate": 1.5951627527204438e-06, + "loss": 0.0802, + "reward": 0.6110491305589676, + "reward_std": 0.13727223221212626, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2798 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.4888763427734, + "epoch": 0.8360839369725935, + "grad_norm": 0.39846867322921753, + "kl": 1.072265625, + "learning_rate": 1.589515829936684e-06, + "loss": 0.0579, + "reward": 0.722098246216774, + "reward_std": 0.09584704507142305, + "rewards/accuracy_reward": 0.22991072246804833, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 2799 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.5513763427734, + "epoch": 0.836382645060115, + "grad_norm": 1.5427008867263794, + "kl": 1.3671875, + "learning_rate": 1.583878056935384e-06, + "loss": 0.062, + "reward": 0.542410746216774, + "reward_std": 0.14554302766919136, + "rewards/accuracy_reward": 0.05357143050059676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2800 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.3482666015625, + "epoch": 0.8366813531476365, + "grad_norm": 0.5824761986732483, + "kl": 1.486328125, + "learning_rate": 1.5782494398498882e-06, + "loss": 0.0579, + "reward": 0.76339291036129, + "reward_std": 0.09584668464958668, + "rewards/accuracy_reward": 0.2723214440047741, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2801 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.9576263427734, + "epoch": 0.8369800612351579, + "grad_norm": 0.48392850160598755, + "kl": 1.2353515625, + "learning_rate": 1.5726299848035843e-06, + "loss": 0.0325, + "reward": 0.606026828289032, + "reward_std": 0.10920473746955395, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2802 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.1629791259766, + "epoch": 0.8372787693226794, + "grad_norm": 0.7122309803962708, + "kl": 1.267578125, + "learning_rate": 1.567019697909884e-06, + "loss": 0.0548, + "reward": 0.595982164144516, + "reward_std": 0.09980747476220131, + "rewards/accuracy_reward": 0.10937500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2803 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.2411041259766, + "epoch": 0.8375774774102008, + "grad_norm": 0.8071985244750977, + "kl": 0.7978515625, + "learning_rate": 1.5614185852722308e-06, + "loss": 0.0272, + "reward": 0.569754496216774, + "reward_std": 0.0867315698415041, + "rewards/accuracy_reward": 0.0758928598370403, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2804 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.1004943847656, + "epoch": 0.8378761854977224, + "grad_norm": 1.1802747249603271, + "kl": 1.2373046875, + "learning_rate": 1.5558266529840893e-06, + "loss": 0.0512, + "reward": 0.620535746216774, + "reward_std": 0.09791625663638115, + "rewards/accuracy_reward": 0.13169643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2805 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.3995971679688, + "epoch": 0.8381748935852438, + "grad_norm": 0.9435425400733948, + "kl": 1.357421875, + "learning_rate": 1.55024390712893e-06, + "loss": 0.0378, + "reward": 0.611607164144516, + "reward_std": 0.12206948176026344, + "rewards/accuracy_reward": 0.12053571734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2806 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.5312805175781, + "epoch": 0.8384736016727653, + "grad_norm": 0.5859232544898987, + "kl": 0.72802734375, + "learning_rate": 1.5446703537802344e-06, + "loss": 0.0438, + "reward": 0.6378348469734192, + "reward_std": 0.09658865630626678, + "rewards/accuracy_reward": 0.1428571529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2807 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.2500457763672, + "epoch": 0.8387723097602867, + "grad_norm": 0.9160741567611694, + "kl": 1.30859375, + "learning_rate": 1.5391059990014834e-06, + "loss": 0.0355, + "reward": 0.598772332072258, + "reward_std": 0.19063975661993027, + "rewards/accuracy_reward": 0.1116071455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2808 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.3661041259766, + "epoch": 0.8390710178478082, + "grad_norm": 1.2140363454818726, + "kl": 1.189453125, + "learning_rate": 1.533550848846148e-06, + "loss": 0.0483, + "reward": 0.6026785969734192, + "reward_std": 0.13861682824790478, + "rewards/accuracy_reward": 0.11383928754366934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 2809 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.1183471679688, + "epoch": 0.8393697259353297, + "grad_norm": 0.7130994200706482, + "kl": 0.66455078125, + "learning_rate": 1.5280049093576899e-06, + "loss": 0.0211, + "reward": 0.6283482313156128, + "reward_std": 0.08754236390814185, + "rewards/accuracy_reward": 0.13616071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2810 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.9620971679688, + "epoch": 0.8396684340228512, + "grad_norm": 0.6885687112808228, + "kl": 1.00390625, + "learning_rate": 1.5224681865695422e-06, + "loss": 0.0373, + "reward": 0.5937500447034836, + "reward_std": 0.14972075633704662, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2811 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.654052734375, + "epoch": 0.8399671421103726, + "grad_norm": 0.7167244553565979, + "kl": 0.970458984375, + "learning_rate": 1.5169406865051218e-06, + "loss": 0.0357, + "reward": 0.612723246216774, + "reward_std": 0.06772222463041544, + "rewards/accuracy_reward": 0.12276786286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2812 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.0223541259766, + "epoch": 0.8402658501978941, + "grad_norm": 1.526968002319336, + "kl": 1.150390625, + "learning_rate": 1.5114224151778068e-06, + "loss": 0.0503, + "reward": 0.5323660969734192, + "reward_std": 0.05679409299045801, + "rewards/accuracy_reward": 0.042410716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2813 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.1295013427734, + "epoch": 0.8405645582854155, + "grad_norm": 0.8101429343223572, + "kl": 1.0947265625, + "learning_rate": 1.505913378590932e-06, + "loss": 0.0454, + "reward": 0.5379464477300644, + "reward_std": 0.06438639247789979, + "rewards/accuracy_reward": 0.0491071455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888392984867096, + "step": 2814 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.0022735595703, + "epoch": 0.8408632663729371, + "grad_norm": 0.6225631833076477, + "kl": 0.9833984375, + "learning_rate": 1.5004135827377909e-06, + "loss": 0.0404, + "reward": 0.5301339477300644, + "reward_std": 0.0697018364444375, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2815 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.1473541259766, + "epoch": 0.8411619744604585, + "grad_norm": 0.8999349474906921, + "kl": 1.3798828125, + "learning_rate": 1.4949230336016251e-06, + "loss": 0.094, + "reward": 0.699776828289032, + "reward_std": 0.1314511951059103, + "rewards/accuracy_reward": 0.2142857238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 2816 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.8370819091797, + "epoch": 0.84146068254798, + "grad_norm": 0.5064029693603516, + "kl": 0.7568359375, + "learning_rate": 1.489441737155609e-06, + "loss": 0.0254, + "reward": 0.674107164144516, + "reward_std": 0.07463232707232237, + "rewards/accuracy_reward": 0.18080358020961285, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493303582072258, + "step": 2817 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.3527221679688, + "epoch": 0.8417593906355014, + "grad_norm": 0.49496152997016907, + "kl": 1.138671875, + "learning_rate": 1.4839696993628594e-06, + "loss": 0.0398, + "reward": 0.5122768133878708, + "reward_std": 0.07988047343678772, + "rewards/accuracy_reward": 0.02008928661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2818 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.4152374267578, + "epoch": 0.842058098723023, + "grad_norm": 0.6973986029624939, + "kl": 1.0439453125, + "learning_rate": 1.4785069261764184e-06, + "loss": 0.0483, + "reward": 0.5881696790456772, + "reward_std": 0.1073724702000618, + "rewards/accuracy_reward": 0.0982142947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2819 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.5558471679688, + "epoch": 0.8423568068105444, + "grad_norm": 0.5016458630561829, + "kl": 0.5068359375, + "learning_rate": 1.4730534235392435e-06, + "loss": 0.0264, + "reward": 0.5585937649011612, + "reward_std": 0.06712113181129098, + "rewards/accuracy_reward": 0.06250000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2820 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.3683471679688, + "epoch": 0.8426555148980659, + "grad_norm": 0.3879983425140381, + "kl": 0.70166015625, + "learning_rate": 1.4676091973842122e-06, + "loss": 0.0292, + "reward": 0.5809151977300644, + "reward_std": 0.0682900664396584, + "rewards/accuracy_reward": 0.08705357694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2821 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.3973541259766, + "epoch": 0.8429542229855873, + "grad_norm": 0.5995916724205017, + "kl": 1.34228515625, + "learning_rate": 1.4621742536341133e-06, + "loss": 0.061, + "reward": 0.5837053805589676, + "reward_std": 0.15285955369472504, + "rewards/accuracy_reward": 0.09375000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2822 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.8795166015625, + "epoch": 0.8432529310731088, + "grad_norm": 0.936468780040741, + "kl": 1.3564453125, + "learning_rate": 1.456748598201626e-06, + "loss": 0.0701, + "reward": 0.7254464626312256, + "reward_std": 0.14790221117436886, + "rewards/accuracy_reward": 0.2366071566939354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2823 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.8839874267578, + "epoch": 0.8435516391606303, + "grad_norm": 0.612314760684967, + "kl": 0.93310546875, + "learning_rate": 1.451332236989339e-06, + "loss": 0.04, + "reward": 0.611607164144516, + "reward_std": 0.10826949402689934, + "rewards/accuracy_reward": 0.12053572107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2824 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.3236999511719, + "epoch": 0.8438503472481518, + "grad_norm": 0.5310856699943542, + "kl": 0.833984375, + "learning_rate": 1.4459251758897153e-06, + "loss": 0.0578, + "reward": 0.6735491305589676, + "reward_std": 0.10615195240825415, + "rewards/accuracy_reward": 0.18080358020961285, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2825 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.0960235595703, + "epoch": 0.8441490553356732, + "grad_norm": 0.7400847673416138, + "kl": 1.3388671875, + "learning_rate": 1.4405274207851116e-06, + "loss": 0.0676, + "reward": 0.5669643133878708, + "reward_std": 0.08084593573585153, + "rewards/accuracy_reward": 0.0758928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2826 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.5112152099609, + "epoch": 0.8444477634231947, + "grad_norm": 1.226048231124878, + "kl": 1.41015625, + "learning_rate": 1.4351389775477576e-06, + "loss": 0.0568, + "reward": 0.635044664144516, + "reward_std": 0.14798968099057674, + "rewards/accuracy_reward": 0.1473214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 2827 + }, + { + "clip_ratio": 0.0, + "completion_length": 936.4397735595703, + "epoch": 0.8447464715107161, + "grad_norm": 0.8079047203063965, + "kl": 0.70947265625, + "learning_rate": 1.4297598520397471e-06, + "loss": 0.0382, + "reward": 0.5630580633878708, + "reward_std": 0.09947754302993417, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2828 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.7165679931641, + "epoch": 0.8450451795982377, + "grad_norm": 1.6034060716629028, + "kl": 0.91455078125, + "learning_rate": 1.4243900501130437e-06, + "loss": 0.026, + "reward": 0.5904018133878708, + "reward_std": 0.0731534967198968, + "rewards/accuracy_reward": 0.09598214481957257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2829 + }, + { + "clip_ratio": 0.0, + "completion_length": 927.6853179931641, + "epoch": 0.8453438876857591, + "grad_norm": 0.740375280380249, + "kl": 0.8837890625, + "learning_rate": 1.4190295776094677e-06, + "loss": 0.0563, + "reward": 0.7059152126312256, + "reward_std": 0.14758638106286526, + "rewards/accuracy_reward": 0.2142857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2830 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.0045166015625, + "epoch": 0.8456425957732806, + "grad_norm": 1.5410361289978027, + "kl": 1.572265625, + "learning_rate": 1.413678440360684e-06, + "loss": 0.0725, + "reward": 0.5318080559372902, + "reward_std": 0.08034020848572254, + "rewards/accuracy_reward": 0.04910714388824999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 2831 + }, + { + "clip_ratio": 0.0, + "completion_length": 908.7991485595703, + "epoch": 0.845941303860802, + "grad_norm": 0.6082596182823181, + "kl": 0.982421875, + "learning_rate": 1.4083366441882074e-06, + "loss": 0.0398, + "reward": 0.6512277126312256, + "reward_std": 0.11930735409259796, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2832 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.4955749511719, + "epoch": 0.8462400119483235, + "grad_norm": 0.880022406578064, + "kl": 1.3955078125, + "learning_rate": 1.4030041949033902e-06, + "loss": 0.0621, + "reward": 0.6640625298023224, + "reward_std": 0.06847206922248006, + "rewards/accuracy_reward": 0.17633929220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 2833 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.7210083007812, + "epoch": 0.846538720035845, + "grad_norm": 1.0208193063735962, + "kl": 0.919921875, + "learning_rate": 1.3976810983074107e-06, + "loss": 0.0386, + "reward": 0.5379464477300644, + "reward_std": 0.1118139773607254, + "rewards/accuracy_reward": 0.046875002793967724, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2834 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.6473693847656, + "epoch": 0.8468374281233665, + "grad_norm": 2.0487756729125977, + "kl": 0.8505859375, + "learning_rate": 1.392367360191278e-06, + "loss": 0.0384, + "reward": 0.6261161118745804, + "reward_std": 0.082662058994174, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2835 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.6674499511719, + "epoch": 0.8471361362108879, + "grad_norm": 0.41278865933418274, + "kl": 0.8818359375, + "learning_rate": 1.3870629863358221e-06, + "loss": 0.0365, + "reward": 0.5234375149011612, + "reward_std": 0.06384472921490669, + "rewards/accuracy_reward": 0.031250001629814506, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2836 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.3259429931641, + "epoch": 0.8474348442984094, + "grad_norm": 0.443586140871048, + "kl": 1.140625, + "learning_rate": 1.3817679825116748e-06, + "loss": 0.0482, + "reward": 0.5937500223517418, + "reward_std": 0.08347738720476627, + "rewards/accuracy_reward": 0.10267857694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2837 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.1451416015625, + "epoch": 0.8477335523859308, + "grad_norm": 1.3717808723449707, + "kl": 1.03515625, + "learning_rate": 1.3764823544792883e-06, + "loss": 0.0489, + "reward": 0.6785714477300644, + "reward_std": 0.17297322303056717, + "rewards/accuracy_reward": 0.18750001583248377, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2838 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.8906707763672, + "epoch": 0.8480322604734524, + "grad_norm": 0.8610155582427979, + "kl": 0.8134765625, + "learning_rate": 1.3712061079889016e-06, + "loss": 0.0471, + "reward": 0.72433041036129, + "reward_std": 0.13501944486051798, + "rewards/accuracy_reward": 0.2299107238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2839 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.763427734375, + "epoch": 0.8483309685609738, + "grad_norm": 0.47339195013046265, + "kl": 0.82373046875, + "learning_rate": 1.3659392487805567e-06, + "loss": 0.0253, + "reward": 0.5876116305589676, + "reward_std": 0.08487676549702883, + "rewards/accuracy_reward": 0.09375000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2840 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.5670013427734, + "epoch": 0.8486296766484953, + "grad_norm": 1.2380701303482056, + "kl": 0.8564453125, + "learning_rate": 1.3606817825840834e-06, + "loss": 0.0479, + "reward": 0.6255580708384514, + "reward_std": 0.08742283191531897, + "rewards/accuracy_reward": 0.1339285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2841 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.7344207763672, + "epoch": 0.8489283847360167, + "grad_norm": 0.33691540360450745, + "kl": 0.39990234375, + "learning_rate": 1.3554337151190833e-06, + "loss": 0.0133, + "reward": 0.598214328289032, + "reward_std": 0.07985686743631959, + "rewards/accuracy_reward": 0.10044643329456449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678656578064, + "step": 2842 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.0536193847656, + "epoch": 0.8492270928235381, + "grad_norm": 0.6803890466690063, + "kl": 0.84521484375, + "learning_rate": 1.3501950520949436e-06, + "loss": 0.0343, + "reward": 0.6411830633878708, + "reward_std": 0.12300079804845154, + "rewards/accuracy_reward": 0.1473214402794838, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2843 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.0156555175781, + "epoch": 0.8495258009110597, + "grad_norm": 1.5547869205474854, + "kl": 1.134765625, + "learning_rate": 1.3449657992108167e-06, + "loss": 0.0627, + "reward": 0.6757812798023224, + "reward_std": 0.14244035724550486, + "rewards/accuracy_reward": 0.1875000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 2844 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.8482666015625, + "epoch": 0.8498245089985811, + "grad_norm": 0.8295108675956726, + "kl": 1.431640625, + "learning_rate": 1.339745962155613e-06, + "loss": 0.0664, + "reward": 0.5212053656578064, + "reward_std": 0.09808778017759323, + "rewards/accuracy_reward": 0.03125000139698386, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2845 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.2053985595703, + "epoch": 0.8501232170861026, + "grad_norm": 1.8203339576721191, + "kl": 1.2392578125, + "learning_rate": 1.334535546608008e-06, + "loss": 0.0429, + "reward": 0.520647332072258, + "reward_std": 0.0922826174646616, + "rewards/accuracy_reward": 0.02901785890571773, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2846 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.8281860351562, + "epoch": 0.850421925173624, + "grad_norm": 0.4722720980644226, + "kl": 0.89013671875, + "learning_rate": 1.3293345582364225e-06, + "loss": 0.0396, + "reward": 0.6629464626312256, + "reward_std": 0.18981929123401642, + "rewards/accuracy_reward": 0.1718750111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2847 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.6429138183594, + "epoch": 0.8507206332611456, + "grad_norm": 0.9474472403526306, + "kl": 1.234375, + "learning_rate": 1.3241430026990187e-06, + "loss": 0.0613, + "reward": 0.603794664144516, + "reward_std": 0.1112457774579525, + "rewards/accuracy_reward": 0.11383929406292737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2848 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.0870819091797, + "epoch": 0.851019341348667, + "grad_norm": 0.9271945357322693, + "kl": 1.26953125, + "learning_rate": 1.3189608856437053e-06, + "loss": 0.0598, + "reward": 0.6015625298023224, + "reward_std": 0.1078113024123013, + "rewards/accuracy_reward": 0.11383929196745157, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 2849 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.5580749511719, + "epoch": 0.8513180494361885, + "grad_norm": 0.6096792221069336, + "kl": 1.060546875, + "learning_rate": 1.3137882127081126e-06, + "loss": 0.0461, + "reward": 0.6344866454601288, + "reward_std": 0.15338996797800064, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2850 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.1674499511719, + "epoch": 0.8516167575237099, + "grad_norm": 0.638431966304779, + "kl": 1.1220703125, + "learning_rate": 1.3086249895196045e-06, + "loss": 0.0477, + "reward": 0.6875000447034836, + "reward_std": 0.12174424529075623, + "rewards/accuracy_reward": 0.19419643701985478, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2851 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.0670166015625, + "epoch": 0.8519154656112314, + "grad_norm": 0.9387733936309814, + "kl": 0.8388671875, + "learning_rate": 1.3034712216952628e-06, + "loss": 0.0408, + "reward": 0.5892857313156128, + "reward_std": 0.08910548174753785, + "rewards/accuracy_reward": 0.0959821492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2852 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.0335235595703, + "epoch": 0.8522141736987529, + "grad_norm": 0.4505062699317932, + "kl": 1.0107421875, + "learning_rate": 1.2983269148418797e-06, + "loss": 0.0439, + "reward": 0.6093750298023224, + "reward_std": 0.11940812133252621, + "rewards/accuracy_reward": 0.1183035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2853 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.6116485595703, + "epoch": 0.8525128817862744, + "grad_norm": 0.8909831643104553, + "kl": 1.26171875, + "learning_rate": 1.2931920745559566e-06, + "loss": 0.0492, + "reward": 0.5982142984867096, + "reward_std": 0.18796517327427864, + "rewards/accuracy_reward": 0.10714286379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2854 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.1451110839844, + "epoch": 0.8528115898737958, + "grad_norm": 0.6571267247200012, + "kl": 1.22705078125, + "learning_rate": 1.2880667064237006e-06, + "loss": 0.0621, + "reward": 0.549107164144516, + "reward_std": 0.07044385187327862, + "rewards/accuracy_reward": 0.058035716181620955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2855 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.2812957763672, + "epoch": 0.8531102979613173, + "grad_norm": 0.4811101257801056, + "kl": 1.22265625, + "learning_rate": 1.2829508160210036e-06, + "loss": 0.0279, + "reward": 0.6216518133878708, + "reward_std": 0.09908306528814137, + "rewards/accuracy_reward": 0.13169643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2856 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.7812957763672, + "epoch": 0.8534090060488387, + "grad_norm": 2.209869623184204, + "kl": 1.419921875, + "learning_rate": 1.2778444089134567e-06, + "loss": 0.0601, + "reward": 0.5412946790456772, + "reward_std": 0.08045438956469297, + "rewards/accuracy_reward": 0.051339288242161274, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553880095482, + "step": 2857 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.8995971679688, + "epoch": 0.8537077141363603, + "grad_norm": 1.2361562252044678, + "kl": 0.908203125, + "learning_rate": 1.272747490656332e-06, + "loss": 0.0459, + "reward": 0.558593787252903, + "reward_std": 0.08293083123862743, + "rewards/accuracy_reward": 0.066964291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2858 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.9687957763672, + "epoch": 0.8540064222238817, + "grad_norm": 0.5697630047798157, + "kl": 1.04931640625, + "learning_rate": 1.2676600667945715e-06, + "loss": 0.0469, + "reward": 0.604352705180645, + "reward_std": 0.10883215256035328, + "rewards/accuracy_reward": 0.1160714291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2859 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.2366485595703, + "epoch": 0.8543051303114032, + "grad_norm": 1.4167029857635498, + "kl": 1.01513671875, + "learning_rate": 1.2625821428627981e-06, + "loss": 0.0428, + "reward": 0.5775669738650322, + "reward_std": 0.05966331670060754, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2860 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.3393402099609, + "epoch": 0.8546038383989246, + "grad_norm": 0.47540298104286194, + "kl": 0.59228515625, + "learning_rate": 1.2575137243852965e-06, + "loss": 0.0187, + "reward": 0.565848246216774, + "reward_std": 0.10599848860874772, + "rewards/accuracy_reward": 0.0758928582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2861 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.0625457763672, + "epoch": 0.8549025464864461, + "grad_norm": 0.5743367075920105, + "kl": 0.90283203125, + "learning_rate": 1.2524548168760043e-06, + "loss": 0.0433, + "reward": 0.5814732313156128, + "reward_std": 0.044504522578790784, + "rewards/accuracy_reward": 0.0892857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2862 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.0446929931641, + "epoch": 0.8552012545739676, + "grad_norm": 0.9687197208404541, + "kl": 1.2490234375, + "learning_rate": 1.2474054258385226e-06, + "loss": 0.0583, + "reward": 0.5664062798023224, + "reward_std": 0.126642812974751, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812798023224, + "step": 2863 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.1473541259766, + "epoch": 0.8554999626614891, + "grad_norm": 1.0477663278579712, + "kl": 1.4775390625, + "learning_rate": 1.2423655567660885e-06, + "loss": 0.0692, + "reward": 0.576450914144516, + "reward_std": 0.11505760997533798, + "rewards/accuracy_reward": 0.0892857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 2864 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.5178985595703, + "epoch": 0.8557986707490105, + "grad_norm": 0.4437367022037506, + "kl": 1.013671875, + "learning_rate": 1.2373352151415885e-06, + "loss": 0.0416, + "reward": 0.5708705633878708, + "reward_std": 0.04027051059529185, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2865 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.6830749511719, + "epoch": 0.856097378836532, + "grad_norm": 0.896484911441803, + "kl": 1.1904296875, + "learning_rate": 1.2323144064375435e-06, + "loss": 0.0535, + "reward": 0.577566996216774, + "reward_std": 0.10910465195775032, + "rewards/accuracy_reward": 0.09151786379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 2866 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.1004791259766, + "epoch": 0.8563960869240534, + "grad_norm": 0.5862838625907898, + "kl": 1.23046875, + "learning_rate": 1.2273031361160958e-06, + "loss": 0.0569, + "reward": 0.6093750298023224, + "reward_std": 0.13886341266334057, + "rewards/accuracy_reward": 0.11830357694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2867 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.5357666015625, + "epoch": 0.856694795011575, + "grad_norm": 1.5989822149276733, + "kl": 1.177734375, + "learning_rate": 1.2223014096290199e-06, + "loss": 0.0483, + "reward": 0.6780134290456772, + "reward_std": 0.09731650166213512, + "rewards/accuracy_reward": 0.1897321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812798023224, + "step": 2868 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.0938110351562, + "epoch": 0.8569935030990964, + "grad_norm": 1.0913950204849243, + "kl": 1.0361328125, + "learning_rate": 1.217309232417705e-06, + "loss": 0.0345, + "reward": 0.5797991305589676, + "reward_std": 0.08064004522748291, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2869 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.1741485595703, + "epoch": 0.8572922111866179, + "grad_norm": 0.49596309661865234, + "kl": 1.1103515625, + "learning_rate": 1.212326609913147e-06, + "loss": 0.0477, + "reward": 0.5323661118745804, + "reward_std": 0.12209409661591053, + "rewards/accuracy_reward": 0.042410717345774174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2870 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.747802734375, + "epoch": 0.8575909192741393, + "grad_norm": 0.849854588508606, + "kl": 1.145751953125, + "learning_rate": 1.2073535475359533e-06, + "loss": 0.0365, + "reward": 0.5703125298023224, + "reward_std": 0.1291490700095892, + "rewards/accuracy_reward": 0.08258928824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 2871 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.6272888183594, + "epoch": 0.8578896273616609, + "grad_norm": 1.578618049621582, + "kl": 0.89990234375, + "learning_rate": 1.2023900506963293e-06, + "loss": 0.0415, + "reward": 0.6439732611179352, + "reward_std": 0.09966596029698849, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2872 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.2076263427734, + "epoch": 0.8581883354491823, + "grad_norm": 0.9135022759437561, + "kl": 1.134765625, + "learning_rate": 1.1974361247940702e-06, + "loss": 0.048, + "reward": 0.6054687649011612, + "reward_std": 0.12118441611528397, + "rewards/accuracy_reward": 0.11607143469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2873 + }, + { + "clip_ratio": 0.0, + "completion_length": 944.4911041259766, + "epoch": 0.8584870435367038, + "grad_norm": 1.5255221128463745, + "kl": 0.61474609375, + "learning_rate": 1.1924917752185628e-06, + "loss": 0.0402, + "reward": 0.6434152126312256, + "reward_std": 0.057224934455007315, + "rewards/accuracy_reward": 0.1495535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2874 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.6741638183594, + "epoch": 0.8587857516242252, + "grad_norm": 1.594106912612915, + "kl": 0.68408203125, + "learning_rate": 1.1875570073487786e-06, + "loss": 0.03, + "reward": 0.5524553656578064, + "reward_std": 0.068230289965868, + "rewards/accuracy_reward": 0.055803575087338686, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2875 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.6428985595703, + "epoch": 0.8590844597117467, + "grad_norm": 1.0432217121124268, + "kl": 0.8974609375, + "learning_rate": 1.1826318265532543e-06, + "loss": 0.036, + "reward": 0.5424107313156128, + "reward_std": 0.12121278466656804, + "rewards/accuracy_reward": 0.05133928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2876 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.6652221679688, + "epoch": 0.8593831677992682, + "grad_norm": 0.9571677446365356, + "kl": 0.7041015625, + "learning_rate": 1.1777162381901108e-06, + "loss": 0.0422, + "reward": 0.729910746216774, + "reward_std": 0.09003685880452394, + "rewards/accuracy_reward": 0.2343750074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2877 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.7299346923828, + "epoch": 0.8596818758867897, + "grad_norm": 0.5124216675758362, + "kl": 0.74658203125, + "learning_rate": 1.1728102476070213e-06, + "loss": 0.0407, + "reward": 0.6847098469734192, + "reward_std": 0.12144891358911991, + "rewards/accuracy_reward": 0.1919642947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2878 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.3839721679688, + "epoch": 0.8599805839743111, + "grad_norm": 0.642859935760498, + "kl": 0.90234375, + "learning_rate": 1.1679138601412253e-06, + "loss": 0.031, + "reward": 0.5212053954601288, + "reward_std": 0.06788715533912182, + "rewards/accuracy_reward": 0.029017859371379018, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2879 + }, + { + "clip_ratio": 0.0, + "completion_length": 944.3125457763672, + "epoch": 0.8602792920618326, + "grad_norm": 0.5499101877212524, + "kl": 1.0146484375, + "learning_rate": 1.1630270811195132e-06, + "loss": 0.0399, + "reward": 0.576450914144516, + "reward_std": 0.08353567030280828, + "rewards/accuracy_reward": 0.08705357275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2880 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.294677734375, + "epoch": 0.860578000149354, + "grad_norm": 0.6596195101737976, + "kl": 0.8046875, + "learning_rate": 1.1581499158582187e-06, + "loss": 0.048, + "reward": 0.5792410895228386, + "reward_std": 0.1478363862261176, + "rewards/accuracy_reward": 0.08928572200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2881 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.9754943847656, + "epoch": 0.8608767082368756, + "grad_norm": 0.4583509862422943, + "kl": 0.64453125, + "learning_rate": 1.1532823696632223e-06, + "loss": 0.0292, + "reward": 0.5535714626312256, + "reward_std": 0.108983950689435, + "rewards/accuracy_reward": 0.05803571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2882 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.4531707763672, + "epoch": 0.861175416324397, + "grad_norm": 0.4839169383049011, + "kl": 0.869140625, + "learning_rate": 1.1484244478299366e-06, + "loss": 0.054, + "reward": 0.5686384215950966, + "reward_std": 0.10069496184587479, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2883 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.8951416015625, + "epoch": 0.8614741244119185, + "grad_norm": 0.5744840502738953, + "kl": 0.896484375, + "learning_rate": 1.1435761556433035e-06, + "loss": 0.0441, + "reward": 0.6104910969734192, + "reward_std": 0.09296526107937098, + "rewards/accuracy_reward": 0.11830357578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 2884 + }, + { + "clip_ratio": 0.0, + "completion_length": 923.4241638183594, + "epoch": 0.8617728324994399, + "grad_norm": 1.1974786520004272, + "kl": 1.0087890625, + "learning_rate": 1.1387374983777888e-06, + "loss": 0.0512, + "reward": 0.6450893059372902, + "reward_std": 0.1807972053065896, + "rewards/accuracy_reward": 0.1540178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2885 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.0714874267578, + "epoch": 0.8620715405869613, + "grad_norm": 0.8168326616287231, + "kl": 1.11767578125, + "learning_rate": 1.1339084812973823e-06, + "loss": 0.0487, + "reward": 0.5184151977300644, + "reward_std": 0.10939181596040726, + "rewards/accuracy_reward": 0.029017858440056443, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2886 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.6161041259766, + "epoch": 0.8623702486744829, + "grad_norm": 1.239516019821167, + "kl": 0.92041015625, + "learning_rate": 1.1290891096555746e-06, + "loss": 0.0469, + "reward": 0.6300223618745804, + "reward_std": 0.11250811908394098, + "rewards/accuracy_reward": 0.14285715110599995, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2887 + }, + { + "clip_ratio": 0.0, + "completion_length": 918.2143249511719, + "epoch": 0.8626689567620043, + "grad_norm": 0.7735241651535034, + "kl": 0.74853515625, + "learning_rate": 1.124279388695373e-06, + "loss": 0.0455, + "reward": 0.6367187798023224, + "reward_std": 0.06921195611357689, + "rewards/accuracy_reward": 0.1428571529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2888 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.2678985595703, + "epoch": 0.8629676648495258, + "grad_norm": 0.7927867770195007, + "kl": 0.669677734375, + "learning_rate": 1.119479323649284e-06, + "loss": 0.0248, + "reward": 0.5758928805589676, + "reward_std": 0.09532497823238373, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2889 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.3393249511719, + "epoch": 0.8632663729370472, + "grad_norm": 0.43528780341148376, + "kl": 0.621337890625, + "learning_rate": 1.1146889197393052e-06, + "loss": 0.0291, + "reward": 0.5703125298023224, + "reward_std": 0.09444085508584976, + "rewards/accuracy_reward": 0.07589286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2890 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.6205749511719, + "epoch": 0.8635650810245687, + "grad_norm": 0.5060498118400574, + "kl": 0.955078125, + "learning_rate": 1.1099081821769297e-06, + "loss": 0.0529, + "reward": 0.624441996216774, + "reward_std": 0.11909739975817502, + "rewards/accuracy_reward": 0.13392857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2891 + }, + { + "clip_ratio": 0.0, + "completion_length": 912.7634429931641, + "epoch": 0.8638637891120902, + "grad_norm": 0.9351537227630615, + "kl": 0.3955078125, + "learning_rate": 1.1051371161631265e-06, + "loss": 0.0233, + "reward": 0.5591517984867096, + "reward_std": 0.13054851535707712, + "rewards/accuracy_reward": 0.0647321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2892 + }, + { + "clip_ratio": 0.0, + "completion_length": 924.4129943847656, + "epoch": 0.8641624971996117, + "grad_norm": 0.9140760898590088, + "kl": 0.57373046875, + "learning_rate": 1.100375726888352e-06, + "loss": 0.0353, + "reward": 0.5837053805589676, + "reward_std": 0.1413597147911787, + "rewards/accuracy_reward": 0.08928571944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2893 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.1920013427734, + "epoch": 0.8644612052871331, + "grad_norm": 0.4165382385253906, + "kl": 0.559326171875, + "learning_rate": 1.0956240195325308e-06, + "loss": 0.0193, + "reward": 0.575334832072258, + "reward_std": 0.10449779406189919, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2894 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.5670166015625, + "epoch": 0.8647599133746546, + "grad_norm": 0.7374168038368225, + "kl": 0.64990234375, + "learning_rate": 1.090881999265051e-06, + "loss": 0.0333, + "reward": 0.6261160969734192, + "reward_std": 0.09713490423746407, + "rewards/accuracy_reward": 0.1339285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2895 + }, + { + "clip_ratio": 0.0, + "completion_length": 933.1830749511719, + "epoch": 0.865058621462176, + "grad_norm": 0.44387108087539673, + "kl": 0.67041015625, + "learning_rate": 1.0861496712447694e-06, + "loss": 0.0322, + "reward": 0.556361623108387, + "reward_std": 0.03615389973856509, + "rewards/accuracy_reward": 0.0625000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2896 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.4330902099609, + "epoch": 0.8653573295496976, + "grad_norm": 1.1207104921340942, + "kl": 0.578125, + "learning_rate": 1.0814270406199967e-06, + "loss": 0.0281, + "reward": 0.5904018133878708, + "reward_std": 0.07566628325730562, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2897 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.0357360839844, + "epoch": 0.865656037637219, + "grad_norm": 0.6057807207107544, + "kl": 0.2265625, + "learning_rate": 1.0767141125284875e-06, + "loss": 0.0071, + "reward": 0.584263414144516, + "reward_std": 0.04045637929812074, + "rewards/accuracy_reward": 0.08705357275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 2898 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.0692443847656, + "epoch": 0.8659547457247405, + "grad_norm": 0.5321558713912964, + "kl": 0.70849609375, + "learning_rate": 1.072010892097447e-06, + "loss": 0.0239, + "reward": 0.6283482313156128, + "reward_std": 0.09820207580924034, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2899 + }, + { + "clip_ratio": 0.0, + "completion_length": 921.6250305175781, + "epoch": 0.8662534538122619, + "grad_norm": 0.487579882144928, + "kl": 0.541015625, + "learning_rate": 1.0673173844435214e-06, + "loss": 0.0312, + "reward": 0.5943080484867096, + "reward_std": 0.094577229116112, + "rewards/accuracy_reward": 0.09821428824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 2900 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.2790679931641, + "epoch": 0.8665521618997835, + "grad_norm": 1.3498502969741821, + "kl": 0.80029296875, + "learning_rate": 1.062633594672783e-06, + "loss": 0.0242, + "reward": 0.6350446790456772, + "reward_std": 0.11782960034906864, + "rewards/accuracy_reward": 0.14062500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2901 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.4420013427734, + "epoch": 0.8668508699873049, + "grad_norm": 0.4178147315979004, + "kl": 0.75634765625, + "learning_rate": 1.0579595278807376e-06, + "loss": 0.0478, + "reward": 0.5714286118745804, + "reward_std": 0.10021691769361496, + "rewards/accuracy_reward": 0.07812500349245965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2902 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.0112152099609, + "epoch": 0.8671495780748264, + "grad_norm": 0.7848483920097351, + "kl": 0.77197265625, + "learning_rate": 1.0532951891523124e-06, + "loss": 0.0176, + "reward": 0.5881696790456772, + "reward_std": 0.11024283338338137, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2903 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.0223693847656, + "epoch": 0.8674482861623478, + "grad_norm": 0.4248412251472473, + "kl": 0.585693359375, + "learning_rate": 1.0486405835618496e-06, + "loss": 0.0396, + "reward": 0.6367187798023224, + "reward_std": 0.057318430161103606, + "rewards/accuracy_reward": 0.1428571529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2904 + }, + { + "clip_ratio": 0.0, + "completion_length": 936.7455749511719, + "epoch": 0.8677469942498693, + "grad_norm": 0.6455655097961426, + "kl": 0.874755859375, + "learning_rate": 1.0439957161731062e-06, + "loss": 0.0387, + "reward": 0.5496651977300644, + "reward_std": 0.08115024119615555, + "rewards/accuracy_reward": 0.05803571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2905 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.7969360351562, + "epoch": 0.8680457023373908, + "grad_norm": 0.5013458132743835, + "kl": 0.74951171875, + "learning_rate": 1.039360592039238e-06, + "loss": 0.0313, + "reward": 0.6568080633878708, + "reward_std": 0.07273470051586628, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2906 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.9710388183594, + "epoch": 0.8683444104249123, + "grad_norm": 0.5252956748008728, + "kl": 0.66943359375, + "learning_rate": 1.034735216202809e-06, + "loss": 0.0237, + "reward": 0.595982164144516, + "reward_std": 0.09561234316788614, + "rewards/accuracy_reward": 0.10044643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357238650322, + "step": 2907 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.8482513427734, + "epoch": 0.8686431185124337, + "grad_norm": 0.475139856338501, + "kl": 0.492431640625, + "learning_rate": 1.0301195936957765e-06, + "loss": 0.0189, + "reward": 0.5273437649011612, + "reward_std": 0.10467138700187206, + "rewards/accuracy_reward": 0.03348214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2908 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.3683471679688, + "epoch": 0.8689418265999552, + "grad_norm": 0.4103522300720215, + "kl": 0.3203125, + "learning_rate": 1.0255137295394813e-06, + "loss": 0.0114, + "reward": 0.6082589626312256, + "reward_std": 0.1331924246624112, + "rewards/accuracy_reward": 0.11160714644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 2909 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.8482513427734, + "epoch": 0.8692405346874766, + "grad_norm": 0.507853627204895, + "kl": 0.588134765625, + "learning_rate": 1.0209176287446542e-06, + "loss": 0.0127, + "reward": 0.6160714477300644, + "reward_std": 0.11741769313812256, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2910 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.6027069091797, + "epoch": 0.8695392427749982, + "grad_norm": 0.755804717540741, + "kl": 0.8154296875, + "learning_rate": 1.0163312963114035e-06, + "loss": 0.0309, + "reward": 0.5340401977300644, + "reward_std": 0.09110193327069283, + "rewards/accuracy_reward": 0.04464285867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 2911 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.0134429931641, + "epoch": 0.8698379508625196, + "grad_norm": 0.6036261916160583, + "kl": 0.766357421875, + "learning_rate": 1.011754737229208e-06, + "loss": 0.0294, + "reward": 0.5825893208384514, + "reward_std": 0.12025593314319849, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2912 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.8638916015625, + "epoch": 0.8701366589500411, + "grad_norm": 0.3930726647377014, + "kl": 0.637451171875, + "learning_rate": 1.0071879564769139e-06, + "loss": 0.0191, + "reward": 0.6188616156578064, + "reward_std": 0.10695468075573444, + "rewards/accuracy_reward": 0.12500000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2913 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.0848541259766, + "epoch": 0.8704353670375625, + "grad_norm": 0.7618242502212524, + "kl": 1.2197265625, + "learning_rate": 1.0026309590227358e-06, + "loss": 0.0454, + "reward": 0.5814732387661934, + "reward_std": 0.07418711693026125, + "rewards/accuracy_reward": 0.08928571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2914 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.0111999511719, + "epoch": 0.870734075125084, + "grad_norm": 1.0708438158035278, + "kl": 1.0078125, + "learning_rate": 9.980837498242357e-07, + "loss": 0.0578, + "reward": 0.6729910969734192, + "reward_std": 0.17194212973117828, + "rewards/accuracy_reward": 0.1808035783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2915 + }, + { + "clip_ratio": 0.0, + "completion_length": 927.1071929931641, + "epoch": 0.8710327832126055, + "grad_norm": 0.3943694233894348, + "kl": 0.611328125, + "learning_rate": 9.935463338283325e-07, + "loss": 0.025, + "reward": 0.6160714626312256, + "reward_std": 0.11924503184854984, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2916 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.4888763427734, + "epoch": 0.871331491300127, + "grad_norm": 0.8152443766593933, + "kl": 0.47509765625, + "learning_rate": 9.890187159712927e-07, + "loss": 0.0212, + "reward": 0.636160746216774, + "reward_std": 0.10954142175614834, + "rewards/accuracy_reward": 0.14062500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2917 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.8125305175781, + "epoch": 0.8716301993876484, + "grad_norm": 0.3740006387233734, + "kl": 0.703125, + "learning_rate": 9.845009011787166e-07, + "loss": 0.0284, + "reward": 0.5608259290456772, + "reward_std": 0.10512721259146929, + "rewards/accuracy_reward": 0.06696428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2918 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.5022735595703, + "epoch": 0.8719289074751699, + "grad_norm": 0.6626452803611755, + "kl": 1.064453125, + "learning_rate": 9.79992894365549e-07, + "loss": 0.0632, + "reward": 0.5820312798023224, + "reward_std": 0.11910403706133366, + "rewards/accuracy_reward": 0.08928571874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2919 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.6942443847656, + "epoch": 0.8722276155626913, + "grad_norm": 0.4515368938446045, + "kl": 0.68798828125, + "learning_rate": 9.754947004360537e-07, + "loss": 0.0213, + "reward": 0.5625000298023224, + "reward_std": 0.1287775468081236, + "rewards/accuracy_reward": 0.07142857508733869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2920 + }, + { + "clip_ratio": 0.0, + "completion_length": 910.0156707763672, + "epoch": 0.8725263236502129, + "grad_norm": 0.47000300884246826, + "kl": 0.7685546875, + "learning_rate": 9.710063242838286e-07, + "loss": 0.044, + "reward": 0.624441996216774, + "reward_std": 0.11430634744465351, + "rewards/accuracy_reward": 0.12946428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2921 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.5580749511719, + "epoch": 0.8728250317377343, + "grad_norm": 0.25318971276283264, + "kl": 0.767822265625, + "learning_rate": 9.665277707917875e-07, + "loss": 0.0182, + "reward": 0.5937500298023224, + "reward_std": 0.07366819167509675, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2922 + }, + { + "clip_ratio": 0.0, + "completion_length": 924.4710235595703, + "epoch": 0.8731237398252558, + "grad_norm": 0.7839317917823792, + "kl": 0.6396484375, + "learning_rate": 9.620590448321554e-07, + "loss": 0.0214, + "reward": 0.5954241305589676, + "reward_std": 0.07255867240019143, + "rewards/accuracy_reward": 0.1004464365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2923 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.0536193847656, + "epoch": 0.8734224479127772, + "grad_norm": 0.6473337411880493, + "kl": 0.943359375, + "learning_rate": 9.576001512664678e-07, + "loss": 0.0509, + "reward": 0.6043527275323868, + "reward_std": 0.16122643277049065, + "rewards/accuracy_reward": 0.11160715040750802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2924 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.4420013427734, + "epoch": 0.8737211560002988, + "grad_norm": 1.5468261241912842, + "kl": 0.6748046875, + "learning_rate": 9.531510949455681e-07, + "loss": 0.033, + "reward": 0.5591517984867096, + "reward_std": 0.09296373394317925, + "rewards/accuracy_reward": 0.06473214458674192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2925 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.0402069091797, + "epoch": 0.8740198640878202, + "grad_norm": 2.0869572162628174, + "kl": 0.708984375, + "learning_rate": 9.487118807095885e-07, + "loss": 0.0343, + "reward": 0.5870536118745804, + "reward_std": 0.11540373973548412, + "rewards/accuracy_reward": 0.09151786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 2926 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.0156555175781, + "epoch": 0.8743185721753417, + "grad_norm": 1.1084116697311401, + "kl": 0.8955078125, + "learning_rate": 9.442825133879608e-07, + "loss": 0.0471, + "reward": 0.6071428805589676, + "reward_std": 0.1168310884386301, + "rewards/accuracy_reward": 0.11383929150179029, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 2927 + }, + { + "clip_ratio": 0.0, + "completion_length": 932.5736999511719, + "epoch": 0.8746172802628631, + "grad_norm": 0.8663967847824097, + "kl": 0.7294921875, + "learning_rate": 9.398629977994056e-07, + "loss": 0.0323, + "reward": 0.516741082072258, + "reward_std": 0.07856288668699563, + "rewards/accuracy_reward": 0.022321429336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 2928 + }, + { + "clip_ratio": 0.0, + "completion_length": 914.0312957763672, + "epoch": 0.8749159883503845, + "grad_norm": 0.3714257478713989, + "kl": 0.5087890625, + "learning_rate": 9.354533387519171e-07, + "loss": 0.0114, + "reward": 0.6138392984867096, + "reward_std": 0.14575818181037903, + "rewards/accuracy_reward": 0.1205357201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2929 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.0111999511719, + "epoch": 0.8752146964379061, + "grad_norm": 1.2035197019577026, + "kl": 0.61376953125, + "learning_rate": 9.310535410427767e-07, + "loss": 0.0269, + "reward": 0.6177455633878708, + "reward_std": 0.08912939880974591, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 2930 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.7879791259766, + "epoch": 0.8755134045254275, + "grad_norm": 1.135016679763794, + "kl": 0.70849609375, + "learning_rate": 9.266636094585301e-07, + "loss": 0.0352, + "reward": 0.5295759290456772, + "reward_std": 0.09393015364184976, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 2931 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.6071929931641, + "epoch": 0.875812112612949, + "grad_norm": 0.7170656323432922, + "kl": 0.8203125, + "learning_rate": 9.222835487749937e-07, + "loss": 0.0316, + "reward": 0.7198660969734192, + "reward_std": 0.179371926933527, + "rewards/accuracy_reward": 0.2254464402794838, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2932 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.7344360351562, + "epoch": 0.8761108207004704, + "grad_norm": 0.6132971048355103, + "kl": 0.8974609375, + "learning_rate": 9.179133637572457e-07, + "loss": 0.0366, + "reward": 0.5279018133878708, + "reward_std": 0.10823618364520371, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2933 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.6183624267578, + "epoch": 0.8764095287879919, + "grad_norm": 1.640430212020874, + "kl": 0.9296875, + "learning_rate": 9.135530591596165e-07, + "loss": 0.0505, + "reward": 0.550223246216774, + "reward_std": 0.12303395941853523, + "rewards/accuracy_reward": 0.060267860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2934 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.0000457763672, + "epoch": 0.8767082368755134, + "grad_norm": 0.3743448555469513, + "kl": 0.5107421875, + "learning_rate": 9.092026397256914e-07, + "loss": 0.0168, + "reward": 0.5680803954601288, + "reward_std": 0.0726345106959343, + "rewards/accuracy_reward": 0.07366071874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2935 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.2991485595703, + "epoch": 0.8770069449630349, + "grad_norm": 0.9938798546791077, + "kl": 0.69287109375, + "learning_rate": 9.048621101883026e-07, + "loss": 0.0342, + "reward": 0.6824777126312256, + "reward_std": 0.10662746988236904, + "rewards/accuracy_reward": 0.19196429289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2936 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.7277374267578, + "epoch": 0.8773056530505563, + "grad_norm": 1.1848664283752441, + "kl": 0.82861328125, + "learning_rate": 9.00531475269516e-07, + "loss": 0.0424, + "reward": 0.5864955633878708, + "reward_std": 0.09671044768765569, + "rewards/accuracy_reward": 0.09151786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2937 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.1473541259766, + "epoch": 0.8776043611380778, + "grad_norm": 0.5720799565315247, + "kl": 1.09814453125, + "learning_rate": 8.962107396806407e-07, + "loss": 0.0428, + "reward": 0.5362723395228386, + "reward_std": 0.0572121343575418, + "rewards/accuracy_reward": 0.044642860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2938 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.0156707763672, + "epoch": 0.8779030692255992, + "grad_norm": 0.8619834780693054, + "kl": 1.150390625, + "learning_rate": 8.918999081222157e-07, + "loss": 0.0439, + "reward": 0.5284598395228386, + "reward_std": 0.08457925729453564, + "rewards/accuracy_reward": 0.0379464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2939 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.9174499511719, + "epoch": 0.8782017773131208, + "grad_norm": 0.4235113263130188, + "kl": 0.6923828125, + "learning_rate": 8.875989852839984e-07, + "loss": 0.0255, + "reward": 0.580357164144516, + "reward_std": 0.07367124129086733, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2940 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.5067443847656, + "epoch": 0.8785004854006422, + "grad_norm": 1.0940055847167969, + "kl": 0.9833984375, + "learning_rate": 8.833079758449748e-07, + "loss": 0.0398, + "reward": 0.7633928954601288, + "reward_std": 0.15685422718524933, + "rewards/accuracy_reward": 0.2723214440047741, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2941 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.6897735595703, + "epoch": 0.8787991934881637, + "grad_norm": 0.8398132920265198, + "kl": 1.0009765625, + "learning_rate": 8.79026884473343e-07, + "loss": 0.049, + "reward": 0.5625000298023224, + "reward_std": 0.14456496108323336, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2942 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.8036041259766, + "epoch": 0.8790979015756851, + "grad_norm": 0.5793119072914124, + "kl": 1.3505859375, + "learning_rate": 8.747557158265074e-07, + "loss": 0.0649, + "reward": 0.5122768208384514, + "reward_std": 0.07995075173676014, + "rewards/accuracy_reward": 0.022321430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2943 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.8125457763672, + "epoch": 0.8793966096632067, + "grad_norm": 0.627263069152832, + "kl": 1.1123046875, + "learning_rate": 8.704944745510846e-07, + "loss": 0.0593, + "reward": 0.671316996216774, + "reward_std": 0.07112388359382749, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 2944 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.7366485595703, + "epoch": 0.8796953177507281, + "grad_norm": 0.5910086631774902, + "kl": 1.32421875, + "learning_rate": 8.66243165282884e-07, + "loss": 0.052, + "reward": 0.574218787252903, + "reward_std": 0.0826170863583684, + "rewards/accuracy_reward": 0.0848214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 2945 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.7500457763672, + "epoch": 0.8799940258382496, + "grad_norm": 0.6259650588035583, + "kl": 0.9560546875, + "learning_rate": 8.620017926469149e-07, + "loss": 0.0278, + "reward": 0.6250000149011612, + "reward_std": 0.1058836430311203, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888392984867096, + "step": 2946 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.5736999511719, + "epoch": 0.880292733925771, + "grad_norm": 2.1982696056365967, + "kl": 1.40234375, + "learning_rate": 8.577703612573784e-07, + "loss": 0.0632, + "reward": 0.5474330484867096, + "reward_std": 0.11885082861408591, + "rewards/accuracy_reward": 0.060267859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2947 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.8370971679688, + "epoch": 0.8805914420132925, + "grad_norm": 2.161365032196045, + "kl": 1.544921875, + "learning_rate": 8.535488757176513e-07, + "loss": 0.0573, + "reward": 0.5479910969734192, + "reward_std": 0.0970839187502861, + "rewards/accuracy_reward": 0.06026785867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 2948 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.5045013427734, + "epoch": 0.880890150100814, + "grad_norm": 1.1049015522003174, + "kl": 1.4453125, + "learning_rate": 8.493373406202987e-07, + "loss": 0.0682, + "reward": 0.658482164144516, + "reward_std": 0.16549585945904255, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2949 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.5112152099609, + "epoch": 0.8811888581883355, + "grad_norm": 0.9713718891143799, + "kl": 0.849609375, + "learning_rate": 8.4513576054706e-07, + "loss": 0.0285, + "reward": 0.6529018133878708, + "reward_std": 0.11880033276975155, + "rewards/accuracy_reward": 0.16071429220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 2950 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.9442443847656, + "epoch": 0.8814875662758569, + "grad_norm": 0.757099986076355, + "kl": 0.7158203125, + "learning_rate": 8.409441400688401e-07, + "loss": 0.0346, + "reward": 0.6238839477300644, + "reward_std": 0.1371013280004263, + "rewards/accuracy_reward": 0.12946429336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 2951 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.0781707763672, + "epoch": 0.8817862743633784, + "grad_norm": 0.7687888741493225, + "kl": 1.640625, + "learning_rate": 8.3676248374571e-07, + "loss": 0.0598, + "reward": 0.5128348469734192, + "reward_std": 0.09275066759437323, + "rewards/accuracy_reward": 0.02901785750873387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 2952 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.6607666015625, + "epoch": 0.8820849824508998, + "grad_norm": 0.5404132008552551, + "kl": 1.28515625, + "learning_rate": 8.325907961269064e-07, + "loss": 0.0532, + "reward": 0.658482164144516, + "reward_std": 0.18295327201485634, + "rewards/accuracy_reward": 0.17187501210719347, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071715950966, + "step": 2953 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.6384429931641, + "epoch": 0.8823836905384214, + "grad_norm": 0.5201438665390015, + "kl": 1.041015625, + "learning_rate": 8.284290817508122e-07, + "loss": 0.0426, + "reward": 0.5597098469734192, + "reward_std": 0.1308770445175469, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2954 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.8750457763672, + "epoch": 0.8826823986259428, + "grad_norm": 1.9706737995147705, + "kl": 1.5732421875, + "learning_rate": 8.24277345144967e-07, + "loss": 0.0618, + "reward": 0.604910746216774, + "reward_std": 0.13550921343266964, + "rewards/accuracy_reward": 0.12053572060540318, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750223517418, + "step": 2955 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.9085235595703, + "epoch": 0.8829811067134643, + "grad_norm": 0.6109487414360046, + "kl": 1.357421875, + "learning_rate": 8.201355908260544e-07, + "loss": 0.0616, + "reward": 0.5513393133878708, + "reward_std": 0.08996245171874762, + "rewards/accuracy_reward": 0.06250000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 2956 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.0536041259766, + "epoch": 0.8832798148009857, + "grad_norm": 0.8123025894165039, + "kl": 1.427734375, + "learning_rate": 8.160038232998935e-07, + "loss": 0.0691, + "reward": 0.595982164144516, + "reward_std": 0.098345254547894, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2957 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.1585235595703, + "epoch": 0.8835785228885072, + "grad_norm": 0.7199362516403198, + "kl": 1.0537109375, + "learning_rate": 8.118820470614463e-07, + "loss": 0.0464, + "reward": 0.5831473618745804, + "reward_std": 0.09838142804801464, + "rewards/accuracy_reward": 0.09151785913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2958 + }, + { + "clip_ratio": 0.0, + "completion_length": 991.6741333007812, + "epoch": 0.8838772309760287, + "grad_norm": 1.346609354019165, + "kl": 1.375, + "learning_rate": 8.077702665947973e-07, + "loss": 0.0595, + "reward": 0.5987723469734192, + "reward_std": 0.08545652218163013, + "rewards/accuracy_reward": 0.11383929220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2959 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.8750457763672, + "epoch": 0.8841759390635502, + "grad_norm": 1.3932636976242065, + "kl": 1.203125, + "learning_rate": 8.036684863731636e-07, + "loss": 0.0484, + "reward": 0.5831473469734192, + "reward_std": 0.08735682535916567, + "rewards/accuracy_reward": 0.09821428963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2960 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.9598693847656, + "epoch": 0.8844746471510716, + "grad_norm": 1.0957832336425781, + "kl": 1.3037109375, + "learning_rate": 7.995767108588814e-07, + "loss": 0.0461, + "reward": 0.6333705484867096, + "reward_std": 0.18448911234736443, + "rewards/accuracy_reward": 0.14508929569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2961 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.2567291259766, + "epoch": 0.8847733552385931, + "grad_norm": 0.7450630068778992, + "kl": 0.83544921875, + "learning_rate": 7.954949445033966e-07, + "loss": 0.044, + "reward": 0.7075892984867096, + "reward_std": 0.12490664049983025, + "rewards/accuracy_reward": 0.21428572852164507, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 2962 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.2567443847656, + "epoch": 0.8850720633261145, + "grad_norm": 1.3284114599227905, + "kl": 1.4765625, + "learning_rate": 7.914231917472748e-07, + "loss": 0.0616, + "reward": 0.5016741305589676, + "reward_std": 0.09945654775947332, + "rewards/accuracy_reward": 0.017857143888249993, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 2963 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.1719207763672, + "epoch": 0.8853707714136361, + "grad_norm": 1.3148396015167236, + "kl": 0.8330078125, + "learning_rate": 7.873614570201838e-07, + "loss": 0.0367, + "reward": 0.7293527126312256, + "reward_std": 0.12774566048756242, + "rewards/accuracy_reward": 0.2366071529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 2964 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.1428985595703, + "epoch": 0.8856694795011575, + "grad_norm": 1.0651359558105469, + "kl": 1.16748046875, + "learning_rate": 7.833097447408911e-07, + "loss": 0.0569, + "reward": 0.5664062798023224, + "reward_std": 0.11372638959437609, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812798023224, + "step": 2965 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.4129943847656, + "epoch": 0.885968187588679, + "grad_norm": 0.6553637981414795, + "kl": 1.623046875, + "learning_rate": 7.792680593172619e-07, + "loss": 0.0719, + "reward": 0.5658482313156128, + "reward_std": 0.12767819315195084, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4810268059372902, + "step": 2966 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.3036193847656, + "epoch": 0.8862668956762004, + "grad_norm": 0.6836866736412048, + "kl": 0.78955078125, + "learning_rate": 7.75236405146258e-07, + "loss": 0.0206, + "reward": 0.6138393133878708, + "reward_std": 0.06925374735146761, + "rewards/accuracy_reward": 0.12053571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 2967 + }, + { + "clip_ratio": 0.0, + "completion_length": 944.0290679931641, + "epoch": 0.886565603763722, + "grad_norm": 1.4284791946411133, + "kl": 1.28955078125, + "learning_rate": 7.712147866139197e-07, + "loss": 0.046, + "reward": 0.5223214477300644, + "reward_std": 0.0753919929265976, + "rewards/accuracy_reward": 0.031250000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 2968 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.1652221679688, + "epoch": 0.8868643118512434, + "grad_norm": 1.3846545219421387, + "kl": 1.34375, + "learning_rate": 7.672032080953751e-07, + "loss": 0.0584, + "reward": 0.6010044887661934, + "reward_std": 0.0711723044514656, + "rewards/accuracy_reward": 0.1138392947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 2969 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.8460235595703, + "epoch": 0.8871630199387649, + "grad_norm": 0.5286653637886047, + "kl": 1.3310546875, + "learning_rate": 7.632016739548309e-07, + "loss": 0.0664, + "reward": 0.6149553880095482, + "reward_std": 0.10658918041735888, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2970 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.3236999511719, + "epoch": 0.8874617280262863, + "grad_norm": 0.8985528349876404, + "kl": 1.369140625, + "learning_rate": 7.592101885455594e-07, + "loss": 0.0607, + "reward": 0.5714285969734192, + "reward_std": 0.08102626539766788, + "rewards/accuracy_reward": 0.08258928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 2971 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.1607666015625, + "epoch": 0.8877604361138077, + "grad_norm": 0.5357082486152649, + "kl": 1.0302734375, + "learning_rate": 7.552287562099103e-07, + "loss": 0.0463, + "reward": 0.5558035895228386, + "reward_std": 0.08852312993258238, + "rewards/accuracy_reward": 0.06696429033763707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888392984867096, + "step": 2972 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.6763763427734, + "epoch": 0.8880591442013293, + "grad_norm": 0.9360129833221436, + "kl": 1.3701171875, + "learning_rate": 7.512573812792878e-07, + "loss": 0.0551, + "reward": 0.6489955484867096, + "reward_std": 0.09068011119961739, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 2973 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.9107666015625, + "epoch": 0.8883578522888507, + "grad_norm": 1.1713612079620361, + "kl": 1.357421875, + "learning_rate": 7.472960680741603e-07, + "loss": 0.0647, + "reward": 0.7561384290456772, + "reward_std": 0.1804459374397993, + "rewards/accuracy_reward": 0.2656250223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 2974 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.2411041259766, + "epoch": 0.8886565603763722, + "grad_norm": 1.0498902797698975, + "kl": 1.2275390625, + "learning_rate": 7.433448209040495e-07, + "loss": 0.049, + "reward": 0.573660746216774, + "reward_std": 0.16392029263079166, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2975 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.0960235595703, + "epoch": 0.8889552684638936, + "grad_norm": 2.2029356956481934, + "kl": 1.6181640625, + "learning_rate": 7.394036440675223e-07, + "loss": 0.0852, + "reward": 0.679129496216774, + "reward_std": 0.1700092889368534, + "rewards/accuracy_reward": 0.19419644586741924, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2976 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.8817291259766, + "epoch": 0.8892539765514151, + "grad_norm": 0.6005621552467346, + "kl": 1.681640625, + "learning_rate": 7.354725418521947e-07, + "loss": 0.0618, + "reward": 0.6300223469734192, + "reward_std": 0.11703726090490818, + "rewards/accuracy_reward": 0.14508929289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 2977 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.325927734375, + "epoch": 0.8895526846389366, + "grad_norm": 0.5826957821846008, + "kl": 1.0576171875, + "learning_rate": 7.315515185347222e-07, + "loss": 0.0678, + "reward": 0.6356027275323868, + "reward_std": 0.1406299527734518, + "rewards/accuracy_reward": 0.1473214365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 2978 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.3326416015625, + "epoch": 0.8898513927264581, + "grad_norm": 1.0182300806045532, + "kl": 1.51953125, + "learning_rate": 7.276405783807894e-07, + "loss": 0.058, + "reward": 0.5747768133878708, + "reward_std": 0.1465056575834751, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.483258955180645, + "step": 2979 + }, + { + "clip_ratio": 0.0, + "completion_length": 970.2187957763672, + "epoch": 0.8901501008139795, + "grad_norm": 0.6962528228759766, + "kl": 1.07958984375, + "learning_rate": 7.237397256451195e-07, + "loss": 0.0409, + "reward": 0.5407366305589676, + "reward_std": 0.12716237269341946, + "rewards/accuracy_reward": 0.049107144586741924, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2980 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.0201263427734, + "epoch": 0.890448808901501, + "grad_norm": 2.067512035369873, + "kl": 1.1943359375, + "learning_rate": 7.198489645714579e-07, + "loss": 0.0461, + "reward": 0.5407366305589676, + "reward_std": 0.08513482939451933, + "rewards/accuracy_reward": 0.053571431431919336, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 2981 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.982177734375, + "epoch": 0.8907475169890224, + "grad_norm": 0.715421199798584, + "kl": 1.03515625, + "learning_rate": 7.159682993925687e-07, + "loss": 0.0503, + "reward": 0.6422991305589676, + "reward_std": 0.13499606028199196, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2982 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.5803833007812, + "epoch": 0.891046225076544, + "grad_norm": 0.398294597864151, + "kl": 1.0283203125, + "learning_rate": 7.12097734330236e-07, + "loss": 0.0392, + "reward": 0.5479910969734192, + "reward_std": 0.12597984820604324, + "rewards/accuracy_reward": 0.058035716880112886, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553656578064, + "step": 2983 + }, + { + "clip_ratio": 0.0, + "completion_length": 970.6317443847656, + "epoch": 0.8913449331640654, + "grad_norm": 0.598031222820282, + "kl": 1.056640625, + "learning_rate": 7.082372735952591e-07, + "loss": 0.0434, + "reward": 0.576450914144516, + "reward_std": 0.13753657042980194, + "rewards/accuracy_reward": 0.08482143213041127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2984 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.1451263427734, + "epoch": 0.8916436412515869, + "grad_norm": 0.6370338797569275, + "kl": 1.1435546875, + "learning_rate": 7.043869213874355e-07, + "loss": 0.0207, + "reward": 0.5189732536673546, + "reward_std": 0.08801863063126802, + "rewards/accuracy_reward": 0.029017859371379018, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 2985 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.1339721679688, + "epoch": 0.8919423493391083, + "grad_norm": 1.1723687648773193, + "kl": 1.2919921875, + "learning_rate": 7.005466818955753e-07, + "loss": 0.0556, + "reward": 0.5580357313156128, + "reward_std": 0.1333716195076704, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 2986 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.3839721679688, + "epoch": 0.8922410574266298, + "grad_norm": 0.931461751461029, + "kl": 0.98291015625, + "learning_rate": 6.96716559297479e-07, + "loss": 0.0402, + "reward": 0.5217634215950966, + "reward_std": 0.08811180107295513, + "rewards/accuracy_reward": 0.03125000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 2987 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.2321929931641, + "epoch": 0.8925397655141513, + "grad_norm": 0.6416184306144714, + "kl": 0.8955078125, + "learning_rate": 6.928965577599467e-07, + "loss": 0.0534, + "reward": 0.6339286118745804, + "reward_std": 0.10116825997829437, + "rewards/accuracy_reward": 0.14285715040750802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 2988 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.6562957763672, + "epoch": 0.8928384736016728, + "grad_norm": 2.103692054748535, + "kl": 1.1865234375, + "learning_rate": 6.890866814387676e-07, + "loss": 0.0524, + "reward": 0.5691964626312256, + "reward_std": 0.07652874197810888, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750298023224, + "step": 2989 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.7143249511719, + "epoch": 0.8931371816891942, + "grad_norm": 0.5776629447937012, + "kl": 0.9345703125, + "learning_rate": 6.852869344787084e-07, + "loss": 0.0372, + "reward": 0.6149553954601288, + "reward_std": 0.13191011920571327, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 2990 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.4353179931641, + "epoch": 0.8934358897767157, + "grad_norm": 0.8424273133277893, + "kl": 1.2939453125, + "learning_rate": 6.814973210135256e-07, + "loss": 0.0572, + "reward": 0.609375037252903, + "reward_std": 0.11897914204746485, + "rewards/accuracy_reward": 0.12276786682195961, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 2991 + }, + { + "clip_ratio": 0.0, + "completion_length": 932.9531707763672, + "epoch": 0.8937345978642371, + "grad_norm": 0.7473775148391724, + "kl": 0.9296875, + "learning_rate": 6.777178451659472e-07, + "loss": 0.0483, + "reward": 0.549107164144516, + "reward_std": 0.11911411955952644, + "rewards/accuracy_reward": 0.06026785937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 2992 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.3482666015625, + "epoch": 0.8940333059517587, + "grad_norm": 1.093194842338562, + "kl": 0.673828125, + "learning_rate": 6.739485110476707e-07, + "loss": 0.0308, + "reward": 0.5379464626312256, + "reward_std": 0.06908553559333086, + "rewards/accuracy_reward": 0.04464285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 2993 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.5156860351562, + "epoch": 0.8943320140392801, + "grad_norm": 0.477359414100647, + "kl": 1.123046875, + "learning_rate": 6.701893227593614e-07, + "loss": 0.0367, + "reward": 0.5429687723517418, + "reward_std": 0.07659096363931894, + "rewards/accuracy_reward": 0.05133928661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 2994 + }, + { + "clip_ratio": 0.0, + "completion_length": 970.904052734375, + "epoch": 0.8946307221268016, + "grad_norm": 0.8433018326759338, + "kl": 0.8974609375, + "learning_rate": 6.664402843906515e-07, + "loss": 0.0415, + "reward": 0.5474330559372902, + "reward_std": 0.1176627166569233, + "rewards/accuracy_reward": 0.0558035746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 2995 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.9844207763672, + "epoch": 0.894929430214323, + "grad_norm": 0.9945914149284363, + "kl": 0.83154296875, + "learning_rate": 6.627014000201237e-07, + "loss": 0.0297, + "reward": 0.6545759290456772, + "reward_std": 0.12717946711927652, + "rewards/accuracy_reward": 0.16071429336443543, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 2996 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.4933624267578, + "epoch": 0.8952281383018446, + "grad_norm": 0.5308420658111572, + "kl": 0.54736328125, + "learning_rate": 6.58972673715319e-07, + "loss": 0.0257, + "reward": 0.5351562798023224, + "reward_std": 0.07963654212653637, + "rewards/accuracy_reward": 0.0401785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 2997 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.3728179931641, + "epoch": 0.895526846389366, + "grad_norm": 0.6190487742424011, + "kl": 0.7275390625, + "learning_rate": 6.552541095327281e-07, + "loss": 0.0223, + "reward": 0.5373884290456772, + "reward_std": 0.07965423166751862, + "rewards/accuracy_reward": 0.04241071757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 2998 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.5312805175781, + "epoch": 0.8958255544768875, + "grad_norm": 0.4398040771484375, + "kl": 0.87353515625, + "learning_rate": 6.515457115177804e-07, + "loss": 0.0607, + "reward": 0.5591518208384514, + "reward_std": 0.11837146803736687, + "rewards/accuracy_reward": 0.06919643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 2999 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.7969207763672, + "epoch": 0.8961242625644089, + "grad_norm": 1.1439217329025269, + "kl": 1.158203125, + "learning_rate": 6.478474837048532e-07, + "loss": 0.0485, + "reward": 0.5987723544239998, + "reward_std": 0.08756178058683872, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 3000 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.7433471679688, + "epoch": 0.8964229706519304, + "grad_norm": 0.9653938412666321, + "kl": 0.85888671875, + "learning_rate": 6.441594301172527e-07, + "loss": 0.0232, + "reward": 0.5898437798023224, + "reward_std": 0.10417013708502054, + "rewards/accuracy_reward": 0.10267857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 3001 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.6473846435547, + "epoch": 0.8967216787394519, + "grad_norm": 0.924572229385376, + "kl": 1.4453125, + "learning_rate": 6.404815547672216e-07, + "loss": 0.0597, + "reward": 0.567522332072258, + "reward_std": 0.10519436094909906, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330484867096, + "step": 3002 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.0804138183594, + "epoch": 0.8970203868269734, + "grad_norm": 0.557506263256073, + "kl": 1.1796875, + "learning_rate": 6.368138616559283e-07, + "loss": 0.0541, + "reward": 0.5630580633878708, + "reward_std": 0.1368188764899969, + "rewards/accuracy_reward": 0.0758928582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 3003 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.404052734375, + "epoch": 0.8973190949144948, + "grad_norm": 0.9617760181427002, + "kl": 1.1943359375, + "learning_rate": 6.331563547734621e-07, + "loss": 0.0367, + "reward": 0.5636160895228386, + "reward_std": 0.05805116845294833, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 3004 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.263427734375, + "epoch": 0.8976178030020163, + "grad_norm": 0.7072583436965942, + "kl": 1.2939453125, + "learning_rate": 6.295090380988323e-07, + "loss": 0.0632, + "reward": 0.5954241305589676, + "reward_std": 0.11965147778391838, + "rewards/accuracy_reward": 0.10714286123402417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 3005 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.8192443847656, + "epoch": 0.8979165110895377, + "grad_norm": 2.762784957885742, + "kl": 1.4453125, + "learning_rate": 6.258719155999637e-07, + "loss": 0.0616, + "reward": 0.6372767984867096, + "reward_std": 0.13367256708443165, + "rewards/accuracy_reward": 0.14732143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 3006 + }, + { + "clip_ratio": 0.0, + "completion_length": 988.2210235595703, + "epoch": 0.8982152191770593, + "grad_norm": 1.0406132936477661, + "kl": 0.99462890625, + "learning_rate": 6.222449912336859e-07, + "loss": 0.0418, + "reward": 0.598214328289032, + "reward_std": 0.13808834180235863, + "rewards/accuracy_reward": 0.10937500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 3007 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.2232513427734, + "epoch": 0.8985139272645807, + "grad_norm": 1.1207934617996216, + "kl": 1.5087890625, + "learning_rate": 6.18628268945739e-07, + "loss": 0.0796, + "reward": 0.651785746216774, + "reward_std": 0.1436621230095625, + "rewards/accuracy_reward": 0.16294644260779023, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 3008 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.7254791259766, + "epoch": 0.8988126353521022, + "grad_norm": 1.02571702003479, + "kl": 1.52734375, + "learning_rate": 6.150217526707636e-07, + "loss": 0.0752, + "reward": 0.5669643133878708, + "reward_std": 0.10720442794263363, + "rewards/accuracy_reward": 0.08258928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4843750149011612, + "step": 3009 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.6607513427734, + "epoch": 0.8991113434396236, + "grad_norm": 0.6845847368240356, + "kl": 1.498046875, + "learning_rate": 6.114254463322933e-07, + "loss": 0.0692, + "reward": 0.5792410969734192, + "reward_std": 0.19170472584664822, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910895228386, + "step": 3010 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.107177734375, + "epoch": 0.8994100515271451, + "grad_norm": 1.460266351699829, + "kl": 1.23828125, + "learning_rate": 6.078393538427574e-07, + "loss": 0.0536, + "reward": 0.5987723469734192, + "reward_std": 0.0821471493691206, + "rewards/accuracy_reward": 0.11160714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 3011 + }, + { + "clip_ratio": 0.0, + "completion_length": 983.5848541259766, + "epoch": 0.8997087596146666, + "grad_norm": 0.5150672197341919, + "kl": 1.1025390625, + "learning_rate": 6.042634791034763e-07, + "loss": 0.0409, + "reward": 0.6021205633878708, + "reward_std": 0.09728556242771447, + "rewards/accuracy_reward": 0.11160714644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 3012 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.2656707763672, + "epoch": 0.9000074677021881, + "grad_norm": 0.6420621275901794, + "kl": 0.55615234375, + "learning_rate": 6.00697826004647e-07, + "loss": 0.014, + "reward": 0.7248884290456772, + "reward_std": 0.13625280745327473, + "rewards/accuracy_reward": 0.22991072572767735, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3013 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.388427734375, + "epoch": 0.9003061757897095, + "grad_norm": 0.8351077437400818, + "kl": 1.2568359375, + "learning_rate": 5.971423984253544e-07, + "loss": 0.0582, + "reward": 0.5797991305589676, + "reward_std": 0.1331538762897253, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 3014 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.5312805175781, + "epoch": 0.9006048838772309, + "grad_norm": 1.251711368560791, + "kl": 1.291015625, + "learning_rate": 5.93597200233551e-07, + "loss": 0.0476, + "reward": 0.556919664144516, + "reward_std": 0.11987122613936663, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854910969734192, + "step": 3015 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.2321929931641, + "epoch": 0.9009035919647524, + "grad_norm": 0.48867735266685486, + "kl": 1.173828125, + "learning_rate": 5.900622352860675e-07, + "loss": 0.0561, + "reward": 0.6668527126312256, + "reward_std": 0.11922712996602058, + "rewards/accuracy_reward": 0.1808035857975483, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 3016 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.9687805175781, + "epoch": 0.9012023000522739, + "grad_norm": 0.5604292154312134, + "kl": 0.7783203125, + "learning_rate": 5.865375074286006e-07, + "loss": 0.0342, + "reward": 0.5859375149011612, + "reward_std": 0.0682257090229541, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3017 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.1741333007812, + "epoch": 0.9015010081397954, + "grad_norm": 0.78183913230896, + "kl": 1.609375, + "learning_rate": 5.830230204957044e-07, + "loss": 0.0763, + "reward": 0.6104910969734192, + "reward_std": 0.13934778235852718, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4832589477300644, + "step": 3018 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.1116485595703, + "epoch": 0.9017997162273168, + "grad_norm": 1.0623071193695068, + "kl": 1.13427734375, + "learning_rate": 5.795187783108003e-07, + "loss": 0.048, + "reward": 0.5680803880095482, + "reward_std": 0.08344213431701064, + "rewards/accuracy_reward": 0.0758928582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3019 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.2210235595703, + "epoch": 0.9020984243148383, + "grad_norm": 0.502132773399353, + "kl": 0.88720703125, + "learning_rate": 5.7602478468616e-07, + "loss": 0.0293, + "reward": 0.7087053954601288, + "reward_std": 0.16809117794036865, + "rewards/accuracy_reward": 0.2187500111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553880095482, + "step": 3020 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.2500457763672, + "epoch": 0.9023971324023597, + "grad_norm": 0.9869541525840759, + "kl": 1.3203125, + "learning_rate": 5.72541043422904e-07, + "loss": 0.0396, + "reward": 0.589285746216774, + "reward_std": 0.10158035159111023, + "rewards/accuracy_reward": 0.10267857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 3021 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.2433319091797, + "epoch": 0.9026958404898813, + "grad_norm": 2.066906690597534, + "kl": 1.63671875, + "learning_rate": 5.690675583110028e-07, + "loss": 0.0713, + "reward": 0.6316964477300644, + "reward_std": 0.16599159315228462, + "rewards/accuracy_reward": 0.145089291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 3022 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.1674652099609, + "epoch": 0.9029945485774027, + "grad_norm": 0.6832757592201233, + "kl": 1.1826171875, + "learning_rate": 5.656043331292682e-07, + "loss": 0.0365, + "reward": 0.6037946790456772, + "reward_std": 0.10444340202957392, + "rewards/accuracy_reward": 0.113839291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 3023 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.7924652099609, + "epoch": 0.9032932566649242, + "grad_norm": 0.6782937049865723, + "kl": 1.3701171875, + "learning_rate": 5.621513716453475e-07, + "loss": 0.0577, + "reward": 0.5518973469734192, + "reward_std": 0.13335948809981346, + "rewards/accuracy_reward": 0.06696428940631449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 3024 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.3281555175781, + "epoch": 0.9035919647524456, + "grad_norm": 1.0271466970443726, + "kl": 1.658203125, + "learning_rate": 5.58708677615728e-07, + "loss": 0.0747, + "reward": 0.6266741454601288, + "reward_std": 0.1387605518102646, + "rewards/accuracy_reward": 0.14062500977888703, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 3025 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.3951263427734, + "epoch": 0.9038906728399672, + "grad_norm": 0.9275954961776733, + "kl": 1.0, + "learning_rate": 5.552762547857194e-07, + "loss": 0.044, + "reward": 0.514508955180645, + "reward_std": 0.09893217030912638, + "rewards/accuracy_reward": 0.026785715948790312, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487723246216774, + "step": 3026 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.8013916015625, + "epoch": 0.9041893809274886, + "grad_norm": 0.9190865159034729, + "kl": 1.0849609375, + "learning_rate": 5.518541068894622e-07, + "loss": 0.0477, + "reward": 0.5736607313156128, + "reward_std": 0.10207655094563961, + "rewards/accuracy_reward": 0.08258928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3027 + }, + { + "clip_ratio": 0.0, + "completion_length": 964.7455749511719, + "epoch": 0.9044880890150101, + "grad_norm": 0.8912606835365295, + "kl": 1.41796875, + "learning_rate": 5.484422376499222e-07, + "loss": 0.064, + "reward": 0.577566996216774, + "reward_std": 0.05910502839833498, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 3028 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.7232666015625, + "epoch": 0.9047867971025315, + "grad_norm": 0.628538966178894, + "kl": 0.8271484375, + "learning_rate": 5.45040650778873e-07, + "loss": 0.0314, + "reward": 0.5753348469734192, + "reward_std": 0.13802815973758698, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 3029 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.7656707763672, + "epoch": 0.905085505190053, + "grad_norm": 0.9517306685447693, + "kl": 0.9619140625, + "learning_rate": 5.416493499769094e-07, + "loss": 0.0441, + "reward": 0.548549123108387, + "reward_std": 0.06040308764204383, + "rewards/accuracy_reward": 0.05803571827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3030 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.2991485595703, + "epoch": 0.9053842132775745, + "grad_norm": 0.7224648594856262, + "kl": 0.77197265625, + "learning_rate": 5.382683389334375e-07, + "loss": 0.0412, + "reward": 0.667410746216774, + "reward_std": 0.1315438123419881, + "rewards/accuracy_reward": 0.1741071455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3031 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.3817291259766, + "epoch": 0.905682921365096, + "grad_norm": 0.5741225481033325, + "kl": 0.9326171875, + "learning_rate": 5.348976213266621e-07, + "loss": 0.0417, + "reward": 0.5329241305589676, + "reward_std": 0.11198752373456955, + "rewards/accuracy_reward": 0.04017857299186289, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 3032 + }, + { + "clip_ratio": 0.0, + "completion_length": 981.8683471679688, + "epoch": 0.9059816294526174, + "grad_norm": 1.3820478916168213, + "kl": 0.7333984375, + "learning_rate": 5.315372008235941e-07, + "loss": 0.0442, + "reward": 0.529575914144516, + "reward_std": 0.09651313023641706, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 3033 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.8906555175781, + "epoch": 0.9062803375401389, + "grad_norm": 1.4880481958389282, + "kl": 0.73828125, + "learning_rate": 5.28187081080045e-07, + "loss": 0.0426, + "reward": 0.5429687798023224, + "reward_std": 0.0980567317456007, + "rewards/accuracy_reward": 0.04910714505240321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 3034 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.9286041259766, + "epoch": 0.9065790456276603, + "grad_norm": 1.06563138961792, + "kl": 1.3388671875, + "learning_rate": 5.248472657406123e-07, + "loss": 0.0849, + "reward": 0.6266741156578064, + "reward_std": 0.14238131325691938, + "rewards/accuracy_reward": 0.14062500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 3035 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.3192443847656, + "epoch": 0.9068777537151819, + "grad_norm": 0.8510280847549438, + "kl": 0.7841796875, + "learning_rate": 5.2151775843869e-07, + "loss": 0.0268, + "reward": 0.6010044813156128, + "reward_std": 0.09816442616283894, + "rewards/accuracy_reward": 0.10937500651925802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 3036 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.2746124267578, + "epoch": 0.9071764618027033, + "grad_norm": 1.663138747215271, + "kl": 1.068359375, + "learning_rate": 5.181985627964559e-07, + "loss": 0.0494, + "reward": 0.6601562798023224, + "reward_std": 0.1920577436685562, + "rewards/accuracy_reward": 0.16964286845177412, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 3037 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.4286193847656, + "epoch": 0.9074751698902248, + "grad_norm": 0.773502767086029, + "kl": 0.9912109375, + "learning_rate": 5.148896824248683e-07, + "loss": 0.045, + "reward": 0.6127232313156128, + "reward_std": 0.08574800472706556, + "rewards/accuracy_reward": 0.12276786426082253, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 3038 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.7768249511719, + "epoch": 0.9077738779777462, + "grad_norm": 1.4146097898483276, + "kl": 1.064453125, + "learning_rate": 5.115911209236669e-07, + "loss": 0.0539, + "reward": 0.5937500298023224, + "reward_std": 0.09762310422956944, + "rewards/accuracy_reward": 0.10491072107106447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 3039 + }, + { + "clip_ratio": 0.0, + "completion_length": 927.0446929931641, + "epoch": 0.9080725860652678, + "grad_norm": 1.0457763671875, + "kl": 1.326171875, + "learning_rate": 5.083028818813607e-07, + "loss": 0.0639, + "reward": 0.5502232387661934, + "reward_std": 0.09309038706123829, + "rewards/accuracy_reward": 0.06250000302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 3040 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.1094055175781, + "epoch": 0.9083712941527892, + "grad_norm": 0.6280717253684998, + "kl": 0.8525390625, + "learning_rate": 5.050249688752329e-07, + "loss": 0.0324, + "reward": 0.5574777126312256, + "reward_std": 0.10077444277703762, + "rewards/accuracy_reward": 0.06696428777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3041 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.1629943847656, + "epoch": 0.9086700022403107, + "grad_norm": 1.1407926082611084, + "kl": 1.1630859375, + "learning_rate": 5.01757385471332e-07, + "loss": 0.0577, + "reward": 0.6484375298023224, + "reward_std": 0.10841815918684006, + "rewards/accuracy_reward": 0.1584821492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 3042 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.154052734375, + "epoch": 0.9089687103278321, + "grad_norm": 1.1398319005966187, + "kl": 0.87890625, + "learning_rate": 4.985001352244667e-07, + "loss": 0.0372, + "reward": 0.636160746216774, + "reward_std": 0.11372539959847927, + "rewards/accuracy_reward": 0.14285715389996767, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3043 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.8237152099609, + "epoch": 0.9092674184153536, + "grad_norm": 0.8719400763511658, + "kl": 1.3310546875, + "learning_rate": 4.95253221678208e-07, + "loss": 0.0576, + "reward": 0.5770089700818062, + "reward_std": 0.11895668134093285, + "rewards/accuracy_reward": 0.08705357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 3044 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.8772735595703, + "epoch": 0.909566126502875, + "grad_norm": 0.6086107492446899, + "kl": 0.908203125, + "learning_rate": 4.920166483648792e-07, + "loss": 0.0488, + "reward": 0.607700914144516, + "reward_std": 0.10433235764503479, + "rewards/accuracy_reward": 0.1160714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491629496216774, + "step": 3045 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.4263916015625, + "epoch": 0.9098648345903966, + "grad_norm": 0.597335934638977, + "kl": 0.830078125, + "learning_rate": 4.887904188055537e-07, + "loss": 0.0429, + "reward": 0.521763414144516, + "reward_std": 0.05068964418023825, + "rewards/accuracy_reward": 0.0290178582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 3046 + }, + { + "clip_ratio": 0.0, + "completion_length": 970.5870819091797, + "epoch": 0.910163542677918, + "grad_norm": 1.09651780128479, + "kl": 1.47607421875, + "learning_rate": 4.855745365100539e-07, + "loss": 0.0579, + "reward": 0.5385044813156128, + "reward_std": 0.10121782496571541, + "rewards/accuracy_reward": 0.051339287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 3047 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.0536193847656, + "epoch": 0.9104622507654395, + "grad_norm": 0.5116015672683716, + "kl": 1.123046875, + "learning_rate": 4.823690049769448e-07, + "loss": 0.0569, + "reward": 0.6406250298023224, + "reward_std": 0.10725001245737076, + "rewards/accuracy_reward": 0.14955357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 3048 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.716552734375, + "epoch": 0.9107609588529609, + "grad_norm": 0.43305137753486633, + "kl": 0.77392578125, + "learning_rate": 4.791738276935299e-07, + "loss": 0.0376, + "reward": 0.5837053805589676, + "reward_std": 0.07918262388557196, + "rewards/accuracy_reward": 0.09151786030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 3049 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.3348541259766, + "epoch": 0.9110596669404825, + "grad_norm": 0.9251819849014282, + "kl": 1.0986328125, + "learning_rate": 4.759890081358487e-07, + "loss": 0.0501, + "reward": 0.6601562798023224, + "reward_std": 0.11938202381134033, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 3050 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.4464721679688, + "epoch": 0.9113583750280039, + "grad_norm": 0.9046303033828735, + "kl": 1.291015625, + "learning_rate": 4.7281454976867535e-07, + "loss": 0.0514, + "reward": 0.5647321790456772, + "reward_std": 0.10839549265801907, + "rewards/accuracy_reward": 0.0736607180442661, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 3051 + }, + { + "clip_ratio": 0.0, + "completion_length": 982.4576416015625, + "epoch": 0.9116570831155254, + "grad_norm": 1.673284649848938, + "kl": 1.740234375, + "learning_rate": 4.696504560455051e-07, + "loss": 0.0554, + "reward": 0.5195312947034836, + "reward_std": 0.12398836389183998, + "rewards/accuracy_reward": 0.03571428754366934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4838169887661934, + "step": 3052 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.7567443847656, + "epoch": 0.9119557912030468, + "grad_norm": 0.8142789006233215, + "kl": 1.333984375, + "learning_rate": 4.664967304085655e-07, + "loss": 0.0493, + "reward": 0.624441996216774, + "reward_std": 0.09857665002346039, + "rewards/accuracy_reward": 0.1361607196740806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 3053 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.4553985595703, + "epoch": 0.9122544992905683, + "grad_norm": 0.42913275957107544, + "kl": 1.28662109375, + "learning_rate": 4.6335337628879874e-07, + "loss": 0.0522, + "reward": 0.5390625223517418, + "reward_std": 0.08919381257146597, + "rewards/accuracy_reward": 0.04910714412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 3054 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.6719055175781, + "epoch": 0.9125532073780898, + "grad_norm": 1.2197167873382568, + "kl": 1.484375, + "learning_rate": 4.602203971058661e-07, + "loss": 0.0741, + "reward": 0.6339286118745804, + "reward_std": 0.10722060315310955, + "rewards/accuracy_reward": 0.14732143515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486607164144516, + "step": 3055 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.2745971679688, + "epoch": 0.9128519154656113, + "grad_norm": 0.4300071597099304, + "kl": 0.70556640625, + "learning_rate": 4.570977962681444e-07, + "loss": 0.038, + "reward": 0.6830357313156128, + "reward_std": 0.13537612301297486, + "rewards/accuracy_reward": 0.1941964365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 3056 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.1674499511719, + "epoch": 0.9131506235531327, + "grad_norm": 0.7247112989425659, + "kl": 1.22265625, + "learning_rate": 4.539855771727131e-07, + "loss": 0.0438, + "reward": 0.6679687649011612, + "reward_std": 0.11400537006556988, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 3057 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.3192291259766, + "epoch": 0.9134493316406541, + "grad_norm": 0.49619191884994507, + "kl": 1.0986328125, + "learning_rate": 4.508837432053648e-07, + "loss": 0.0393, + "reward": 0.5357143133878708, + "reward_std": 0.05479595740325749, + "rewards/accuracy_reward": 0.04464285937137902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3058 + }, + { + "clip_ratio": 0.0, + "completion_length": 987.9576416015625, + "epoch": 0.9137480397281756, + "grad_norm": 0.5624633431434631, + "kl": 0.8193359375, + "learning_rate": 4.477922977405913e-07, + "loss": 0.0411, + "reward": 0.5385044813156128, + "reward_std": 0.1072362158447504, + "rewards/accuracy_reward": 0.044642860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 3059 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.9286193847656, + "epoch": 0.9140467478156971, + "grad_norm": 0.8861991167068481, + "kl": 1.0166015625, + "learning_rate": 4.4471124414157905e-07, + "loss": 0.0578, + "reward": 0.612723246216774, + "reward_std": 0.10113495029509068, + "rewards/accuracy_reward": 0.12053571688011289, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3060 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.4576416015625, + "epoch": 0.9143454559032186, + "grad_norm": 0.978590190410614, + "kl": 1.0703125, + "learning_rate": 4.4164058576021464e-07, + "loss": 0.0554, + "reward": 0.6138393133878708, + "reward_std": 0.1670653447508812, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3061 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.6428985595703, + "epoch": 0.91464416399074, + "grad_norm": 0.40806519985198975, + "kl": 0.7470703125, + "learning_rate": 4.3858032593707357e-07, + "loss": 0.04, + "reward": 0.5306919813156128, + "reward_std": 0.0656590098515153, + "rewards/accuracy_reward": 0.035714288242161274, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3062 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.4129791259766, + "epoch": 0.9149428720782615, + "grad_norm": 0.371139258146286, + "kl": 1.1708984375, + "learning_rate": 4.355304680014172e-07, + "loss": 0.0441, + "reward": 0.5742187574505806, + "reward_std": 0.1195173179730773, + "rewards/accuracy_reward": 0.08482143026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 3063 + }, + { + "clip_ratio": 0.0, + "completion_length": 936.2567443847656, + "epoch": 0.9152415801657829, + "grad_norm": 0.9405346512794495, + "kl": 0.8427734375, + "learning_rate": 4.3249101527119253e-07, + "loss": 0.0509, + "reward": 0.604352705180645, + "reward_std": 0.12464707437902689, + "rewards/accuracy_reward": 0.1116071492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3064 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.8281707763672, + "epoch": 0.9155402882533045, + "grad_norm": 0.7174206972122192, + "kl": 0.92578125, + "learning_rate": 4.29461971053029e-07, + "loss": 0.0381, + "reward": 0.6244419887661934, + "reward_std": 0.08741713687777519, + "rewards/accuracy_reward": 0.13169643748551607, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3065 + }, + { + "clip_ratio": 0.0, + "completion_length": 979.8214874267578, + "epoch": 0.9158389963408259, + "grad_norm": 0.5990934371948242, + "kl": 0.8857421875, + "learning_rate": 4.264433386422251e-07, + "loss": 0.0408, + "reward": 0.5915178656578064, + "reward_std": 0.11067066807299852, + "rewards/accuracy_reward": 0.09821429289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 3066 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.2500457763672, + "epoch": 0.9161377044283474, + "grad_norm": 0.49960824847221375, + "kl": 0.902587890625, + "learning_rate": 4.2343512132276055e-07, + "loss": 0.0406, + "reward": 0.6462053805589676, + "reward_std": 0.08200768940150738, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3067 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.3616485595703, + "epoch": 0.9164364125158688, + "grad_norm": 1.5284011363983154, + "kl": 1.173828125, + "learning_rate": 4.2043732236727973e-07, + "loss": 0.0644, + "reward": 0.588169664144516, + "reward_std": 0.12389116920530796, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 3068 + }, + { + "clip_ratio": 0.0, + "completion_length": 933.1830596923828, + "epoch": 0.9167351206033904, + "grad_norm": 2.157428026199341, + "kl": 1.173828125, + "learning_rate": 4.1744994503709277e-07, + "loss": 0.0569, + "reward": 0.6138393133878708, + "reward_std": 0.1471104733645916, + "rewards/accuracy_reward": 0.12276786286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3069 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.5067291259766, + "epoch": 0.9170338286909118, + "grad_norm": 0.3677796721458435, + "kl": 0.7978515625, + "learning_rate": 4.144729925821767e-07, + "loss": 0.0256, + "reward": 0.563058078289032, + "reward_std": 0.08198198489844799, + "rewards/accuracy_reward": 0.07142857392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 3070 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.9353179931641, + "epoch": 0.9173325367784333, + "grad_norm": 0.944822371006012, + "kl": 0.63720703125, + "learning_rate": 4.115064682411607e-07, + "loss": 0.0351, + "reward": 0.603794664144516, + "reward_std": 0.12193467514589429, + "rewards/accuracy_reward": 0.1093750074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 3071 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.9152069091797, + "epoch": 0.9176312448659547, + "grad_norm": 0.6442558765411377, + "kl": 0.8447265625, + "learning_rate": 4.0855037524133443e-07, + "loss": 0.0392, + "reward": 0.5396205484867096, + "reward_std": 0.07497548614628613, + "rewards/accuracy_reward": 0.04464286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3072 + }, + { + "clip_ratio": 0.0, + "completion_length": 993.0156707763672, + "epoch": 0.9179299529534762, + "grad_norm": 0.44090378284454346, + "kl": 0.9443359375, + "learning_rate": 4.0560471679863654e-07, + "loss": 0.0388, + "reward": 0.6277902126312256, + "reward_std": 0.11575744859874249, + "rewards/accuracy_reward": 0.13392857741564512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 3073 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.4777069091797, + "epoch": 0.9182286610409977, + "grad_norm": 0.6058793663978577, + "kl": 0.77978515625, + "learning_rate": 4.026694961176547e-07, + "loss": 0.0424, + "reward": 0.6272321790456772, + "reward_std": 0.12974157370626926, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 3074 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.9687805175781, + "epoch": 0.9185273691285192, + "grad_norm": 0.3590560555458069, + "kl": 0.7626953125, + "learning_rate": 3.9974471639162236e-07, + "loss": 0.0272, + "reward": 0.5976562798023224, + "reward_std": 0.09852251410484314, + "rewards/accuracy_reward": 0.1049107201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3075 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.7768249511719, + "epoch": 0.9188260772160406, + "grad_norm": 1.508314609527588, + "kl": 0.564453125, + "learning_rate": 3.968303808024121e-07, + "loss": 0.0223, + "reward": 0.6690848618745804, + "reward_std": 0.1263184007257223, + "rewards/accuracy_reward": 0.1763392984867096, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3076 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.0647888183594, + "epoch": 0.9191247853035621, + "grad_norm": 1.898552417755127, + "kl": 0.91943359375, + "learning_rate": 3.939264925205355e-07, + "loss": 0.0439, + "reward": 0.6250000298023224, + "reward_std": 0.15626974031329155, + "rewards/accuracy_reward": 0.13616071874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 3077 + }, + { + "clip_ratio": 0.0, + "completion_length": 933.4486846923828, + "epoch": 0.9194234933910835, + "grad_norm": 0.5412580966949463, + "kl": 0.669921875, + "learning_rate": 3.910330547051389e-07, + "loss": 0.0357, + "reward": 0.5636160969734192, + "reward_std": 0.09762244392186403, + "rewards/accuracy_reward": 0.07142857438884676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 3078 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.3750457763672, + "epoch": 0.9197222014786051, + "grad_norm": 0.7298544049263, + "kl": 1.134765625, + "learning_rate": 3.881500705039998e-07, + "loss": 0.0485, + "reward": 0.593750037252903, + "reward_std": 0.12308035604655743, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 3079 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.4397735595703, + "epoch": 0.9200209095661265, + "grad_norm": 0.35155388712882996, + "kl": 1.1337890625, + "learning_rate": 3.852775430535194e-07, + "loss": 0.0478, + "reward": 0.6529017984867096, + "reward_std": 0.1356817283667624, + "rewards/accuracy_reward": 0.16294643771834671, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 3080 + }, + { + "clip_ratio": 0.0, + "completion_length": 986.7054138183594, + "epoch": 0.920319617653648, + "grad_norm": 0.9772230386734009, + "kl": 0.941650390625, + "learning_rate": 3.8241547547873016e-07, + "loss": 0.0328, + "reward": 0.5128348544239998, + "reward_std": 0.08541021961718798, + "rewards/accuracy_reward": 0.022321430267766118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3081 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.0402221679688, + "epoch": 0.9206183257411694, + "grad_norm": 1.1142394542694092, + "kl": 1.2998046875, + "learning_rate": 3.795638708932781e-07, + "loss": 0.0401, + "reward": 0.580357164144516, + "reward_std": 0.15638529881834984, + "rewards/accuracy_reward": 0.09375000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4866071715950966, + "step": 3082 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.7812957763672, + "epoch": 0.920917033828691, + "grad_norm": 0.4294557571411133, + "kl": 0.6220703125, + "learning_rate": 3.7672273239942936e-07, + "loss": 0.0276, + "reward": 0.659598246216774, + "reward_std": 0.12218674505129457, + "rewards/accuracy_reward": 0.16517857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 3083 + }, + { + "clip_ratio": 0.0, + "completion_length": 977.3616333007812, + "epoch": 0.9212157419162124, + "grad_norm": 0.44231748580932617, + "kl": 0.95703125, + "learning_rate": 3.738920630880671e-07, + "loss": 0.0335, + "reward": 0.6768973469734192, + "reward_std": 0.17728712037205696, + "rewards/accuracy_reward": 0.18750001303851604, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 3084 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.0960083007812, + "epoch": 0.9215144500037339, + "grad_norm": 0.45505353808403015, + "kl": 0.79248046875, + "learning_rate": 3.7107186603867917e-07, + "loss": 0.0397, + "reward": 0.678013414144516, + "reward_std": 0.10129961930215359, + "rewards/accuracy_reward": 0.18526786379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3085 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.8036193847656, + "epoch": 0.9218131580912553, + "grad_norm": 0.8631579279899597, + "kl": 0.8525390625, + "learning_rate": 3.682621443193635e-07, + "loss": 0.0323, + "reward": 0.589285746216774, + "reward_std": 0.10158853977918625, + "rewards/accuracy_reward": 0.09821428940631449, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3086 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.6830902099609, + "epoch": 0.9221118661787768, + "grad_norm": 0.4235527813434601, + "kl": 1.10888671875, + "learning_rate": 3.654629009868249e-07, + "loss": 0.0526, + "reward": 0.5212053954601288, + "reward_std": 0.095060670748353, + "rewards/accuracy_reward": 0.033482143422588706, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 3087 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.6049499511719, + "epoch": 0.9224105742662982, + "grad_norm": 1.2673851251602173, + "kl": 1.234375, + "learning_rate": 3.6267413908636304e-07, + "loss": 0.0682, + "reward": 0.5223214402794838, + "reward_std": 0.07568964222446084, + "rewards/accuracy_reward": 0.03348214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 3088 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.8147735595703, + "epoch": 0.9227092823538198, + "grad_norm": 0.389459490776062, + "kl": 0.87353515625, + "learning_rate": 3.5989586165187884e-07, + "loss": 0.0374, + "reward": 0.5937500223517418, + "reward_std": 0.10783475171774626, + "rewards/accuracy_reward": 0.10044643003493547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3089 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.9844207763672, + "epoch": 0.9230079904413412, + "grad_norm": 0.4522457420825958, + "kl": 0.89892578125, + "learning_rate": 3.571280717058656e-07, + "loss": 0.0295, + "reward": 0.6021205633878708, + "reward_std": 0.08581337332725525, + "rewards/accuracy_reward": 0.1093750074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 3090 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.9397735595703, + "epoch": 0.9233066985288627, + "grad_norm": 0.5683391690254211, + "kl": 0.605224609375, + "learning_rate": 3.54370772259407e-07, + "loss": 0.0257, + "reward": 0.611607164144516, + "reward_std": 0.06948084803298116, + "rewards/accuracy_reward": 0.11830357508733869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3091 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.7366485595703, + "epoch": 0.9236054066163841, + "grad_norm": 1.0306594371795654, + "kl": 0.96484375, + "learning_rate": 3.5162396631217453e-07, + "loss": 0.0401, + "reward": 0.5602678805589676, + "reward_std": 0.147691321792081, + "rewards/accuracy_reward": 0.06919643003493547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3092 + }, + { + "clip_ratio": 0.0, + "completion_length": 960.3214874267578, + "epoch": 0.9239041147039057, + "grad_norm": 0.3648022413253784, + "kl": 0.84912109375, + "learning_rate": 3.4888765685242465e-07, + "loss": 0.0499, + "reward": 0.650669664144516, + "reward_std": 0.12914189137518406, + "rewards/accuracy_reward": 0.1562500111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3093 + }, + { + "clip_ratio": 0.0, + "completion_length": 901.2053985595703, + "epoch": 0.9242028227914271, + "grad_norm": 0.6274372339248657, + "kl": 0.447265625, + "learning_rate": 3.4616184685699273e-07, + "loss": 0.0249, + "reward": 0.576450914144516, + "reward_std": 0.051149213686585426, + "rewards/accuracy_reward": 0.07812500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4983258992433548, + "step": 3094 + }, + { + "clip_ratio": 0.0, + "completion_length": 919.8504943847656, + "epoch": 0.9245015308789486, + "grad_norm": 0.9356869459152222, + "kl": 0.86328125, + "learning_rate": 3.4344653929129554e-07, + "loss": 0.0475, + "reward": 0.5708705633878708, + "reward_std": 0.09431107435375452, + "rewards/accuracy_reward": 0.0758928619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 3095 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.3839721679688, + "epoch": 0.92480023896647, + "grad_norm": 1.111446499824524, + "kl": 0.763427734375, + "learning_rate": 3.4074173710931804e-07, + "loss": 0.033, + "reward": 0.5731026977300644, + "reward_std": 0.11894579976797104, + "rewards/accuracy_reward": 0.08035714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 3096 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.0022888183594, + "epoch": 0.9250989470539915, + "grad_norm": 0.5479207038879395, + "kl": 1.1005859375, + "learning_rate": 3.380474432536207e-07, + "loss": 0.0519, + "reward": 0.5714286044239998, + "reward_std": 0.07649183692410588, + "rewards/accuracy_reward": 0.08035714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3097 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.1741638183594, + "epoch": 0.925397655141513, + "grad_norm": 0.3992994427680969, + "kl": 0.5419921875, + "learning_rate": 3.3536366065533456e-07, + "loss": 0.0282, + "reward": 0.549107164144516, + "reward_std": 0.07916155084967613, + "rewards/accuracy_reward": 0.05357143236324191, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 3098 + }, + { + "clip_ratio": 0.0, + "completion_length": 927.2656555175781, + "epoch": 0.9256963632290345, + "grad_norm": 0.9750297665596008, + "kl": 1.16015625, + "learning_rate": 3.326903922341473e-07, + "loss": 0.0582, + "reward": 0.6880580633878708, + "reward_std": 0.10740339197218418, + "rewards/accuracy_reward": 0.2008928656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 3099 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.7634429931641, + "epoch": 0.9259950713165559, + "grad_norm": 0.5354942679405212, + "kl": 0.974609375, + "learning_rate": 3.30027640898315e-07, + "loss": 0.0456, + "reward": 0.7304687798023224, + "reward_std": 0.11138071957975626, + "rewards/accuracy_reward": 0.2388392947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3100 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.6272888183594, + "epoch": 0.9262937794040773, + "grad_norm": 0.7432148456573486, + "kl": 1.04052734375, + "learning_rate": 3.2737540954465244e-07, + "loss": 0.0487, + "reward": 0.592075914144516, + "reward_std": 0.09584900829941034, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 3101 + }, + { + "clip_ratio": 0.0, + "completion_length": 933.1786041259766, + "epoch": 0.9265924874915988, + "grad_norm": 0.37439805269241333, + "kl": 0.57080078125, + "learning_rate": 3.247337010585228e-07, + "loss": 0.0207, + "reward": 0.5530134290456772, + "reward_std": 0.05414706142619252, + "rewards/accuracy_reward": 0.05803571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3102 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.7969055175781, + "epoch": 0.9268911955791203, + "grad_norm": 0.7643630504608154, + "kl": 0.951171875, + "learning_rate": 3.221025183138493e-07, + "loss": 0.0315, + "reward": 0.578683078289032, + "reward_std": 0.16406487300992012, + "rewards/accuracy_reward": 0.08482143376022577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 3103 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.2723693847656, + "epoch": 0.9271899036666418, + "grad_norm": 0.7802721261978149, + "kl": 1.3330078125, + "learning_rate": 3.194818641731012e-07, + "loss": 0.0481, + "reward": 0.6422991305589676, + "reward_std": 0.13486678712069988, + "rewards/accuracy_reward": 0.15625000605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.486049123108387, + "step": 3104 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.2344360351562, + "epoch": 0.9274886117541632, + "grad_norm": 0.7221817970275879, + "kl": 0.43408203125, + "learning_rate": 3.168717414872902e-07, + "loss": 0.0213, + "reward": 0.5591518133878708, + "reward_std": 0.10769847081974149, + "rewards/accuracy_reward": 0.0647321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3105 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.4754791259766, + "epoch": 0.9277873198416847, + "grad_norm": 1.2481627464294434, + "kl": 1.009765625, + "learning_rate": 3.1427215309597693e-07, + "loss": 0.0455, + "reward": 0.612723246216774, + "reward_std": 0.09700598753988743, + "rewards/accuracy_reward": 0.1205357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3106 + }, + { + "clip_ratio": 0.0, + "completion_length": 915.6607666015625, + "epoch": 0.9280860279292061, + "grad_norm": 0.5204891562461853, + "kl": 0.76123046875, + "learning_rate": 3.1168310182725814e-07, + "loss": 0.052, + "reward": 0.6160714477300644, + "reward_std": 0.05971228634007275, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 3107 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.4085235595703, + "epoch": 0.9283847360167277, + "grad_norm": 0.9361897706985474, + "kl": 1.0693359375, + "learning_rate": 3.0910459049776633e-07, + "loss": 0.0506, + "reward": 0.5429687649011612, + "reward_std": 0.05958366207778454, + "rewards/accuracy_reward": 0.051339287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 3108 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.6518249511719, + "epoch": 0.9286834441042491, + "grad_norm": 1.3455780744552612, + "kl": 0.7568359375, + "learning_rate": 3.0653662191267087e-07, + "loss": 0.0421, + "reward": 0.647879496216774, + "reward_std": 0.15083510428667068, + "rewards/accuracy_reward": 0.15625000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3109 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.4241638183594, + "epoch": 0.9289821521917706, + "grad_norm": 0.616645336151123, + "kl": 0.8193359375, + "learning_rate": 3.039791988656693e-07, + "loss": 0.0403, + "reward": 0.599330373108387, + "reward_std": 0.06049918895587325, + "rewards/accuracy_reward": 0.10491071827709675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 3110 + }, + { + "clip_ratio": 0.0, + "completion_length": 926.9687957763672, + "epoch": 0.929280860279292, + "grad_norm": 0.5699665546417236, + "kl": 0.8876953125, + "learning_rate": 3.0143232413898607e-07, + "loss": 0.0422, + "reward": 0.6143973618745804, + "reward_std": 0.10764958150684834, + "rewards/accuracy_reward": 0.12276786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3111 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.1518096923828, + "epoch": 0.9295795683668135, + "grad_norm": 0.9790916442871094, + "kl": 0.7412109375, + "learning_rate": 2.9889600050337363e-07, + "loss": 0.0354, + "reward": 0.6964286118745804, + "reward_std": 0.1549595221877098, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3112 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.325927734375, + "epoch": 0.929878276454335, + "grad_norm": 1.1712745428085327, + "kl": 0.7822265625, + "learning_rate": 2.9637023071810155e-07, + "loss": 0.0411, + "reward": 0.631138414144516, + "reward_std": 0.12023325311020017, + "rewards/accuracy_reward": 0.13839286286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 3113 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.2210388183594, + "epoch": 0.9301769845418565, + "grad_norm": 1.2569948434829712, + "kl": 0.8076171875, + "learning_rate": 2.938550175309607e-07, + "loss": 0.0364, + "reward": 0.5758928805589676, + "reward_std": 0.121150525752455, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3114 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.4062957763672, + "epoch": 0.9304756926293779, + "grad_norm": 0.5082544684410095, + "kl": 1.013671875, + "learning_rate": 2.9135036367825773e-07, + "loss": 0.054, + "reward": 0.5719866454601288, + "reward_std": 0.10609344765543938, + "rewards/accuracy_reward": 0.08258928824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 3115 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.3058471679688, + "epoch": 0.9307744007168994, + "grad_norm": 1.883124828338623, + "kl": 1.53125, + "learning_rate": 2.888562718848076e-07, + "loss": 0.0717, + "reward": 0.6729911118745804, + "reward_std": 0.16740089654922485, + "rewards/accuracy_reward": 0.18526786379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 3116 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.8303985595703, + "epoch": 0.9310731088044208, + "grad_norm": 1.0075831413269043, + "kl": 1.00830078125, + "learning_rate": 2.863727448639386e-07, + "loss": 0.0478, + "reward": 0.6902902126312256, + "reward_std": 0.14473623037338257, + "rewards/accuracy_reward": 0.1986607238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 3117 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.857177734375, + "epoch": 0.9313718168919424, + "grad_norm": 0.7922288775444031, + "kl": 1.0263671875, + "learning_rate": 2.838997853174874e-07, + "loss": 0.0344, + "reward": 0.5306919813156128, + "reward_std": 0.06785525009036064, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4860491305589676, + "step": 3118 + }, + { + "clip_ratio": 0.0, + "completion_length": 971.341552734375, + "epoch": 0.9316705249794638, + "grad_norm": 0.37654703855514526, + "kl": 0.757568359375, + "learning_rate": 2.8143739593578854e-07, + "loss": 0.0355, + "reward": 0.647879496216774, + "reward_std": 0.11550401244312525, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3119 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.7589721679688, + "epoch": 0.9319692330669853, + "grad_norm": 1.2812097072601318, + "kl": 1.32421875, + "learning_rate": 2.7898557939768254e-07, + "loss": 0.0678, + "reward": 0.584263414144516, + "reward_std": 0.08791511505842209, + "rewards/accuracy_reward": 0.0959821492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 3120 + }, + { + "clip_ratio": 0.0, + "completion_length": 978.0960083007812, + "epoch": 0.9322679411545067, + "grad_norm": 1.5157902240753174, + "kl": 0.693359375, + "learning_rate": 2.7654433837050245e-07, + "loss": 0.0342, + "reward": 0.589285746216774, + "reward_std": 0.1271588746458292, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3121 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.1451416015625, + "epoch": 0.9325666492420283, + "grad_norm": 1.2880183458328247, + "kl": 1.041015625, + "learning_rate": 2.741136755100815e-07, + "loss": 0.0726, + "reward": 0.6194196790456772, + "reward_std": 0.1429321044124663, + "rewards/accuracy_reward": 0.1294642947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 3122 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.9777374267578, + "epoch": 0.9328653573295497, + "grad_norm": 1.0133841037750244, + "kl": 0.7822265625, + "learning_rate": 2.7169359346074344e-07, + "loss": 0.0286, + "reward": 0.6434152126312256, + "reward_std": 0.1520934123545885, + "rewards/accuracy_reward": 0.1517857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 3123 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.9420013427734, + "epoch": 0.9331640654170712, + "grad_norm": 0.4721980392932892, + "kl": 0.58642578125, + "learning_rate": 2.6928409485529773e-07, + "loss": 0.0228, + "reward": 0.590959832072258, + "reward_std": 0.10648142546415329, + "rewards/accuracy_reward": 0.0959821455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 3124 + }, + { + "clip_ratio": 0.0, + "completion_length": 922.0111999511719, + "epoch": 0.9334627735045926, + "grad_norm": 1.2022863626480103, + "kl": 0.7021484375, + "learning_rate": 2.6688518231504535e-07, + "loss": 0.0276, + "reward": 0.655691996216774, + "reward_std": 0.11674227006733418, + "rewards/accuracy_reward": 0.16294643841683865, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.8370971679688, + "epoch": 0.9337614815921141, + "grad_norm": 0.49083420634269714, + "kl": 0.754638671875, + "learning_rate": 2.6449685844976645e-07, + "loss": 0.0176, + "reward": 0.6015625298023224, + "reward_std": 0.08523244503885508, + "rewards/accuracy_reward": 0.10714285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 3126 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.8661193847656, + "epoch": 0.9340601896796356, + "grad_norm": 0.517970860004425, + "kl": 1.2646484375, + "learning_rate": 2.621191258577238e-07, + "loss": 0.0578, + "reward": 0.5279018133878708, + "reward_std": 0.07901435671374202, + "rewards/accuracy_reward": 0.0401785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 3127 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.6986999511719, + "epoch": 0.9343588977671571, + "grad_norm": 0.48946309089660645, + "kl": 1.08251953125, + "learning_rate": 2.5975198712565706e-07, + "loss": 0.0508, + "reward": 0.7767857611179352, + "reward_std": 0.10445700399577618, + "rewards/accuracy_reward": 0.2857142947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3128 + }, + { + "clip_ratio": 0.0, + "completion_length": 973.6094207763672, + "epoch": 0.9346576058546785, + "grad_norm": 0.4241443872451782, + "kl": 0.900390625, + "learning_rate": 2.573954448287819e-07, + "loss": 0.0336, + "reward": 0.5111607387661934, + "reward_std": 0.09120816271752119, + "rewards/accuracy_reward": 0.022321429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 3129 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.6116333007812, + "epoch": 0.9349563139422, + "grad_norm": 0.4710473120212555, + "kl": 0.615234375, + "learning_rate": 2.5504950153078413e-07, + "loss": 0.0216, + "reward": 0.5742187723517418, + "reward_std": 0.09600192029029131, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294738650322, + "step": 3130 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.810302734375, + "epoch": 0.9352550220297214, + "grad_norm": 1.8738495111465454, + "kl": 1.06640625, + "learning_rate": 2.527141597838212e-07, + "loss": 0.0452, + "reward": 0.515066996216774, + "reward_std": 0.08522841334342957, + "rewards/accuracy_reward": 0.024553573224693537, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3131 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.341552734375, + "epoch": 0.935553730117243, + "grad_norm": 0.678744375705719, + "kl": 1.3388671875, + "learning_rate": 2.5038942212851637e-07, + "loss": 0.0614, + "reward": 0.5530134066939354, + "reward_std": 0.09694909863173962, + "rewards/accuracy_reward": 0.06473214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812649011612, + "step": 3132 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.4286193847656, + "epoch": 0.9358524382047644, + "grad_norm": 1.1526715755462646, + "kl": 0.66650390625, + "learning_rate": 2.4807529109395544e-07, + "loss": 0.038, + "reward": 0.654575914144516, + "reward_std": 0.11526789888739586, + "rewards/accuracy_reward": 0.16071429289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 3133 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.1027221679688, + "epoch": 0.9361511462922859, + "grad_norm": 0.44632503390312195, + "kl": 0.74853515625, + "learning_rate": 2.4577176919768687e-07, + "loss": 0.0347, + "reward": 0.5814732313156128, + "reward_std": 0.05105562973767519, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 3134 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.8013763427734, + "epoch": 0.9364498543798073, + "grad_norm": 1.0368989706039429, + "kl": 0.66259765625, + "learning_rate": 2.4347885894571487e-07, + "loss": 0.0145, + "reward": 0.6210937649011612, + "reward_std": 0.12821803474798799, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 3135 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.3683471679688, + "epoch": 0.9367485624673288, + "grad_norm": 0.9305571913719177, + "kl": 0.70361328125, + "learning_rate": 2.4119656283250304e-07, + "loss": 0.0529, + "reward": 0.6099330708384514, + "reward_std": 0.10850715171545744, + "rewards/accuracy_reward": 0.11830357648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3136 + }, + { + "clip_ratio": 0.0, + "completion_length": 928.6830749511719, + "epoch": 0.9370472705548503, + "grad_norm": 1.3361915349960327, + "kl": 1.19580078125, + "learning_rate": 2.389248833409663e-07, + "loss": 0.0537, + "reward": 0.6333705708384514, + "reward_std": 0.10395967401564121, + "rewards/accuracy_reward": 0.14285714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3137 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.6473693847656, + "epoch": 0.9373459786423718, + "grad_norm": 1.8459842205047607, + "kl": 0.99951171875, + "learning_rate": 2.366638229424667e-07, + "loss": 0.0421, + "reward": 0.5613839626312256, + "reward_std": 0.09926139190793037, + "rewards/accuracy_reward": 0.06919643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3138 + }, + { + "clip_ratio": 0.0, + "completion_length": 930.2723693847656, + "epoch": 0.9376446867298932, + "grad_norm": 0.7243342399597168, + "kl": 0.765625, + "learning_rate": 2.344133840968188e-07, + "loss": 0.0442, + "reward": 0.616629496216774, + "reward_std": 0.11219715885818005, + "rewards/accuracy_reward": 0.12500000605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3139 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.2120971679688, + "epoch": 0.9379433948174147, + "grad_norm": 0.4513695240020752, + "kl": 0.77001953125, + "learning_rate": 2.3217356925227973e-07, + "loss": 0.0267, + "reward": 0.545758955180645, + "reward_std": 0.07472506444901228, + "rewards/accuracy_reward": 0.055803573690354824, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 3140 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.7411193847656, + "epoch": 0.9382421029049361, + "grad_norm": 0.5870475172996521, + "kl": 1.171875, + "learning_rate": 2.2994438084554594e-07, + "loss": 0.0657, + "reward": 0.5736607313156128, + "reward_std": 0.1071918373927474, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 3141 + }, + { + "clip_ratio": 0.0, + "completion_length": 910.8839721679688, + "epoch": 0.9385408109924577, + "grad_norm": 0.5931692123413086, + "kl": 0.654296875, + "learning_rate": 2.2772582130175747e-07, + "loss": 0.032, + "reward": 0.7338169813156128, + "reward_std": 0.09835939155891538, + "rewards/accuracy_reward": 0.2433035857975483, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 3142 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.5134429931641, + "epoch": 0.9388395190799791, + "grad_norm": 0.7206888794898987, + "kl": 0.89599609375, + "learning_rate": 2.2551789303449034e-07, + "loss": 0.0434, + "reward": 0.5340401977300644, + "reward_std": 0.09388164430856705, + "rewards/accuracy_reward": 0.042410716880112886, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 3143 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.3326263427734, + "epoch": 0.9391382271675005, + "grad_norm": 1.9443392753601074, + "kl": 1.08203125, + "learning_rate": 2.2332059844575317e-07, + "loss": 0.0457, + "reward": 0.6478794813156128, + "reward_std": 0.08954030647873878, + "rewards/accuracy_reward": 0.1562500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3144 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.6786193847656, + "epoch": 0.939436935255022, + "grad_norm": 0.8977212309837341, + "kl": 0.9921875, + "learning_rate": 2.2113393992598596e-07, + "loss": 0.0458, + "reward": 0.5083705559372902, + "reward_std": 0.08074554149061441, + "rewards/accuracy_reward": 0.017857143422588706, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3145 + }, + { + "clip_ratio": 0.0, + "completion_length": 910.3616485595703, + "epoch": 0.9397356433425434, + "grad_norm": 1.7673739194869995, + "kl": 0.61083984375, + "learning_rate": 2.1895791985406257e-07, + "loss": 0.0394, + "reward": 0.6763393133878708, + "reward_std": 0.08757604518905282, + "rewards/accuracy_reward": 0.18080358393490314, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 3146 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.9955902099609, + "epoch": 0.940034351430065, + "grad_norm": 0.9242091774940491, + "kl": 0.8837890625, + "learning_rate": 2.1679254059727594e-07, + "loss": 0.0426, + "reward": 0.6194196790456772, + "reward_std": 0.16219795495271683, + "rewards/accuracy_reward": 0.12723214412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3147 + }, + { + "clip_ratio": 0.0, + "completion_length": 889.4777221679688, + "epoch": 0.9403330595175864, + "grad_norm": 0.5639451742172241, + "kl": 0.79443359375, + "learning_rate": 2.1463780451134841e-07, + "loss": 0.0626, + "reward": 0.590401828289032, + "reward_std": 0.08820712566375732, + "rewards/accuracy_reward": 0.098214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 3148 + }, + { + "clip_ratio": 0.0, + "completion_length": 976.4420166015625, + "epoch": 0.9406317676051079, + "grad_norm": 1.1745915412902832, + "kl": 0.8623046875, + "learning_rate": 2.124937139404204e-07, + "loss": 0.0403, + "reward": 0.5396205484867096, + "reward_std": 0.11825938895344734, + "rewards/accuracy_reward": 0.0491071455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3149 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.0245971679688, + "epoch": 0.9409304756926293, + "grad_norm": 1.5508838891983032, + "kl": 1.0400390625, + "learning_rate": 2.103602712170527e-07, + "loss": 0.0604, + "reward": 0.6568080633878708, + "reward_std": 0.15362652763724327, + "rewards/accuracy_reward": 0.16517857694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 3150 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.8861999511719, + "epoch": 0.9412291837801509, + "grad_norm": 0.42195627093315125, + "kl": 0.5419921875, + "learning_rate": 2.0823747866222322e-07, + "loss": 0.029, + "reward": 0.5452009290456772, + "reward_std": 0.1278087105602026, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3151 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.3393249511719, + "epoch": 0.9415278918676723, + "grad_norm": 0.4047963619232178, + "kl": 0.6298828125, + "learning_rate": 2.0612533858531902e-07, + "loss": 0.033, + "reward": 0.624441996216774, + "reward_std": 0.06236724299378693, + "rewards/accuracy_reward": 0.12946428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3152 + }, + { + "clip_ratio": 0.0, + "completion_length": 970.2254943847656, + "epoch": 0.9418265999551938, + "grad_norm": 0.8298485279083252, + "kl": 1.0419921875, + "learning_rate": 2.0402385328414543e-07, + "loss": 0.0445, + "reward": 0.5033482313156128, + "reward_std": 0.08599713258445263, + "rewards/accuracy_reward": 0.015625000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 3153 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.1004943847656, + "epoch": 0.9421253080427152, + "grad_norm": 0.40340685844421387, + "kl": 0.661865234375, + "learning_rate": 2.019330250449103e-07, + "loss": 0.0313, + "reward": 0.5909598469734192, + "reward_std": 0.11797734349966049, + "rewards/accuracy_reward": 0.0959821492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3154 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.7478179931641, + "epoch": 0.9424240161302367, + "grad_norm": 1.1058624982833862, + "kl": 0.826171875, + "learning_rate": 1.998528561422297e-07, + "loss": 0.0251, + "reward": 0.5731026977300644, + "reward_std": 0.10729106422513723, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 3155 + }, + { + "clip_ratio": 0.0, + "completion_length": 967.1429138183594, + "epoch": 0.9427227242177582, + "grad_norm": 0.502281904220581, + "kl": 0.646240234375, + "learning_rate": 1.9778334883912342e-07, + "loss": 0.0297, + "reward": 0.628348246216774, + "reward_std": 0.12310335645452142, + "rewards/accuracy_reward": 0.1361607238650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3156 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.497802734375, + "epoch": 0.9430214323052797, + "grad_norm": 0.2942585349082947, + "kl": 0.4697265625, + "learning_rate": 1.9572450538701493e-07, + "loss": 0.014, + "reward": 0.5686384290456772, + "reward_std": 0.08853007014840841, + "rewards/accuracy_reward": 0.07366071874275804, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 3157 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.7254791259766, + "epoch": 0.9433201403928011, + "grad_norm": 1.6559441089630127, + "kl": 1.07958984375, + "learning_rate": 1.9367632802572033e-07, + "loss": 0.0379, + "reward": 0.5658482313156128, + "reward_std": 0.09910184727050364, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 3158 + }, + { + "clip_ratio": 0.0, + "completion_length": 930.2768402099609, + "epoch": 0.9436188484803226, + "grad_norm": 0.8740105032920837, + "kl": 0.916015625, + "learning_rate": 1.9163881898345836e-07, + "loss": 0.0596, + "reward": 0.6283482387661934, + "reward_std": 0.1331726349890232, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 3159 + }, + { + "clip_ratio": 0.0, + "completion_length": 936.8326568603516, + "epoch": 0.943917556567844, + "grad_norm": 1.2841898202896118, + "kl": 0.6806640625, + "learning_rate": 1.8961198047683926e-07, + "loss": 0.0303, + "reward": 0.6015625149011612, + "reward_std": 0.03366912016645074, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3160 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.3125610351562, + "epoch": 0.9442162646553656, + "grad_norm": 0.6802109479904175, + "kl": 1.160400390625, + "learning_rate": 1.8759581471086363e-07, + "loss": 0.0582, + "reward": 0.5747768133878708, + "reward_std": 0.13630108162760735, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 3161 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.8638916015625, + "epoch": 0.944514972742887, + "grad_norm": 1.652927279472351, + "kl": 0.7890625, + "learning_rate": 1.855903238789225e-07, + "loss": 0.0335, + "reward": 0.565848246216774, + "reward_std": 0.08905975054949522, + "rewards/accuracy_reward": 0.07142857392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3162 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.8951110839844, + "epoch": 0.9448136808304085, + "grad_norm": 0.48537886142730713, + "kl": 0.5078125, + "learning_rate": 1.8359551016279398e-07, + "loss": 0.0348, + "reward": 0.5658482313156128, + "reward_std": 0.08475705049932003, + "rewards/accuracy_reward": 0.06919643119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 3163 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.6808471679688, + "epoch": 0.9451123889179299, + "grad_norm": 1.0724153518676758, + "kl": 1.099609375, + "learning_rate": 1.8161137573263877e-07, + "loss": 0.0572, + "reward": 0.55636166036129, + "reward_std": 0.13068855181336403, + "rewards/accuracy_reward": 0.06696428777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 3164 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.2545166015625, + "epoch": 0.9454110970054515, + "grad_norm": 1.1911735534667969, + "kl": 0.72265625, + "learning_rate": 1.7963792274700242e-07, + "loss": 0.0314, + "reward": 0.5842634290456772, + "reward_std": 0.1174231469631195, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3165 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.8504943847656, + "epoch": 0.9457098050929729, + "grad_norm": 0.6994132399559021, + "kl": 1.0380859375, + "learning_rate": 1.7767515335280538e-07, + "loss": 0.0591, + "reward": 0.6032366305589676, + "reward_std": 0.150896979495883, + "rewards/accuracy_reward": 0.11383929289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 3166 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.8170013427734, + "epoch": 0.9460085131804944, + "grad_norm": 0.6503378748893738, + "kl": 0.60693359375, + "learning_rate": 1.757230696853518e-07, + "loss": 0.021, + "reward": 0.5251116305589676, + "reward_std": 0.08458347991108894, + "rewards/accuracy_reward": 0.031250000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 3167 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.1361999511719, + "epoch": 0.9463072212680158, + "grad_norm": 0.48729658126831055, + "kl": 0.58740234375, + "learning_rate": 1.7378167386831512e-07, + "loss": 0.0311, + "reward": 0.5585937798023224, + "reward_std": 0.06840655440464616, + "rewards/accuracy_reward": 0.06250000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 3168 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.8527221679688, + "epoch": 0.9466059293555373, + "grad_norm": 0.47682854533195496, + "kl": 0.760498046875, + "learning_rate": 1.7185096801374368e-07, + "loss": 0.0428, + "reward": 0.6595982313156128, + "reward_std": 0.13403502851724625, + "rewards/accuracy_reward": 0.1674107201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3169 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.8683471679688, + "epoch": 0.9469046374430587, + "grad_norm": 0.9304136037826538, + "kl": 0.5576171875, + "learning_rate": 1.699309542220584e-07, + "loss": 0.0226, + "reward": 0.5479910969734192, + "reward_std": 0.08076103590428829, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3170 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.2031707763672, + "epoch": 0.9472033455305803, + "grad_norm": 0.4663383662700653, + "kl": 1.130859375, + "learning_rate": 1.68021634582044e-07, + "loss": 0.0598, + "reward": 0.5809152126312256, + "reward_std": 0.10486333072185516, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 3171 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.3638916015625, + "epoch": 0.9475020536181017, + "grad_norm": 1.0881402492523193, + "kl": 0.86376953125, + "learning_rate": 1.661230111708534e-07, + "loss": 0.0438, + "reward": 0.6188616156578064, + "reward_std": 0.10859744250774384, + "rewards/accuracy_reward": 0.12946428824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 3172 + }, + { + "clip_ratio": 0.0, + "completion_length": 924.4464874267578, + "epoch": 0.9478007617056232, + "grad_norm": 1.1155281066894531, + "kl": 1.806640625, + "learning_rate": 1.6423508605400318e-07, + "loss": 0.092, + "reward": 0.638950914144516, + "reward_std": 0.14858944434672594, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.482700914144516, + "step": 3173 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.5558471679688, + "epoch": 0.9480994697931446, + "grad_norm": 0.6346709132194519, + "kl": 0.794921875, + "learning_rate": 1.6235786128537046e-07, + "loss": 0.0493, + "reward": 0.5407366305589676, + "reward_std": 0.08831583824940026, + "rewards/accuracy_reward": 0.04910714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3174 + }, + { + "clip_ratio": 0.0, + "completion_length": 923.1652069091797, + "epoch": 0.9483981778806662, + "grad_norm": 1.063740611076355, + "kl": 0.370361328125, + "learning_rate": 1.604913389071927e-07, + "loss": 0.0221, + "reward": 0.6802455484867096, + "reward_std": 0.14423212897963822, + "rewards/accuracy_reward": 0.18303572200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 3175 + }, + { + "clip_ratio": 0.0, + "completion_length": 918.1049652099609, + "epoch": 0.9486968859681876, + "grad_norm": 0.37833884358406067, + "kl": 0.56689453125, + "learning_rate": 1.586355209500634e-07, + "loss": 0.0402, + "reward": 0.5837053805589676, + "reward_std": 0.17487938702106476, + "rewards/accuracy_reward": 0.08928571944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3176 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.7768402099609, + "epoch": 0.9489955940557091, + "grad_norm": 0.4438292682170868, + "kl": 0.5009765625, + "learning_rate": 1.5679040943292867e-07, + "loss": 0.0155, + "reward": 0.5463169813156128, + "reward_std": 0.11282436735928059, + "rewards/accuracy_reward": 0.05133928661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3177 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.2545318603516, + "epoch": 0.9492943021432305, + "grad_norm": 0.547390341758728, + "kl": 1.0703125, + "learning_rate": 1.549560063630906e-07, + "loss": 0.0346, + "reward": 0.5976562798023224, + "reward_std": 0.1060391841456294, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 3178 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.2277374267578, + "epoch": 0.949593010230752, + "grad_norm": 1.44923996925354, + "kl": 0.9765625, + "learning_rate": 1.5313231373619953e-07, + "loss": 0.0619, + "reward": 0.5781250223517418, + "reward_std": 0.07587395422160625, + "rewards/accuracy_reward": 0.0892857201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 3179 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.2835388183594, + "epoch": 0.9498917183182735, + "grad_norm": 1.4542235136032104, + "kl": 0.56201171875, + "learning_rate": 1.5131933353625394e-07, + "loss": 0.0217, + "reward": 0.6780134290456772, + "reward_std": 0.10827783867716789, + "rewards/accuracy_reward": 0.180803582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 3180 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.7835388183594, + "epoch": 0.950190426405795, + "grad_norm": 0.9933051466941833, + "kl": 0.9453125, + "learning_rate": 1.495170677356006e-07, + "loss": 0.051, + "reward": 0.6646205633878708, + "reward_std": 0.1284193294122815, + "rewards/accuracy_reward": 0.17410715040750802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 3181 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.9241485595703, + "epoch": 0.9504891344933164, + "grad_norm": 0.6252902746200562, + "kl": 0.78369140625, + "learning_rate": 1.4772551829492444e-07, + "loss": 0.038, + "reward": 0.6277901977300644, + "reward_std": 0.08888908382505178, + "rewards/accuracy_reward": 0.1339285783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 3182 + }, + { + "clip_ratio": 0.0, + "completion_length": 920.3460235595703, + "epoch": 0.9507878425808379, + "grad_norm": 2.3632562160491943, + "kl": 0.8310546875, + "learning_rate": 1.459446871632586e-07, + "loss": 0.0596, + "reward": 0.6199776977300644, + "reward_std": 0.06743778847157955, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 3183 + }, + { + "clip_ratio": 0.0, + "completion_length": 936.9420166015625, + "epoch": 0.9510865506683593, + "grad_norm": 0.6176393032073975, + "kl": 0.875, + "learning_rate": 1.4417457627797226e-07, + "loss": 0.0498, + "reward": 0.6199777126312256, + "reward_std": 0.1483887042850256, + "rewards/accuracy_reward": 0.12723214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 3184 + }, + { + "clip_ratio": 0.0, + "completion_length": 932.5826416015625, + "epoch": 0.9513852587558809, + "grad_norm": 0.557157576084137, + "kl": 0.517333984375, + "learning_rate": 1.424151875647717e-07, + "loss": 0.0349, + "reward": 0.5987723469734192, + "reward_std": 0.09653660212643445, + "rewards/accuracy_reward": 0.10267857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 3185 + }, + { + "clip_ratio": 0.0, + "completion_length": 943.2254943847656, + "epoch": 0.9516839668434023, + "grad_norm": 0.3058072328567505, + "kl": 0.536865234375, + "learning_rate": 1.406665229377002e-07, + "loss": 0.0326, + "reward": 0.5429687798023224, + "reward_std": 0.1076715737581253, + "rewards/accuracy_reward": 0.046875003492459655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 3186 + }, + { + "clip_ratio": 0.0, + "completion_length": 927.8169860839844, + "epoch": 0.9519826749309237, + "grad_norm": 0.8551336526870728, + "kl": 0.8798828125, + "learning_rate": 1.389285842991339e-07, + "loss": 0.0165, + "reward": 0.5625000298023224, + "reward_std": 0.1259972508996725, + "rewards/accuracy_reward": 0.07366071594879031, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 3187 + }, + { + "clip_ratio": 0.0, + "completion_length": 928.7321929931641, + "epoch": 0.9522813830184452, + "grad_norm": 0.3770377039909363, + "kl": 0.64111328125, + "learning_rate": 1.3720137353977814e-07, + "loss": 0.0317, + "reward": 0.6010044813156128, + "reward_std": 0.08709233743138611, + "rewards/accuracy_reward": 0.10714286309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 3188 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.7723693847656, + "epoch": 0.9525800911059666, + "grad_norm": 0.7582630515098572, + "kl": 0.5126953125, + "learning_rate": 1.354848925386698e-07, + "loss": 0.03, + "reward": 0.6579241454601288, + "reward_std": 0.07836880441755056, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.497209832072258, + "step": 3189 + }, + { + "clip_ratio": 0.0, + "completion_length": 944.4687957763672, + "epoch": 0.9528787991934882, + "grad_norm": 0.47408702969551086, + "kl": 0.77294921875, + "learning_rate": 1.3377914316317186e-07, + "loss": 0.0554, + "reward": 0.6395089477300644, + "reward_std": 0.11316875601187348, + "rewards/accuracy_reward": 0.1473214402794838, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 3190 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.1071929931641, + "epoch": 0.9531775072810096, + "grad_norm": 1.0323776006698608, + "kl": 1.0283203125, + "learning_rate": 1.3208412726897324e-07, + "loss": 0.044, + "reward": 0.6328125298023224, + "reward_std": 0.08090389892458916, + "rewards/accuracy_reward": 0.14062500931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 3191 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.4799499511719, + "epoch": 0.9534762153685311, + "grad_norm": 0.5503929853439331, + "kl": 0.7998046875, + "learning_rate": 1.3039984670008443e-07, + "loss": 0.0292, + "reward": 0.6506696492433548, + "reward_std": 0.10291948914527893, + "rewards/accuracy_reward": 0.15625000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3192 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.8214721679688, + "epoch": 0.9537749234560525, + "grad_norm": 1.1118327379226685, + "kl": 0.780029296875, + "learning_rate": 1.2872630328883752e-07, + "loss": 0.0383, + "reward": 0.6060268133878708, + "reward_std": 0.03550924826413393, + "rewards/accuracy_reward": 0.11160714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196492433548, + "step": 3193 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.8147735595703, + "epoch": 0.954073631543574, + "grad_norm": 0.9141556620597839, + "kl": 1.04736328125, + "learning_rate": 1.2706349885588276e-07, + "loss": 0.0413, + "reward": 0.5708705633878708, + "reward_std": 0.1295553706586361, + "rewards/accuracy_reward": 0.0803571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134215950966, + "step": 3194 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.8058471679688, + "epoch": 0.9543723396310955, + "grad_norm": 0.6213966608047485, + "kl": 1.0146484375, + "learning_rate": 1.2541143521019093e-07, + "loss": 0.0604, + "reward": 0.7170759290456772, + "reward_std": 0.1535583883523941, + "rewards/accuracy_reward": 0.2254464402794838, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 3195 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.6428985595703, + "epoch": 0.954671047718617, + "grad_norm": 0.9245670437812805, + "kl": 1.0, + "learning_rate": 1.2377011414904327e-07, + "loss": 0.0238, + "reward": 0.5915178805589676, + "reward_std": 0.09971383586525917, + "rewards/accuracy_reward": 0.10044643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 3196 + }, + { + "clip_ratio": 0.0, + "completion_length": 926.5357666015625, + "epoch": 0.9549697558061384, + "grad_norm": 0.48406511545181274, + "kl": 0.863525390625, + "learning_rate": 1.2213953745803587e-07, + "loss": 0.0618, + "reward": 0.6953125447034836, + "reward_std": 0.14888792857527733, + "rewards/accuracy_reward": 0.2031250111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3197 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.6585235595703, + "epoch": 0.9552684638936599, + "grad_norm": 0.8896274566650391, + "kl": 0.8115234375, + "learning_rate": 1.2051970691107972e-07, + "loss": 0.0213, + "reward": 0.6316964477300644, + "reward_std": 0.0942430105060339, + "rewards/accuracy_reward": 0.14062500931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 3198 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.5536041259766, + "epoch": 0.9555671719811814, + "grad_norm": 0.5823484659194946, + "kl": 0.465576171875, + "learning_rate": 1.1891062427038746e-07, + "loss": 0.0295, + "reward": 0.6088169813156128, + "reward_std": 0.11031033284962177, + "rewards/accuracy_reward": 0.11607143469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3199 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.9978179931641, + "epoch": 0.9558658800687029, + "grad_norm": 1.0146082639694214, + "kl": 0.308349609375, + "learning_rate": 1.1731229128648546e-07, + "loss": -0.0009, + "reward": 0.6060267984867096, + "reward_std": 0.09161270130425692, + "rewards/accuracy_reward": 0.1093750074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 3200 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.6339569091797, + "epoch": 0.9561645881562243, + "grad_norm": 0.6109718680381775, + "kl": 1.1982421875, + "learning_rate": 1.1572470969820282e-07, + "loss": 0.0487, + "reward": 0.5719866305589676, + "reward_std": 0.10357636585831642, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3201 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.1205749511719, + "epoch": 0.9564632962437458, + "grad_norm": 0.5842647552490234, + "kl": 0.45751953125, + "learning_rate": 1.1414788123267351e-07, + "loss": 0.0184, + "reward": 0.5948660969734192, + "reward_std": 0.11554662371054292, + "rewards/accuracy_reward": 0.10044643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3202 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.0312957763672, + "epoch": 0.9567620043312672, + "grad_norm": 0.8592960834503174, + "kl": 0.9921875, + "learning_rate": 1.1258180760533089e-07, + "loss": 0.0564, + "reward": 0.5926339477300644, + "reward_std": 0.12135454453527927, + "rewards/accuracy_reward": 0.10044643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3203 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.9844055175781, + "epoch": 0.9570607124187888, + "grad_norm": 0.5368284583091736, + "kl": 0.530517578125, + "learning_rate": 1.110264905199121e-07, + "loss": 0.0265, + "reward": 0.5407366305589676, + "reward_std": 0.11568738613277674, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 3204 + }, + { + "clip_ratio": 0.0, + "completion_length": 959.6116485595703, + "epoch": 0.9573594205063102, + "grad_norm": 0.7116967439651489, + "kl": 0.9052734375, + "learning_rate": 1.0948193166844701e-07, + "loss": 0.0396, + "reward": 0.5948660969734192, + "reward_std": 0.14308394119143486, + "rewards/accuracy_reward": 0.10267857694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3205 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.9286193847656, + "epoch": 0.9576581285938317, + "grad_norm": 0.5100167989730835, + "kl": 0.6513671875, + "learning_rate": 1.0794813273126592e-07, + "loss": 0.03, + "reward": 0.5825892984867096, + "reward_std": 0.12507288623601198, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 3206 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.2857666015625, + "epoch": 0.9579568366813531, + "grad_norm": 0.7665013074874878, + "kl": 0.38818359375, + "learning_rate": 1.0642509537698964e-07, + "loss": 0.0206, + "reward": 0.5379464477300644, + "reward_std": 0.04936607135459781, + "rewards/accuracy_reward": 0.042410717345774174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357238650322, + "step": 3207 + }, + { + "clip_ratio": 0.0, + "completion_length": 924.3281707763672, + "epoch": 0.9582555447688746, + "grad_norm": 1.0629717111587524, + "kl": 0.508056640625, + "learning_rate": 1.049128212625361e-07, + "loss": 0.0356, + "reward": 0.5870535969734192, + "reward_std": 0.06493196310475469, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 3208 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.3438110351562, + "epoch": 0.9585542528563961, + "grad_norm": 0.71890789270401, + "kl": 0.52294921875, + "learning_rate": 1.0341131203311039e-07, + "loss": 0.0206, + "reward": 0.6383928805589676, + "reward_std": 0.09000659384764731, + "rewards/accuracy_reward": 0.14508929592557251, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 3209 + }, + { + "clip_ratio": 0.0, + "completion_length": 944.0045013427734, + "epoch": 0.9588529609439176, + "grad_norm": 0.6305485963821411, + "kl": 0.87158203125, + "learning_rate": 1.0192056932220695e-07, + "loss": 0.0336, + "reward": 0.6261160969734192, + "reward_std": 0.08017919026315212, + "rewards/accuracy_reward": 0.13392857508733869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 3210 + }, + { + "clip_ratio": 0.0, + "completion_length": 984.4062805175781, + "epoch": 0.959151669031439, + "grad_norm": 0.39778271317481995, + "kl": 0.966796875, + "learning_rate": 1.004405947516085e-07, + "loss": 0.0512, + "reward": 0.5775669813156128, + "reward_std": 0.11174479871988297, + "rewards/accuracy_reward": 0.08705357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 3211 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.4509429931641, + "epoch": 0.9594503771189605, + "grad_norm": 0.5103617310523987, + "kl": 0.6865234375, + "learning_rate": 9.897138993138156e-08, + "loss": 0.0398, + "reward": 0.594866082072258, + "reward_std": 0.14489764533936977, + "rewards/accuracy_reward": 0.10044643189758062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3212 + }, + { + "clip_ratio": 0.0, + "completion_length": 975.7255096435547, + "epoch": 0.9597490852064819, + "grad_norm": 0.5941271781921387, + "kl": 1.1337890625, + "learning_rate": 9.751295645987647e-08, + "loss": 0.0559, + "reward": 0.5664062947034836, + "reward_std": 0.10121986828744411, + "rewards/accuracy_reward": 0.07589286030270159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3213 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.0268096923828, + "epoch": 0.9600477932940035, + "grad_norm": 0.4588782787322998, + "kl": 0.5615234375, + "learning_rate": 9.606529592372738e-08, + "loss": 0.0213, + "reward": 0.5876116305589676, + "reward_std": 0.07702687662094831, + "rewards/accuracy_reward": 0.09375000302679837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 3214 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.1719207763672, + "epoch": 0.9603465013815249, + "grad_norm": 0.8815662860870361, + "kl": 0.74169921875, + "learning_rate": 9.462840989784671e-08, + "loss": 0.0327, + "reward": 0.597098246216774, + "reward_std": 0.08506209868937731, + "rewards/accuracy_reward": 0.10267857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3215 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.4420166015625, + "epoch": 0.9606452094690464, + "grad_norm": 0.4984791576862335, + "kl": 0.962890625, + "learning_rate": 9.320229994542518e-08, + "loss": 0.0437, + "reward": 0.5797991305589676, + "reward_std": 0.09237229451537132, + "rewards/accuracy_reward": 0.08705357508733869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3216 + }, + { + "clip_ratio": 0.0, + "completion_length": 919.700927734375, + "epoch": 0.9609439175565678, + "grad_norm": 0.8314155340194702, + "kl": 0.931640625, + "learning_rate": 9.17869676179306e-08, + "loss": 0.0173, + "reward": 0.5323660969734192, + "reward_std": 0.02983540273271501, + "rewards/accuracy_reward": 0.03794643026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3217 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.3393249511719, + "epoch": 0.9612426256440894, + "grad_norm": 0.9385867118835449, + "kl": 1.19580078125, + "learning_rate": 9.038241445510687e-08, + "loss": 0.0654, + "reward": 0.5574776977300644, + "reward_std": 0.10988635756075382, + "rewards/accuracy_reward": 0.06919643119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 3218 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.1451263427734, + "epoch": 0.9615413337316108, + "grad_norm": 1.1096971035003662, + "kl": 0.9697265625, + "learning_rate": 8.898864198496837e-08, + "loss": 0.0546, + "reward": 0.5401785895228386, + "reward_std": 0.07629283983260393, + "rewards/accuracy_reward": 0.05133928847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 3219 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.7478179931641, + "epoch": 0.9618400418191323, + "grad_norm": 0.3676908314228058, + "kl": 0.527587890625, + "learning_rate": 8.760565172380443e-08, + "loss": 0.046, + "reward": 0.6015625149011612, + "reward_std": 0.12420297786593437, + "rewards/accuracy_reward": 0.10937500558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3220 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.9263916015625, + "epoch": 0.9621387499066537, + "grad_norm": 1.425166130065918, + "kl": 0.8818359375, + "learning_rate": 8.62334451761715e-08, + "loss": 0.0418, + "reward": 0.5691964477300644, + "reward_std": 0.081162229180336, + "rewards/accuracy_reward": 0.07589286169968545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3221 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.7366485595703, + "epoch": 0.9624374579941752, + "grad_norm": 1.2185665369033813, + "kl": 0.912109375, + "learning_rate": 8.487202383489656e-08, + "loss": 0.0463, + "reward": 0.6104910895228386, + "reward_std": 0.06337486300617456, + "rewards/accuracy_reward": 0.12053572130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 3222 + }, + { + "clip_ratio": 0.0, + "completion_length": 915.3013763427734, + "epoch": 0.9627361660816967, + "grad_norm": 0.5300157070159912, + "kl": 0.4560546875, + "learning_rate": 8.352138918107377e-08, + "loss": 0.0232, + "reward": 0.5602678805589676, + "reward_std": 0.0936560183763504, + "rewards/accuracy_reward": 0.06473214598372579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 3223 + }, + { + "clip_ratio": 0.0, + "completion_length": 900.8594055175781, + "epoch": 0.9630348741692182, + "grad_norm": 0.9010494351387024, + "kl": 0.9111328125, + "learning_rate": 8.218154268405998e-08, + "loss": 0.0473, + "reward": 0.6813616156578064, + "reward_std": 0.13121903128921986, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 3224 + }, + { + "clip_ratio": 0.0, + "completion_length": 948.5201263427734, + "epoch": 0.9633335822567396, + "grad_norm": 1.1022679805755615, + "kl": 0.91015625, + "learning_rate": 8.085248580147586e-08, + "loss": 0.0415, + "reward": 0.6690848469734192, + "reward_std": 0.11712811514735222, + "rewards/accuracy_reward": 0.17857143841683865, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 3225 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.669677734375, + "epoch": 0.9636322903442611, + "grad_norm": 0.6731881499290466, + "kl": 1.0205078125, + "learning_rate": 7.953421997920818e-08, + "loss": 0.0618, + "reward": 0.5703125149011612, + "reward_std": 0.08406350994482636, + "rewards/accuracy_reward": 0.07812500116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3226 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.4844207763672, + "epoch": 0.9639309984317825, + "grad_norm": 1.0383880138397217, + "kl": 1.2431640625, + "learning_rate": 7.822674665139751e-08, + "loss": 0.0446, + "reward": 0.585379496216774, + "reward_std": 0.07562888506799936, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973469734192, + "step": 3227 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.3147735595703, + "epoch": 0.9642297065193041, + "grad_norm": 1.552093744277954, + "kl": 0.53125, + "learning_rate": 7.693006724044827e-08, + "loss": 0.0224, + "reward": 0.6462053954601288, + "reward_std": 0.12114622723311186, + "rewards/accuracy_reward": 0.1540178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 3228 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.0960235595703, + "epoch": 0.9645284146068255, + "grad_norm": 0.4800511300563812, + "kl": 0.64990234375, + "learning_rate": 7.564418315702093e-08, + "loss": 0.0283, + "reward": 0.5898437798023224, + "reward_std": 0.1176135204732418, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 3229 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.2656555175781, + "epoch": 0.9648271226943469, + "grad_norm": 0.5575625896453857, + "kl": 1.0439453125, + "learning_rate": 7.436909580003093e-08, + "loss": 0.0518, + "reward": 0.5764509215950966, + "reward_std": 0.13437431957572699, + "rewards/accuracy_reward": 0.0848214291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 3230 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.935302734375, + "epoch": 0.9651258307818684, + "grad_norm": 1.5085346698760986, + "kl": 0.525390625, + "learning_rate": 7.310480655664864e-08, + "loss": 0.0151, + "reward": 0.5831473469734192, + "reward_std": 0.06763019459322095, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 3231 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.5312805175781, + "epoch": 0.9654245388693898, + "grad_norm": 0.8689988851547241, + "kl": 0.84375, + "learning_rate": 7.185131680229606e-08, + "loss": 0.0366, + "reward": 0.550223246216774, + "reward_std": 0.10994695127010345, + "rewards/accuracy_reward": 0.058035716880112886, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3232 + }, + { + "clip_ratio": 0.0, + "completion_length": 928.7991485595703, + "epoch": 0.9657232469569114, + "grad_norm": 0.6410739421844482, + "kl": 0.5791015625, + "learning_rate": 7.060862790064793e-08, + "loss": 0.0202, + "reward": 0.5524553805589676, + "reward_std": 0.1118574496358633, + "rewards/accuracy_reward": 0.05803571664728224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3233 + }, + { + "clip_ratio": 0.0, + "completion_length": 963.482177734375, + "epoch": 0.9660219550444328, + "grad_norm": 1.3296897411346436, + "kl": 1.2919921875, + "learning_rate": 6.937674120362725e-08, + "loss": 0.0506, + "reward": 0.6216518133878708, + "reward_std": 0.12502482533454895, + "rewards/accuracy_reward": 0.12946429569274187, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3234 + }, + { + "clip_ratio": 0.0, + "completion_length": 957.5647735595703, + "epoch": 0.9663206631319543, + "grad_norm": 0.49484017491340637, + "kl": 0.761962890625, + "learning_rate": 6.815565805140645e-08, + "loss": 0.0263, + "reward": 0.6328125298023224, + "reward_std": 0.11230873689055443, + "rewards/accuracy_reward": 0.14062500931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3235 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.3080749511719, + "epoch": 0.9666193712194757, + "grad_norm": 0.47233009338378906, + "kl": 0.8330078125, + "learning_rate": 6.694537977240512e-08, + "loss": 0.0464, + "reward": 0.5814732313156128, + "reward_std": 0.08498237654566765, + "rewards/accuracy_reward": 0.08928571734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3236 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.3281707763672, + "epoch": 0.9669180793069972, + "grad_norm": 0.5944632887840271, + "kl": 0.6337890625, + "learning_rate": 6.574590768328559e-08, + "loss": 0.021, + "reward": 0.6121652126312256, + "reward_std": 0.10255143418908119, + "rewards/accuracy_reward": 0.11830357951112092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 3237 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.4486846923828, + "epoch": 0.9672167873945187, + "grad_norm": 1.1726160049438477, + "kl": 1.49072265625, + "learning_rate": 6.45572430889574e-08, + "loss": 0.0638, + "reward": 0.5161830559372902, + "reward_std": 0.10208459384739399, + "rewards/accuracy_reward": 0.031250000931322575, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 3238 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.5848541259766, + "epoch": 0.9675154954820402, + "grad_norm": 1.300161600112915, + "kl": 0.86083984375, + "learning_rate": 6.337938728257054e-08, + "loss": 0.0388, + "reward": 0.6428571492433548, + "reward_std": 0.11717861890792847, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3239 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.8728179931641, + "epoch": 0.9678142035695616, + "grad_norm": 1.3292909860610962, + "kl": 1.1767578125, + "learning_rate": 6.221234154551781e-08, + "loss": 0.0584, + "reward": 0.5345982313156128, + "reward_std": 0.08106379210948944, + "rewards/accuracy_reward": 0.04464286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 3240 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.8884429931641, + "epoch": 0.9681129116570831, + "grad_norm": 1.6262375116348267, + "kl": 0.814697265625, + "learning_rate": 6.105610714742805e-08, + "loss": 0.0375, + "reward": 0.5407366305589676, + "reward_std": 0.08693214133381844, + "rewards/accuracy_reward": 0.04464285867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4960937649011612, + "step": 3241 + }, + { + "clip_ratio": 0.0, + "completion_length": 922.4263916015625, + "epoch": 0.9684116197446045, + "grad_norm": 1.4439008235931396, + "kl": 0.809814453125, + "learning_rate": 5.991068534617394e-08, + "loss": 0.0385, + "reward": 0.5820312723517418, + "reward_std": 0.08867156133055687, + "rewards/accuracy_reward": 0.0915178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4905134066939354, + "step": 3242 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.0446929931641, + "epoch": 0.9687103278321261, + "grad_norm": 0.6328553557395935, + "kl": 1.060546875, + "learning_rate": 5.8776077387859845e-08, + "loss": 0.0465, + "reward": 0.6283482313156128, + "reward_std": 0.09943273104727268, + "rewards/accuracy_reward": 0.13616072316654027, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3243 + }, + { + "clip_ratio": 0.0, + "completion_length": 969.7388916015625, + "epoch": 0.9690090359196475, + "grad_norm": 0.7073116898536682, + "kl": 1.00146484375, + "learning_rate": 5.765228450682947e-08, + "loss": 0.0419, + "reward": 0.644531287252903, + "reward_std": 0.057778088841587305, + "rewards/accuracy_reward": 0.15178572130389512, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3244 + }, + { + "clip_ratio": 0.0, + "completion_length": 966.0781707763672, + "epoch": 0.969307744007169, + "grad_norm": 0.8733079433441162, + "kl": 1.46435546875, + "learning_rate": 5.653930792565821e-08, + "loss": 0.0675, + "reward": 0.5630580633878708, + "reward_std": 0.11142788268625736, + "rewards/accuracy_reward": 0.07589286053553224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4871651977300644, + "step": 3245 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.6406707763672, + "epoch": 0.9696064520946904, + "grad_norm": 0.9082719683647156, + "kl": 0.672607421875, + "learning_rate": 5.5437148855156387e-08, + "loss": 0.0308, + "reward": 0.5329241305589676, + "reward_std": 0.08712942898273468, + "rewards/accuracy_reward": 0.03794643050059676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3246 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.544677734375, + "epoch": 0.969905160182212, + "grad_norm": 1.117292046546936, + "kl": 1.17919921875, + "learning_rate": 5.434580849436377e-08, + "loss": 0.0353, + "reward": 0.589285746216774, + "reward_std": 0.0970095694065094, + "rewards/accuracy_reward": 0.09821429289877415, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3247 + }, + { + "clip_ratio": 0.0, + "completion_length": 974.9308471679688, + "epoch": 0.9702038682697334, + "grad_norm": 0.3658195436000824, + "kl": 0.59130859375, + "learning_rate": 5.3265288030553974e-08, + "loss": 0.0239, + "reward": 0.5412946790456772, + "reward_std": 0.043448752257972956, + "rewards/accuracy_reward": 0.044642860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 3248 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.4353179931641, + "epoch": 0.9705025763572549, + "grad_norm": 0.3155656158924103, + "kl": 0.58935546875, + "learning_rate": 5.2195588639225584e-08, + "loss": 0.0275, + "reward": 0.5206473469734192, + "reward_std": 0.06717439880594611, + "rewards/accuracy_reward": 0.0267857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 3249 + }, + { + "clip_ratio": 0.0, + "completion_length": 956.2745971679688, + "epoch": 0.9708012844447763, + "grad_norm": 1.136747121810913, + "kl": 1.173828125, + "learning_rate": 5.1136711484106594e-08, + "loss": 0.05, + "reward": 0.5100446790456772, + "reward_std": 0.08607277180999517, + "rewards/accuracy_reward": 0.020089286845177412, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553880095482, + "step": 3250 + }, + { + "clip_ratio": 0.0, + "completion_length": 962.1808471679688, + "epoch": 0.9710999925322978, + "grad_norm": 1.224306583404541, + "kl": 0.58837890625, + "learning_rate": 5.008865771715221e-08, + "loss": 0.0407, + "reward": 0.5892857313156128, + "reward_std": 0.07445696229115129, + "rewards/accuracy_reward": 0.09598214644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3251 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.075927734375, + "epoch": 0.9713987006198193, + "grad_norm": 0.6265223026275635, + "kl": 0.73974609375, + "learning_rate": 4.9051428478542604e-08, + "loss": 0.0331, + "reward": 0.6746651977300644, + "reward_std": 0.1133632161654532, + "rewards/accuracy_reward": 0.1808035857975483, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 3252 + }, + { + "clip_ratio": 0.0, + "completion_length": 912.1295013427734, + "epoch": 0.9716974087073408, + "grad_norm": 0.41029077768325806, + "kl": 0.91259765625, + "learning_rate": 4.802502489668071e-08, + "loss": 0.0617, + "reward": 0.6562500149011612, + "reward_std": 0.12368509406223893, + "rewards/accuracy_reward": 0.16517857927829027, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3253 + }, + { + "clip_ratio": 0.0, + "completion_length": 932.3147735595703, + "epoch": 0.9719961167948622, + "grad_norm": 1.1586240530014038, + "kl": 0.8935546875, + "learning_rate": 4.700944808819441e-08, + "loss": 0.0499, + "reward": 0.6261161118745804, + "reward_std": 0.14751281961798668, + "rewards/accuracy_reward": 0.1361607201397419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 3254 + }, + { + "clip_ratio": 0.0, + "completion_length": 924.3638916015625, + "epoch": 0.9722948248823837, + "grad_norm": 1.0987504720687866, + "kl": 0.82275390625, + "learning_rate": 4.6004699157928824e-08, + "loss": 0.0347, + "reward": 0.593191996216774, + "reward_std": 0.10613285936415195, + "rewards/accuracy_reward": 0.10267857508733869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3255 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.4933471679688, + "epoch": 0.9725935329699051, + "grad_norm": 1.1994041204452515, + "kl": 0.62353515625, + "learning_rate": 4.501077919895513e-08, + "loss": 0.0206, + "reward": 0.589285746216774, + "reward_std": 0.10451915115118027, + "rewards/accuracy_reward": 0.09375000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 3256 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.7656555175781, + "epoch": 0.9728922410574267, + "grad_norm": 0.45982688665390015, + "kl": 0.43212890625, + "learning_rate": 4.4027689292560626e-08, + "loss": 0.0188, + "reward": 0.6183035969734192, + "reward_std": 0.11852527223527431, + "rewards/accuracy_reward": 0.12276786286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 3257 + }, + { + "clip_ratio": 0.0, + "completion_length": 952.4129791259766, + "epoch": 0.9731909491449481, + "grad_norm": 1.3905863761901855, + "kl": 0.86572265625, + "learning_rate": 4.3055430508248675e-08, + "loss": 0.0303, + "reward": 0.550223246216774, + "reward_std": 0.11874322593212128, + "rewards/accuracy_reward": 0.058035715483129025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3258 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.4018402099609, + "epoch": 0.9734896572324696, + "grad_norm": 1.161939024925232, + "kl": 1.0517578125, + "learning_rate": 4.2094003903743183e-08, + "loss": 0.0451, + "reward": 0.5613839477300644, + "reward_std": 0.10658497922122478, + "rewards/accuracy_reward": 0.0736607164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232387661934, + "step": 3259 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.9174499511719, + "epoch": 0.973788365319991, + "grad_norm": 0.447524756193161, + "kl": 0.4130859375, + "learning_rate": 4.114341052498194e-08, + "loss": 0.0294, + "reward": 0.6512276977300644, + "reward_std": 0.15559024550020695, + "rewards/accuracy_reward": 0.15848215110599995, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3260 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.0045013427734, + "epoch": 0.9740870734075125, + "grad_norm": 0.42442846298217773, + "kl": 0.55615234375, + "learning_rate": 4.020365140611771e-08, + "loss": 0.0297, + "reward": 0.628348246216774, + "reward_std": 0.11189555376768112, + "rewards/accuracy_reward": 0.13392857555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3261 + }, + { + "clip_ratio": 0.0, + "completion_length": 951.5000610351562, + "epoch": 0.974385781495034, + "grad_norm": 0.7938007712364197, + "kl": 0.8291015625, + "learning_rate": 3.927472756951489e-08, + "loss": 0.037, + "reward": 0.5507812723517418, + "reward_std": 0.1399762062355876, + "rewards/accuracy_reward": 0.06026786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3262 }, { - "completion_length": 611.3987148284912, - "epoch": 0.2915194346289753, - "grad_norm": 37.171165466308594, - "kl": 326.39921875, - "learning_rate": 1.785937464889027e-05, - "loss": 13.071, - "reward": 0.4729166806442663, - "reward_std": 0.36482593989931045, - "rewards/accuracy_reward": 0.07187500176951289, - "rewards/format_reward": 0.40104167715180666, - "step": 165 + "clip_ratio": 0.0, + "completion_length": 938.0625305175781, + "epoch": 0.9746844895825555, + "grad_norm": 0.543318510055542, + "kl": 0.5986328125, + "learning_rate": 3.83566400257529e-08, + "loss": 0.0325, + "reward": 0.5128348469734192, + "reward_std": 0.08201212109997869, + "rewards/accuracy_reward": 0.020089287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3263 }, { - "completion_length": 895.0153816223144, - "epoch": 0.3003533568904594, - "grad_norm": 13.124195098876953, - "kl": 2.1869384765625, - "learning_rate": 1.766485078671514e-05, - "loss": 0.0875, - "reward": 0.12682292009703816, - "reward_std": 0.21330046532675623, - "rewards/accuracy_reward": 0.0015625000465661286, - "rewards/format_reward": 0.12526041991077363, - "step": 170 + "clip_ratio": 0.0, + "completion_length": 935.0737152099609, + "epoch": 0.9749831976700769, + "grad_norm": 0.8563680648803711, + "kl": 0.78369140625, + "learning_rate": 3.744938977362056e-08, + "loss": 0.0399, + "reward": 0.663504496216774, + "reward_std": 0.10819004103541374, + "rewards/accuracy_reward": 0.1696428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 3264 }, { - "completion_length": 230.1461009979248, - "epoch": 0.30918727915194344, - "grad_norm": 2.6990084648132324, - "kl": 4.179736328125, - "learning_rate": 1.746302775541467e-05, - "loss": 0.1674, - "reward": 0.8765625193715095, - "reward_std": 0.27116235313005743, - "rewards/accuracy_reward": 0.02057291704695672, - "rewards/format_reward": 0.8559896007180214, - "step": 175 + "clip_ratio": 0.0, + "completion_length": 920.5893249511719, + "epoch": 0.9752819057575984, + "grad_norm": 0.4658413827419281, + "kl": 0.73193359375, + "learning_rate": 3.655297780011724e-08, + "loss": 0.0309, + "reward": 0.623325914144516, + "reward_std": 0.10521363466978073, + "rewards/accuracy_reward": 0.12946428963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 3265 }, { - "completion_length": 293.1122478485107, - "epoch": 0.31802120141342755, - "grad_norm": 4.779168128967285, - "kl": 1.509521484375, - "learning_rate": 1.7254097749263735e-05, - "loss": 0.0604, - "reward": 1.0182291969656945, - "reward_std": 0.20373042849823833, - "rewards/accuracy_reward": 0.06796875179279596, - "rewards/format_reward": 0.9502604335546494, - "step": 180 + "clip_ratio": 0.0, + "completion_length": 969.9531555175781, + "epoch": 0.9755806138451198, + "grad_norm": 0.9753384590148926, + "kl": 0.8046875, + "learning_rate": 3.566740508045174e-08, + "loss": 0.0399, + "reward": 0.5574776977300644, + "reward_std": 0.10260039055719972, + "rewards/accuracy_reward": 0.0669642873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3266 }, { - "completion_length": 432.6257915496826, - "epoch": 0.32685512367491165, - "grad_norm": 4.049810409545898, - "kl": 2.6603515625, - "learning_rate": 1.703825973044602e-05, - "loss": 0.1064, - "reward": 0.8968750223517418, - "reward_std": 0.41288822125643493, - "rewards/accuracy_reward": 0.07916666870005429, - "rewards/format_reward": 0.8177083529531955, - "step": 185 + "clip_ratio": 0.0, + "completion_length": 933.9554138183594, + "epoch": 0.9758793219326414, + "grad_norm": 0.6658555269241333, + "kl": 0.955078125, + "learning_rate": 3.4792672578038974e-08, + "loss": 0.0447, + "reward": 0.5775669813156128, + "reward_std": 0.13075675815343857, + "rewards/accuracy_reward": 0.08928571920841932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812798023224, + "step": 3267 }, { - "completion_length": 401.93907203674314, - "epoch": 0.33568904593639576, - "grad_norm": 14.11198616027832, - "kl": 4.8384765625, - "learning_rate": 1.681571923958416e-05, - "loss": 0.1936, - "reward": 0.9333333589136601, - "reward_std": 0.4148533625528216, - "rewards/accuracy_reward": 0.09973958611954004, - "rewards/format_reward": 0.8335937716066837, - "step": 190 + "clip_ratio": 0.0, + "completion_length": 944.5736999511719, + "epoch": 0.9761780300201628, + "grad_norm": 1.9624019861221313, + "kl": 1.529296875, + "learning_rate": 3.3928781244504384e-08, + "loss": 0.0699, + "reward": 0.5407366305589676, + "reward_std": 0.10278272442519665, + "rewards/accuracy_reward": 0.05580357206054032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4849330559372902, + "step": 3268 }, { - "completion_length": 330.86511516571045, - "epoch": 0.34452296819787986, - "grad_norm": 2.0772392749786377, - "kl": 1.749658203125, - "learning_rate": 1.6586688200005193e-05, - "loss": 0.07, - "reward": 1.0406250312924386, - "reward_std": 0.29810264529660346, - "rewards/accuracy_reward": 0.11770833667833358, - "rewards/format_reward": 0.9229166842997074, - "step": 195 + "clip_ratio": 0.0, + "completion_length": 960.9487152099609, + "epoch": 0.9764767381076843, + "grad_norm": 0.40008270740509033, + "kl": 0.8212890625, + "learning_rate": 3.3075732019675065e-08, + "loss": 0.0354, + "reward": 0.5725446715950966, + "reward_std": 0.10505452007055283, + "rewards/accuracy_reward": 0.08258928824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553880095482, + "step": 3269 }, { - "completion_length": 394.4031360626221, - "epoch": 0.35335689045936397, - "grad_norm": 1.5617283582687378, - "kl": 1.7335693359375, - "learning_rate": 1.6351384715927897e-05, - "loss": 0.0694, - "reward": 1.0929687805473804, - "reward_std": 0.28054872443899515, - "rewards/accuracy_reward": 0.1539062546333298, - "rewards/format_reward": 0.9390625178813934, - "step": 200 + "clip_ratio": 0.0, + "completion_length": 931.2366638183594, + "epoch": 0.9767754461952057, + "grad_norm": 0.7073755264282227, + "kl": 0.88720703125, + "learning_rate": 3.2233525831586455e-08, + "loss": 0.0443, + "reward": 0.6462053805589676, + "reward_std": 0.05430810758844018, + "rewards/accuracy_reward": 0.15401786286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 3270 }, { - "epoch": 0.35335689045936397, - "eval_completion_length": 485.9895978655134, - "eval_kl": 2.228794642857143, - "eval_loss": 0.08293063938617706, - "eval_reward": 1.0282738549368722, - "eval_reward_std": 0.4085059974874769, - "eval_rewards/accuracy_reward": 0.16517858100788935, - "eval_rewards/format_reward": 0.863095257963453, - "eval_runtime": 55.1993, - "eval_samples_per_second": 1.794, - "eval_steps_per_second": 0.036, - "step": 200 + "clip_ratio": 0.0, + "completion_length": 943.9442443847656, + "epoch": 0.9770741542827273, + "grad_norm": 3.557579755783081, + "kl": 0.93359375, + "learning_rate": 3.140216359647452e-08, + "loss": 0.0468, + "reward": 0.6534598618745804, + "reward_std": 0.08678633440285921, + "rewards/accuracy_reward": 0.1607142947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3271 }, { - "completion_length": 570.369550704956, - "epoch": 0.3621908127208481, - "grad_norm": 3.386005401611328, - "kl": 5.7681640625, - "learning_rate": 1.611003286476406e-05, - "loss": 0.2308, - "reward": 0.9132812798023224, - "reward_std": 0.46581555213779213, - "rewards/accuracy_reward": 0.14010417060926555, - "rewards/format_reward": 0.7731771036982537, - "step": 205 + "clip_ratio": 0.0, + "completion_length": 950.8683471679688, + "epoch": 0.9773728623702487, + "grad_norm": 0.8002287745475769, + "kl": 0.70947265625, + "learning_rate": 3.0581646218781346e-08, + "loss": 0.0333, + "reward": 0.595982164144516, + "reward_std": 0.11791126243770123, + "rewards/accuracy_reward": 0.10044643515720963, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 3272 }, { - "completion_length": 691.1328323364257, - "epoch": 0.3710247349823322, - "grad_norm": 5.973910331726074, - "kl": 3.29013671875, - "learning_rate": 1.5862862483731574e-05, - "loss": 0.1317, - "reward": 0.7466146018356085, - "reward_std": 0.5668615996837616, - "rewards/accuracy_reward": 0.155989587912336, - "rewards/format_reward": 0.5906250175088644, - "step": 210 + "clip_ratio": 0.0, + "completion_length": 948.1607513427734, + "epoch": 0.9776715704577701, + "grad_norm": 0.5716285705566406, + "kl": 0.955078125, + "learning_rate": 2.9771974591149557e-08, + "loss": 0.0312, + "reward": 0.6171875149011612, + "reward_std": 0.09868330741301179, + "rewards/accuracy_reward": 0.1250000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 3273 }, { - "completion_length": 487.6937648773193, - "epoch": 0.37985865724381623, - "grad_norm": 0.7804630398750305, - "kl": 2.0060546875, - "learning_rate": 1.5610108950982494e-05, - "loss": 0.0802, - "reward": 1.0221354477107525, - "reward_std": 0.40744142825715246, - "rewards/accuracy_reward": 0.17526042207609863, - "rewards/format_reward": 0.8468750186264515, - "step": 215 + "clip_ratio": 0.0, + "completion_length": 967.0937805175781, + "epoch": 0.9779702785452916, + "grad_norm": 0.5250840187072754, + "kl": 0.605712890625, + "learning_rate": 2.8973149594422323e-08, + "loss": 0.0169, + "reward": 0.5719866305589676, + "reward_std": 0.1245256804395467, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 3274 }, { - "completion_length": 457.1224075317383, - "epoch": 0.38869257950530034, - "grad_norm": 0.9766826629638672, - "kl": 1.4112548828125, - "learning_rate": 1.535201296145451e-05, - "loss": 0.0565, - "reward": 1.0976562827825547, - "reward_std": 0.39641969576478003, - "rewards/accuracy_reward": 0.21458333877380936, - "rewards/format_reward": 0.8830729387700558, - "step": 220 + "clip_ratio": 0.0, + "completion_length": 915.0937957763672, + "epoch": 0.978268986632813, + "grad_norm": 0.8431305289268494, + "kl": 0.61083984375, + "learning_rate": 2.8185172097641156e-08, + "loss": 0.0352, + "reward": 0.5954241305589676, + "reward_std": 0.078178730327636, + "rewards/accuracy_reward": 0.10044643469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776902794838, + "step": 3275 }, { - "completion_length": 441.93542633056643, - "epoch": 0.39752650176678445, - "grad_norm": 0.3449498116970062, - "kl": 0.56259765625, - "learning_rate": 1.5088820297659314e-05, - "loss": 0.0225, - "reward": 1.179687537252903, - "reward_std": 0.37575065195560453, - "rewards/accuracy_reward": 0.2713541740551591, - "rewards/format_reward": 0.9083333566784859, - "step": 225 + "clip_ratio": 0.0, + "completion_length": 970.5513916015625, + "epoch": 0.9785676947203346, + "grad_norm": 0.42443597316741943, + "kl": 0.67333984375, + "learning_rate": 2.740804295805144e-08, + "loss": 0.036, + "reward": 0.600446455180645, + "reward_std": 0.09478327631950378, + "rewards/accuracy_reward": 0.1093750037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 3276 }, { - "completion_length": 452.71407318115234, - "epoch": 0.40636042402826855, - "grad_norm": 0.519928514957428, - "kl": 0.4630126953125, - "learning_rate": 1.4820781595626116e-05, - "loss": 0.0185, - "reward": 1.2335937932133674, - "reward_std": 0.36431988701224327, - "rewards/accuracy_reward": 0.3156250100582838, - "rewards/format_reward": 0.9179687701165676, - "step": 230 + "clip_ratio": 0.0, + "completion_length": 922.5736999511719, + "epoch": 0.978866402807856, + "grad_norm": 1.0811432600021362, + "kl": 0.84765625, + "learning_rate": 2.6641763021091337e-08, + "loss": 0.0406, + "reward": 0.6339286118745804, + "reward_std": 0.09819843247532845, + "rewards/accuracy_reward": 0.14062500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3277 }, { - "completion_length": 534.6705894470215, - "epoch": 0.41519434628975266, - "grad_norm": 0.3681395649909973, - "kl": 0.767236328125, - "learning_rate": 1.4548152106223157e-05, - "loss": 0.0307, - "reward": 1.0226562790572644, - "reward_std": 0.5260338146239519, - "rewards/accuracy_reward": 0.23229167382232846, - "rewards/format_reward": 0.7903646051883697, - "step": 235 + "clip_ratio": 0.0, + "completion_length": 965.4286041259766, + "epoch": 0.9791651108953775, + "grad_norm": 1.6404937505722046, + "kl": 0.859375, + "learning_rate": 2.5886333120398456e-08, + "loss": 0.0437, + "reward": 0.5993303954601288, + "reward_std": 0.14630636759102345, + "rewards/accuracy_reward": 0.10937500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 3278 }, { - "completion_length": 448.2255340576172, - "epoch": 0.42402826855123676, - "grad_norm": 0.5592738389968872, - "kl": 0.9706298828125, - "learning_rate": 1.4271191452084598e-05, - "loss": 0.0388, - "reward": 1.059635452926159, - "reward_std": 0.45823672600090504, - "rewards/accuracy_reward": 0.2283854230772704, - "rewards/format_reward": 0.8312500193715096, - "step": 240 + "clip_ratio": 0.0, + "completion_length": 935.7031555175781, + "epoch": 0.9794638189828989, + "grad_norm": 0.47686198353767395, + "kl": 0.677734375, + "learning_rate": 2.514175407780761e-08, + "loss": 0.0291, + "reward": 0.5753348618745804, + "reward_std": 0.09872804512269795, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3279 }, { - "completion_length": 328.0093835830688, - "epoch": 0.43286219081272087, - "grad_norm": 0.3892843425273895, - "kl": 0.99150390625, - "learning_rate": 1.3990163380374195e-05, - "loss": 0.0397, - "reward": 1.1539062917232514, - "reward_std": 0.2934054052922875, - "rewards/accuracy_reward": 0.21223958893679082, - "rewards/format_reward": 0.9416666865348816, - "step": 245 + "clip_ratio": 0.0, + "completion_length": 939.0469360351562, + "epoch": 0.9797625270704204, + "grad_norm": 0.42110297083854675, + "kl": 0.779296875, + "learning_rate": 2.440802670334641e-08, + "loss": 0.0419, + "reward": 0.6160714477300644, + "reward_std": 0.0790243586525321, + "rewards/accuracy_reward": 0.12276786379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 3280 }, { - "completion_length": 381.8747497558594, - "epoch": 0.4416961130742049, - "grad_norm": 0.20731879770755768, - "kl": 0.63095703125, - "learning_rate": 1.3705335511621229e-05, - "loss": 0.0252, - "reward": 1.1802083745598793, - "reward_std": 0.3383705548942089, - "rewards/accuracy_reward": 0.2419270913582295, - "rewards/format_reward": 0.9382812716066837, - "step": 250 + "clip_ratio": 0.0, + "completion_length": 954.3817443847656, + "epoch": 0.9800612351579419, + "grad_norm": 1.4291521310806274, + "kl": 0.8798828125, + "learning_rate": 2.368515179523967e-08, + "loss": 0.0405, + "reward": 0.556919664144516, + "reward_std": 0.11177602410316467, + "rewards/accuracy_reward": 0.06473214691504836, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 3281 }, { - "completion_length": 495.92293014526365, - "epoch": 0.450530035335689, - "grad_norm": 0.281121164560318, - "kl": 0.6699951171875, - "learning_rate": 1.3416979084867851e-05, - "loss": 0.0268, - "reward": 1.1559896245598793, - "reward_std": 0.4141297750174999, - "rewards/accuracy_reward": 0.27031250651925803, - "rewards/format_reward": 0.8856771036982536, - "step": 255 + "clip_ratio": 0.0, + "completion_length": 972.9643249511719, + "epoch": 0.9803599432454634, + "grad_norm": 0.333818644285202, + "kl": 0.922607421875, + "learning_rate": 2.2973130139903878e-08, + "loss": 0.0437, + "reward": 0.5691964626312256, + "reward_std": 0.08740221057087183, + "rewards/accuracy_reward": 0.0781250016298145, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 3282 }, { - "completion_length": 541.2487155914307, - "epoch": 0.45936395759717313, - "grad_norm": 0.3925512135028839, - "kl": 0.7656982421875, - "learning_rate": 1.3125368699370567e-05, - "loss": 0.0306, - "reward": 1.1682292096316815, - "reward_std": 0.4320328576490283, - "rewards/accuracy_reward": 0.3013020925223827, - "rewards/format_reward": 0.8669271022081375, - "step": 260 + "clip_ratio": 0.0, + "completion_length": 930.9732666015625, + "epoch": 0.9806586513329848, + "grad_norm": 0.821327805519104, + "kl": 0.9482421875, + "learning_rate": 2.22719625119483e-08, + "loss": 0.0372, + "reward": 0.6406250298023224, + "reward_std": 0.12317700684070587, + "rewards/accuracy_reward": 0.14955357951112092, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3283 }, { - "completion_length": 528.2901248931885, - "epoch": 0.46819787985865724, - "grad_norm": 0.7729207277297974, - "kl": 0.8477783203125, - "learning_rate": 1.2830782053101807e-05, - "loss": 0.0339, - "reward": 1.2143229521811008, - "reward_std": 0.402412174642086, - "rewards/accuracy_reward": 0.31536459205672146, - "rewards/format_reward": 0.8989583536982536, - "step": 265 + "clip_ratio": 0.0, + "completion_length": 948.0848693847656, + "epoch": 0.9809573594205063, + "grad_norm": 0.6611117124557495, + "kl": 0.779296875, + "learning_rate": 2.1581649674176086e-08, + "loss": 0.0453, + "reward": 0.5479910969734192, + "reward_std": 0.10433615790680051, + "rewards/accuracy_reward": 0.058035715483129025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 3284 }, { - "completion_length": 546.829706954956, - "epoch": 0.47703180212014135, - "grad_norm": 0.5180284380912781, - "kl": 1.0993408203125, - "learning_rate": 1.2533499678300618e-05, - "loss": 0.044, - "reward": 1.201822955161333, - "reward_std": 0.4039957173168659, - "rewards/accuracy_reward": 0.3072916761506349, - "rewards/format_reward": 0.8945312716066838, - "step": 270 + "clip_ratio": 0.0, + "completion_length": 938.2344055175781, + "epoch": 0.9812560675080277, + "grad_norm": 0.9769347310066223, + "kl": 1.0615234375, + "learning_rate": 2.0902192377577624e-08, + "loss": 0.0586, + "reward": 0.569196455180645, + "reward_std": 0.0511832176707685, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3285 }, { - "completion_length": 551.7250205993653, - "epoch": 0.48586572438162545, - "grad_norm": 0.45238202810287476, - "kl": 0.7359130859375, - "learning_rate": 1.223380467432432e-05, - "loss": 0.0294, - "reward": 1.2083333745598792, - "reward_std": 0.3825568653643131, - "rewards/accuracy_reward": 0.30156250814907254, - "rewards/format_reward": 0.9067708551883698, - "step": 275 + "clip_ratio": 0.0, + "completion_length": 940.4553985595703, + "epoch": 0.9815547755955493, + "grad_norm": 0.5017314553260803, + "kl": 0.7060546875, + "learning_rate": 2.023359136133829e-08, + "loss": 0.0298, + "reward": 0.544084832072258, + "reward_std": 0.12397108040750027, + "rewards/accuracy_reward": 0.05357143119908869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3286 }, { - "completion_length": 574.8323146820069, - "epoch": 0.49469964664310956, - "grad_norm": 1.1250256299972534, - "kl": 1.625048828125, - "learning_rate": 1.1931982438055506e-05, - "loss": 0.065, - "reward": 1.144010452926159, - "reward_std": 0.4556888522580266, - "rewards/accuracy_reward": 0.2822916739620268, - "rewards/format_reward": 0.8617187723517418, - "step": 280 + "clip_ratio": 0.0, + "completion_length": 965.8214721679688, + "epoch": 0.9818534836830707, + "grad_norm": 0.9736894369125366, + "kl": 1.0859375, + "learning_rate": 1.957584735282847e-08, + "loss": 0.0403, + "reward": 0.553571455180645, + "reward_std": 0.09358323272317648, + "rewards/accuracy_reward": 0.0647321455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 3287 }, { - "completion_length": 514.8932434082031, - "epoch": 0.5035335689045937, - "grad_norm": 0.2622203230857849, - "kl": 0.7667724609375, - "learning_rate": 1.1628320392121118e-05, - "loss": 0.0307, - "reward": 1.2359375432133675, - "reward_std": 0.3458207995630801, - "rewards/accuracy_reward": 0.30755209350027146, - "rewards/format_reward": 0.9283854357898236, - "step": 285 + "clip_ratio": 0.0, + "completion_length": 947.2924499511719, + "epoch": 0.9821521917705922, + "grad_norm": 0.6511564254760742, + "kl": 0.9228515625, + "learning_rate": 1.8928961067610217e-08, + "loss": 0.0432, + "reward": 0.6445312649011612, + "reward_std": 0.12570507009513676, + "rewards/accuracy_reward": 0.1495535783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3288 }, { - "completion_length": 521.0554836273193, - "epoch": 0.5123674911660777, - "grad_norm": 0.6569560766220093, - "kl": 0.6212646484375, - "learning_rate": 1.1323107711182474e-05, - "loss": 0.0249, - "reward": 1.2632812917232514, - "reward_std": 0.36140933344140647, - "rewards/accuracy_reward": 0.3291666769422591, - "rewards/format_reward": 0.9341146036982536, - "step": 290 + "clip_ratio": 0.0, + "completion_length": 932.9129943847656, + "epoch": 0.9824508998581136, + "grad_norm": 0.5466786623001099, + "kl": 0.9560546875, + "learning_rate": 1.8292933209432816e-08, + "loss": 0.0527, + "reward": 0.5496651977300644, + "reward_std": 0.08346330560743809, + "rewards/accuracy_reward": 0.05803571571595967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294887661934, + "step": 3289 }, { - "completion_length": 546.5757999420166, - "epoch": 0.5212014134275619, - "grad_norm": 0.8339012265205383, - "kl": 1.341357421875, - "learning_rate": 1.1016635046556773e-05, - "loss": 0.0537, - "reward": 1.189583370834589, - "reward_std": 0.4341584246605635, - "rewards/accuracy_reward": 0.307291676197201, - "rewards/format_reward": 0.8822916850447655, - "step": 295 + "clip_ratio": 0.0, + "completion_length": 927.3839721679688, + "epoch": 0.9827496079456352, + "grad_norm": 0.5426170229911804, + "kl": 0.7626953125, + "learning_rate": 1.7667764470230553e-08, + "loss": 0.0662, + "reward": 0.611607164144516, + "reward_std": 0.1902102194726467, + "rewards/accuracy_reward": 0.12053572200238705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3290 }, { - "completion_length": 505.27032737731935, - "epoch": 0.5300353356890459, - "grad_norm": 0.5498542785644531, - "kl": 1.0662353515625, - "learning_rate": 1.0709194249432471e-05, - "loss": 0.0426, - "reward": 1.223697953671217, - "reward_std": 0.38470860905945303, - "rewards/accuracy_reward": 0.31640625838190317, - "rewards/format_reward": 0.9072916865348816, - "step": 300 + "clip_ratio": 0.0, + "completion_length": 952.9062957763672, + "epoch": 0.9830483160331566, + "grad_norm": 0.4919239282608032, + "kl": 0.554931640625, + "learning_rate": 1.705345553012716e-08, + "loss": 0.0203, + "reward": 0.6138393133878708, + "reward_std": 0.07437950558960438, + "rewards/accuracy_reward": 0.12053571571595967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3291 }, { - "epoch": 0.5300353356890459, - "eval_completion_length": 469.9256112234933, - "eval_kl": 0.6702008928571429, - "eval_loss": 0.02665134333074093, - "eval_reward": 1.2633929082325526, - "eval_reward_std": 0.3754622382777078, - "eval_rewards/accuracy_reward": 0.34375000851494925, - "eval_rewards/format_reward": 0.9196428741727557, - "eval_runtime": 53.9484, - "eval_samples_per_second": 1.835, - "eval_steps_per_second": 0.037, - "step": 300 + "clip_ratio": 0.0, + "completion_length": 941.7902069091797, + "epoch": 0.9833470241206781, + "grad_norm": 2.2712254524230957, + "kl": 0.94775390625, + "learning_rate": 1.6450007057431382e-08, + "loss": 0.0589, + "reward": 0.5931920036673546, + "reward_std": 0.08121617138385773, + "rewards/accuracy_reward": 0.1026785783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3292 }, { - "completion_length": 492.0828289031982, - "epoch": 0.5388692579505301, - "grad_norm": 0.4185556173324585, - "kl": 0.8048583984375, - "learning_rate": 1.0401078092941972e-05, - "loss": 0.0322, - "reward": 1.2166667029261589, - "reward_std": 0.3806588628794998, - "rewards/accuracy_reward": 0.30312500847503543, - "rewards/format_reward": 0.9135416872799397, - "step": 305 + "clip_ratio": 0.0, + "completion_length": 936.4531707763672, + "epoch": 0.9836457322081995, + "grad_norm": 0.5349833369255066, + "kl": 0.81640625, + "learning_rate": 1.5857419708633636e-08, + "loss": 0.0333, + "reward": 0.6149553805589676, + "reward_std": 0.11991378106176853, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 3293 }, { - "completion_length": 485.6471481323242, - "epoch": 0.5477031802120141, - "grad_norm": 0.7252722382545471, - "kl": 0.7559814453125, - "learning_rate": 1.0092579993356386e-05, - "loss": 0.0303, - "reward": 1.2145833745598793, - "reward_std": 0.37445320282131433, - "rewards/accuracy_reward": 0.29427084147464483, - "rewards/format_reward": 0.9203125178813935, - "step": 310 + "clip_ratio": 0.0, + "completion_length": 959.1295013427734, + "epoch": 0.983944440295721, + "grad_norm": 0.4689333736896515, + "kl": 0.81884765625, + "learning_rate": 1.5275694128412675e-08, + "loss": 0.0336, + "reward": 0.5753348469734192, + "reward_std": 0.12316901981830597, + "rewards/accuracy_reward": 0.08258928963914514, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3294 }, { - "completion_length": 490.91199226379393, - "epoch": 0.5565371024734982, - "grad_norm": 0.2333078235387802, - "kl": 0.6938720703125, - "learning_rate": 9.783993730667833e-06, - "loss": 0.0278, - "reward": 1.2690104454755784, - "reward_std": 0.3585425109602511, - "rewards/accuracy_reward": 0.3322916769422591, - "rewards/format_reward": 0.9367187716066837, - "step": 315 + "clip_ratio": 0.0, + "completion_length": 934.4442443847656, + "epoch": 0.9842431483832424, + "grad_norm": 0.8455535769462585, + "kl": 0.60400390625, + "learning_rate": 1.4704830949627825e-08, + "loss": 0.0417, + "reward": 0.5926339477300644, + "reward_std": 0.07783204992301762, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3295 }, { - "completion_length": 526.1885570526123, - "epoch": 0.5653710247349824, - "grad_norm": 0.7570303678512573, - "kl": 0.6832763671875, - "learning_rate": 9.475613168825374e-06, - "loss": 0.0273, - "reward": 1.2013021253049374, - "reward_std": 0.3837351520545781, - "rewards/accuracy_reward": 0.29192709079943596, - "rewards/format_reward": 0.9093750216066837, - "step": 320 + "clip_ratio": 0.0, + "completion_length": 952.0536041259766, + "epoch": 0.984541856470764, + "grad_norm": 0.9568902850151062, + "kl": 0.834716796875, + "learning_rate": 1.4144830793323406e-08, + "loss": 0.0401, + "reward": 0.5987723469734192, + "reward_std": 0.09982769377529621, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3296 }, { - "completion_length": 546.7075695037842, - "epoch": 0.5742049469964664, - "grad_norm": 0.32359421253204346, - "kl": 0.54560546875, - "learning_rate": 9.167731975890977e-06, - "loss": 0.0218, - "reward": 1.1994792081415653, - "reward_std": 0.4060095700901002, - "rewards/accuracy_reward": 0.2968750075437129, - "rewards/format_reward": 0.9026041902601719, - "step": 325 + "clip_ratio": 0.0, + "completion_length": 948.5067291259766, + "epoch": 0.9848405645582854, + "grad_norm": 0.6568217277526855, + "kl": 0.93017578125, + "learning_rate": 1.3595694268723202e-08, + "loss": 0.0432, + "reward": 0.6389509066939354, + "reward_std": 0.14485501311719418, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616156578064, + "step": 3297 }, { - "completion_length": 575.3255393981933, - "epoch": 0.5830388692579506, - "grad_norm": 0.4345127046108246, - "kl": 0.6108642578125, - "learning_rate": 8.860643344382057e-06, - "loss": 0.0244, - "reward": 1.1312500394880771, - "reward_std": 0.4220845863223076, - "rewards/accuracy_reward": 0.2533854244975373, - "rewards/format_reward": 0.8778646014630794, - "step": 330 + "clip_ratio": 0.0, + "completion_length": 954.154052734375, + "epoch": 0.9851392726458069, + "grad_norm": 0.5934439897537231, + "kl": 0.578125, + "learning_rate": 1.3057421973236007e-08, + "loss": 0.0307, + "reward": 0.5904018133878708, + "reward_std": 0.1086134072393179, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4966517984867096, + "step": 3298 }, { - "completion_length": 542.5843936920166, - "epoch": 0.5918727915194346, - "grad_norm": 0.5992975831031799, - "kl": 0.5624267578125, - "learning_rate": 8.554639712066837e-06, - "loss": 0.0225, - "reward": 1.2007812820374966, - "reward_std": 0.3856398138217628, - "rewards/accuracy_reward": 0.292187507590279, - "rewards/format_reward": 0.9085937693715096, - "step": 335 + "clip_ratio": 0.0, + "completion_length": 956.3482666015625, + "epoch": 0.9854379807333283, + "grad_norm": 0.4323152005672455, + "kl": 0.735595703125, + "learning_rate": 1.253001449244673e-08, + "loss": 0.036, + "reward": 0.563616082072258, + "reward_std": 0.07001051609404385, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 3299 }, { - "completion_length": 503.3919399261475, - "epoch": 0.6007067137809188, - "grad_norm": 0.2095309942960739, - "kl": 0.529736328125, - "learning_rate": 8.250012483478478e-06, - "loss": 0.0212, - "reward": 1.2250000357627868, - "reward_std": 0.3707302604801953, - "rewards/accuracy_reward": 0.2986979266395792, - "rewards/format_reward": 0.9263021007180214, - "step": 340 + "clip_ratio": 0.0, + "completion_length": 947.1317443847656, + "epoch": 0.9857366888208499, + "grad_norm": 0.4345482587814331, + "kl": 0.73876953125, + "learning_rate": 1.2013472400125293e-08, + "loss": 0.0296, + "reward": 0.5440848618745804, + "reward_std": 0.10777911730110645, + "rewards/accuracy_reward": 0.04910714481957257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3300 }, { - "completion_length": 482.84766883850097, - "epoch": 0.6095406360424028, - "grad_norm": 0.20496566593647003, - "kl": 0.505419921875, - "learning_rate": 7.947051752413131e-06, - "loss": 0.0202, - "reward": 1.2750000424683094, - "reward_std": 0.3539407839998603, - "rewards/accuracy_reward": 0.33046875982545315, - "rewards/format_reward": 0.9445312693715096, - "step": 345 + "clip_ratio": 0.0, + "completion_length": 930.9844207763672, + "epoch": 0.9860353969083713, + "grad_norm": 0.7366137504577637, + "kl": 0.52587890625, + "learning_rate": 1.150779625821885e-08, + "loss": 0.0281, + "reward": 0.6473214477300644, + "reward_std": 0.06472904654219747, + "rewards/accuracy_reward": 0.1540178619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 3301 }, { - "completion_length": 531.3984535217285, - "epoch": 0.6183745583038869, - "grad_norm": 0.3090152144432068, - "kl": 0.629296875, - "learning_rate": 7.646046025676198e-06, - "loss": 0.0252, - "reward": 1.2320312917232514, - "reward_std": 0.3668346595019102, - "rewards/accuracy_reward": 0.3072916756384075, - "rewards/format_reward": 0.9247396036982536, - "step": 350 + "clip_ratio": 0.0, + "completion_length": 940.3170166015625, + "epoch": 0.9863341049958928, + "grad_norm": 0.9634992480278015, + "kl": 0.91455078125, + "learning_rate": 1.1012986616850685e-08, + "loss": 0.0341, + "reward": 0.5446428805589676, + "reward_std": 0.10867509432137012, + "rewards/accuracy_reward": 0.051339287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3302 }, { - "completion_length": 552.1784030914307, - "epoch": 0.627208480565371, - "grad_norm": 0.45702025294303894, - "kl": 0.6481689453125, - "learning_rate": 7.34728194833988e-06, - "loss": 0.0259, - "reward": 1.2125000417232514, - "reward_std": 0.36545806713402273, - "rewards/accuracy_reward": 0.30052084126509726, - "rewards/format_reward": 0.9119791857898235, - "step": 355 + "clip_ratio": 0.0, + "completion_length": 974.9531707763672, + "epoch": 0.9866328130834142, + "grad_norm": 0.5524348020553589, + "kl": 1.080078125, + "learning_rate": 1.0529044014329081e-08, + "loss": 0.0548, + "reward": 0.6049107611179352, + "reward_std": 0.12175105512142181, + "rewards/accuracy_reward": 0.11607143399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 3303 }, { - "completion_length": 529.9958492279053, - "epoch": 0.6360424028268551, - "grad_norm": 0.33211594820022583, - "kl": 0.704541015625, - "learning_rate": 7.051044030773619e-06, - "loss": 0.0282, - "reward": 1.2580729566514492, - "reward_std": 0.3836502737365663, - "rewards/accuracy_reward": 0.3377604259643704, - "rewards/format_reward": 0.9203125230967999, - "step": 360 + "clip_ratio": 0.0, + "completion_length": 944.1763916015625, + "epoch": 0.9869315211709357, + "grad_norm": 1.2095417976379395, + "kl": 0.68115234375, + "learning_rate": 1.0055968977132902e-08, + "loss": 0.0415, + "reward": 0.6015625298023224, + "reward_std": 0.09213258512318134, + "rewards/accuracy_reward": 0.10937500605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875298023224, + "step": 3304 }, { - "completion_length": 544.3924690246582, - "epoch": 0.6448763250883393, - "grad_norm": 1.3437328338623047, - "kl": 0.77060546875, - "learning_rate": 6.757614377707409e-06, - "loss": 0.0308, - "reward": 1.2088542021811008, - "reward_std": 0.408264291100204, - "rewards/accuracy_reward": 0.3151041756384075, - "rewards/format_reward": 0.8937500201165676, - "step": 365 + "clip_ratio": 0.0, + "completion_length": 947.2701416015625, + "epoch": 0.9872302292584572, + "grad_norm": 0.9270266890525818, + "kl": 0.9873046875, + "learning_rate": 9.593762019922681e-09, + "loss": 0.0404, + "reward": 0.5758928805589676, + "reward_std": 0.07019779365509748, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 3305 }, { - "completion_length": 532.4552253723144, - "epoch": 0.6537102473498233, - "grad_norm": 0.5647250413894653, - "kl": 0.7380126953125, - "learning_rate": 6.467272419585984e-06, - "loss": 0.0295, - "reward": 1.2164062887430191, - "reward_std": 0.4184632558375597, - "rewards/accuracy_reward": 0.31953125898726287, - "rewards/format_reward": 0.8968750178813935, - "step": 370 + "clip_ratio": 0.0, + "completion_length": 974.4665832519531, + "epoch": 0.9875289373459787, + "grad_norm": 1.2449707984924316, + "kl": 1.2783203125, + "learning_rate": 9.142423645535081e-09, + "loss": 0.049, + "reward": 0.5837053805589676, + "reward_std": 0.10666595702059567, + "rewards/accuracy_reward": 0.09151786100119352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875149011612, + "step": 3306 }, { - "completion_length": 536.7328311920166, - "epoch": 0.6625441696113075, - "grad_norm": 0.6853923201560974, - "kl": 0.9218505859375, - "learning_rate": 6.18029464646968e-06, - "loss": 0.0369, - "reward": 1.2013021230697631, - "reward_std": 0.41806467771530154, - "rewards/accuracy_reward": 0.3039062582189217, - "rewards/format_reward": 0.8973958499729633, - "step": 375 + "clip_ratio": 0.0, + "completion_length": 964.4486999511719, + "epoch": 0.9878276454335001, + "grad_norm": 0.5206459760665894, + "kl": 0.322265625, + "learning_rate": 8.701954344980668e-09, + "loss": 0.0137, + "reward": 0.5814732313156128, + "reward_std": 0.06700764154084027, + "rewards/accuracy_reward": 0.08705357555299997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.494419664144516, + "step": 3307 }, { - "completion_length": 508.79246101379397, - "epoch": 0.6713780918727915, - "grad_norm": 0.3235747218132019, - "kl": 0.659375, - "learning_rate": 5.896954344735426e-06, - "loss": 0.0264, - "reward": 1.249739622324705, - "reward_std": 0.37400244316086173, - "rewards/accuracy_reward": 0.3302083401940763, - "rewards/format_reward": 0.9195312686264515, - "step": 380 + "clip_ratio": 0.0, + "completion_length": 938.1763916015625, + "epoch": 0.9881263535210216, + "grad_norm": 1.034024953842163, + "kl": 0.92626953125, + "learning_rate": 8.272354597448351e-09, + "loss": 0.0582, + "reward": 0.6378348469734192, + "reward_std": 0.1476863007992506, + "rewards/accuracy_reward": 0.14508929406292737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 3308 }, { - "completion_length": 480.9989696502686, - "epoch": 0.6802120141342756, - "grad_norm": 0.45692071318626404, - "kl": 0.5900146484375, - "learning_rate": 5.617521336828556e-06, - "loss": 0.0236, - "reward": 1.262239618599415, - "reward_std": 0.35354847051203253, - "rewards/accuracy_reward": 0.32812500894069674, - "rewards/format_reward": 0.9341146044433117, - "step": 385 + "clip_ratio": 0.0, + "completion_length": 898.3036193847656, + "epoch": 0.988425061608543, + "grad_norm": 0.6479416489601135, + "kl": 0.611328125, + "learning_rate": 7.853624870298727e-09, + "loss": 0.0314, + "reward": 0.6651786118745804, + "reward_std": 0.11181948520243168, + "rewards/accuracy_reward": 0.17187500651925802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493303582072258, + "step": 3309 }, { - "completion_length": 472.4760540008545, - "epoch": 0.6890459363957597, - "grad_norm": 0.32815152406692505, - "kl": 0.5199462890625, - "learning_rate": 5.342261724313292e-06, - "loss": 0.0208, - "reward": 1.2812500447034836, - "reward_std": 0.33566789580509065, - "rewards/accuracy_reward": 0.3382812598720193, - "rewards/format_reward": 0.9429687730967998, - "step": 390 + "clip_ratio": 0.0, + "completion_length": 957.1295166015625, + "epoch": 0.9887237696960646, + "grad_norm": 0.9220425486564636, + "kl": 0.782470703125, + "learning_rate": 7.4457656190707324e-09, + "loss": 0.0344, + "reward": 0.6132812798023224, + "reward_std": 0.13750748336315155, + "rewards/accuracy_reward": 0.1205357164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 3310 }, { - "completion_length": 513.5192852020264, - "epoch": 0.6978798586572438, - "grad_norm": 0.44857704639434814, - "kl": 0.6975341796875, - "learning_rate": 5.0714376344666095e-06, - "loss": 0.0279, - "reward": 1.2565104596316814, - "reward_std": 0.38013526052236557, - "rewards/accuracy_reward": 0.3427083441987634, - "rewards/format_reward": 0.9138021022081375, - "step": 395 + "clip_ratio": 0.0, + "completion_length": 940.8661193847656, + "epoch": 0.989022477783586, + "grad_norm": 0.420410692691803, + "kl": 0.57177734375, + "learning_rate": 7.048777287472774e-09, + "loss": 0.021, + "reward": 0.5362723469734192, + "reward_std": 0.11554627865552902, + "rewards/accuracy_reward": 0.04241071664728224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 3311 }, { - "completion_length": 568.7135593414307, - "epoch": 0.7067137809187279, - "grad_norm": 0.47048985958099365, - "kl": 0.6981689453125, - "learning_rate": 4.8053069706567555e-06, - "loss": 0.0279, - "reward": 1.2117187917232513, - "reward_std": 0.4218774849548936, - "rewards/accuracy_reward": 0.33229167610406873, - "rewards/format_reward": 0.8794271029531956, - "step": 400 + "clip_ratio": 0.0, + "completion_length": 944.7656860351562, + "epoch": 0.9893211858711075, + "grad_norm": 0.829038679599762, + "kl": 0.892578125, + "learning_rate": 6.6626603073916e-09, + "loss": 0.0353, + "reward": 0.5156250149011612, + "reward_std": 0.06297942041419446, + "rewards/accuracy_reward": 0.022321430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035969734192, + "step": 3312 }, { - "epoch": 0.7067137809187279, - "eval_completion_length": 591.4181692940848, - "eval_kl": 1.0301339285714286, - "eval_loss": 0.04000028595328331, - "eval_reward": 1.2321428912026542, - "eval_reward_std": 0.49129511628832134, - "eval_rewards/accuracy_reward": 0.38988095947674345, - "eval_rewards/format_reward": 0.8422619274684361, - "eval_runtime": 57.521, - "eval_samples_per_second": 1.721, - "eval_steps_per_second": 0.035, - "step": 400 + "clip_ratio": 0.0, + "completion_length": 944.0580749511719, + "epoch": 0.9896198939586289, + "grad_norm": 0.5296908020973206, + "kl": 1.10986328125, + "learning_rate": 6.287415098883421e-09, + "loss": 0.0486, + "reward": 0.5926339626312256, + "reward_std": 0.1409473679959774, + "rewards/accuracy_reward": 0.10267857694998384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.489955373108387, + "step": 3313 }, { - "completion_length": 589.264342880249, - "epoch": 0.715547703180212, - "grad_norm": 0.4592438042163849, - "kl": 0.9536865234375, - "learning_rate": 4.5441231667441724e-06, - "loss": 0.0381, - "reward": 1.167968787252903, - "reward_std": 0.4488849970512092, - "rewards/accuracy_reward": 0.31666667517274616, - "rewards/format_reward": 0.8513021014630795, - "step": 405 + "clip_ratio": 0.0, + "completion_length": 961.1763916015625, + "epoch": 0.9899186020461505, + "grad_norm": 0.7024483680725098, + "kl": 1.0693359375, + "learning_rate": 5.923042070178353e-09, + "loss": 0.0452, + "reward": 0.545758955180645, + "reward_std": 0.08992115268483758, + "rewards/accuracy_reward": 0.05580357275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553880095482, + "step": 3314 }, { - "completion_length": 556.8291862487793, - "epoch": 0.7243816254416962, - "grad_norm": 0.6357600092887878, - "kl": 0.7573974609375, - "learning_rate": 4.288134945738684e-06, - "loss": 0.0303, - "reward": 1.1882812924683095, - "reward_std": 0.43695366848260164, - "rewards/accuracy_reward": 0.3119791763136163, - "rewards/format_reward": 0.8763021044433117, - "step": 410 + "clip_ratio": 0.0, + "completion_length": 966.935302734375, + "epoch": 0.9902173101336719, + "grad_norm": 1.1556239128112793, + "kl": 1.0595703125, + "learning_rate": 5.569541617679308e-09, + "loss": 0.0416, + "reward": 0.5513392984867096, + "reward_std": 0.07232599332928658, + "rewards/accuracy_reward": 0.060267860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491071455180645, + "step": 3315 }, { - "completion_length": 506.94715003967286, - "epoch": 0.7332155477031802, - "grad_norm": 0.49249598383903503, - "kl": 0.6909423828125, - "learning_rate": 4.037586082942805e-06, - "loss": 0.0276, - "reward": 1.2596354559063911, - "reward_std": 0.41586904488503934, - "rewards/accuracy_reward": 0.3580729281529784, - "rewards/format_reward": 0.9015625178813934, - "step": 415 + "clip_ratio": 0.0, + "completion_length": 957.6875457763672, + "epoch": 0.9905160182211933, + "grad_norm": 0.9687811136245728, + "kl": 0.87109375, + "learning_rate": 5.22691412595866e-09, + "loss": 0.0437, + "reward": 0.5837053805589676, + "reward_std": 0.09439345821738243, + "rewards/accuracy_reward": 0.09375000605359674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 3316 }, { - "completion_length": 505.1260581970215, - "epoch": 0.7420494699646644, - "grad_norm": 0.7220035195350647, - "kl": 0.683154296875, - "learning_rate": 3.7927151738066693e-06, - "loss": 0.0273, - "reward": 1.2005208738148212, - "reward_std": 0.39248473905026915, - "rewards/accuracy_reward": 0.3106770905666053, - "rewards/format_reward": 0.8898437686264515, - "step": 420 + "clip_ratio": 0.0, + "completion_length": 920.4174499511719, + "epoch": 0.9908147263087148, + "grad_norm": 0.6394659280776978, + "kl": 0.521484375, + "learning_rate": 4.895159967762686e-09, + "loss": 0.0254, + "reward": 0.6004464626312256, + "reward_std": 0.12290199659764767, + "rewards/accuracy_reward": 0.10714286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3317 }, { - "completion_length": 498.1062644958496, - "epoch": 0.7508833922261484, - "grad_norm": 0.31572696566581726, - "kl": 0.77158203125, - "learning_rate": 3.553755406715724e-06, - "loss": 0.0309, - "reward": 1.2153646171092987, - "reward_std": 0.4389189792796969, - "rewards/accuracy_reward": 0.3322916753590107, - "rewards/format_reward": 0.8830729372799396, - "step": 425 + "clip_ratio": 0.0, + "completion_length": 960.4888763427734, + "epoch": 0.9911134343962362, + "grad_norm": 1.818381428718567, + "kl": 1.1484375, + "learning_rate": 4.574279504007128e-09, + "loss": 0.0334, + "reward": 0.574218787252903, + "reward_std": 0.07065250724554062, + "rewards/accuracy_reward": 0.0870535746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.487165205180645, + "step": 3318 }, { - "completion_length": 477.43178367614746, - "epoch": 0.7597173144876325, - "grad_norm": 0.28930729627609253, - "kl": 0.7544921875, - "learning_rate": 3.320934340927513e-06, - "loss": 0.0302, - "reward": 1.2364583685994148, - "reward_std": 0.39876444116234777, - "rewards/accuracy_reward": 0.3304687602445483, - "rewards/format_reward": 0.9059895999729634, - "step": 430 + "clip_ratio": 0.0, + "completion_length": 949.0960235595703, + "epoch": 0.9914121424837578, + "grad_norm": 1.1663293838500977, + "kl": 0.577392578125, + "learning_rate": 4.264273083778303e-09, + "loss": 0.0323, + "reward": 0.5742187649011612, + "reward_std": 0.060642533004283905, + "rewards/accuracy_reward": 0.0803571492433548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 3319 }, { - "completion_length": 469.1546977996826, - "epoch": 0.7685512367491166, - "grad_norm": 0.5157067775726318, - "kl": 0.6148681640625, - "learning_rate": 3.094473689869002e-06, - "loss": 0.0246, - "reward": 1.2536458775401116, - "reward_std": 0.36776905208826066, - "rewards/accuracy_reward": 0.32734375898726287, - "rewards/format_reward": 0.9263021051883698, - "step": 435 + "clip_ratio": 0.0, + "completion_length": 947.1272735595703, + "epoch": 0.9917108505712792, + "grad_norm": 0.8710744976997375, + "kl": 1.416015625, + "learning_rate": 3.965141044333099e-09, + "loss": 0.0615, + "reward": 0.611607164144516, + "reward_std": 0.0687430864199996, + "rewards/accuracy_reward": 0.1227678619325161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393059372902, + "step": 3320 }, { - "completion_length": 470.22579460144044, - "epoch": 0.7773851590106007, - "grad_norm": 0.5000882744789124, - "kl": 0.5715576171875, - "learning_rate": 2.8745891100008683e-06, - "loss": 0.0229, - "reward": 1.2776041999459267, - "reward_std": 0.3550225287675858, - "rewards/accuracy_reward": 0.34166667724493893, - "rewards/format_reward": 0.9359375186264515, - "step": 440 + "clip_ratio": 0.0, + "completion_length": 930.4442291259766, + "epoch": 0.9920095586588007, + "grad_norm": 0.5803068280220032, + "kl": 0.7333984375, + "learning_rate": 3.676883711097867e-09, + "loss": 0.0397, + "reward": 0.6551339626312256, + "reward_std": 0.1553910132497549, + "rewards/accuracy_reward": 0.1629464365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3321 }, { - "completion_length": 496.6867332458496, - "epoch": 0.7862190812720848, - "grad_norm": 0.7162412405014038, - "kl": 0.6553955078125, - "learning_rate": 2.6614899954497797e-06, - "loss": 0.0262, - "reward": 1.2505208760499955, - "reward_std": 0.37483210051432253, - "rewards/accuracy_reward": 0.33515625952277334, - "rewards/format_reward": 0.9153645984828472, - "step": 445 + "clip_ratio": 0.0, + "completion_length": 952.7433471679688, + "epoch": 0.9923082667463221, + "grad_norm": 1.2223000526428223, + "kl": 1.0283203125, + "learning_rate": 3.3995013976684253e-09, + "loss": 0.0461, + "reward": 0.5524553805589676, + "reward_std": 0.11175016406923532, + "rewards/accuracy_reward": 0.06250000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4899553805589676, + "step": 3322 }, { - "completion_length": 523.0612133026123, - "epoch": 0.7950530035335689, - "grad_norm": 0.40043678879737854, - "kl": 0.737939453125, - "learning_rate": 2.455379278604226e-06, - "loss": 0.0295, - "reward": 1.2184896238148213, - "reward_std": 0.4309634905308485, - "rewards/accuracy_reward": 0.336718759406358, - "rewards/format_reward": 0.8817708536982536, - "step": 450 + "clip_ratio": 0.0, + "completion_length": 938.8861999511719, + "epoch": 0.9926069748338436, + "grad_norm": 0.645093560218811, + "kl": 0.71923828125, + "learning_rate": 3.132994405808942e-09, + "loss": 0.0387, + "reward": 0.6406250298023224, + "reward_std": 0.1659430731087923, + "rewards/accuracy_reward": 0.149553582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3323 }, { - "completion_length": 517.6575660705566, - "epoch": 0.803886925795053, - "grad_norm": 0.43137258291244507, - "kl": 0.714453125, - "learning_rate": 2.256453236863815e-06, - "loss": 0.0286, - "reward": 1.2213542066514491, - "reward_std": 0.40591246346011756, - "rewards/accuracy_reward": 0.33515626038424673, - "rewards/format_reward": 0.8861979372799397, - "step": 455 + "clip_ratio": 0.0, + "completion_length": 978.5536041259766, + "epoch": 0.992905682921365, + "grad_norm": 0.59635329246521, + "kl": 0.54345703125, + "learning_rate": 2.877363025454161e-09, + "loss": 0.0164, + "reward": 0.5329241305589676, + "reward_std": 0.11077826959080994, + "rewards/accuracy_reward": 0.0379464291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3324 }, { - "completion_length": 518.142724609375, - "epoch": 0.8127208480565371, - "grad_norm": 0.8827040195465088, - "kl": 0.732080078125, - "learning_rate": 2.064901305726055e-06, - "loss": 0.0293, - "reward": 1.2278646230697632, - "reward_std": 0.4181942055001855, - "rewards/accuracy_reward": 0.33802084233611823, - "rewards/format_reward": 0.8898437708616257, - "step": 460 + "clip_ratio": 0.0, + "completion_length": 940.6920318603516, + "epoch": 0.9932043910088866, + "grad_norm": 0.40652981400489807, + "kl": 0.8505859375, + "learning_rate": 2.632607534703846e-09, + "loss": 0.0376, + "reward": 0.5714285969734192, + "reward_std": 0.07928062067367136, + "rewards/accuracy_reward": 0.0781250037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3325 }, { - "completion_length": 520.0903812408447, - "epoch": 0.8215547703180212, - "grad_norm": 0.9919424653053284, - "kl": 0.90341796875, - "learning_rate": 1.880905898388612e-06, - "loss": 0.0362, - "reward": 1.204166703671217, - "reward_std": 0.45391905400902033, - "rewards/accuracy_reward": 0.33750001061707735, - "rewards/format_reward": 0.8666666887700558, - "step": 465 + "clip_ratio": 0.0, + "completion_length": 936.1830749511719, + "epoch": 0.993503099096408, + "grad_norm": 1.1894493103027344, + "kl": 0.921875, + "learning_rate": 2.3987281998294477e-09, + "loss": 0.046, + "reward": 0.6210937649011612, + "reward_std": 0.09660216327756643, + "rewards/accuracy_reward": 0.13169643399305642, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 3326 }, { - "completion_length": 526.2109565734863, - "epoch": 0.8303886925795053, - "grad_norm": 1.2576853036880493, - "kl": 1.0386962890625, - "learning_rate": 1.7046422320388556e-06, - "loss": 0.0416, - "reward": 1.1648437902331352, - "reward_std": 0.47298653740435836, - "rewards/accuracy_reward": 0.31015625963918864, - "rewards/format_reward": 0.8546875223517418, - "step": 470 + "clip_ratio": 0.0, + "completion_length": 968.3683624267578, + "epoch": 0.9938018071839295, + "grad_norm": 0.6024178862571716, + "kl": 0.7685546875, + "learning_rate": 2.1757252752685475e-09, + "loss": 0.0338, + "reward": 0.5719866305589676, + "reward_std": 0.11432175617665052, + "rewards/accuracy_reward": 0.08035714668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3327 }, { - "completion_length": 494.23230781555174, - "epoch": 0.8392226148409894, - "grad_norm": 0.37666624784469604, - "kl": 0.851123046875, - "learning_rate": 1.5362781609960853e-06, - "loss": 0.0341, - "reward": 1.2177083760499954, - "reward_std": 0.4222365788649768, - "rewards/accuracy_reward": 0.3223958426620811, - "rewards/format_reward": 0.8953125178813934, - "step": 475 + "clip_ratio": 0.0, + "completion_length": 929.7969207763672, + "epoch": 0.9941005152714509, + "grad_norm": 0.867092490196228, + "kl": 0.5302734375, + "learning_rate": 1.9635990036270813e-09, + "loss": 0.0168, + "reward": 0.6071428805589676, + "reward_std": 0.13816221430897713, + "rewards/accuracy_reward": 0.1116071455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4955357313156128, + "step": 3328 }, { - "completion_length": 486.34949150085447, - "epoch": 0.8480565371024735, - "grad_norm": 0.33189892768859863, - "kl": 0.7993896484375, - "learning_rate": 1.375974016865359e-06, - "loss": 0.032, - "reward": 1.227083369344473, - "reward_std": 0.4056109145283699, - "rewards/accuracy_reward": 0.328385425824672, - "rewards/format_reward": 0.8986979387700558, - "step": 480 + "clip_ratio": 0.0, + "completion_length": 950.4442443847656, + "epoch": 0.9943992233589725, + "grad_norm": 0.4133750796318054, + "kl": 0.9404296875, + "learning_rate": 1.7623496156771169e-09, + "loss": 0.033, + "reward": 0.6322544887661934, + "reward_std": 0.0820098016411066, + "rewards/accuracy_reward": 0.14062500465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4916294813156128, + "step": 3329 }, { - "completion_length": 484.6830867767334, - "epoch": 0.8568904593639576, - "grad_norm": 0.334587424993515, - "kl": 0.7773681640625, - "learning_rate": 1.2238824558551365e-06, - "loss": 0.0311, - "reward": 1.2481771245598794, - "reward_std": 0.4028174251317978, - "rewards/accuracy_reward": 0.34244792610406877, - "rewards/format_reward": 0.9057291857898235, - "step": 485 + "clip_ratio": 0.0, + "completion_length": 968.7120971679688, + "epoch": 0.9946979314464939, + "grad_norm": 0.5425924062728882, + "kl": 0.849609375, + "learning_rate": 1.5719773303568553e-09, + "loss": 0.0399, + "reward": 0.5864955484867096, + "reward_std": 0.13398644607514143, + "rewards/accuracy_reward": 0.09375000349245965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455484867096, + "step": 3330 }, { - "completion_length": 475.6992298126221, - "epoch": 0.8657243816254417, - "grad_norm": 0.4910571873188019, - "kl": 0.7350830078125, - "learning_rate": 1.080148313404127e-06, - "loss": 0.0294, - "reward": 1.2268229596316815, - "reward_std": 0.37681172844022515, - "rewards/accuracy_reward": 0.3067708421032876, - "rewards/format_reward": 0.9200521044433116, - "step": 490 + "clip_ratio": 0.0, + "completion_length": 954.3817443847656, + "epoch": 0.9949966395340154, + "grad_norm": 0.7738576531410217, + "kl": 1.1259765625, + "learning_rate": 1.392482354775071e-09, + "loss": 0.0583, + "reward": 0.5267857313156128, + "reward_std": 0.09746246645227075, + "rewards/accuracy_reward": 0.03571428777649999, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4910714477300644, + "step": 3331 }, { - "completion_length": 470.992720413208, - "epoch": 0.8745583038869258, - "grad_norm": 0.4497652053833008, - "kl": 0.6725341796875, - "learning_rate": 9.449084662557984e-07, - "loss": 0.0269, - "reward": 1.2450521253049374, - "reward_std": 0.3653396725654602, - "rewards/accuracy_reward": 0.31562500838190316, - "rewards/format_reward": 0.9294271044433117, - "step": 495 + "clip_ratio": 0.0, + "completion_length": 966.4464721679688, + "epoch": 0.9952953476215368, + "grad_norm": 0.7691988348960876, + "kl": 1.0146484375, + "learning_rate": 1.2238648842033408e-09, + "loss": 0.0447, + "reward": 0.603794664144516, + "reward_std": 0.07231528963893652, + "rewards/accuracy_reward": 0.11160714738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3332 }, { - "completion_length": 470.85782585144045, - "epoch": 0.8833922261484098, - "grad_norm": 0.2721257209777832, - "kl": 0.6823486328125, - "learning_rate": 8.182917021118664e-07, - "loss": 0.0273, - "reward": 1.2682292021811008, - "reward_std": 0.3636104612611234, - "rewards/accuracy_reward": 0.3427083446644247, - "rewards/format_reward": 0.9255208522081375, - "step": 500 + "clip_ratio": 0.0, + "completion_length": 979.0245971679688, + "epoch": 0.9955940557090583, + "grad_norm": 0.8864338994026184, + "kl": 0.9072265625, + "learning_rate": 1.0661251020815944e-09, + "loss": 0.0374, + "reward": 0.5970982313156128, + "reward_std": 0.06831089756451547, + "rewards/accuracy_reward": 0.1026785746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4944196566939354, + "step": 3333 }, { - "epoch": 0.8833922261484098, - "eval_completion_length": 473.1770935058594, - "eval_kl": 0.6358816964285714, - "eval_loss": 0.02628379687666893, - "eval_reward": 1.3065476417541504, - "eval_reward_std": 0.38953217438289095, - "eval_rewards/accuracy_reward": 0.3764881023338863, - "eval_rewards/format_reward": 0.9300595436777387, - "eval_runtime": 51.631, - "eval_samples_per_second": 1.917, - "eval_steps_per_second": 0.039, - "step": 500 + "clip_ratio": 0.0, + "completion_length": 948.3326263427734, + "epoch": 0.9958927637965798, + "grad_norm": 1.010763168334961, + "kl": 0.74072265625, + "learning_rate": 9.192631800147844e-10, + "loss": 0.0416, + "reward": 0.5909598469734192, + "reward_std": 0.0720328763127327, + "rewards/accuracy_reward": 0.09598214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4949776977300644, + "step": 3334 }, { - "completion_length": 484.1507957458496, - "epoch": 0.892226148409894, - "grad_norm": 0.3219813108444214, - "kl": 0.6318603515625, - "learning_rate": 7.004185969889188e-07, - "loss": 0.0253, - "reward": 1.2557292029261589, - "reward_std": 0.3643105070106685, - "rewards/accuracy_reward": 0.3278645928483456, - "rewards/format_reward": 0.9278646051883698, - "step": 505 + "clip_ratio": 0.0, + "completion_length": 963.2344055175781, + "epoch": 0.9961914718841013, + "grad_norm": 0.894925594329834, + "kl": 0.953125, + "learning_rate": 7.832792777739962e-10, + "loss": 0.048, + "reward": 0.540178582072258, + "reward_std": 0.07906406186521053, + "rewards/accuracy_reward": 0.051339287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4888393133878708, + "step": 3335 }, { - "completion_length": 477.00678329467775, - "epoch": 0.901060070671378, - "grad_norm": 0.4054728150367737, - "kl": 0.6594482421875, - "learning_rate": 5.914014003949408e-07, - "loss": 0.0264, - "reward": 1.277083372324705, - "reward_std": 0.3729406754486263, - "rewards/accuracy_reward": 0.3463541771983728, - "rewards/format_reward": 0.9307291865348816, - "step": 510 + "clip_ratio": 0.0, + "completion_length": 951.2254943847656, + "epoch": 0.9964901799716227, + "grad_norm": 1.6550697088241577, + "kl": 0.8876953125, + "learning_rate": 6.581735432964476e-10, + "loss": 0.0416, + "reward": 0.6411830633878708, + "reward_std": 0.14809241704642773, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.493861623108387, + "step": 3336 }, { - "completion_length": 481.5070434570313, - "epoch": 0.9098939929328622, - "grad_norm": 0.6569808721542358, - "kl": 0.7029541015625, - "learning_rate": 4.913439284351207e-07, - "loss": 0.0281, - "reward": 1.2552083715796472, - "reward_std": 0.36309111285954715, - "rewards/accuracy_reward": 0.33177084345370533, - "rewards/format_reward": 0.9234375193715095, - "step": 515 + "clip_ratio": 0.0, + "completion_length": 951.1562957763672, + "epoch": 0.9967888880591442, + "grad_norm": 0.604431688785553, + "kl": 0.81396484375, + "learning_rate": 5.439461126854894e-10, + "loss": 0.0514, + "reward": 0.6601562798023224, + "reward_std": 0.11656972020864487, + "rewards/accuracy_reward": 0.16964286752045155, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3337 }, { - "completion_length": 484.1567832946777, - "epoch": 0.9187279151943463, - "grad_norm": 0.3494890332221985, - "kl": 0.6611083984375, - "learning_rate": 4.003414649486892e-07, - "loss": 0.0265, - "reward": 1.2908854551613331, - "reward_std": 0.3768584240227938, - "rewards/accuracy_reward": 0.36510417722165583, - "rewards/format_reward": 0.9257812723517418, - "step": 520 + "clip_ratio": 0.0, + "completion_length": 967.8013763427734, + "epoch": 0.9970875961466656, + "grad_norm": 0.5434059500694275, + "kl": 0.71875, + "learning_rate": 4.4059711020949523e-10, + "loss": 0.0321, + "reward": 0.573660746216774, + "reward_std": 0.10922924242913723, + "rewards/accuracy_reward": 0.08035714412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4933035895228386, + "step": 3338 }, { - "completion_length": 493.15131454467775, - "epoch": 0.9275618374558304, - "grad_norm": 0.35475271940231323, - "kl": 0.7049560546875, - "learning_rate": 3.184806707709698e-07, - "loss": 0.0282, - "reward": 1.2507812805473804, - "reward_std": 0.37382183149456977, - "rewards/accuracy_reward": 0.33359375889413057, - "rewards/format_reward": 0.9171875201165676, - "step": 525 + "clip_ratio": 0.0, + "completion_length": 937.3973693847656, + "epoch": 0.9973863042341872, + "grad_norm": 0.8377710580825806, + "kl": 0.68408203125, + "learning_rate": 3.4812664830186084e-10, + "loss": 0.0415, + "reward": 0.5820312649011612, + "reward_std": 0.06652617454528809, + "rewards/accuracy_reward": 0.08928571734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455633878708, + "step": 3339 }, { - "completion_length": 499.4250141143799, - "epoch": 0.9363957597173145, - "grad_norm": 0.5438317060470581, - "kl": 0.79248046875, - "learning_rate": 2.458395012070369e-07, - "loss": 0.0317, - "reward": 1.2453125409781933, - "reward_std": 0.38421452324837446, - "rewards/accuracy_reward": 0.3304687574040145, - "rewards/format_reward": 0.9148437693715096, - "step": 530 + "clip_ratio": 0.0, + "completion_length": 907.1942291259766, + "epoch": 0.9976850123217086, + "grad_norm": 0.7404647469520569, + "kl": 0.9296875, + "learning_rate": 2.665348275610047e-10, + "loss": 0.0484, + "reward": 0.631138414144516, + "reward_std": 0.1265859603881836, + "rewards/accuracy_reward": 0.1383928656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927455559372902, + "step": 3340 }, { - "completion_length": 505.2278835296631, - "epoch": 0.9452296819787986, - "grad_norm": 0.3518020510673523, - "kl": 0.718017578125, - "learning_rate": 1.8248713179557788e-07, - "loss": 0.0287, - "reward": 1.257552120089531, - "reward_std": 0.41283271964639423, - "rewards/accuracy_reward": 0.3481770919635892, - "rewards/format_reward": 0.9093750171363354, - "step": 535 + "clip_ratio": 0.0, + "completion_length": 948.7969207763672, + "epoch": 0.9979837204092301, + "grad_norm": 1.2576568126678467, + "kl": 0.70703125, + "learning_rate": 1.958217367514781e-10, + "loss": 0.0226, + "reward": 0.5887277126312256, + "reward_std": 0.09028939250856638, + "rewards/accuracy_reward": 0.09821428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3341 }, { - "completion_length": 506.74063835144045, - "epoch": 0.9540636042402827, - "grad_norm": 0.43968465924263, - "kl": 0.8108642578125, - "learning_rate": 1.2848389243363514e-07, - "loss": 0.0324, - "reward": 1.2395833745598792, - "reward_std": 0.39432696914300325, - "rewards/accuracy_reward": 0.32552084382623436, - "rewards/format_reward": 0.9140625201165676, - "step": 540 + "clip_ratio": 0.0, + "completion_length": 958.1897888183594, + "epoch": 0.9982824284967515, + "grad_norm": 0.8784641027450562, + "kl": 1.4248046875, + "learning_rate": 1.359874528006344e-10, + "loss": 0.0629, + "reward": 0.6439732611179352, + "reward_std": 0.13895309157669544, + "rewards/accuracy_reward": 0.1562500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4877232313156128, + "step": 3342 }, { - "completion_length": 501.75417976379396, - "epoch": 0.9628975265017667, - "grad_norm": 0.49120408296585083, - "kl": 0.73642578125, - "learning_rate": 8.388120992499083e-08, - "loss": 0.0295, - "reward": 1.2252604506909848, - "reward_std": 0.38105701059103014, - "rewards/accuracy_reward": 0.3171875092666596, - "rewards/format_reward": 0.9080729350447655, - "step": 545 + "clip_ratio": 0.0, + "completion_length": 965.5781555175781, + "epoch": 0.998581136584273, + "grad_norm": 1.0037137269973755, + "kl": 1.51171875, + "learning_rate": 8.703204080418026e-11, + "loss": 0.0518, + "reward": 0.5507812798023224, + "reward_std": 0.10685666743665934, + "rewards/accuracy_reward": 0.06250000488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4882812723517418, + "step": 3343 }, { - "completion_length": 500.1635566711426, - "epoch": 0.9717314487632509, - "grad_norm": 0.4883018136024475, - "kl": 0.7296142578125, - "learning_rate": 4.8721559006873473e-08, - "loss": 0.0292, - "reward": 1.260677120089531, - "reward_std": 0.38784099034965036, - "rewards/accuracy_reward": 0.3515625098254532, - "rewards/format_reward": 0.9091146029531956, - "step": 550 + "clip_ratio": 0.0, + "completion_length": 925.8437957763672, + "epoch": 0.9988798446717945, + "grad_norm": 0.44397562742233276, + "kl": 0.90478515625, + "learning_rate": 4.895555402062435e-11, + "loss": 0.026, + "reward": 0.5909598469734192, + "reward_std": 0.1376759596168995, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.490513414144516, + "step": 3344 }, { - "completion_length": 501.52787895202636, - "epoch": 0.980565371024735, - "grad_norm": 0.28192296624183655, - "kl": 0.7337890625, - "learning_rate": 2.3038421901651064e-08, - "loss": 0.0294, - "reward": 1.2546875417232513, - "reward_std": 0.4058391135185957, - "rewards/accuracy_reward": 0.33906250912696123, - "rewards/format_reward": 0.9156250216066837, - "step": 555 + "clip_ratio": 0.0, + "completion_length": 955.8750610351562, + "epoch": 0.999178552759316, + "grad_norm": 0.44204282760620117, + "kl": 0.8291015625, + "learning_rate": 2.1758033871277507e-11, + "loss": 0.0328, + "reward": 0.5680803805589676, + "reward_std": 0.10021161893382668, + "rewards/accuracy_reward": 0.07589286053553224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4921875223517418, + "step": 3345 }, { - "completion_length": 508.4052215576172, - "epoch": 0.9893992932862191, - "grad_norm": 0.634943425655365, - "kl": 0.7984130859375, - "learning_rate": 6.856256432000719e-09, - "loss": 0.0319, - "reward": 1.2528646238148213, - "reward_std": 0.4318184578791261, - "rewards/accuracy_reward": 0.35208334370981903, - "rewards/format_reward": 0.9007812708616256, - "step": 560 + "clip_ratio": 0.0, + "completion_length": 950.3884429931641, + "epoch": 0.9994772608468374, + "grad_norm": 0.7998667359352112, + "kl": 0.7861328125, + "learning_rate": 5.439509946914001e-12, + "loss": 0.0433, + "reward": 0.6367187798023224, + "reward_std": 0.10677458345890045, + "rewards/accuracy_reward": 0.14285715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4938616305589676, + "step": 3346 }, { - "completion_length": 497.38673439025877, - "epoch": 0.9982332155477032, - "grad_norm": 0.5770408511161804, - "kl": 0.7656494140625, - "learning_rate": 1.904727299473219e-10, - "loss": 0.0306, - "reward": 1.2656250461935996, - "reward_std": 0.39005161710083486, - "rewards/accuracy_reward": 0.34973959196358917, - "rewards/format_reward": 0.9158854357898235, - "step": 565 + "clip_ratio": 0.0, + "completion_length": 942.5693054199219, + "epoch": 0.9997759689343589, + "grad_norm": 0.3975193500518799, + "kl": 0.806640625, + "learning_rate": 0.0, + "loss": 0.0481, + "reward": 0.5853794887661934, + "reward_std": 0.10773887578397989, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4893973395228386, + "step": 3347 }, { - "completion_length": 475.08334159851074, - "epoch": 1.0, - "kl": 0.8173828125, - "reward": 1.2356771230697632, - "reward_std": 0.39903966896235943, - "rewards/accuracy_reward": 0.330729172565043, - "rewards/format_reward": 0.9049479365348816, - "step": 566, + "epoch": 0.9997759689343589, + "step": 3347, "total_flos": 0.0, - "train_loss": 0.27599951057092365, - "train_runtime": 58865.7657, - "train_samples_per_second": 1.231, - "train_steps_per_second": 0.01 + "train_loss": 0.05178338478773718, + "train_runtime": 159624.3158, + "train_samples_per_second": 0.587, + "train_steps_per_second": 0.021 } ], - "logging_steps": 5, - "max_steps": 566, + "logging_steps": 1, + "max_steps": 3347, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, @@ -1574,8 +50234,8 @@ "should_epoch_stop": false, "should_evaluate": false, "should_log": false, - "should_save": false, - "should_training_stop": false + "should_save": true, + "should_training_stop": true }, "attributes": {} }