|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5141388174807198, |
|
"eval_steps": 500, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2889.4444580078125, |
|
"epoch": 0.001713796058269066, |
|
"grad_norm": 0.1318138986825943, |
|
"kl": 0.0, |
|
"learning_rate": 1.6949152542372882e-08, |
|
"loss": 0.0468, |
|
"reward": 0.861111119389534, |
|
"reward_std": 0.38495802134275436, |
|
"rewards/accuracy_reward": 0.20833333674818277, |
|
"rewards/format_reward": 0.4444444440305233, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2912.0833129882812, |
|
"epoch": 0.003427592116538132, |
|
"grad_norm": 0.07943949103355408, |
|
"kl": 0.0, |
|
"learning_rate": 3.3898305084745764e-08, |
|
"loss": 0.0335, |
|
"reward": 0.47222224064171314, |
|
"reward_std": 0.1791159175336361, |
|
"rewards/accuracy_reward": 0.0694444477558136, |
|
"rewards/format_reward": 0.33333334513008595, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2928.4444580078125, |
|
"epoch": 0.005141388174807198, |
|
"grad_norm": 0.09856246411800385, |
|
"kl": 3.4332275390625e-05, |
|
"learning_rate": 5.0847457627118645e-08, |
|
"loss": 0.0448, |
|
"reward": 0.6388888880610466, |
|
"reward_std": 0.23899273201823235, |
|
"rewards/accuracy_reward": 0.13888889271765947, |
|
"rewards/format_reward": 0.3611111156642437, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2919.0555419921875, |
|
"epoch": 0.006855184233076264, |
|
"grad_norm": 0.13336561620235443, |
|
"kl": 3.319978713989258e-05, |
|
"learning_rate": 6.779661016949153e-08, |
|
"loss": 0.0937, |
|
"reward": 1.0416666865348816, |
|
"reward_std": 0.8419074863195419, |
|
"rewards/accuracy_reward": 0.3055555634200573, |
|
"rewards/format_reward": 0.4305555671453476, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2756.0000610351562, |
|
"epoch": 0.00856898029134533, |
|
"grad_norm": 0.20592202246189117, |
|
"kl": 1.940131187438965e-05, |
|
"learning_rate": 8.47457627118644e-08, |
|
"loss": 0.1041, |
|
"reward": 0.986111119389534, |
|
"reward_std": 0.4405064880847931, |
|
"rewards/accuracy_reward": 0.23611112032085657, |
|
"rewards/format_reward": 0.5138888955116272, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3220.4166870117188, |
|
"epoch": 0.010282776349614395, |
|
"grad_norm": 0.15423689782619476, |
|
"kl": 3.999471664428711e-05, |
|
"learning_rate": 1.0169491525423729e-07, |
|
"loss": 0.0714, |
|
"reward": 0.5972222350537777, |
|
"reward_std": 0.6300827041268349, |
|
"rewards/accuracy_reward": 0.13888888992369175, |
|
"rewards/format_reward": 0.31944444589316845, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2272.7222595214844, |
|
"epoch": 0.011996572407883462, |
|
"grad_norm": 0.1403103917837143, |
|
"kl": 3.629922866821289e-05, |
|
"learning_rate": 1.1864406779661017e-07, |
|
"loss": 0.0274, |
|
"reward": 1.0972222238779068, |
|
"reward_std": 0.3974733129143715, |
|
"rewards/accuracy_reward": 0.2500000074505806, |
|
"rewards/format_reward": 0.5972222238779068, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3010.4722290039062, |
|
"epoch": 0.013710368466152529, |
|
"grad_norm": 0.15772269666194916, |
|
"kl": 2.4080276489257812e-05, |
|
"learning_rate": 1.3559322033898305e-07, |
|
"loss": 0.0218, |
|
"reward": 1.3888889253139496, |
|
"reward_std": 0.8940589390695095, |
|
"rewards/accuracy_reward": 0.430555559694767, |
|
"rewards/format_reward": 0.5277777835726738, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2840.0556030273438, |
|
"epoch": 0.015424164524421594, |
|
"grad_norm": 0.10687297582626343, |
|
"kl": 2.6017427444458008e-05, |
|
"learning_rate": 1.5254237288135593e-07, |
|
"loss": 0.0778, |
|
"reward": 0.9305555522441864, |
|
"reward_std": 0.6126390919089317, |
|
"rewards/accuracy_reward": 0.2500000037252903, |
|
"rewards/format_reward": 0.43055555783212185, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2991.7083129882812, |
|
"epoch": 0.01713796058269066, |
|
"grad_norm": 0.15705247223377228, |
|
"kl": 3.451108932495117e-05, |
|
"learning_rate": 1.694915254237288e-07, |
|
"loss": 0.0238, |
|
"reward": 0.5833333283662796, |
|
"reward_std": 0.6530618071556091, |
|
"rewards/accuracy_reward": 0.15277777891606092, |
|
"rewards/format_reward": 0.27777778171002865, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2877.375030517578, |
|
"epoch": 0.018851756640959727, |
|
"grad_norm": 0.21513371169567108, |
|
"kl": 3.460049629211426e-05, |
|
"learning_rate": 1.8644067796610168e-07, |
|
"loss": 0.0466, |
|
"reward": 0.7916666800156236, |
|
"reward_std": 0.18172631040215492, |
|
"rewards/accuracy_reward": 0.2361111119389534, |
|
"rewards/format_reward": 0.3194444412365556, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3343.013916015625, |
|
"epoch": 0.02056555269922879, |
|
"grad_norm": 0.14181174337863922, |
|
"kl": 3.936886787414551e-05, |
|
"learning_rate": 2.0338983050847458e-07, |
|
"loss": 0.074, |
|
"reward": 0.3888888908550143, |
|
"reward_std": 0.5541914477944374, |
|
"rewards/accuracy_reward": 0.08333333395421505, |
|
"rewards/format_reward": 0.22222222667187452, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2895.25, |
|
"epoch": 0.022279348757497857, |
|
"grad_norm": 0.12315575033426285, |
|
"kl": 3.3020973205566406e-05, |
|
"learning_rate": 2.2033898305084743e-07, |
|
"loss": 0.0576, |
|
"reward": 0.833333358168602, |
|
"reward_std": 0.5088144801557064, |
|
"rewards/accuracy_reward": 0.18055555783212185, |
|
"rewards/format_reward": 0.4722222201526165, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3024.763916015625, |
|
"epoch": 0.023993144815766924, |
|
"grad_norm": 0.1697244644165039, |
|
"kl": 2.4959444999694824e-05, |
|
"learning_rate": 2.3728813559322033e-07, |
|
"loss": 0.0401, |
|
"reward": 0.666666679084301, |
|
"reward_std": 0.29541126638650894, |
|
"rewards/accuracy_reward": 0.18055556062608957, |
|
"rewards/format_reward": 0.3055555634200573, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3397.7916259765625, |
|
"epoch": 0.02570694087403599, |
|
"grad_norm": 0.14056538045406342, |
|
"kl": 4.1157007217407227e-05, |
|
"learning_rate": 2.542372881355932e-07, |
|
"loss": 0.0871, |
|
"reward": 0.33333333767950535, |
|
"reward_std": 0.616430725902319, |
|
"rewards/accuracy_reward": 0.08333333488553762, |
|
"rewards/format_reward": 0.16666666977107525, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2442.4722595214844, |
|
"epoch": 0.027420736932305057, |
|
"grad_norm": 0.15735530853271484, |
|
"kl": 2.2862106561660767e-05, |
|
"learning_rate": 2.711864406779661e-07, |
|
"loss": 0.0156, |
|
"reward": 1.4166666567325592, |
|
"reward_std": 0.86329685151577, |
|
"rewards/accuracy_reward": 0.4305555634200573, |
|
"rewards/format_reward": 0.5555555522441864, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2926.25, |
|
"epoch": 0.02913453299057412, |
|
"grad_norm": 0.0811009407043457, |
|
"kl": 2.7507543563842773e-05, |
|
"learning_rate": 2.88135593220339e-07, |
|
"loss": 0.0173, |
|
"reward": 0.8750000167638063, |
|
"reward_std": 0.2480051852762699, |
|
"rewards/accuracy_reward": 0.2777777798473835, |
|
"rewards/format_reward": 0.31944445334374905, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3358.9722900390625, |
|
"epoch": 0.030848329048843187, |
|
"grad_norm": 0.1565597802400589, |
|
"kl": 3.2454729080200195e-05, |
|
"learning_rate": 3.0508474576271186e-07, |
|
"loss": 0.0761, |
|
"reward": 0.3472222350537777, |
|
"reward_std": 0.47978880628943443, |
|
"rewards/accuracy_reward": 0.0972222238779068, |
|
"rewards/format_reward": 0.1527777798473835, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3248.263916015625, |
|
"epoch": 0.032562125107112254, |
|
"grad_norm": 0.06823915243148804, |
|
"kl": 3.1620264053344727e-05, |
|
"learning_rate": 3.220338983050847e-07, |
|
"loss": 0.0115, |
|
"reward": 0.611111119389534, |
|
"reward_std": 0.4586857333779335, |
|
"rewards/accuracy_reward": 0.18055555690079927, |
|
"rewards/format_reward": 0.2500000037252903, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3367.013916015625, |
|
"epoch": 0.03427592116538132, |
|
"grad_norm": 0.09324803203344345, |
|
"kl": 3.3974647521972656e-05, |
|
"learning_rate": 3.389830508474576e-07, |
|
"loss": 0.0378, |
|
"reward": 0.5, |
|
"reward_std": 0.47972556948661804, |
|
"rewards/accuracy_reward": 0.1527777798473835, |
|
"rewards/format_reward": 0.1944444514811039, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3031.4583740234375, |
|
"epoch": 0.03598971722365039, |
|
"grad_norm": 0.11860450357198715, |
|
"kl": 4.3004751205444336e-05, |
|
"learning_rate": 3.559322033898305e-07, |
|
"loss": 0.0817, |
|
"reward": 0.541666679084301, |
|
"reward_std": 0.5936876386404037, |
|
"rewards/accuracy_reward": 0.12500000186264515, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3335.4444580078125, |
|
"epoch": 0.037703513281919454, |
|
"grad_norm": 0.09448128193616867, |
|
"kl": 2.7835369110107422e-05, |
|
"learning_rate": 3.7288135593220336e-07, |
|
"loss": 0.0319, |
|
"reward": 0.5138888908550143, |
|
"reward_std": 0.37944628670811653, |
|
"rewards/accuracy_reward": 0.1388888917863369, |
|
"rewards/format_reward": 0.23611111473292112, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3147.4583129882812, |
|
"epoch": 0.03941730934018852, |
|
"grad_norm": 0.11545193940401077, |
|
"kl": 2.8133392333984375e-05, |
|
"learning_rate": 3.898305084745763e-07, |
|
"loss": 0.0274, |
|
"reward": 0.4166666753590107, |
|
"reward_std": 0.4431168958544731, |
|
"rewards/accuracy_reward": 0.08333333488553762, |
|
"rewards/format_reward": 0.2500000046566129, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2804.5416870117188, |
|
"epoch": 0.04113110539845758, |
|
"grad_norm": 0.22444979846477509, |
|
"kl": 2.9146671295166016e-05, |
|
"learning_rate": 4.0677966101694916e-07, |
|
"loss": 0.0787, |
|
"reward": 1.0416666716337204, |
|
"reward_std": 0.23116151615977287, |
|
"rewards/accuracy_reward": 0.3055555634200573, |
|
"rewards/format_reward": 0.4305555671453476, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3022.1527709960938, |
|
"epoch": 0.04284490145672665, |
|
"grad_norm": 0.11472934484481812, |
|
"kl": 3.597140312194824e-05, |
|
"learning_rate": 4.23728813559322e-07, |
|
"loss": 0.0722, |
|
"reward": 0.7638888955116272, |
|
"reward_std": 0.4787874035537243, |
|
"rewards/accuracy_reward": 0.1944444477558136, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3167.3195190429688, |
|
"epoch": 0.044558697514995714, |
|
"grad_norm": 0.23513932526111603, |
|
"kl": 3.221631050109863e-05, |
|
"learning_rate": 4.4067796610169486e-07, |
|
"loss": 0.1099, |
|
"reward": 0.5000000186264515, |
|
"reward_std": 0.6264622695744038, |
|
"rewards/accuracy_reward": 0.11111111473292112, |
|
"rewards/format_reward": 0.27777778916060925, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3249.5833129882812, |
|
"epoch": 0.04627249357326478, |
|
"grad_norm": 0.10312272608280182, |
|
"kl": 1.971423625946045e-05, |
|
"learning_rate": 4.576271186440678e-07, |
|
"loss": 0.036, |
|
"reward": 0.6250000037252903, |
|
"reward_std": 0.3617309741675854, |
|
"rewards/accuracy_reward": 0.15277778077870607, |
|
"rewards/format_reward": 0.3194444514811039, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2603.77783203125, |
|
"epoch": 0.04798628963153385, |
|
"grad_norm": 0.15025873482227325, |
|
"kl": 2.3640692234039307e-05, |
|
"learning_rate": 4.7457627118644066e-07, |
|
"loss": 0.0584, |
|
"reward": 1.1111111342906952, |
|
"reward_std": 0.4762897342443466, |
|
"rewards/accuracy_reward": 0.3055555634200573, |
|
"rewards/format_reward": 0.5000000149011612, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3161.8472900390625, |
|
"epoch": 0.049700085689802914, |
|
"grad_norm": 0.12228947132825851, |
|
"kl": 3.4302473068237305e-05, |
|
"learning_rate": 4.915254237288136e-07, |
|
"loss": 0.0351, |
|
"reward": 0.8333333302289248, |
|
"reward_std": 0.6312860958278179, |
|
"rewards/accuracy_reward": 0.236111119389534, |
|
"rewards/format_reward": 0.36111110635101795, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2754.902801513672, |
|
"epoch": 0.05141388174807198, |
|
"grad_norm": 0.24735864996910095, |
|
"kl": 1.800060272216797e-05, |
|
"learning_rate": 5.084745762711864e-07, |
|
"loss": 0.1167, |
|
"reward": 1.2638889253139496, |
|
"reward_std": 0.8531928509473801, |
|
"rewards/accuracy_reward": 0.361111119389534, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3002.125, |
|
"epoch": 0.05312767780634105, |
|
"grad_norm": 0.15179027616977692, |
|
"kl": 9.566545486450195e-06, |
|
"learning_rate": 5.254237288135593e-07, |
|
"loss": 0.0781, |
|
"reward": 0.6388889010995626, |
|
"reward_std": 0.28320711851119995, |
|
"rewards/accuracy_reward": 0.180555559694767, |
|
"rewards/format_reward": 0.27777778171002865, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3020.0694580078125, |
|
"epoch": 0.054841473864610114, |
|
"grad_norm": 0.2012336701154709, |
|
"kl": 2.09808349609375e-05, |
|
"learning_rate": 5.423728813559322e-07, |
|
"loss": 0.1265, |
|
"reward": 0.9861111342906952, |
|
"reward_std": 0.9004222899675369, |
|
"rewards/accuracy_reward": 0.26388888992369175, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2485.0555725097656, |
|
"epoch": 0.056555269922879174, |
|
"grad_norm": 0.13532550632953644, |
|
"kl": 9.387731552124023e-06, |
|
"learning_rate": 5.59322033898305e-07, |
|
"loss": 0.0014, |
|
"reward": 1.1250000149011612, |
|
"reward_std": 0.6304849497973919, |
|
"rewards/accuracy_reward": 0.305555559694767, |
|
"rewards/format_reward": 0.5138889029622078, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2882.5833740234375, |
|
"epoch": 0.05826906598114824, |
|
"grad_norm": 0.11332134157419205, |
|
"kl": 1.0229647159576416e-05, |
|
"learning_rate": 5.76271186440678e-07, |
|
"loss": 0.0378, |
|
"reward": 0.847222238779068, |
|
"reward_std": 0.5727614238858223, |
|
"rewards/accuracy_reward": 0.2500000074505806, |
|
"rewards/format_reward": 0.3472222276031971, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3271.638916015625, |
|
"epoch": 0.05998286203941731, |
|
"grad_norm": 0.10583982616662979, |
|
"kl": 2.9146671295166016e-05, |
|
"learning_rate": 5.932203389830508e-07, |
|
"loss": 0.0459, |
|
"reward": 0.37500000558793545, |
|
"reward_std": 0.22475946694612503, |
|
"rewards/accuracy_reward": 0.0694444477558136, |
|
"rewards/format_reward": 0.23611111752688885, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2812.0833740234375, |
|
"epoch": 0.061696658097686374, |
|
"grad_norm": 0.17242440581321716, |
|
"kl": 5.033612251281738e-05, |
|
"learning_rate": 6.101694915254237e-07, |
|
"loss": 0.086, |
|
"reward": 0.4583333432674408, |
|
"reward_std": 0.25616974383592606, |
|
"rewards/accuracy_reward": 0.0694444477558136, |
|
"rewards/format_reward": 0.3194444477558136, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3051.763916015625, |
|
"epoch": 0.06341045415595545, |
|
"grad_norm": 0.13607284426689148, |
|
"kl": 9.356439113616943e-05, |
|
"learning_rate": 6.271186440677966e-07, |
|
"loss": 0.0796, |
|
"reward": 0.4861111231148243, |
|
"reward_std": 0.4781397730112076, |
|
"rewards/accuracy_reward": 0.12500000279396772, |
|
"rewards/format_reward": 0.23611111752688885, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3268.375, |
|
"epoch": 0.06512425021422451, |
|
"grad_norm": 0.10063257068395615, |
|
"kl": 2.215057611465454e-05, |
|
"learning_rate": 6.440677966101694e-07, |
|
"loss": 0.075, |
|
"reward": 0.4166666716337204, |
|
"reward_std": 0.4407730996608734, |
|
"rewards/accuracy_reward": 0.0972222238779068, |
|
"rewards/format_reward": 0.22222222946584225, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3201.2083129882812, |
|
"epoch": 0.06683804627249357, |
|
"grad_norm": 0.0791403129696846, |
|
"kl": 4.489719867706299e-05, |
|
"learning_rate": 6.610169491525423e-07, |
|
"loss": 0.028, |
|
"reward": 0.5138889029622078, |
|
"reward_std": 0.3695859834551811, |
|
"rewards/accuracy_reward": 0.13888889085501432, |
|
"rewards/format_reward": 0.23611111380159855, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2954.5138549804688, |
|
"epoch": 0.06855184233076264, |
|
"grad_norm": 0.1909833401441574, |
|
"kl": 8.817017078399658e-05, |
|
"learning_rate": 6.779661016949152e-07, |
|
"loss": 0.0638, |
|
"reward": 0.8750000149011612, |
|
"reward_std": 0.3410548157989979, |
|
"rewards/accuracy_reward": 0.2777777872979641, |
|
"rewards/format_reward": 0.3194444514811039, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2738.6250610351562, |
|
"epoch": 0.0702656383890317, |
|
"grad_norm": 0.12753309309482574, |
|
"kl": 3.444775938987732e-05, |
|
"learning_rate": 6.949152542372881e-07, |
|
"loss": 0.0836, |
|
"reward": 1.1388889104127884, |
|
"reward_std": 0.6430224105715752, |
|
"rewards/accuracy_reward": 0.3472222276031971, |
|
"rewards/format_reward": 0.4444444477558136, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2710.8750610351562, |
|
"epoch": 0.07197943444730077, |
|
"grad_norm": 0.1831730157136917, |
|
"kl": 0.0002748072147369385, |
|
"learning_rate": 7.11864406779661e-07, |
|
"loss": 0.0698, |
|
"reward": 0.9305555857717991, |
|
"reward_std": 0.4780263118445873, |
|
"rewards/accuracy_reward": 0.236111119389534, |
|
"rewards/format_reward": 0.4583333469927311, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2921.9861450195312, |
|
"epoch": 0.07369323050556983, |
|
"grad_norm": 0.15687353909015656, |
|
"kl": 8.583441376686096e-05, |
|
"learning_rate": 7.288135593220338e-07, |
|
"loss": 0.0681, |
|
"reward": 1.0277777910232544, |
|
"reward_std": 0.5702280625700951, |
|
"rewards/accuracy_reward": 0.2916666716337204, |
|
"rewards/format_reward": 0.4444444477558136, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3053.9166870117188, |
|
"epoch": 0.07540702656383891, |
|
"grad_norm": 0.16944237053394318, |
|
"kl": 3.604590892791748e-05, |
|
"learning_rate": 7.457627118644067e-07, |
|
"loss": 0.1266, |
|
"reward": 0.6527777910232544, |
|
"reward_std": 0.6824748627841473, |
|
"rewards/accuracy_reward": 0.13888889271765947, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3097.3194580078125, |
|
"epoch": 0.07712082262210797, |
|
"grad_norm": 0.12692783772945404, |
|
"kl": 4.616379737854004e-05, |
|
"learning_rate": 7.627118644067796e-07, |
|
"loss": 0.0334, |
|
"reward": 0.6944444626569748, |
|
"reward_std": 0.5105769783258438, |
|
"rewards/accuracy_reward": 0.16666666977107525, |
|
"rewards/format_reward": 0.361111119389534, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2547.388916015625, |
|
"epoch": 0.07883461868037704, |
|
"grad_norm": 0.17742452025413513, |
|
"kl": 0.00026345252990722656, |
|
"learning_rate": 7.796610169491526e-07, |
|
"loss": 0.0976, |
|
"reward": 0.6527777761220932, |
|
"reward_std": 0.34457773715257645, |
|
"rewards/accuracy_reward": 0.09722222574055195, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3137.90283203125, |
|
"epoch": 0.0805484147386461, |
|
"grad_norm": 0.15140484273433685, |
|
"kl": 0.0002467818558216095, |
|
"learning_rate": 7.966101694915253e-07, |
|
"loss": 0.0735, |
|
"reward": 0.888888917863369, |
|
"reward_std": 0.7274487726390362, |
|
"rewards/accuracy_reward": 0.23611111752688885, |
|
"rewards/format_reward": 0.4166666753590107, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2775.9722290039062, |
|
"epoch": 0.08226221079691516, |
|
"grad_norm": 0.23574158549308777, |
|
"kl": 0.00021022558212280273, |
|
"learning_rate": 8.135593220338983e-07, |
|
"loss": 0.1454, |
|
"reward": 1.2777777910232544, |
|
"reward_std": 0.6754349321126938, |
|
"rewards/accuracy_reward": 0.3888888955116272, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2843.5972290039062, |
|
"epoch": 0.08397600685518423, |
|
"grad_norm": 0.1440126597881317, |
|
"kl": 0.0003483295440673828, |
|
"learning_rate": 8.305084745762712e-07, |
|
"loss": 0.0911, |
|
"reward": 1.1250000447034836, |
|
"reward_std": 0.5149499028921127, |
|
"rewards/accuracy_reward": 0.3472222238779068, |
|
"rewards/format_reward": 0.430555559694767, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2877.02783203125, |
|
"epoch": 0.0856898029134533, |
|
"grad_norm": 0.16242656111717224, |
|
"kl": 0.0005815029144287109, |
|
"learning_rate": 8.47457627118644e-07, |
|
"loss": 0.1154, |
|
"reward": 0.7361111342906952, |
|
"reward_std": 0.497927188873291, |
|
"rewards/accuracy_reward": 0.1527777835726738, |
|
"rewards/format_reward": 0.4305555745959282, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3075.5833129882812, |
|
"epoch": 0.08740359897172237, |
|
"grad_norm": 0.09776900708675385, |
|
"kl": 0.00047463178634643555, |
|
"learning_rate": 8.64406779661017e-07, |
|
"loss": 0.0336, |
|
"reward": 0.9305555820465088, |
|
"reward_std": 0.6370660364627838, |
|
"rewards/accuracy_reward": 0.3055555634200573, |
|
"rewards/format_reward": 0.3194444477558136, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3223.0000610351562, |
|
"epoch": 0.08911739502999143, |
|
"grad_norm": 0.09731809794902802, |
|
"kl": 0.00039637088775634766, |
|
"learning_rate": 8.813559322033897e-07, |
|
"loss": 0.0465, |
|
"reward": 0.5833333358168602, |
|
"reward_std": 0.5373477786779404, |
|
"rewards/accuracy_reward": 0.16666666977107525, |
|
"rewards/format_reward": 0.25000000931322575, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3072.638916015625, |
|
"epoch": 0.0908311910882605, |
|
"grad_norm": 0.11242014914751053, |
|
"kl": 0.0001506805419921875, |
|
"learning_rate": 8.983050847457627e-07, |
|
"loss": 0.0368, |
|
"reward": 0.5694444589316845, |
|
"reward_std": 0.3695859871804714, |
|
"rewards/accuracy_reward": 0.09722222480922937, |
|
"rewards/format_reward": 0.37500000186264515, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3340.65283203125, |
|
"epoch": 0.09254498714652956, |
|
"grad_norm": 0.09306000173091888, |
|
"kl": 0.00031280517578125, |
|
"learning_rate": 9.152542372881356e-07, |
|
"loss": 0.0025, |
|
"reward": 0.6111111342906952, |
|
"reward_std": 0.5472080707550049, |
|
"rewards/accuracy_reward": 0.1944444477558136, |
|
"rewards/format_reward": 0.2222222238779068, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3444.361083984375, |
|
"epoch": 0.09425878320479864, |
|
"grad_norm": 0.10728821158409119, |
|
"kl": 0.0005254745483398438, |
|
"learning_rate": 9.322033898305083e-07, |
|
"loss": 0.0437, |
|
"reward": 0.3888888955116272, |
|
"reward_std": 0.4680837541818619, |
|
"rewards/accuracy_reward": 0.08333333395421505, |
|
"rewards/format_reward": 0.22222222667187452, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3232.0416870117188, |
|
"epoch": 0.0959725792630677, |
|
"grad_norm": 0.12119881063699722, |
|
"kl": 0.00034499168395996094, |
|
"learning_rate": 9.491525423728813e-07, |
|
"loss": 0.0786, |
|
"reward": 0.7916666828095913, |
|
"reward_std": 0.4137357361614704, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3270.388916015625, |
|
"epoch": 0.09768637532133675, |
|
"grad_norm": 0.11568433046340942, |
|
"kl": 0.0003733634948730469, |
|
"learning_rate": 9.661016949152542e-07, |
|
"loss": 0.0212, |
|
"reward": 0.652777798473835, |
|
"reward_std": 0.6071257442235947, |
|
"rewards/accuracy_reward": 0.16666666883975267, |
|
"rewards/format_reward": 0.3194444552063942, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3276.7777709960938, |
|
"epoch": 0.09940017137960583, |
|
"grad_norm": 0.1174827516078949, |
|
"kl": 0.002979278564453125, |
|
"learning_rate": 9.830508474576272e-07, |
|
"loss": 0.0607, |
|
"reward": 0.3611111156642437, |
|
"reward_std": 0.4297382980585098, |
|
"rewards/accuracy_reward": 0.06944444496184587, |
|
"rewards/format_reward": 0.2222222276031971, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3178.65283203125, |
|
"epoch": 0.10111396743787489, |
|
"grad_norm": 0.13718360662460327, |
|
"kl": 0.00142669677734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0701, |
|
"reward": 0.9861111342906952, |
|
"reward_std": 0.7938041463494301, |
|
"rewards/accuracy_reward": 0.31944444961845875, |
|
"rewards/format_reward": 0.3472222276031971, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3385.9166259765625, |
|
"epoch": 0.10282776349614396, |
|
"grad_norm": 0.1017572283744812, |
|
"kl": 0.0005638599395751953, |
|
"learning_rate": 9.999919124237425e-07, |
|
"loss": 0.0466, |
|
"reward": 0.5833333283662796, |
|
"reward_std": 0.5943029820919037, |
|
"rewards/accuracy_reward": 0.180555559694767, |
|
"rewards/format_reward": 0.22222222480922937, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3338.388916015625, |
|
"epoch": 0.10454155955441302, |
|
"grad_norm": 0.1060507744550705, |
|
"kl": 0.0013508796691894531, |
|
"learning_rate": 9.999676499856762e-07, |
|
"loss": 0.057, |
|
"reward": 0.3888888880610466, |
|
"reward_std": 0.2651822194457054, |
|
"rewards/accuracy_reward": 0.0694444477558136, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2797.0416870117188, |
|
"epoch": 0.1062553556126821, |
|
"grad_norm": 0.13327208161354065, |
|
"kl": 0.004306793212890625, |
|
"learning_rate": 9.999272135579094e-07, |
|
"loss": 0.0093, |
|
"reward": 0.9722222238779068, |
|
"reward_std": 0.4413543753325939, |
|
"rewards/accuracy_reward": 0.2638888955116272, |
|
"rewards/format_reward": 0.4444444552063942, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3424.4861450195312, |
|
"epoch": 0.10796915167095116, |
|
"grad_norm": 0.057017724961042404, |
|
"kl": 0.0010051727294921875, |
|
"learning_rate": 9.998706045939205e-07, |
|
"loss": 0.0094, |
|
"reward": 0.5416666641831398, |
|
"reward_std": 0.360260970890522, |
|
"rewards/accuracy_reward": 0.1527777798473835, |
|
"rewards/format_reward": 0.23611111380159855, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2833.3334045410156, |
|
"epoch": 0.10968294772922023, |
|
"grad_norm": 0.1504010707139969, |
|
"kl": 0.0013284683227539062, |
|
"learning_rate": 9.997978251285065e-07, |
|
"loss": 0.0029, |
|
"reward": 0.6527778003364801, |
|
"reward_std": 0.4071078971028328, |
|
"rewards/accuracy_reward": 0.1250000037252903, |
|
"rewards/format_reward": 0.40277778543531895, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3199.5972290039062, |
|
"epoch": 0.11139674378748929, |
|
"grad_norm": 0.11405781656503677, |
|
"kl": 0.001636505126953125, |
|
"learning_rate": 9.997088777777095e-07, |
|
"loss": 0.0488, |
|
"reward": 0.7222222210839391, |
|
"reward_std": 0.5615956597030163, |
|
"rewards/accuracy_reward": 0.23611111659556627, |
|
"rewards/format_reward": 0.24999999906867743, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2915.388916015625, |
|
"epoch": 0.11311053984575835, |
|
"grad_norm": 0.11168493330478668, |
|
"kl": 0.000568389892578125, |
|
"learning_rate": 9.99603765738723e-07, |
|
"loss": -0.0035, |
|
"reward": 0.7777777910232544, |
|
"reward_std": 0.2721655145287514, |
|
"rewards/accuracy_reward": 0.23611112032085657, |
|
"rewards/format_reward": 0.3055555671453476, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3189.40283203125, |
|
"epoch": 0.11482433590402742, |
|
"grad_norm": 0.12389721721410751, |
|
"kl": 0.001224517822265625, |
|
"learning_rate": 9.994824927897762e-07, |
|
"loss": 0.0351, |
|
"reward": 0.6111111044883728, |
|
"reward_std": 0.5111561641097069, |
|
"rewards/accuracy_reward": 0.15277778450399637, |
|
"rewards/format_reward": 0.3055555671453476, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3199.2777709960938, |
|
"epoch": 0.11653813196229648, |
|
"grad_norm": 0.04368606582283974, |
|
"kl": 0.0010256767272949219, |
|
"learning_rate": 9.993450632899989e-07, |
|
"loss": 0.0049, |
|
"reward": 0.4722222238779068, |
|
"reward_std": 0.2300211265683174, |
|
"rewards/accuracy_reward": 0.12500000186264515, |
|
"rewards/format_reward": 0.22222222574055195, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3365.9583740234375, |
|
"epoch": 0.11825192802056556, |
|
"grad_norm": 0.10794109106063843, |
|
"kl": 0.0010347366333007812, |
|
"learning_rate": 9.99191482179265e-07, |
|
"loss": 0.0407, |
|
"reward": 0.6388889029622078, |
|
"reward_std": 0.6728046834468842, |
|
"rewards/accuracy_reward": 0.1944444477558136, |
|
"rewards/format_reward": 0.25000000558793545, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2993.1527709960938, |
|
"epoch": 0.11996572407883462, |
|
"grad_norm": 0.08366014063358307, |
|
"kl": 0.0018405914306640625, |
|
"learning_rate": 9.99021754978014e-07, |
|
"loss": -0.0295, |
|
"reward": 0.8194444589316845, |
|
"reward_std": 0.4187307730317116, |
|
"rewards/accuracy_reward": 0.25000000186264515, |
|
"rewards/format_reward": 0.3194444486871362, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3297.041748046875, |
|
"epoch": 0.12167952013710369, |
|
"grad_norm": 0.12551282346248627, |
|
"kl": 0.0020809173583984375, |
|
"learning_rate": 9.988358877870534e-07, |
|
"loss": 0.057, |
|
"reward": 0.8750000111758709, |
|
"reward_std": 0.766589842736721, |
|
"rewards/accuracy_reward": 0.27777778450399637, |
|
"rewards/format_reward": 0.3194444486871362, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2669.166717529297, |
|
"epoch": 0.12339331619537275, |
|
"grad_norm": 0.07140244543552399, |
|
"kl": 0.0021390914916992188, |
|
"learning_rate": 9.986338872873393e-07, |
|
"loss": 0.0464, |
|
"reward": 1.2638888955116272, |
|
"reward_std": 0.31082576885819435, |
|
"rewards/accuracy_reward": 0.3750000027939677, |
|
"rewards/format_reward": 0.5138888955116272, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3122.9166870117188, |
|
"epoch": 0.12510711225364182, |
|
"grad_norm": 0.1383487433195114, |
|
"kl": 0.0014562606811523438, |
|
"learning_rate": 9.984157607397357e-07, |
|
"loss": 0.0568, |
|
"reward": 0.8472222201526165, |
|
"reward_std": 0.5111991167068481, |
|
"rewards/accuracy_reward": 0.23611112125217915, |
|
"rewards/format_reward": 0.37500000838190317, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2634.4306030273438, |
|
"epoch": 0.1268209083119109, |
|
"grad_norm": 0.16660131514072418, |
|
"kl": 0.00234222412109375, |
|
"learning_rate": 9.981815159847542e-07, |
|
"loss": 0.0609, |
|
"reward": 1.2638888955116272, |
|
"reward_std": 0.599253699183464, |
|
"rewards/accuracy_reward": 0.3472222276031971, |
|
"rewards/format_reward": 0.5694444477558136, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3220.75, |
|
"epoch": 0.12853470437017994, |
|
"grad_norm": 0.05945681035518646, |
|
"kl": 0.00293731689453125, |
|
"learning_rate": 9.979311614422718e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8055555671453476, |
|
"reward_std": 0.34192486852407455, |
|
"rewards/accuracy_reward": 0.2500000037252903, |
|
"rewards/format_reward": 0.305555553175509, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2547.402801513672, |
|
"epoch": 0.13024850042844902, |
|
"grad_norm": 0.12682628631591797, |
|
"kl": 0.0037593841552734375, |
|
"learning_rate": 9.976647061112284e-07, |
|
"loss": 0.0589, |
|
"reward": 1.3055555522441864, |
|
"reward_std": 0.5543116554617882, |
|
"rewards/accuracy_reward": 0.3472222276031971, |
|
"rewards/format_reward": 0.6111111119389534, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3289.8056030273438, |
|
"epoch": 0.1319622964867181, |
|
"grad_norm": 0.07940449565649033, |
|
"kl": 0.00164031982421875, |
|
"learning_rate": 9.973821595693026e-07, |
|
"loss": 0.0351, |
|
"reward": 0.7777777910232544, |
|
"reward_std": 0.5557935982942581, |
|
"rewards/accuracy_reward": 0.2361111156642437, |
|
"rewards/format_reward": 0.3055555634200573, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3333.75, |
|
"epoch": 0.13367609254498714, |
|
"grad_norm": 0.1280481368303299, |
|
"kl": 0.00507354736328125, |
|
"learning_rate": 9.970835319725696e-07, |
|
"loss": -0.0008, |
|
"reward": 0.27777778450399637, |
|
"reward_std": 0.42627324536442757, |
|
"rewards/accuracy_reward": 0.06944444589316845, |
|
"rewards/format_reward": 0.13888889271765947, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2680.7361450195312, |
|
"epoch": 0.1353898886032562, |
|
"grad_norm": 0.25829118490219116, |
|
"kl": 0.0030956268310546875, |
|
"learning_rate": 9.967688340551327e-07, |
|
"loss": 0.1441, |
|
"reward": 0.9999999925494194, |
|
"reward_std": 0.46356454864144325, |
|
"rewards/accuracy_reward": 0.2500000027939677, |
|
"rewards/format_reward": 0.5, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3091.388916015625, |
|
"epoch": 0.13710368466152528, |
|
"grad_norm": 0.2998567819595337, |
|
"kl": 0.0024929046630859375, |
|
"learning_rate": 9.96438077128741e-07, |
|
"loss": 0.0411, |
|
"reward": 0.5555555690079927, |
|
"reward_std": 0.6594638489186764, |
|
"rewards/accuracy_reward": 0.12500000186264515, |
|
"rewards/format_reward": 0.3055555624887347, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3257.0416870117188, |
|
"epoch": 0.13881748071979436, |
|
"grad_norm": 0.06869164109230042, |
|
"kl": 0.0032901763916015625, |
|
"learning_rate": 9.960912730823802e-07, |
|
"loss": 0.0225, |
|
"reward": 0.3888888927176595, |
|
"reward_std": 0.1994824968278408, |
|
"rewards/accuracy_reward": 0.11111111380159855, |
|
"rewards/format_reward": 0.16666667070239782, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3114.875, |
|
"epoch": 0.1405312767780634, |
|
"grad_norm": 0.246004119515419, |
|
"kl": 0.0038585662841796875, |
|
"learning_rate": 9.95728434381847e-07, |
|
"loss": 0.1002, |
|
"reward": 0.6527777910232544, |
|
"reward_std": 0.4963437169790268, |
|
"rewards/accuracy_reward": 0.1944444477558136, |
|
"rewards/format_reward": 0.2638888955116272, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2820.8612060546875, |
|
"epoch": 0.14224507283633248, |
|
"grad_norm": 0.1362665593624115, |
|
"kl": 0.0025300979614257812, |
|
"learning_rate": 9.953495740692994e-07, |
|
"loss": 0.0321, |
|
"reward": 1.152777798473835, |
|
"reward_std": 0.4870201013982296, |
|
"rewards/accuracy_reward": 0.3194444552063942, |
|
"rewards/format_reward": 0.5138888955116272, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3047.2361450195312, |
|
"epoch": 0.14395886889460155, |
|
"grad_norm": 0.12096895277500153, |
|
"kl": 0.0046234130859375, |
|
"learning_rate": 9.949547057627897e-07, |
|
"loss": 0.06, |
|
"reward": 0.7361111156642437, |
|
"reward_std": 0.4963437244296074, |
|
"rewards/accuracy_reward": 0.1944444514811039, |
|
"rewards/format_reward": 0.3472222266718745, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3108.5833740234375, |
|
"epoch": 0.1456726649528706, |
|
"grad_norm": 0.10810334980487823, |
|
"kl": 0.0030975341796875, |
|
"learning_rate": 9.945438436557734e-07, |
|
"loss": 0.037, |
|
"reward": 0.8472222238779068, |
|
"reward_std": 0.4963437356054783, |
|
"rewards/accuracy_reward": 0.2500000074505806, |
|
"rewards/format_reward": 0.3472222238779068, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2940.791717529297, |
|
"epoch": 0.14738646101113967, |
|
"grad_norm": 0.19223813712596893, |
|
"kl": 0.011386871337890625, |
|
"learning_rate": 9.941170025166e-07, |
|
"loss": 0.0879, |
|
"reward": 0.7638889048248529, |
|
"reward_std": 0.48003676161170006, |
|
"rewards/accuracy_reward": 0.20833333395421505, |
|
"rewards/format_reward": 0.3472222276031971, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3504.5138549804688, |
|
"epoch": 0.14910025706940874, |
|
"grad_norm": 0.058835774660110474, |
|
"kl": 0.0025482177734375, |
|
"learning_rate": 9.93674197687982e-07, |
|
"loss": 0.006, |
|
"reward": 0.4166666679084301, |
|
"reward_std": 0.3332236036658287, |
|
"rewards/accuracy_reward": 0.13888889271765947, |
|
"rewards/format_reward": 0.13888889271765947, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3452.3472290039062, |
|
"epoch": 0.15081405312767782, |
|
"grad_norm": 0.10622096061706543, |
|
"kl": 0.00501251220703125, |
|
"learning_rate": 9.932154450864423e-07, |
|
"loss": 0.0373, |
|
"reward": 0.36111112125217915, |
|
"reward_std": 0.30703413113951683, |
|
"rewards/accuracy_reward": 0.1111111119389534, |
|
"rewards/format_reward": 0.13888888992369175, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2943.8472290039062, |
|
"epoch": 0.15252784918594686, |
|
"grad_norm": 0.24842512607574463, |
|
"kl": 0.00630950927734375, |
|
"learning_rate": 9.927407612017446e-07, |
|
"loss": 0.1236, |
|
"reward": 0.4583333432674408, |
|
"reward_std": 0.36548174917697906, |
|
"rewards/accuracy_reward": 0.06944444589316845, |
|
"rewards/format_reward": 0.3194444440305233, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3144.0416870117188, |
|
"epoch": 0.15424164524421594, |
|
"grad_norm": 0.08785022795200348, |
|
"kl": 0.0038299560546875, |
|
"learning_rate": 9.92250163096298e-07, |
|
"loss": 0.0214, |
|
"reward": 0.7916666716337204, |
|
"reward_std": 0.29556556046009064, |
|
"rewards/accuracy_reward": 0.2500000074505806, |
|
"rewards/format_reward": 0.2916666753590107, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3134.5833740234375, |
|
"epoch": 0.155955441302485, |
|
"grad_norm": 0.12506471574306488, |
|
"kl": 0.007457733154296875, |
|
"learning_rate": 9.91743668404545e-07, |
|
"loss": 0.0601, |
|
"reward": 0.6944444645196199, |
|
"reward_std": 0.4941713698208332, |
|
"rewards/accuracy_reward": 0.18055555783212185, |
|
"rewards/format_reward": 0.3333333386108279, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2990.5972290039062, |
|
"epoch": 0.15766923736075408, |
|
"grad_norm": 0.09640727192163467, |
|
"kl": 0.004512786865234375, |
|
"learning_rate": 9.912212953323279e-07, |
|
"loss": -0.0125, |
|
"reward": 0.8611110979691148, |
|
"reward_std": 0.4442981481552124, |
|
"rewards/accuracy_reward": 0.25000000186264515, |
|
"rewards/format_reward": 0.36111112032085657, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2980.263916015625, |
|
"epoch": 0.15938303341902313, |
|
"grad_norm": 0.1393478512763977, |
|
"kl": 0.00595855712890625, |
|
"learning_rate": 9.906830626562331e-07, |
|
"loss": 0.0749, |
|
"reward": 1.1250000149011612, |
|
"reward_std": 0.7443297058343887, |
|
"rewards/accuracy_reward": 0.33333333767950535, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3000.0694580078125, |
|
"epoch": 0.1610968294772922, |
|
"grad_norm": 0.1266450732946396, |
|
"kl": 0.003871917724609375, |
|
"learning_rate": 9.90128989722918e-07, |
|
"loss": 0.0799, |
|
"reward": 0.7916666753590107, |
|
"reward_std": 0.40012458711862564, |
|
"rewards/accuracy_reward": 0.22222222853451967, |
|
"rewards/format_reward": 0.34722223319113255, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3058.1112060546875, |
|
"epoch": 0.16281062553556128, |
|
"grad_norm": 0.14600639045238495, |
|
"kl": 0.005794525146484375, |
|
"learning_rate": 9.89559096448414e-07, |
|
"loss": 0.0668, |
|
"reward": 0.5972222313284874, |
|
"reward_std": 0.3742276132106781, |
|
"rewards/accuracy_reward": 0.15277778450399637, |
|
"rewards/format_reward": 0.2916666753590107, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3196.2361450195312, |
|
"epoch": 0.16452442159383032, |
|
"grad_norm": 0.22430367767810822, |
|
"kl": 0.003925323486328125, |
|
"learning_rate": 9.889734033174114e-07, |
|
"loss": 0.0566, |
|
"reward": 1.263888880610466, |
|
"reward_std": 0.7597223073244095, |
|
"rewards/accuracy_reward": 0.3888888880610466, |
|
"rewards/format_reward": 0.486111119389534, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3478.7916870117188, |
|
"epoch": 0.1662382176520994, |
|
"grad_norm": 0.11037839204072952, |
|
"kl": 0.005950927734375, |
|
"learning_rate": 9.883719313825227e-07, |
|
"loss": 0.0342, |
|
"reward": 0.583333345130086, |
|
"reward_std": 0.6006828360259533, |
|
"rewards/accuracy_reward": 0.1666666716337204, |
|
"rewards/format_reward": 0.25000000558793545, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3026.5277709960938, |
|
"epoch": 0.16795201371036847, |
|
"grad_norm": 0.1222737729549408, |
|
"kl": 0.00817108154296875, |
|
"learning_rate": 9.877547022635267e-07, |
|
"loss": 0.042, |
|
"reward": 1.0555555894970894, |
|
"reward_std": 0.5561864748597145, |
|
"rewards/accuracy_reward": 0.3055555559694767, |
|
"rewards/format_reward": 0.4444444589316845, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3361.638916015625, |
|
"epoch": 0.16966580976863754, |
|
"grad_norm": 0.1510419100522995, |
|
"kl": 0.00934600830078125, |
|
"learning_rate": 9.871217381465902e-07, |
|
"loss": 0.0842, |
|
"reward": 0.4444444617256522, |
|
"reward_std": 0.3906443119049072, |
|
"rewards/accuracy_reward": 0.1111111119389534, |
|
"rewards/format_reward": 0.22222222294658422, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3250.8194580078125, |
|
"epoch": 0.1713796058269066, |
|
"grad_norm": 0.15683910250663757, |
|
"kl": 0.0079803466796875, |
|
"learning_rate": 9.864730617834712e-07, |
|
"loss": 0.0871, |
|
"reward": 0.722222225740552, |
|
"reward_std": 0.5869148373603821, |
|
"rewards/accuracy_reward": 0.19444444589316845, |
|
"rewards/format_reward": 0.33333334140479565, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3126.6806030273438, |
|
"epoch": 0.17309340188517566, |
|
"grad_norm": 0.0631570890545845, |
|
"kl": 0.005035400390625, |
|
"learning_rate": 9.85808696490701e-07, |
|
"loss": 0.0344, |
|
"reward": 0.5555555559694767, |
|
"reward_std": 0.04303314909338951, |
|
"rewards/accuracy_reward": 0.1666666716337204, |
|
"rewards/format_reward": 0.2222222276031971, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3082.9166870117188, |
|
"epoch": 0.17480719794344474, |
|
"grad_norm": 0.0892498642206192, |
|
"kl": 0.00846099853515625, |
|
"learning_rate": 9.851286661487463e-07, |
|
"loss": 0.0089, |
|
"reward": 0.8194444477558136, |
|
"reward_std": 0.5157952532172203, |
|
"rewards/accuracy_reward": 0.1944444477558136, |
|
"rewards/format_reward": 0.4305555522441864, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3149.3333129882812, |
|
"epoch": 0.17652099400171378, |
|
"grad_norm": 0.1086692363023758, |
|
"kl": 0.00589752197265625, |
|
"learning_rate": 9.844329952011504e-07, |
|
"loss": 0.0502, |
|
"reward": 0.7777777798473835, |
|
"reward_std": 0.2971716746687889, |
|
"rewards/accuracy_reward": 0.2083333395421505, |
|
"rewards/format_reward": 0.3611111156642437, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3506.9722290039062, |
|
"epoch": 0.17823479005998286, |
|
"grad_norm": 0.10262292623519897, |
|
"kl": 0.005126953125, |
|
"learning_rate": 9.837217086536547e-07, |
|
"loss": 0.0261, |
|
"reward": 0.4583333320915699, |
|
"reward_std": 0.6020341292023659, |
|
"rewards/accuracy_reward": 0.13888889364898205, |
|
"rewards/format_reward": 0.18055556155741215, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2822.2222290039062, |
|
"epoch": 0.17994858611825193, |
|
"grad_norm": 0.11509492248296738, |
|
"kl": 0.00792694091796875, |
|
"learning_rate": 9.829948320733e-07, |
|
"loss": 0.0501, |
|
"reward": 0.8333333283662796, |
|
"reward_std": 0.6226537488400936, |
|
"rewards/accuracy_reward": 0.18055556062608957, |
|
"rewards/format_reward": 0.4722222238779068, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3389.6527709960938, |
|
"epoch": 0.181662382176521, |
|
"grad_norm": 0.11453983187675476, |
|
"kl": 0.00846099853515625, |
|
"learning_rate": 9.822523915875077e-07, |
|
"loss": 0.0623, |
|
"reward": 0.6666666679084301, |
|
"reward_std": 0.7060182616114616, |
|
"rewards/accuracy_reward": 0.20833334047347307, |
|
"rewards/format_reward": 0.25000000838190317, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3259.6112060546875, |
|
"epoch": 0.18337617823479005, |
|
"grad_norm": 0.09651084989309311, |
|
"kl": 0.007171630859375, |
|
"learning_rate": 9.8149441388314e-07, |
|
"loss": 0.0375, |
|
"reward": 0.5833333441987634, |
|
"reward_std": 0.3762567453086376, |
|
"rewards/accuracy_reward": 0.16666667349636555, |
|
"rewards/format_reward": 0.2500000027939677, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3319.1944580078125, |
|
"epoch": 0.18508997429305912, |
|
"grad_norm": 0.11137856543064117, |
|
"kl": 0.00550079345703125, |
|
"learning_rate": 9.807209262055415e-07, |
|
"loss": 0.03, |
|
"reward": 0.736111112870276, |
|
"reward_std": 0.6152903288602829, |
|
"rewards/accuracy_reward": 0.19444444589316845, |
|
"rewards/format_reward": 0.3472222210839391, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3011.9443969726562, |
|
"epoch": 0.1868037703513282, |
|
"grad_norm": 0.1661546230316162, |
|
"kl": 0.00968170166015625, |
|
"learning_rate": 9.799319563575593e-07, |
|
"loss": 0.0582, |
|
"reward": 0.5555555559694767, |
|
"reward_std": 0.40824829787015915, |
|
"rewards/accuracy_reward": 0.11111111380159855, |
|
"rewards/format_reward": 0.33333333022892475, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3077.3472290039062, |
|
"epoch": 0.18851756640959727, |
|
"grad_norm": 0.1285841166973114, |
|
"kl": 0.0084228515625, |
|
"learning_rate": 9.791275326985434e-07, |
|
"loss": 0.0404, |
|
"reward": 0.7638888955116272, |
|
"reward_std": 0.8195342868566513, |
|
"rewards/accuracy_reward": 0.19444445241242647, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2989.02783203125, |
|
"epoch": 0.19023136246786632, |
|
"grad_norm": 0.14638738334178925, |
|
"kl": 0.00787353515625, |
|
"learning_rate": 9.783076841433279e-07, |
|
"loss": 0.0678, |
|
"reward": 0.6805555634200573, |
|
"reward_std": 0.540618360042572, |
|
"rewards/accuracy_reward": 0.15277778171002865, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3064.2222290039062, |
|
"epoch": 0.1919451585261354, |
|
"grad_norm": 0.11260731518268585, |
|
"kl": 0.0069580078125, |
|
"learning_rate": 9.774724401611918e-07, |
|
"loss": 0.0528, |
|
"reward": 0.624999993480742, |
|
"reward_std": 0.45954934880137444, |
|
"rewards/accuracy_reward": 0.12500000186264515, |
|
"rewards/format_reward": 0.37500000838190317, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3019.5138549804688, |
|
"epoch": 0.19365895458440446, |
|
"grad_norm": 0.1483815312385559, |
|
"kl": 0.0084686279296875, |
|
"learning_rate": 9.76621830774799e-07, |
|
"loss": 0.0892, |
|
"reward": 1.1250000149011612, |
|
"reward_std": 0.7124863266944885, |
|
"rewards/accuracy_reward": 0.3194444477558136, |
|
"rewards/format_reward": 0.4861111119389534, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2765.2083435058594, |
|
"epoch": 0.1953727506426735, |
|
"grad_norm": 0.16946516931056976, |
|
"kl": 0.00933837890625, |
|
"learning_rate": 9.757558865591196e-07, |
|
"loss": 0.0612, |
|
"reward": 0.8611110933125019, |
|
"reward_std": 0.5443289652466774, |
|
"rewards/accuracy_reward": 0.22222222853451967, |
|
"rewards/format_reward": 0.41666667349636555, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2844.77783203125, |
|
"epoch": 0.19708654670094258, |
|
"grad_norm": 0.3654565215110779, |
|
"kl": 0.00984954833984375, |
|
"learning_rate": 9.748746386403305e-07, |
|
"loss": 0.2113, |
|
"reward": 0.875, |
|
"reward_std": 0.7314287945628166, |
|
"rewards/accuracy_reward": 0.2361111156642437, |
|
"rewards/format_reward": 0.4027777910232544, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3334.25, |
|
"epoch": 0.19880034275921166, |
|
"grad_norm": 0.1866183876991272, |
|
"kl": 0.01085662841796875, |
|
"learning_rate": 9.739781186946978e-07, |
|
"loss": 0.0509, |
|
"reward": 0.5138888889923692, |
|
"reward_std": 0.3636069521307945, |
|
"rewards/accuracy_reward": 0.1111111156642437, |
|
"rewards/format_reward": 0.2916666688397527, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3231.9722290039062, |
|
"epoch": 0.20051413881748073, |
|
"grad_norm": 0.1388145536184311, |
|
"kl": 0.01290130615234375, |
|
"learning_rate": 9.730663589474364e-07, |
|
"loss": 0.0397, |
|
"reward": 0.9027777630835772, |
|
"reward_std": 0.766443993896246, |
|
"rewards/accuracy_reward": 0.2500000037252903, |
|
"rewards/format_reward": 0.40277778543531895, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2966.15283203125, |
|
"epoch": 0.20222793487574978, |
|
"grad_norm": 0.10939842462539673, |
|
"kl": 0.011932373046875, |
|
"learning_rate": 9.721393921715533e-07, |
|
"loss": 0.0173, |
|
"reward": 0.6527777835726738, |
|
"reward_std": 0.4645882025361061, |
|
"rewards/accuracy_reward": 0.12500000279396772, |
|
"rewards/format_reward": 0.4027777835726738, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2932.3333740234375, |
|
"epoch": 0.20394173093401885, |
|
"grad_norm": 0.10793702304363251, |
|
"kl": 0.01213836669921875, |
|
"learning_rate": 9.711972516866678e-07, |
|
"loss": 0.0573, |
|
"reward": 0.861111119389534, |
|
"reward_std": 0.21574701368808746, |
|
"rewards/accuracy_reward": 0.236111119389534, |
|
"rewards/format_reward": 0.3888888955116272, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3096.666748046875, |
|
"epoch": 0.20565552699228792, |
|
"grad_norm": 0.22288325428962708, |
|
"kl": 0.01317596435546875, |
|
"learning_rate": 9.70239971357816e-07, |
|
"loss": 0.1225, |
|
"reward": 0.8750000149011612, |
|
"reward_std": 0.8644967824220657, |
|
"rewards/accuracy_reward": 0.26388889644294977, |
|
"rewards/format_reward": 0.3472222276031971, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3175.625, |
|
"epoch": 0.207369323050557, |
|
"grad_norm": 0.11317700147628784, |
|
"kl": 0.0115814208984375, |
|
"learning_rate": 9.692675855942318e-07, |
|
"loss": 0.0349, |
|
"reward": 0.8611111342906952, |
|
"reward_std": 0.6463145054876804, |
|
"rewards/accuracy_reward": 0.2361111156642437, |
|
"rewards/format_reward": 0.38888888992369175, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3247.8333740234375, |
|
"epoch": 0.20908311910882604, |
|
"grad_norm": 0.18068967759609222, |
|
"kl": 0.014404296875, |
|
"learning_rate": 9.682801293481108e-07, |
|
"loss": 0.0904, |
|
"reward": 0.4305555671453476, |
|
"reward_std": 0.5079665929079056, |
|
"rewards/accuracy_reward": 0.041666666977107525, |
|
"rewards/format_reward": 0.3472222276031971, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2834.2222595214844, |
|
"epoch": 0.21079691516709512, |
|
"grad_norm": 0.15124721825122833, |
|
"kl": 0.012481689453125, |
|
"learning_rate": 9.67277638113354e-07, |
|
"loss": 0.0635, |
|
"reward": 1.2222222611308098, |
|
"reward_std": 0.6941533647477627, |
|
"rewards/accuracy_reward": 0.37500000186264515, |
|
"rewards/format_reward": 0.47222221828997135, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3222.3472900390625, |
|
"epoch": 0.2125107112253642, |
|
"grad_norm": 0.14290107786655426, |
|
"kl": 0.012481689453125, |
|
"learning_rate": 9.662601479242914e-07, |
|
"loss": 0.0404, |
|
"reward": 0.8611111268401146, |
|
"reward_std": 0.5691947191953659, |
|
"rewards/accuracy_reward": 0.25000000558793545, |
|
"rewards/format_reward": 0.3611111119389534, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3088.8333740234375, |
|
"epoch": 0.21422450728363324, |
|
"grad_norm": 0.08310513198375702, |
|
"kl": 0.0166168212890625, |
|
"learning_rate": 9.652276953543877e-07, |
|
"loss": 0.0133, |
|
"reward": 0.8055555820465088, |
|
"reward_std": 0.3680921755731106, |
|
"rewards/accuracy_reward": 0.2500000037252903, |
|
"rewards/format_reward": 0.3055555559694767, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3026.0555419921875, |
|
"epoch": 0.2159383033419023, |
|
"grad_norm": 0.20979027450084686, |
|
"kl": 0.017181396484375, |
|
"learning_rate": 9.641803175149264e-07, |
|
"loss": 0.076, |
|
"reward": 0.8611111156642437, |
|
"reward_std": 0.41928989440202713, |
|
"rewards/accuracy_reward": 0.22222222108393908, |
|
"rewards/format_reward": 0.416666672565043, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3460.875, |
|
"epoch": 0.21765209940017138, |
|
"grad_norm": 0.15380728244781494, |
|
"kl": 0.0160064697265625, |
|
"learning_rate": 9.631180520536777e-07, |
|
"loss": 0.0459, |
|
"reward": 0.2916666716337204, |
|
"reward_std": 0.5213519968092442, |
|
"rewards/accuracy_reward": 0.06944444496184587, |
|
"rewards/format_reward": 0.15277777891606092, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3339.1806030273438, |
|
"epoch": 0.21936589545844046, |
|
"grad_norm": 0.0945175364613533, |
|
"kl": 0.0178375244140625, |
|
"learning_rate": 9.62040937153543e-07, |
|
"loss": 0.0209, |
|
"reward": 0.6527777910232544, |
|
"reward_std": 0.46317092329263687, |
|
"rewards/accuracy_reward": 0.2083333395421505, |
|
"rewards/format_reward": 0.2361111156642437, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2740.3195190429688, |
|
"epoch": 0.2210796915167095, |
|
"grad_norm": 0.15982748568058014, |
|
"kl": 0.01837158203125, |
|
"learning_rate": 9.60949011531184e-07, |
|
"loss": 0.0037, |
|
"reward": 0.8750000149011612, |
|
"reward_std": 0.5248102210462093, |
|
"rewards/accuracy_reward": 0.2222222238779068, |
|
"rewards/format_reward": 0.4305555559694767, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2644.3472900390625, |
|
"epoch": 0.22279348757497858, |
|
"grad_norm": 0.10350772738456726, |
|
"kl": 0.0185546875, |
|
"learning_rate": 9.598423144356312e-07, |
|
"loss": 0.0416, |
|
"reward": 1.5833333153277636, |
|
"reward_std": 0.4627453200519085, |
|
"rewards/accuracy_reward": 0.4861111082136631, |
|
"rewards/format_reward": 0.6111111212521791, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3329.263916015625, |
|
"epoch": 0.22450728363324765, |
|
"grad_norm": 0.17227336764335632, |
|
"kl": 0.0158233642578125, |
|
"learning_rate": 9.587208856468713e-07, |
|
"loss": 0.0527, |
|
"reward": 0.4583333432674408, |
|
"reward_std": 0.4853021502494812, |
|
"rewards/accuracy_reward": 0.11111111473292112, |
|
"rewards/format_reward": 0.23611111380159855, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3231.4861450195312, |
|
"epoch": 0.2262210796915167, |
|
"grad_norm": 0.13778822124004364, |
|
"kl": 0.016204833984375, |
|
"learning_rate": 9.575847654744196e-07, |
|
"loss": 0.0544, |
|
"reward": 0.611111119389534, |
|
"reward_std": 0.541582465171814, |
|
"rewards/accuracy_reward": 0.16666667070239782, |
|
"rewards/format_reward": 0.2777777872979641, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3083.2222900390625, |
|
"epoch": 0.22793487574978577, |
|
"grad_norm": 0.12422922253608704, |
|
"kl": 0.0192108154296875, |
|
"learning_rate": 9.564339947558697e-07, |
|
"loss": 0.017, |
|
"reward": 1.0694444477558136, |
|
"reward_std": 0.5992536917328835, |
|
"rewards/accuracy_reward": 0.31944444961845875, |
|
"rewards/format_reward": 0.430555559694767, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2589.6806030273438, |
|
"epoch": 0.22964867180805484, |
|
"grad_norm": 0.24903494119644165, |
|
"kl": 0.01654052734375, |
|
"learning_rate": 9.552686148554252e-07, |
|
"loss": 0.1379, |
|
"reward": 1.3055555671453476, |
|
"reward_std": 0.8013041242957115, |
|
"rewards/accuracy_reward": 0.3611111165955663, |
|
"rewards/format_reward": 0.5833333469927311, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2682.638916015625, |
|
"epoch": 0.23136246786632392, |
|
"grad_norm": 0.16912828385829926, |
|
"kl": 0.0197601318359375, |
|
"learning_rate": 9.540886676624145e-07, |
|
"loss": 0.0699, |
|
"reward": 1.1805555671453476, |
|
"reward_std": 0.5904464609920979, |
|
"rewards/accuracy_reward": 0.30555556435137987, |
|
"rewards/format_reward": 0.5694444477558136, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3100.4444580078125, |
|
"epoch": 0.23307626392459296, |
|
"grad_norm": 0.14741510152816772, |
|
"kl": 0.0220794677734375, |
|
"learning_rate": 9.528941955897839e-07, |
|
"loss": 0.0595, |
|
"reward": 0.8750000232830644, |
|
"reward_std": 0.6136412769556046, |
|
"rewards/accuracy_reward": 0.2638888917863369, |
|
"rewards/format_reward": 0.3472222285345197, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3276.1805419921875, |
|
"epoch": 0.23479005998286204, |
|
"grad_norm": 0.12443236261606216, |
|
"kl": 0.020843505859375, |
|
"learning_rate": 9.516852415725732e-07, |
|
"loss": 0.0334, |
|
"reward": 1.1944444477558136, |
|
"reward_std": 0.5957729890942574, |
|
"rewards/accuracy_reward": 0.361111119389534, |
|
"rewards/format_reward": 0.4722222238779068, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3190.138916015625, |
|
"epoch": 0.2365038560411311, |
|
"grad_norm": 0.12088010460138321, |
|
"kl": 0.0197601318359375, |
|
"learning_rate": 9.504618490663726e-07, |
|
"loss": 0.0239, |
|
"reward": 0.9305555606260896, |
|
"reward_std": 0.39455174282193184, |
|
"rewards/accuracy_reward": 0.2777777910232544, |
|
"rewards/format_reward": 0.3750000009313226, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3037.0833129882812, |
|
"epoch": 0.23821765209940018, |
|
"grad_norm": 0.17049150168895721, |
|
"kl": 0.02288818359375, |
|
"learning_rate": 9.492240620457606e-07, |
|
"loss": 0.075, |
|
"reward": 0.6111111156642437, |
|
"reward_std": 0.35632818564772606, |
|
"rewards/accuracy_reward": 0.13888888992369175, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2627.3611450195312, |
|
"epoch": 0.23993144815766923, |
|
"grad_norm": 0.14227531850337982, |
|
"kl": 0.028778076171875, |
|
"learning_rate": 9.479719250027239e-07, |
|
"loss": 0.0286, |
|
"reward": 0.9027777910232544, |
|
"reward_std": 0.2613905444741249, |
|
"rewards/accuracy_reward": 0.18055556155741215, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3192.625, |
|
"epoch": 0.2416452442159383, |
|
"grad_norm": 0.16459161043167114, |
|
"kl": 0.02606201171875, |
|
"learning_rate": 9.467054829450571e-07, |
|
"loss": 0.0413, |
|
"reward": 0.7500000149011612, |
|
"reward_std": 0.4791666865348816, |
|
"rewards/accuracy_reward": 0.19444444496184587, |
|
"rewards/format_reward": 0.361111119389534, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3326.263916015625, |
|
"epoch": 0.24335904027420738, |
|
"grad_norm": 0.08248669654130936, |
|
"kl": 0.0257568359375, |
|
"learning_rate": 9.454247813947455e-07, |
|
"loss": -0.0021, |
|
"reward": 0.486111119389534, |
|
"reward_std": 0.24812544882297516, |
|
"rewards/accuracy_reward": 0.1527777835726738, |
|
"rewards/format_reward": 0.180555559694767, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3274.7500610351562, |
|
"epoch": 0.24507283633247642, |
|
"grad_norm": 0.22874736785888672, |
|
"kl": 0.027557373046875, |
|
"learning_rate": 9.441298663863289e-07, |
|
"loss": 0.0603, |
|
"reward": 0.6944444477558136, |
|
"reward_std": 0.5954620316624641, |
|
"rewards/accuracy_reward": 0.16666666511446238, |
|
"rewards/format_reward": 0.361111119389534, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3433.0416870117188, |
|
"epoch": 0.2467866323907455, |
|
"grad_norm": 0.1357790231704712, |
|
"kl": 0.025115966796875, |
|
"learning_rate": 9.428207844652466e-07, |
|
"loss": 0.0337, |
|
"reward": 0.611111119389534, |
|
"reward_std": 0.7063797116279602, |
|
"rewards/accuracy_reward": 0.180555559694767, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3047.5972900390625, |
|
"epoch": 0.24850042844901457, |
|
"grad_norm": 0.10790549218654633, |
|
"kl": 0.02679443359375, |
|
"learning_rate": 9.414975826861651e-07, |
|
"loss": -0.0015, |
|
"reward": 0.8194444347172976, |
|
"reward_std": 0.45097024738788605, |
|
"rewards/accuracy_reward": 0.2222222276031971, |
|
"rewards/format_reward": 0.37500000931322575, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2496.180633544922, |
|
"epoch": 0.25021422450728364, |
|
"grad_norm": 0.2421943098306656, |
|
"kl": 0.025360107421875, |
|
"learning_rate": 9.401603086112854e-07, |
|
"loss": 0.0966, |
|
"reward": 1.1666667014360428, |
|
"reward_std": 0.7269039005041122, |
|
"rewards/accuracy_reward": 0.3194444514811039, |
|
"rewards/format_reward": 0.5277777835726738, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3213.861083984375, |
|
"epoch": 0.2519280205655527, |
|
"grad_norm": 0.07842709124088287, |
|
"kl": 0.029083251953125, |
|
"learning_rate": 9.388090103086343e-07, |
|
"loss": 0.0114, |
|
"reward": 0.5833333358168602, |
|
"reward_std": 0.14478403329849243, |
|
"rewards/accuracy_reward": 0.1388888917863369, |
|
"rewards/format_reward": 0.3055555671453476, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3094.3611450195312, |
|
"epoch": 0.2536418166238218, |
|
"grad_norm": 0.08900830894708633, |
|
"kl": 0.02301025390625, |
|
"learning_rate": 9.374437363503368e-07, |
|
"loss": 0.0566, |
|
"reward": 0.5138888917863369, |
|
"reward_std": 0.17996380850672722, |
|
"rewards/accuracy_reward": 0.125, |
|
"rewards/format_reward": 0.2638888917863369, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2830.7361450195312, |
|
"epoch": 0.25535561268209084, |
|
"grad_norm": 0.5457747578620911, |
|
"kl": 0.05902099609375, |
|
"learning_rate": 9.360645358108695e-07, |
|
"loss": 0.0158, |
|
"reward": 1.0694444477558136, |
|
"reward_std": 0.6218058541417122, |
|
"rewards/accuracy_reward": 0.2916666716337204, |
|
"rewards/format_reward": 0.486111119389534, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2843.3333740234375, |
|
"epoch": 0.2570694087403599, |
|
"grad_norm": 0.3755730986595154, |
|
"kl": 0.02838134765625, |
|
"learning_rate": 9.34671458265297e-07, |
|
"loss": 0.1583, |
|
"reward": 1.2083333432674408, |
|
"reward_std": 0.6026154272258282, |
|
"rewards/accuracy_reward": 0.36111111380159855, |
|
"rewards/format_reward": 0.486111119389534, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3075.763916015625, |
|
"epoch": 0.258783204798629, |
|
"grad_norm": 0.418891578912735, |
|
"kl": 0.0416259765625, |
|
"learning_rate": 9.332645537874899e-07, |
|
"loss": 0.0858, |
|
"reward": 0.5277777947485447, |
|
"reward_std": 0.4114787243306637, |
|
"rewards/accuracy_reward": 0.09722222574055195, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3111.8056030273438, |
|
"epoch": 0.26049700085689803, |
|
"grad_norm": 0.24780075252056122, |
|
"kl": 0.03948974609375, |
|
"learning_rate": 9.318438729483249e-07, |
|
"loss": 0.0987, |
|
"reward": 0.6527777761220932, |
|
"reward_std": 0.5695527195930481, |
|
"rewards/accuracy_reward": 0.1944444477558136, |
|
"rewards/format_reward": 0.2638888992369175, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3033.763916015625, |
|
"epoch": 0.2622107969151671, |
|
"grad_norm": 0.31493860483169556, |
|
"kl": 0.036956787109375, |
|
"learning_rate": 9.304094668138669e-07, |
|
"loss": 0.0483, |
|
"reward": 0.7777777798473835, |
|
"reward_std": 0.7054093405604362, |
|
"rewards/accuracy_reward": 0.180555559694767, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2864.388916015625, |
|
"epoch": 0.2639245929734362, |
|
"grad_norm": 0.17504319548606873, |
|
"kl": 0.03851318359375, |
|
"learning_rate": 9.289613869435336e-07, |
|
"loss": 0.0353, |
|
"reward": 0.7083333432674408, |
|
"reward_std": 0.5801292359828949, |
|
"rewards/accuracy_reward": 0.19444444682449102, |
|
"rewards/format_reward": 0.31944444961845875, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3274.7222290039062, |
|
"epoch": 0.2656383890317052, |
|
"grad_norm": 0.130802720785141, |
|
"kl": 0.04754638671875, |
|
"learning_rate": 9.274996853882425e-07, |
|
"loss": 0.0159, |
|
"reward": 0.8472222350537777, |
|
"reward_std": 0.5149499326944351, |
|
"rewards/accuracy_reward": 0.25000000186264515, |
|
"rewards/format_reward": 0.3472222248092294, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2833.4444580078125, |
|
"epoch": 0.26735218508997427, |
|
"grad_norm": 0.24871142208576202, |
|
"kl": 0.04052734375, |
|
"learning_rate": 9.260244146885391e-07, |
|
"loss": 0.0251, |
|
"reward": 0.9861111305654049, |
|
"reward_std": 0.5550614818930626, |
|
"rewards/accuracy_reward": 0.2500000046566129, |
|
"rewards/format_reward": 0.486111112870276, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2903.638885498047, |
|
"epoch": 0.26906598114824337, |
|
"grad_norm": 0.158583402633667, |
|
"kl": 0.04901123046875, |
|
"learning_rate": 9.245356278727093e-07, |
|
"loss": -0.0172, |
|
"reward": 1.0555555894970894, |
|
"reward_std": 0.6695602312684059, |
|
"rewards/accuracy_reward": 0.30555555783212185, |
|
"rewards/format_reward": 0.4444444514811039, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2771.236114501953, |
|
"epoch": 0.2707797772065124, |
|
"grad_norm": 0.2626900374889374, |
|
"kl": 0.051513671875, |
|
"learning_rate": 9.230333784548726e-07, |
|
"loss": 0.0563, |
|
"reward": 0.8055555671453476, |
|
"reward_std": 0.6322761699557304, |
|
"rewards/accuracy_reward": 0.1805555559694767, |
|
"rewards/format_reward": 0.4444444477558136, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2995.0694580078125, |
|
"epoch": 0.27249357326478146, |
|
"grad_norm": 0.7491080164909363, |
|
"kl": 0.06500244140625, |
|
"learning_rate": 9.215177204330587e-07, |
|
"loss": 0.1298, |
|
"reward": 0.847222201526165, |
|
"reward_std": 0.8498382568359375, |
|
"rewards/accuracy_reward": 0.22222222667187452, |
|
"rewards/format_reward": 0.4027777835726738, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2974.9166870117188, |
|
"epoch": 0.27420736932305056, |
|
"grad_norm": 0.2567845284938812, |
|
"kl": 0.06201171875, |
|
"learning_rate": 9.199887082872672e-07, |
|
"loss": 0.0524, |
|
"reward": 0.9583333283662796, |
|
"reward_std": 0.5167308412492275, |
|
"rewards/accuracy_reward": 0.30555556155741215, |
|
"rewards/format_reward": 0.3472222285345197, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2585.77783203125, |
|
"epoch": 0.2759211653813196, |
|
"grad_norm": 0.2760615050792694, |
|
"kl": 0.0694580078125, |
|
"learning_rate": 9.184463969775083e-07, |
|
"loss": 0.0275, |
|
"reward": 0.9166666567325592, |
|
"reward_std": 0.587900884449482, |
|
"rewards/accuracy_reward": 0.30555556155741215, |
|
"rewards/format_reward": 0.30555556155741215, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2197.25, |
|
"epoch": 0.2776349614395887, |
|
"grad_norm": 0.5792744755744934, |
|
"kl": 0.0704345703125, |
|
"learning_rate": 9.168908419418278e-07, |
|
"loss": 0.0347, |
|
"reward": 0.611111119389534, |
|
"reward_std": 0.4948512986302376, |
|
"rewards/accuracy_reward": 0.15277778077870607, |
|
"rewards/format_reward": 0.3055555671453476, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2838.041748046875, |
|
"epoch": 0.27934875749785776, |
|
"grad_norm": 0.39986100792884827, |
|
"kl": 0.0880126953125, |
|
"learning_rate": 9.153220990943145e-07, |
|
"loss": 0.0897, |
|
"reward": 0.625, |
|
"reward_std": 0.6566977351903915, |
|
"rewards/accuracy_reward": 0.1944444477558136, |
|
"rewards/format_reward": 0.23611111287027597, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2120.2916870117188, |
|
"epoch": 0.2810625535561268, |
|
"grad_norm": 0.5093821287155151, |
|
"kl": 0.0892333984375, |
|
"learning_rate": 9.137402248230903e-07, |
|
"loss": 0.0789, |
|
"reward": 0.7777777835726738, |
|
"reward_std": 0.4542044475674629, |
|
"rewards/accuracy_reward": 0.236111119389534, |
|
"rewards/format_reward": 0.305555559694767, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2339.6666870117188, |
|
"epoch": 0.2827763496143959, |
|
"grad_norm": 0.3379420340061188, |
|
"kl": 0.095947265625, |
|
"learning_rate": 9.121452759882831e-07, |
|
"loss": -0.01, |
|
"reward": 1.0555555745959282, |
|
"reward_std": 0.7722257673740387, |
|
"rewards/accuracy_reward": 0.3472222285345197, |
|
"rewards/format_reward": 0.3611111119389534, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2644.361114501953, |
|
"epoch": 0.28449014567266495, |
|
"grad_norm": 0.4091556668281555, |
|
"kl": 0.105224609375, |
|
"learning_rate": 9.105373099199835e-07, |
|
"loss": 0.0473, |
|
"reward": 0.9166666716337204, |
|
"reward_std": 0.8636802136898041, |
|
"rewards/accuracy_reward": 0.2777777807787061, |
|
"rewards/format_reward": 0.361111112870276, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2401.9722595214844, |
|
"epoch": 0.286203941730934, |
|
"grad_norm": 0.6180943250656128, |
|
"kl": 0.1217041015625, |
|
"learning_rate": 9.08916384416183e-07, |
|
"loss": 0.1078, |
|
"reward": 1.0138888955116272, |
|
"reward_std": 1.0287161767482758, |
|
"rewards/accuracy_reward": 0.3888888955116272, |
|
"rewards/format_reward": 0.23611111659556627, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2471.9584045410156, |
|
"epoch": 0.2879177377892031, |
|
"grad_norm": 0.3897995352745056, |
|
"kl": 0.119384765625, |
|
"learning_rate": 9.072825577406981e-07, |
|
"loss": 0.0262, |
|
"reward": 1.0555555522441864, |
|
"reward_std": 0.5160078145563602, |
|
"rewards/accuracy_reward": 0.3333333432674408, |
|
"rewards/format_reward": 0.3888888917863369, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2073.916717529297, |
|
"epoch": 0.28963153384747214, |
|
"grad_norm": 0.74821537733078, |
|
"kl": 0.1395263671875, |
|
"learning_rate": 9.056358886210747e-07, |
|
"loss": 0.0509, |
|
"reward": 0.902777798473835, |
|
"reward_std": 0.7328798174858093, |
|
"rewards/accuracy_reward": 0.2916666688397527, |
|
"rewards/format_reward": 0.3194444514811039, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1999.2499694824219, |
|
"epoch": 0.2913453299057412, |
|
"grad_norm": 0.5790112018585205, |
|
"kl": 0.1368408203125, |
|
"learning_rate": 9.039764362464775e-07, |
|
"loss": -0.0285, |
|
"reward": 0.9444444701075554, |
|
"reward_std": 0.6172067150473595, |
|
"rewards/accuracy_reward": 0.34722222946584225, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2040.3472595214844, |
|
"epoch": 0.2930591259640103, |
|
"grad_norm": 0.6991499066352844, |
|
"kl": 0.13037109375, |
|
"learning_rate": 9.023042602655623e-07, |
|
"loss": -0.0061, |
|
"reward": 0.8750000186264515, |
|
"reward_std": 0.5527190193533897, |
|
"rewards/accuracy_reward": 0.2361111119389534, |
|
"rewards/format_reward": 0.40277778916060925, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2682.625, |
|
"epoch": 0.29477292202227934, |
|
"grad_norm": 0.4274706244468689, |
|
"kl": 0.1610107421875, |
|
"learning_rate": 9.00619420784333e-07, |
|
"loss": 0.0389, |
|
"reward": 0.8888888955116272, |
|
"reward_std": 0.9098298698663712, |
|
"rewards/accuracy_reward": 0.26388889737427235, |
|
"rewards/format_reward": 0.361111119389534, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2140.3333435058594, |
|
"epoch": 0.29648671808054844, |
|
"grad_norm": 1.2440991401672363, |
|
"kl": 0.156494140625, |
|
"learning_rate": 8.989219783639795e-07, |
|
"loss": 0.1369, |
|
"reward": 0.7916666865348816, |
|
"reward_std": 0.7525217533111572, |
|
"rewards/accuracy_reward": 0.1666666679084301, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2162.361083984375, |
|
"epoch": 0.2982005141388175, |
|
"grad_norm": 2.159301519393921, |
|
"kl": 0.1800537109375, |
|
"learning_rate": 8.972119940187017e-07, |
|
"loss": 0.1231, |
|
"reward": 0.6666666716337204, |
|
"reward_std": 0.6304600611329079, |
|
"rewards/accuracy_reward": 0.13888889364898205, |
|
"rewards/format_reward": 0.3888888955116272, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2967.8333740234375, |
|
"epoch": 0.29991431019708653, |
|
"grad_norm": 0.6146534085273743, |
|
"kl": 0.29296875, |
|
"learning_rate": 8.95489529213517e-07, |
|
"loss": 0.0622, |
|
"reward": 0.4027777872979641, |
|
"reward_std": 0.4733867570757866, |
|
"rewards/accuracy_reward": 0.0555555559694767, |
|
"rewards/format_reward": 0.2916666753590107, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2475.7777709960938, |
|
"epoch": 0.30162810625535563, |
|
"grad_norm": 0.9705228209495544, |
|
"kl": 0.2490234375, |
|
"learning_rate": 8.93754645862049e-07, |
|
"loss": 0.0784, |
|
"reward": 0.8750000149011612, |
|
"reward_std": 0.6140558868646622, |
|
"rewards/accuracy_reward": 0.1805555559694767, |
|
"rewards/format_reward": 0.5138888880610466, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3104.0139770507812, |
|
"epoch": 0.3033419023136247, |
|
"grad_norm": 0.41929054260253906, |
|
"kl": 0.32421875, |
|
"learning_rate": 8.920074063243045e-07, |
|
"loss": 0.0399, |
|
"reward": 0.5277777845039964, |
|
"reward_std": 0.3559510372579098, |
|
"rewards/accuracy_reward": 0.11111111287027597, |
|
"rewards/format_reward": 0.30555556435137987, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2869.6527709960938, |
|
"epoch": 0.3050556983718937, |
|
"grad_norm": 0.4340916574001312, |
|
"kl": 0.351318359375, |
|
"learning_rate": 8.902478734044297e-07, |
|
"loss": 0.0451, |
|
"reward": 0.8750000074505806, |
|
"reward_std": 0.37828588113188744, |
|
"rewards/accuracy_reward": 0.2222222238779068, |
|
"rewards/format_reward": 0.430555559694767, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3403.9444580078125, |
|
"epoch": 0.3067694944301628, |
|
"grad_norm": 0.5245348811149597, |
|
"kl": 0.4765625, |
|
"learning_rate": 8.884761103484547e-07, |
|
"loss": 0.0707, |
|
"reward": 0.5416666753590107, |
|
"reward_std": 0.47885870188474655, |
|
"rewards/accuracy_reward": 0.15277778450399637, |
|
"rewards/format_reward": 0.23611111380159855, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2249.7083740234375, |
|
"epoch": 0.30848329048843187, |
|
"grad_norm": 1.083481788635254, |
|
"kl": 0.35009765625, |
|
"learning_rate": 8.866921808420184e-07, |
|
"loss": 0.1105, |
|
"reward": 1.4305555820465088, |
|
"reward_std": 0.4940013214945793, |
|
"rewards/accuracy_reward": 0.3333333395421505, |
|
"rewards/format_reward": 0.763888880610466, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3113.90283203125, |
|
"epoch": 0.3101970865467009, |
|
"grad_norm": 0.5710152983665466, |
|
"kl": 0.525390625, |
|
"learning_rate": 8.848961490080805e-07, |
|
"loss": 0.0798, |
|
"reward": 0.7916666716337204, |
|
"reward_std": 0.4283023402094841, |
|
"rewards/accuracy_reward": 0.2222222238779068, |
|
"rewards/format_reward": 0.3472222313284874, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3370.9166870117188, |
|
"epoch": 0.31191088260497, |
|
"grad_norm": 0.744907557964325, |
|
"kl": 0.662109375, |
|
"learning_rate": 8.830880794046162e-07, |
|
"loss": 0.0642, |
|
"reward": 0.22222222853451967, |
|
"reward_std": 0.27216554805636406, |
|
"rewards/accuracy_reward": 0.013888888992369175, |
|
"rewards/format_reward": 0.19444444961845875, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3109.4861450195312, |
|
"epoch": 0.31362467866323906, |
|
"grad_norm": 0.9323793649673462, |
|
"kl": 0.5927734375, |
|
"learning_rate": 8.81268037022296e-07, |
|
"loss": 0.0847, |
|
"reward": 0.5972222201526165, |
|
"reward_std": 0.518895335495472, |
|
"rewards/accuracy_reward": 0.12500000279396772, |
|
"rewards/format_reward": 0.34722223225980997, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3036.0833129882812, |
|
"epoch": 0.31533847472150817, |
|
"grad_norm": 0.7612030506134033, |
|
"kl": 0.666015625, |
|
"learning_rate": 8.794360872821486e-07, |
|
"loss": 0.0732, |
|
"reward": 0.7777777686715126, |
|
"reward_std": 0.781570628285408, |
|
"rewards/accuracy_reward": 0.15277778171002865, |
|
"rewards/format_reward": 0.47222222574055195, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3103.138916015625, |
|
"epoch": 0.3170522707797772, |
|
"grad_norm": 0.8818280100822449, |
|
"kl": 0.7119140625, |
|
"learning_rate": 8.775922960332108e-07, |
|
"loss": 0.0613, |
|
"reward": 0.6388889141380787, |
|
"reward_std": 0.4765222370624542, |
|
"rewards/accuracy_reward": 0.13888888992369175, |
|
"rewards/format_reward": 0.3611111156642437, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2773.90283203125, |
|
"epoch": 0.31876606683804626, |
|
"grad_norm": 0.6143203377723694, |
|
"kl": 0.6201171875, |
|
"learning_rate": 8.757367295501594e-07, |
|
"loss": 0.0686, |
|
"reward": 0.791666679084301, |
|
"reward_std": 0.4058704674243927, |
|
"rewards/accuracy_reward": 0.1666666679084301, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2821.4583740234375, |
|
"epoch": 0.32047986289631536, |
|
"grad_norm": 2.1052632331848145, |
|
"kl": 0.5546875, |
|
"learning_rate": 8.738694545309298e-07, |
|
"loss": 0.1962, |
|
"reward": 0.7638888955116272, |
|
"reward_std": 0.47831880301237106, |
|
"rewards/accuracy_reward": 0.16666667349636555, |
|
"rewards/format_reward": 0.4305555634200573, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3268.40283203125, |
|
"epoch": 0.3221936589545844, |
|
"grad_norm": 0.7343023419380188, |
|
"kl": 0.6640625, |
|
"learning_rate": 8.719905380943182e-07, |
|
"loss": 0.0963, |
|
"reward": 0.3888888955116272, |
|
"reward_std": 0.5786792561411858, |
|
"rewards/accuracy_reward": 0.0555555559694767, |
|
"rewards/format_reward": 0.2777777872979641, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2894.0972900390625, |
|
"epoch": 0.32390745501285345, |
|
"grad_norm": 1.0675281286239624, |
|
"kl": 0.6748046875, |
|
"learning_rate": 8.701000477775687e-07, |
|
"loss": 0.1042, |
|
"reward": 0.4444444626569748, |
|
"reward_std": 0.4907895401120186, |
|
"rewards/accuracy_reward": 0.013888888992369175, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3079.3056030273438, |
|
"epoch": 0.32562125107112255, |
|
"grad_norm": 0.7137811183929443, |
|
"kl": 0.6826171875, |
|
"learning_rate": 8.681980515339463e-07, |
|
"loss": 0.0989, |
|
"reward": 0.541666679084301, |
|
"reward_std": 0.5043292306363583, |
|
"rewards/accuracy_reward": 0.06944444589316845, |
|
"rewards/format_reward": 0.4027777835726738, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3071.763916015625, |
|
"epoch": 0.3273350471293916, |
|
"grad_norm": 0.7186347842216492, |
|
"kl": 0.6953125, |
|
"learning_rate": 8.662846177302938e-07, |
|
"loss": 0.0886, |
|
"reward": 0.5833333358168602, |
|
"reward_std": 0.5841793119907379, |
|
"rewards/accuracy_reward": 0.08333333488553762, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2750.3194580078125, |
|
"epoch": 0.32904884318766064, |
|
"grad_norm": 0.8925949931144714, |
|
"kl": 0.6806640625, |
|
"learning_rate": 8.643598151445749e-07, |
|
"loss": 0.1256, |
|
"reward": 1.0694444328546524, |
|
"reward_std": 0.6757483929395676, |
|
"rewards/accuracy_reward": 0.20833334047347307, |
|
"rewards/format_reward": 0.6527777910232544, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2392.611083984375, |
|
"epoch": 0.33076263924592975, |
|
"grad_norm": 0.5500184297561646, |
|
"kl": 0.59814453125, |
|
"learning_rate": 8.624237129634014e-07, |
|
"loss": 0.0753, |
|
"reward": 0.9861111082136631, |
|
"reward_std": 0.6858525797724724, |
|
"rewards/accuracy_reward": 0.2083333395421505, |
|
"rewards/format_reward": 0.5694444365799427, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2629.9305419921875, |
|
"epoch": 0.3324764353041988, |
|
"grad_norm": 1.4036595821380615, |
|
"kl": 0.8115234375, |
|
"learning_rate": 8.604763807795471e-07, |
|
"loss": 0.0248, |
|
"reward": 1.222222238779068, |
|
"reward_std": 0.7275054007768631, |
|
"rewards/accuracy_reward": 0.2500000046566129, |
|
"rewards/format_reward": 0.722222238779068, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2822.6527709960938, |
|
"epoch": 0.3341902313624679, |
|
"grad_norm": 0.9383707642555237, |
|
"kl": 0.7744140625, |
|
"learning_rate": 8.58517888589445e-07, |
|
"loss": 0.0842, |
|
"reward": 0.9583333358168602, |
|
"reward_std": 0.6252798363566399, |
|
"rewards/accuracy_reward": 0.16666666977107525, |
|
"rewards/format_reward": 0.6250000037252903, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2831.65283203125, |
|
"epoch": 0.33590402742073694, |
|
"grad_norm": 0.9345480799674988, |
|
"kl": 0.7021484375, |
|
"learning_rate": 8.56548306790673e-07, |
|
"loss": 0.1095, |
|
"reward": 0.9444444626569748, |
|
"reward_std": 0.6248819530010223, |
|
"rewards/accuracy_reward": 0.13888888899236917, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2750.7777709960938, |
|
"epoch": 0.337617823479006, |
|
"grad_norm": 0.6670540571212769, |
|
"kl": 0.685546875, |
|
"learning_rate": 8.54567706179422e-07, |
|
"loss": 0.1075, |
|
"reward": 0.9861111119389534, |
|
"reward_std": 0.5248102322220802, |
|
"rewards/accuracy_reward": 0.180555559694767, |
|
"rewards/format_reward": 0.6250000074505806, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2877.9861450195312, |
|
"epoch": 0.3393316195372751, |
|
"grad_norm": 0.6243178844451904, |
|
"kl": 0.7509765625, |
|
"learning_rate": 8.525761579479519e-07, |
|
"loss": 0.0843, |
|
"reward": 1.0555555671453476, |
|
"reward_std": 0.4111253023147583, |
|
"rewards/accuracy_reward": 0.19444444868713617, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2674.138916015625, |
|
"epoch": 0.34104541559554413, |
|
"grad_norm": 1.4835889339447021, |
|
"kl": 0.7109375, |
|
"learning_rate": 8.505737336820326e-07, |
|
"loss": 0.1377, |
|
"reward": 0.875, |
|
"reward_std": 0.4817770943045616, |
|
"rewards/accuracy_reward": 0.1111111156642437, |
|
"rewards/format_reward": 0.6527777761220932, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2610.1666870117188, |
|
"epoch": 0.3427592116538132, |
|
"grad_norm": 1.3861608505249023, |
|
"kl": 0.7060546875, |
|
"learning_rate": 8.485605053583704e-07, |
|
"loss": 0.1008, |
|
"reward": 0.986111119389534, |
|
"reward_std": 0.6297199055552483, |
|
"rewards/accuracy_reward": 0.15277777798473835, |
|
"rewards/format_reward": 0.6805555671453476, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2719.513916015625, |
|
"epoch": 0.3444730077120823, |
|
"grad_norm": 1.214066982269287, |
|
"kl": 0.755859375, |
|
"learning_rate": 8.465365453420214e-07, |
|
"loss": 0.0994, |
|
"reward": 1.097222238779068, |
|
"reward_std": 0.511199101805687, |
|
"rewards/accuracy_reward": 0.1388888917863369, |
|
"rewards/format_reward": 0.8194444477558136, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2794.7500610351562, |
|
"epoch": 0.3461868037703513, |
|
"grad_norm": 0.9259114861488342, |
|
"kl": 0.85546875, |
|
"learning_rate": 8.445019263837897e-07, |
|
"loss": 0.0963, |
|
"reward": 1.2083333730697632, |
|
"reward_std": 0.45388972014188766, |
|
"rewards/accuracy_reward": 0.2500000009313226, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2657.65283203125, |
|
"epoch": 0.34790059982862037, |
|
"grad_norm": 0.9675770998001099, |
|
"kl": 0.9267578125, |
|
"learning_rate": 8.42456721617613e-07, |
|
"loss": 0.1232, |
|
"reward": 1.2500000149011612, |
|
"reward_std": 0.8653819561004639, |
|
"rewards/accuracy_reward": 0.23611111659556627, |
|
"rewards/format_reward": 0.7777777761220932, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2665.486083984375, |
|
"epoch": 0.3496143958868895, |
|
"grad_norm": 1.2522883415222168, |
|
"kl": 0.87109375, |
|
"learning_rate": 8.404010045579339e-07, |
|
"loss": 0.0604, |
|
"reward": 0.8888888955116272, |
|
"reward_std": 0.5926736369729042, |
|
"rewards/accuracy_reward": 0.09722222574055195, |
|
"rewards/format_reward": 0.6944444626569748, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2659.9722290039062, |
|
"epoch": 0.3513281919451585, |
|
"grad_norm": 1.2460371255874634, |
|
"kl": 0.9208984375, |
|
"learning_rate": 8.383348490970566e-07, |
|
"loss": 0.143, |
|
"reward": 1.3611111491918564, |
|
"reward_std": 0.7807096689939499, |
|
"rewards/accuracy_reward": 0.26388889644294977, |
|
"rewards/format_reward": 0.8333333283662796, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2711.4445190429688, |
|
"epoch": 0.35304198800342756, |
|
"grad_norm": 1.2466338872909546, |
|
"kl": 0.9326171875, |
|
"learning_rate": 8.362583295024916e-07, |
|
"loss": 0.0888, |
|
"reward": 0.8888888955116272, |
|
"reward_std": 0.4819314032793045, |
|
"rewards/accuracy_reward": 0.055555556900799274, |
|
"rewards/format_reward": 0.7777777761220932, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2571.138916015625, |
|
"epoch": 0.35475578406169667, |
|
"grad_norm": 0.7316084504127502, |
|
"kl": 0.8232421875, |
|
"learning_rate": 8.341715204142854e-07, |
|
"loss": 0.0954, |
|
"reward": 1.1527777761220932, |
|
"reward_std": 0.5541091933846474, |
|
"rewards/accuracy_reward": 0.22222223225980997, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2744.5556030273438, |
|
"epoch": 0.3564695801199657, |
|
"grad_norm": 0.9308451414108276, |
|
"kl": 0.8291015625, |
|
"learning_rate": 8.320744968423391e-07, |
|
"loss": 0.0961, |
|
"reward": 1.1666667014360428, |
|
"reward_std": 0.4660206064581871, |
|
"rewards/accuracy_reward": 0.15277777798473835, |
|
"rewards/format_reward": 0.861111119389534, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2710.3055419921875, |
|
"epoch": 0.3581833761782348, |
|
"grad_norm": 0.861219584941864, |
|
"kl": 0.826171875, |
|
"learning_rate": 8.299673341637108e-07, |
|
"loss": 0.0934, |
|
"reward": 0.9722222238779068, |
|
"reward_std": 0.5195175558328629, |
|
"rewards/accuracy_reward": 0.06944444589316845, |
|
"rewards/format_reward": 0.8333333134651184, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2682.2361450195312, |
|
"epoch": 0.35989717223650386, |
|
"grad_norm": 0.9901695251464844, |
|
"kl": 0.705078125, |
|
"learning_rate": 8.278501081199061e-07, |
|
"loss": 0.0675, |
|
"reward": 1.0277778059244156, |
|
"reward_std": 0.5829172991216183, |
|
"rewards/accuracy_reward": 0.09722222294658422, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3082.5833129882812, |
|
"epoch": 0.3616109682947729, |
|
"grad_norm": 0.9780495762825012, |
|
"kl": 0.740234375, |
|
"learning_rate": 8.257228948141567e-07, |
|
"loss": 0.0635, |
|
"reward": 0.8750000149011612, |
|
"reward_std": 0.4285326674580574, |
|
"rewards/accuracy_reward": 0.041666666977107525, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2808.0, |
|
"epoch": 0.363324764353042, |
|
"grad_norm": 0.7250691652297974, |
|
"kl": 0.6376953125, |
|
"learning_rate": 8.23585770708684e-07, |
|
"loss": 0.0783, |
|
"reward": 1.4861111342906952, |
|
"reward_std": 0.5571966022253036, |
|
"rewards/accuracy_reward": 0.3472222350537777, |
|
"rewards/format_reward": 0.7916666567325592, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3058.8611450195312, |
|
"epoch": 0.36503856041131105, |
|
"grad_norm": 0.9827658534049988, |
|
"kl": 0.619140625, |
|
"learning_rate": 8.214388126219512e-07, |
|
"loss": 0.0416, |
|
"reward": 0.9583333432674408, |
|
"reward_std": 0.6335423514246941, |
|
"rewards/accuracy_reward": 0.11111111380159855, |
|
"rewards/format_reward": 0.7361111044883728, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3149.3333740234375, |
|
"epoch": 0.3667523564695801, |
|
"grad_norm": 0.5304873585700989, |
|
"kl": 0.59521484375, |
|
"learning_rate": 8.192820977259012e-07, |
|
"loss": 0.0755, |
|
"reward": 0.8472222238779068, |
|
"reward_std": 0.5018017217516899, |
|
"rewards/accuracy_reward": 0.09722222574055195, |
|
"rewards/format_reward": 0.6527777761220932, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3244.77783203125, |
|
"epoch": 0.3684661525278492, |
|
"grad_norm": 0.5707263350486755, |
|
"kl": 0.53076171875, |
|
"learning_rate": 8.17115703543184e-07, |
|
"loss": 0.0701, |
|
"reward": 1.1388888955116272, |
|
"reward_std": 0.9169216901063919, |
|
"rewards/accuracy_reward": 0.23611112032085657, |
|
"rewards/format_reward": 0.6666666641831398, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3028.6528930664062, |
|
"epoch": 0.37017994858611825, |
|
"grad_norm": 0.5431216359138489, |
|
"kl": 0.4140625, |
|
"learning_rate": 8.149397079443693e-07, |
|
"loss": 0.0184, |
|
"reward": 1.0, |
|
"reward_std": 0.5379247963428497, |
|
"rewards/accuracy_reward": 0.1666666716337204, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2745.5972900390625, |
|
"epoch": 0.3718937446443873, |
|
"grad_norm": 0.5233212113380432, |
|
"kl": 0.376953125, |
|
"learning_rate": 8.127541891451473e-07, |
|
"loss": 0.0288, |
|
"reward": 1.1388889104127884, |
|
"reward_std": 0.37009186670184135, |
|
"rewards/accuracy_reward": 0.19444444868713617, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2928.5972290039062, |
|
"epoch": 0.3736075407026564, |
|
"grad_norm": 0.640915036201477, |
|
"kl": 0.35791015625, |
|
"learning_rate": 8.105592257035178e-07, |
|
"loss": 0.0105, |
|
"reward": 1.2083333134651184, |
|
"reward_std": 0.3840879425406456, |
|
"rewards/accuracy_reward": 0.23611111752688885, |
|
"rewards/format_reward": 0.736111119389534, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3421.0972290039062, |
|
"epoch": 0.37532133676092544, |
|
"grad_norm": 0.39103758335113525, |
|
"kl": 0.4453125, |
|
"learning_rate": 8.083548965169663e-07, |
|
"loss": 0.0357, |
|
"reward": 0.666666679084301, |
|
"reward_std": 0.63420694693923, |
|
"rewards/accuracy_reward": 0.06944444496184587, |
|
"rewards/format_reward": 0.5277777910232544, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3180.263916015625, |
|
"epoch": 0.37703513281919454, |
|
"grad_norm": 0.5637526512145996, |
|
"kl": 0.385009765625, |
|
"learning_rate": 8.061412808196279e-07, |
|
"loss": 0.0577, |
|
"reward": 0.7916666939854622, |
|
"reward_std": 0.5208309143781662, |
|
"rewards/accuracy_reward": 0.0972222238779068, |
|
"rewards/format_reward": 0.5972222313284874, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3347.8750610351562, |
|
"epoch": 0.3787489288774636, |
|
"grad_norm": 0.6664404273033142, |
|
"kl": 0.38037109375, |
|
"learning_rate": 8.039184581794389e-07, |
|
"loss": 0.0478, |
|
"reward": 0.6527777761220932, |
|
"reward_std": 0.6700074002146721, |
|
"rewards/accuracy_reward": 0.08333333395421505, |
|
"rewards/format_reward": 0.486111119389534, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2943.40283203125, |
|
"epoch": 0.38046272493573263, |
|
"grad_norm": 0.4021422863006592, |
|
"kl": 0.302490234375, |
|
"learning_rate": 8.016865084952783e-07, |
|
"loss": 0.0219, |
|
"reward": 1.2083333358168602, |
|
"reward_std": 0.29920290410518646, |
|
"rewards/accuracy_reward": 0.2500000074505806, |
|
"rewards/format_reward": 0.7083333358168602, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3259.875, |
|
"epoch": 0.38217652099400173, |
|
"grad_norm": 0.33430805802345276, |
|
"kl": 0.34228515625, |
|
"learning_rate": 7.994455119940934e-07, |
|
"loss": 0.0304, |
|
"reward": 0.6527777872979641, |
|
"reward_std": 0.27941545471549034, |
|
"rewards/accuracy_reward": 0.0833333358168602, |
|
"rewards/format_reward": 0.4861111156642437, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3342.0694580078125, |
|
"epoch": 0.3838903170522708, |
|
"grad_norm": 0.6777502298355103, |
|
"kl": 0.40234375, |
|
"learning_rate": 7.971955492280181e-07, |
|
"loss": 0.0796, |
|
"reward": 0.5277777835726738, |
|
"reward_std": 0.5462377443909645, |
|
"rewards/accuracy_reward": 0.041666666977107525, |
|
"rewards/format_reward": 0.4444444552063942, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3149.8194580078125, |
|
"epoch": 0.3856041131105398, |
|
"grad_norm": 0.5160468220710754, |
|
"kl": 0.340087890625, |
|
"learning_rate": 7.949367010714766e-07, |
|
"loss": 0.0212, |
|
"reward": 1.2500000149011612, |
|
"reward_std": 0.7483058720827103, |
|
"rewards/accuracy_reward": 0.2500000046566129, |
|
"rewards/format_reward": 0.75, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3071.7777709960938, |
|
"epoch": 0.3873179091688089, |
|
"grad_norm": 0.4626597762107849, |
|
"kl": 0.4052734375, |
|
"learning_rate": 7.926690487182766e-07, |
|
"loss": 0.016, |
|
"reward": 0.8194444552063942, |
|
"reward_std": 0.5300310403108597, |
|
"rewards/accuracy_reward": 0.06944444589316845, |
|
"rewards/format_reward": 0.6805555447936058, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3198.5277709960938, |
|
"epoch": 0.389031705227078, |
|
"grad_norm": 0.5830844640731812, |
|
"kl": 0.3916015625, |
|
"learning_rate": 7.903926736786907e-07, |
|
"loss": 0.0462, |
|
"reward": 0.8611111268401146, |
|
"reward_std": 0.7832111120223999, |
|
"rewards/accuracy_reward": 0.15277778264135122, |
|
"rewards/format_reward": 0.5555555559694767, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3048.5556030273438, |
|
"epoch": 0.390745501285347, |
|
"grad_norm": 0.43366798758506775, |
|
"kl": 0.3828125, |
|
"learning_rate": 7.881076577765265e-07, |
|
"loss": 0.024, |
|
"reward": 1.3472222238779068, |
|
"reward_std": 0.5785843208432198, |
|
"rewards/accuracy_reward": 0.3333333292976022, |
|
"rewards/format_reward": 0.6805555522441864, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3198.2777709960938, |
|
"epoch": 0.3924592973436161, |
|
"grad_norm": 0.721635103225708, |
|
"kl": 0.421875, |
|
"learning_rate": 7.858140831461858e-07, |
|
"loss": 0.0741, |
|
"reward": 1.0694444626569748, |
|
"reward_std": 0.6419415846467018, |
|
"rewards/accuracy_reward": 0.2222222276031971, |
|
"rewards/format_reward": 0.625, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3199.9306030273438, |
|
"epoch": 0.39417309340188517, |
|
"grad_norm": 0.5975983738899231, |
|
"kl": 0.50439453125, |
|
"learning_rate": 7.835120322297115e-07, |
|
"loss": 0.0397, |
|
"reward": 1.125, |
|
"reward_std": 0.6707755327224731, |
|
"rewards/accuracy_reward": 0.2500000037252903, |
|
"rewards/format_reward": 0.625, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3298.90283203125, |
|
"epoch": 0.39588688946015427, |
|
"grad_norm": 0.5230339169502258, |
|
"kl": 0.44140625, |
|
"learning_rate": 7.812015877738252e-07, |
|
"loss": 0.0553, |
|
"reward": 1.0972222238779068, |
|
"reward_std": 0.7510210201144218, |
|
"rewards/accuracy_reward": 0.2500000074505806, |
|
"rewards/format_reward": 0.5972222238779068, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2978.625, |
|
"epoch": 0.3976006855184233, |
|
"grad_norm": 0.4864640533924103, |
|
"kl": 0.42626953125, |
|
"learning_rate": 7.788828328269524e-07, |
|
"loss": 0.04, |
|
"reward": 1.3888889104127884, |
|
"reward_std": 0.4389140121638775, |
|
"rewards/accuracy_reward": 0.3194444552063942, |
|
"rewards/format_reward": 0.75, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2690.7083129882812, |
|
"epoch": 0.39931448157669236, |
|
"grad_norm": 0.41488051414489746, |
|
"kl": 0.330078125, |
|
"learning_rate": 7.765558507362374e-07, |
|
"loss": 0.0505, |
|
"reward": 1.6111111342906952, |
|
"reward_std": 0.6035833917558193, |
|
"rewards/accuracy_reward": 0.3888888992369175, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3084.0833740234375, |
|
"epoch": 0.40102827763496146, |
|
"grad_norm": 0.9714931845664978, |
|
"kl": 0.52490234375, |
|
"learning_rate": 7.742207251445473e-07, |
|
"loss": 0.0225, |
|
"reward": 0.7083333320915699, |
|
"reward_std": 0.3626026399433613, |
|
"rewards/accuracy_reward": 0.11111111287027597, |
|
"rewards/format_reward": 0.4861111231148243, |
|
"step": 234 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3456.4722290039062, |
|
"epoch": 0.4027420736932305, |
|
"grad_norm": 0.6735730767250061, |
|
"kl": 0.5576171875, |
|
"learning_rate": 7.718775399874654e-07, |
|
"loss": 0.0413, |
|
"reward": 0.7638889029622078, |
|
"reward_std": 0.6374682560563087, |
|
"rewards/accuracy_reward": 0.12500000465661287, |
|
"rewards/format_reward": 0.5138889029622078, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3396.0277709960938, |
|
"epoch": 0.40445586975149955, |
|
"grad_norm": 0.46086516976356506, |
|
"kl": 0.51416015625, |
|
"learning_rate": 7.69526379490275e-07, |
|
"loss": 0.0611, |
|
"reward": 0.7777777761220932, |
|
"reward_std": 0.7557945251464844, |
|
"rewards/accuracy_reward": 0.12500000279396772, |
|
"rewards/format_reward": 0.5277777910232544, |
|
"step": 236 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2660.041717529297, |
|
"epoch": 0.40616966580976865, |
|
"grad_norm": 0.49919629096984863, |
|
"kl": 0.375244140625, |
|
"learning_rate": 7.671673281649303e-07, |
|
"loss": 0.0153, |
|
"reward": 0.9305555373430252, |
|
"reward_std": 0.4616476893424988, |
|
"rewards/accuracy_reward": 0.1388888917863369, |
|
"rewards/format_reward": 0.6527777761220932, |
|
"step": 237 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3412.5833740234375, |
|
"epoch": 0.4078834618680377, |
|
"grad_norm": 0.4641401171684265, |
|
"kl": 0.44921875, |
|
"learning_rate": 7.648004708070207e-07, |
|
"loss": 0.0416, |
|
"reward": 0.9166666716337204, |
|
"reward_std": 0.6401529386639595, |
|
"rewards/accuracy_reward": 0.15277778077870607, |
|
"rewards/format_reward": 0.6111111044883728, |
|
"step": 238 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3040.27783203125, |
|
"epoch": 0.40959725792630675, |
|
"grad_norm": 0.3767884373664856, |
|
"kl": 0.4150390625, |
|
"learning_rate": 7.624258924927209e-07, |
|
"loss": 0.0663, |
|
"reward": 1.0555555745959282, |
|
"reward_std": 0.6340665742754936, |
|
"rewards/accuracy_reward": 0.26388890016824007, |
|
"rewards/format_reward": 0.5277777835726738, |
|
"step": 239 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2810.4583740234375, |
|
"epoch": 0.41131105398457585, |
|
"grad_norm": 0.7160300612449646, |
|
"kl": 0.27001953125, |
|
"learning_rate": 7.600436785757339e-07, |
|
"loss": 0.0574, |
|
"reward": 1.3194444477558136, |
|
"reward_std": 0.5911112986505032, |
|
"rewards/accuracy_reward": 0.2916666753590107, |
|
"rewards/format_reward": 0.736111119389534, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2855.4444885253906, |
|
"epoch": 0.4130248500428449, |
|
"grad_norm": 0.6084269285202026, |
|
"kl": 0.359375, |
|
"learning_rate": 7.57653914684223e-07, |
|
"loss": 0.0512, |
|
"reward": 1.0000000447034836, |
|
"reward_std": 0.5819622427225113, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.5, |
|
"step": 241 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3339.5694580078125, |
|
"epoch": 0.414738646101114, |
|
"grad_norm": 0.9977325201034546, |
|
"kl": 0.34228515625, |
|
"learning_rate": 7.552566867177336e-07, |
|
"loss": 0.0482, |
|
"reward": 1.0277778059244156, |
|
"reward_std": 0.8119450323283672, |
|
"rewards/accuracy_reward": 0.2500000037252903, |
|
"rewards/format_reward": 0.5277777910232544, |
|
"step": 242 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3133.416748046875, |
|
"epoch": 0.41645244215938304, |
|
"grad_norm": 0.47933000326156616, |
|
"kl": 0.3671875, |
|
"learning_rate": 7.528520808441057e-07, |
|
"loss": 0.0389, |
|
"reward": 1.0138888880610466, |
|
"reward_std": 0.8036867156624794, |
|
"rewards/accuracy_reward": 0.19444444496184587, |
|
"rewards/format_reward": 0.6250000074505806, |
|
"step": 243 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2973.013916015625, |
|
"epoch": 0.4181662382176521, |
|
"grad_norm": 0.7056116461753845, |
|
"kl": 0.3857421875, |
|
"learning_rate": 7.504401834963763e-07, |
|
"loss": 0.0406, |
|
"reward": 1.0416666716337204, |
|
"reward_std": 0.8239623233675957, |
|
"rewards/accuracy_reward": 0.23611111007630825, |
|
"rewards/format_reward": 0.569444440305233, |
|
"step": 244 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2305.291717529297, |
|
"epoch": 0.4198800342759212, |
|
"grad_norm": 0.8029626607894897, |
|
"kl": 0.284912109375, |
|
"learning_rate": 7.480210813696732e-07, |
|
"loss": 0.072, |
|
"reward": 1.5833333432674408, |
|
"reward_std": 0.43726158887147903, |
|
"rewards/accuracy_reward": 0.4027777789160609, |
|
"rewards/format_reward": 0.7777777910232544, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3021.0000610351562, |
|
"epoch": 0.42159383033419023, |
|
"grad_norm": 0.5367264151573181, |
|
"kl": 0.51416015625, |
|
"learning_rate": 7.455948614180983e-07, |
|
"loss": 0.069, |
|
"reward": 0.9305555447936058, |
|
"reward_std": 0.6338964849710464, |
|
"rewards/accuracy_reward": 0.25000000931322575, |
|
"rewards/format_reward": 0.4305555559694767, |
|
"step": 246 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3298.6250610351562, |
|
"epoch": 0.4233076263924593, |
|
"grad_norm": 0.7639601826667786, |
|
"kl": 0.70458984375, |
|
"learning_rate": 7.431616108516021e-07, |
|
"loss": 0.0682, |
|
"reward": 0.625, |
|
"reward_std": 0.7283531203866005, |
|
"rewards/accuracy_reward": 0.11111111287027597, |
|
"rewards/format_reward": 0.4027777835726738, |
|
"step": 247 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2943.1944580078125, |
|
"epoch": 0.4250214224507284, |
|
"grad_norm": 0.7967272400856018, |
|
"kl": 0.626953125, |
|
"learning_rate": 7.407214171328491e-07, |
|
"loss": 0.0337, |
|
"reward": 0.8055555671453476, |
|
"reward_std": 0.5930216982960701, |
|
"rewards/accuracy_reward": 0.1944444477558136, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 248 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3075.5833740234375, |
|
"epoch": 0.4267352185089974, |
|
"grad_norm": 0.9339537024497986, |
|
"kl": 0.60400390625, |
|
"learning_rate": 7.382743679740741e-07, |
|
"loss": 0.0485, |
|
"reward": 0.6805555522441864, |
|
"reward_std": 0.3991912603378296, |
|
"rewards/accuracy_reward": 0.1250000037252903, |
|
"rewards/format_reward": 0.4305555671453476, |
|
"step": 249 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3042.9722900390625, |
|
"epoch": 0.4284490145672665, |
|
"grad_norm": 0.6018530130386353, |
|
"kl": 0.580078125, |
|
"learning_rate": 7.358205513339286e-07, |
|
"loss": 0.0693, |
|
"reward": 1.0138888955116272, |
|
"reward_std": 0.9293834567070007, |
|
"rewards/accuracy_reward": 0.2361111156642437, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3095.1527709960938, |
|
"epoch": 0.4301628106255356, |
|
"grad_norm": 0.6325446367263794, |
|
"kl": 0.607421875, |
|
"learning_rate": 7.333600554143203e-07, |
|
"loss": 0.1134, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.7987345978617668, |
|
"rewards/accuracy_reward": 0.20833333767950535, |
|
"rewards/format_reward": 0.5000000074505806, |
|
"step": 251 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3202.1111450195312, |
|
"epoch": 0.4318766066838046, |
|
"grad_norm": 0.7408186793327332, |
|
"kl": 0.6162109375, |
|
"learning_rate": 7.308929686572423e-07, |
|
"loss": 0.0686, |
|
"reward": 0.680555559694767, |
|
"reward_std": 0.486115962266922, |
|
"rewards/accuracy_reward": 0.12500000186264515, |
|
"rewards/format_reward": 0.4305555634200573, |
|
"step": 252 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3191.1250610351562, |
|
"epoch": 0.43359040274207367, |
|
"grad_norm": 0.5966067910194397, |
|
"kl": 0.51708984375, |
|
"learning_rate": 7.284193797415932e-07, |
|
"loss": 0.0627, |
|
"reward": 1.0555555671453476, |
|
"reward_std": 0.8806970864534378, |
|
"rewards/accuracy_reward": 0.2638888955116272, |
|
"rewards/format_reward": 0.5277777910232544, |
|
"step": 253 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3091.638916015625, |
|
"epoch": 0.43530419880034277, |
|
"grad_norm": 0.6850226521492004, |
|
"kl": 0.4912109375, |
|
"learning_rate": 7.25939377579991e-07, |
|
"loss": 0.0783, |
|
"reward": 0.9305555745959282, |
|
"reward_std": 0.7012817077338696, |
|
"rewards/accuracy_reward": 0.2361111156642437, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 254 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2927.9166870117188, |
|
"epoch": 0.4370179948586118, |
|
"grad_norm": 0.7290365099906921, |
|
"kl": 0.47216796875, |
|
"learning_rate": 7.234530513155761e-07, |
|
"loss": 0.0949, |
|
"reward": 0.9027777910232544, |
|
"reward_std": 0.7394577413797379, |
|
"rewards/accuracy_reward": 0.180555559694767, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2891.5277709960938, |
|
"epoch": 0.4387317909168809, |
|
"grad_norm": 0.7196898460388184, |
|
"kl": 0.38330078125, |
|
"learning_rate": 7.209604903188073e-07, |
|
"loss": 0.051, |
|
"reward": 0.8888888955116272, |
|
"reward_std": 0.6187723875045776, |
|
"rewards/accuracy_reward": 0.15277778171002865, |
|
"rewards/format_reward": 0.5833333283662796, |
|
"step": 256 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2936.9583129882812, |
|
"epoch": 0.44044558697514996, |
|
"grad_norm": 0.7010894417762756, |
|
"kl": 0.48193359375, |
|
"learning_rate": 7.184617841842498e-07, |
|
"loss": 0.1164, |
|
"reward": 0.8888888955116272, |
|
"reward_std": 0.7065365388989449, |
|
"rewards/accuracy_reward": 0.1944444514811039, |
|
"rewards/format_reward": 0.5, |
|
"step": 257 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3047.013916015625, |
|
"epoch": 0.442159383033419, |
|
"grad_norm": 0.5052915811538696, |
|
"kl": 0.4599609375, |
|
"learning_rate": 7.159570227273543e-07, |
|
"loss": 0.0301, |
|
"reward": 0.7916666641831398, |
|
"reward_std": 0.5421573370695114, |
|
"rewards/accuracy_reward": 0.1388888917863369, |
|
"rewards/format_reward": 0.5138888843357563, |
|
"step": 258 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2858.2084350585938, |
|
"epoch": 0.4438731790916881, |
|
"grad_norm": 1.2074464559555054, |
|
"kl": 0.42578125, |
|
"learning_rate": 7.134462959812286e-07, |
|
"loss": 0.1245, |
|
"reward": 1.1388889104127884, |
|
"reward_std": 0.7573273852467537, |
|
"rewards/accuracy_reward": 0.2638888955116272, |
|
"rewards/format_reward": 0.6111111044883728, |
|
"step": 259 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2916.7222290039062, |
|
"epoch": 0.44558697514995715, |
|
"grad_norm": 0.4421754777431488, |
|
"kl": 0.47509765625, |
|
"learning_rate": 7.10929694193402e-07, |
|
"loss": 0.0576, |
|
"reward": 1.1388888955116272, |
|
"reward_std": 0.7476281188428402, |
|
"rewards/accuracy_reward": 0.2500000037252903, |
|
"rewards/format_reward": 0.6388888955116272, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2938.0972290039062, |
|
"epoch": 0.4473007712082262, |
|
"grad_norm": 0.6367775201797485, |
|
"kl": 0.537109375, |
|
"learning_rate": 7.084073078225803e-07, |
|
"loss": 0.0963, |
|
"reward": 0.972222238779068, |
|
"reward_std": 0.6995532214641571, |
|
"rewards/accuracy_reward": 0.19444444868713617, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 261 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2852.40283203125, |
|
"epoch": 0.4490145672664953, |
|
"grad_norm": 1.2071142196655273, |
|
"kl": 0.5927734375, |
|
"learning_rate": 7.05879227535395e-07, |
|
"loss": 0.0551, |
|
"reward": 1.208333358168602, |
|
"reward_std": 0.6593898758292198, |
|
"rewards/accuracy_reward": 0.33333333767950535, |
|
"rewards/format_reward": 0.5416666641831398, |
|
"step": 262 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2939.77783203125, |
|
"epoch": 0.45072836332476435, |
|
"grad_norm": 1.2289506196975708, |
|
"kl": 0.587890625, |
|
"learning_rate": 7.033455442031449e-07, |
|
"loss": 0.174, |
|
"reward": 0.8055555671453476, |
|
"reward_std": 0.6164307221770287, |
|
"rewards/accuracy_reward": 0.15277778171002865, |
|
"rewards/format_reward": 0.5, |
|
"step": 263 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2960.6806030273438, |
|
"epoch": 0.4524421593830334, |
|
"grad_norm": 0.7730751037597656, |
|
"kl": 0.68212890625, |
|
"learning_rate": 7.008063488985282e-07, |
|
"loss": 0.0753, |
|
"reward": 0.7083333283662796, |
|
"reward_std": 0.5493002012372017, |
|
"rewards/accuracy_reward": 0.09722222480922937, |
|
"rewards/format_reward": 0.5138888955116272, |
|
"step": 264 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3236.7500610351562, |
|
"epoch": 0.4541559554413025, |
|
"grad_norm": 0.514389157295227, |
|
"kl": 0.73046875, |
|
"learning_rate": 6.9826173289237e-07, |
|
"loss": 0.1, |
|
"reward": 1.111111119389534, |
|
"reward_std": 0.9851955845952034, |
|
"rewards/accuracy_reward": 0.31944444216787815, |
|
"rewards/format_reward": 0.4722222313284874, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3163.75, |
|
"epoch": 0.45586975149957154, |
|
"grad_norm": 0.6007187366485596, |
|
"kl": 0.7626953125, |
|
"learning_rate": 6.957117876503413e-07, |
|
"loss": 0.1295, |
|
"reward": 0.694444440305233, |
|
"reward_std": 0.7118404507637024, |
|
"rewards/accuracy_reward": 0.16666666977107525, |
|
"rewards/format_reward": 0.3611111231148243, |
|
"step": 266 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3049.6111450195312, |
|
"epoch": 0.45758354755784064, |
|
"grad_norm": 0.9735589027404785, |
|
"kl": 0.712890625, |
|
"learning_rate": 6.931566048296717e-07, |
|
"loss": 0.0762, |
|
"reward": 0.8750000149011612, |
|
"reward_std": 0.6382264569401741, |
|
"rewards/accuracy_reward": 0.15277778077870607, |
|
"rewards/format_reward": 0.5694444552063942, |
|
"step": 267 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2990.4862060546875, |
|
"epoch": 0.4592973436161097, |
|
"grad_norm": 1.0147572755813599, |
|
"kl": 0.5908203125, |
|
"learning_rate": 6.90596276275854e-07, |
|
"loss": 0.1205, |
|
"reward": 0.569444440305233, |
|
"reward_std": 0.4945811964571476, |
|
"rewards/accuracy_reward": 0.0555555559694767, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 268 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3235.236083984375, |
|
"epoch": 0.46101113967437873, |
|
"grad_norm": 0.45656487345695496, |
|
"kl": 0.6708984375, |
|
"learning_rate": 6.880308940193435e-07, |
|
"loss": 0.1147, |
|
"reward": 0.8194444477558136, |
|
"reward_std": 0.9287731572985649, |
|
"rewards/accuracy_reward": 0.1666666716337204, |
|
"rewards/format_reward": 0.486111119389534, |
|
"step": 269 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2900.7083740234375, |
|
"epoch": 0.46272493573264784, |
|
"grad_norm": 0.8187825083732605, |
|
"kl": 0.5869140625, |
|
"learning_rate": 6.854605502722496e-07, |
|
"loss": 0.103, |
|
"reward": 0.9861111044883728, |
|
"reward_std": 0.6310887522995472, |
|
"rewards/accuracy_reward": 0.25000000558793545, |
|
"rewards/format_reward": 0.486111119389534, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3006.5694580078125, |
|
"epoch": 0.4644387317909169, |
|
"grad_norm": 1.0085062980651855, |
|
"kl": 0.65869140625, |
|
"learning_rate": 6.828853374250211e-07, |
|
"loss": 0.055, |
|
"reward": 0.9166666716337204, |
|
"reward_std": 0.6481297686696053, |
|
"rewards/accuracy_reward": 0.19444444961845875, |
|
"rewards/format_reward": 0.5277777835726738, |
|
"step": 271 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2497.6944885253906, |
|
"epoch": 0.4661525278491859, |
|
"grad_norm": 0.7459636926651001, |
|
"kl": 0.38525390625, |
|
"learning_rate": 6.803053480431267e-07, |
|
"loss": -0.0088, |
|
"reward": 1.069444440305233, |
|
"reward_std": 0.6437568441033363, |
|
"rewards/accuracy_reward": 0.22222222574055195, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 272 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2684.9166870117188, |
|
"epoch": 0.46786632390745503, |
|
"grad_norm": 0.7339934706687927, |
|
"kl": 0.428955078125, |
|
"learning_rate": 6.777206748637253e-07, |
|
"loss": 0.058, |
|
"reward": 1.097222238779068, |
|
"reward_std": 0.6862121671438217, |
|
"rewards/accuracy_reward": 0.22222223225980997, |
|
"rewards/format_reward": 0.6527777910232544, |
|
"step": 273 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2835.638916015625, |
|
"epoch": 0.4695801199657241, |
|
"grad_norm": 0.6278461217880249, |
|
"kl": 0.34814453125, |
|
"learning_rate": 6.751314107923343e-07, |
|
"loss": 0.0337, |
|
"reward": 0.7638888880610466, |
|
"reward_std": 0.6684231236577034, |
|
"rewards/accuracy_reward": 0.06944444589316845, |
|
"rewards/format_reward": 0.6250000074505806, |
|
"step": 274 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3141.5833129882812, |
|
"epoch": 0.4712939160239931, |
|
"grad_norm": 0.5468356609344482, |
|
"kl": 0.5888671875, |
|
"learning_rate": 6.725376488994902e-07, |
|
"loss": 0.0968, |
|
"reward": 0.6527777761220932, |
|
"reward_std": 0.5906506031751633, |
|
"rewards/accuracy_reward": 0.1250000037252903, |
|
"rewards/format_reward": 0.4027777910232544, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2957.5139770507812, |
|
"epoch": 0.4730077120822622, |
|
"grad_norm": 0.6570262908935547, |
|
"kl": 0.46240234375, |
|
"learning_rate": 6.699394824174023e-07, |
|
"loss": 0.051, |
|
"reward": 0.8888888955116272, |
|
"reward_std": 0.7196170538663864, |
|
"rewards/accuracy_reward": 0.1666666679084301, |
|
"rewards/format_reward": 0.555555559694767, |
|
"step": 276 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3263.7777709960938, |
|
"epoch": 0.47472150814053127, |
|
"grad_norm": 0.5633208155632019, |
|
"kl": 0.50390625, |
|
"learning_rate": 6.673370047366016e-07, |
|
"loss": 0.0475, |
|
"reward": 0.7361111119389534, |
|
"reward_std": 0.7856484949588776, |
|
"rewards/accuracy_reward": 0.13888889271765947, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 277 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3001.0972900390625, |
|
"epoch": 0.47643530419880037, |
|
"grad_norm": 0.6549968719482422, |
|
"kl": 0.4775390625, |
|
"learning_rate": 6.647303094025848e-07, |
|
"loss": 0.102, |
|
"reward": 0.9027777910232544, |
|
"reward_std": 0.6888429000973701, |
|
"rewards/accuracy_reward": 0.16666667256504297, |
|
"rewards/format_reward": 0.5694444477558136, |
|
"step": 278 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3204.25, |
|
"epoch": 0.4781491002570694, |
|
"grad_norm": 0.4705544114112854, |
|
"kl": 0.4609375, |
|
"learning_rate": 6.621194901124511e-07, |
|
"loss": 0.053, |
|
"reward": 0.652777798473835, |
|
"reward_std": 0.755700945854187, |
|
"rewards/accuracy_reward": 0.09722222480922937, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 279 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2859.5, |
|
"epoch": 0.47986289631533846, |
|
"grad_norm": 0.7691783905029297, |
|
"kl": 0.38427734375, |
|
"learning_rate": 6.59504640711534e-07, |
|
"loss": 0.1071, |
|
"reward": 1.222222238779068, |
|
"reward_std": 0.7407273650169373, |
|
"rewards/accuracy_reward": 0.3055555606260896, |
|
"rewards/format_reward": 0.611111119389534, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2865.8194274902344, |
|
"epoch": 0.48157669237360756, |
|
"grad_norm": 0.3405319154262543, |
|
"kl": 0.420166015625, |
|
"learning_rate": 6.568858551900289e-07, |
|
"loss": 0.0734, |
|
"reward": 1.1527777910232544, |
|
"reward_std": 0.8395373225212097, |
|
"rewards/accuracy_reward": 0.2777777807787061, |
|
"rewards/format_reward": 0.5972222313284874, |
|
"step": 281 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2852.416717529297, |
|
"epoch": 0.4832904884318766, |
|
"grad_norm": 0.6291264295578003, |
|
"kl": 0.361572265625, |
|
"learning_rate": 6.542632276796142e-07, |
|
"loss": 0.0659, |
|
"reward": 1.0416666716337204, |
|
"reward_std": 0.6899608969688416, |
|
"rewards/accuracy_reward": 0.20833333488553762, |
|
"rewards/format_reward": 0.625, |
|
"step": 282 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3200.7777709960938, |
|
"epoch": 0.48500428449014565, |
|
"grad_norm": 0.5216673016548157, |
|
"kl": 0.474609375, |
|
"learning_rate": 6.516368524500672e-07, |
|
"loss": 0.0597, |
|
"reward": 0.819444477558136, |
|
"reward_std": 0.6402985602617264, |
|
"rewards/accuracy_reward": 0.16666666977107525, |
|
"rewards/format_reward": 0.4861111268401146, |
|
"step": 283 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3028.486083984375, |
|
"epoch": 0.48671808054841476, |
|
"grad_norm": 0.5700398087501526, |
|
"kl": 0.423828125, |
|
"learning_rate": 6.49006823905877e-07, |
|
"loss": 0.0731, |
|
"reward": 0.9722222313284874, |
|
"reward_std": 0.6919921040534973, |
|
"rewards/accuracy_reward": 0.1944444477558136, |
|
"rewards/format_reward": 0.5833333507180214, |
|
"step": 284 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3287.8333740234375, |
|
"epoch": 0.4884318766066838, |
|
"grad_norm": 0.46598854660987854, |
|
"kl": 0.4638671875, |
|
"learning_rate": 6.463732365828497e-07, |
|
"loss": 0.0631, |
|
"reward": 0.8194444440305233, |
|
"reward_std": 0.7012417390942574, |
|
"rewards/accuracy_reward": 0.1666666716337204, |
|
"rewards/format_reward": 0.4861111156642437, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3113.0694580078125, |
|
"epoch": 0.49014567266495285, |
|
"grad_norm": 1.0307719707489014, |
|
"kl": 0.5625, |
|
"learning_rate": 6.437361851447111e-07, |
|
"loss": 0.067, |
|
"reward": 0.8750000223517418, |
|
"reward_std": 0.730349563062191, |
|
"rewards/accuracy_reward": 0.1666666679084301, |
|
"rewards/format_reward": 0.541666679084301, |
|
"step": 286 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3025.7083740234375, |
|
"epoch": 0.49185946872322195, |
|
"grad_norm": 0.6258834600448608, |
|
"kl": 0.46533203125, |
|
"learning_rate": 6.410957643797038e-07, |
|
"loss": 0.0885, |
|
"reward": 0.8888889029622078, |
|
"reward_std": 0.7127974927425385, |
|
"rewards/accuracy_reward": 0.13888889364898205, |
|
"rewards/format_reward": 0.6111111119389534, |
|
"step": 287 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3184.236083984375, |
|
"epoch": 0.493573264781491, |
|
"grad_norm": 0.7940736413002014, |
|
"kl": 0.62451171875, |
|
"learning_rate": 6.384520691971805e-07, |
|
"loss": 0.0912, |
|
"reward": 0.541666679084301, |
|
"reward_std": 0.5503551661968231, |
|
"rewards/accuracy_reward": 0.055555556900799274, |
|
"rewards/format_reward": 0.4305555745959282, |
|
"step": 288 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2996.0833435058594, |
|
"epoch": 0.4952870608397601, |
|
"grad_norm": 0.49851223826408386, |
|
"kl": 0.46435546875, |
|
"learning_rate": 6.358051946241914e-07, |
|
"loss": 0.0878, |
|
"reward": 1.069444466382265, |
|
"reward_std": 0.6114578023552895, |
|
"rewards/accuracy_reward": 0.25000000186264515, |
|
"rewards/format_reward": 0.5694444589316845, |
|
"step": 289 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2852.77783203125, |
|
"epoch": 0.49700085689802914, |
|
"grad_norm": 0.7826417684555054, |
|
"kl": 0.4990234375, |
|
"learning_rate": 6.331552358020698e-07, |
|
"loss": 0.1066, |
|
"reward": 1.0277777761220932, |
|
"reward_std": 0.7379468381404877, |
|
"rewards/accuracy_reward": 0.22222222946584225, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3192.125, |
|
"epoch": 0.4987146529562982, |
|
"grad_norm": 1.0998865365982056, |
|
"kl": 0.43701171875, |
|
"learning_rate": 6.305022879830115e-07, |
|
"loss": 0.0292, |
|
"reward": 1.013888880610466, |
|
"reward_std": 0.7805322334170341, |
|
"rewards/accuracy_reward": 0.180555559694767, |
|
"rewards/format_reward": 0.6527777910232544, |
|
"step": 291 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2944.2083129882812, |
|
"epoch": 0.5004284490145673, |
|
"grad_norm": 0.6076076626777649, |
|
"kl": 0.400634765625, |
|
"learning_rate": 6.278464465266511e-07, |
|
"loss": 0.0494, |
|
"reward": 1.222222238779068, |
|
"reward_std": 0.730344396084547, |
|
"rewards/accuracy_reward": 0.29166666977107525, |
|
"rewards/format_reward": 0.6388888955116272, |
|
"step": 292 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2930.5, |
|
"epoch": 0.5021422450728363, |
|
"grad_norm": 0.40151283144950867, |
|
"kl": 0.36669921875, |
|
"learning_rate": 6.251878068966345e-07, |
|
"loss": 0.0554, |
|
"reward": 0.9444444645196199, |
|
"reward_std": 0.6910195127129555, |
|
"rewards/accuracy_reward": 0.16666667349636555, |
|
"rewards/format_reward": 0.6111111212521791, |
|
"step": 293 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2904.4166870117188, |
|
"epoch": 0.5038560411311054, |
|
"grad_norm": 0.5597663521766663, |
|
"kl": 0.34423828125, |
|
"learning_rate": 6.225264646571872e-07, |
|
"loss": 0.0537, |
|
"reward": 1.0555555820465088, |
|
"reward_std": 0.6472389549016953, |
|
"rewards/accuracy_reward": 0.19444444868713617, |
|
"rewards/format_reward": 0.6666666641831398, |
|
"step": 294 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2847.15283203125, |
|
"epoch": 0.5055698371893744, |
|
"grad_norm": 0.8492727279663086, |
|
"kl": 0.300537109375, |
|
"learning_rate": 6.198625154696796e-07, |
|
"loss": 0.0718, |
|
"reward": 1.2361111342906952, |
|
"reward_std": 0.5978466831147671, |
|
"rewards/accuracy_reward": 0.31944445613771677, |
|
"rewards/format_reward": 0.5972222238779068, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3084.6388549804688, |
|
"epoch": 0.5072836332476436, |
|
"grad_norm": 0.5291158556938171, |
|
"kl": 0.41064453125, |
|
"learning_rate": 6.171960550891878e-07, |
|
"loss": 0.0733, |
|
"reward": 1.222222238779068, |
|
"reward_std": 0.7484849840402603, |
|
"rewards/accuracy_reward": 0.3333333432674408, |
|
"rewards/format_reward": 0.555555559694767, |
|
"step": 296 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2997.5277709960938, |
|
"epoch": 0.5089974293059126, |
|
"grad_norm": 0.37727290391921997, |
|
"kl": 0.372802734375, |
|
"learning_rate": 6.145271793610529e-07, |
|
"loss": 0.0517, |
|
"reward": 1.0555555522441864, |
|
"reward_std": 0.5442864708602428, |
|
"rewards/accuracy_reward": 0.1944444477558136, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 297 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2720.1666870117188, |
|
"epoch": 0.5107112253641817, |
|
"grad_norm": 1.110034704208374, |
|
"kl": 0.34375, |
|
"learning_rate": 6.118559842174344e-07, |
|
"loss": 0.0914, |
|
"reward": 1.2777777761220932, |
|
"reward_std": 0.8702788352966309, |
|
"rewards/accuracy_reward": 0.2916666669771075, |
|
"rewards/format_reward": 0.6944444477558136, |
|
"step": 298 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3106.1389770507812, |
|
"epoch": 0.5124250214224507, |
|
"grad_norm": 0.4852089583873749, |
|
"kl": 0.482421875, |
|
"learning_rate": 6.091825656738635e-07, |
|
"loss": 0.0899, |
|
"reward": 1.0416666716337204, |
|
"reward_std": 0.9716915190219879, |
|
"rewards/accuracy_reward": 0.26388889364898205, |
|
"rewards/format_reward": 0.5138888955116272, |
|
"step": 299 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3082.6944580078125, |
|
"epoch": 0.5141388174807198, |
|
"grad_norm": 1.2142449617385864, |
|
"kl": 0.52734375, |
|
"learning_rate": 6.065070198257903e-07, |
|
"loss": 0.111, |
|
"reward": 1.0416666716337204, |
|
"reward_std": 0.8163295686244965, |
|
"rewards/accuracy_reward": 0.22222222294658422, |
|
"rewards/format_reward": 0.5972222164273262, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 583, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|