Qwen-2.5-7B-Simple-RL / trainer_state.json
Tony042's picture
Model save
d035b8a verified
raw
history blame contribute delete
23.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 400,
"global_step": 234,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 997.4687957763672,
"epoch": 0.004266666666666667,
"grad_norm": 0.4476054012775421,
"kl": 0.0,
"learning_rate": 1.25e-07,
"loss": 0.2117,
"num_tokens": 254317.0,
"reward": 0.5580357387661934,
"reward_std": 0.39642567932605743,
"rewards/accuracy_reward": 0.558035708963871,
"rewards/format_reward": 0.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 833.5591859817505,
"epoch": 0.021333333333333333,
"grad_norm": 0.8373767733573914,
"kl": 0.00017589330673217773,
"learning_rate": 6.25e-07,
"loss": 0.1622,
"num_tokens": 1126402.0,
"reward": 0.5602678819559515,
"reward_std": 0.39950425596907735,
"rewards/accuracy_reward": 0.5602678558789194,
"rewards/format_reward": 0.0,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 842.7527206420898,
"epoch": 0.042666666666666665,
"grad_norm": 0.7621635794639587,
"kl": 0.0002469301223754883,
"learning_rate": 1.25e-06,
"loss": 0.1515,
"num_tokens": 2246671.0,
"reward": 0.5714285939931869,
"reward_std": 0.3970031665638089,
"rewards/accuracy_reward": 0.5714285712689161,
"rewards/format_reward": 0.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 904.8804016113281,
"epoch": 0.064,
"grad_norm": 1.0540382862091064,
"kl": 0.0004149436950683594,
"learning_rate": 1.875e-06,
"loss": 0.1689,
"num_tokens": 3425519.0,
"reward": 0.598214315623045,
"reward_std": 0.37588157430291175,
"rewards/accuracy_reward": 0.5982142850756645,
"rewards/format_reward": 0.0,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 803.2098541259766,
"epoch": 0.08533333333333333,
"grad_norm": 0.24481388926506042,
"kl": 0.0009911060333251953,
"learning_rate": 2.5e-06,
"loss": 0.1647,
"num_tokens": 4485645.0,
"reward": 0.6892857506871224,
"reward_std": 0.305409730784595,
"rewards/accuracy_reward": 0.6892857134342194,
"rewards/format_reward": 0.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 742.3911071777344,
"epoch": 0.10666666666666667,
"grad_norm": 0.3323942720890045,
"kl": 0.001956653594970703,
"learning_rate": 2.99983215271541e-06,
"loss": 0.1555,
"num_tokens": 5470801.0,
"reward": 0.7464286014437675,
"reward_std": 0.23826294410973786,
"rewards/accuracy_reward": 0.7464285716414452,
"rewards/format_reward": 0.0,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 698.3303901672364,
"epoch": 0.128,
"grad_norm": 0.2109682857990265,
"kl": 0.0024506568908691405,
"learning_rate": 2.993961440992859e-06,
"loss": 0.1116,
"num_tokens": 6403683.0,
"reward": 0.8080357491970063,
"reward_std": 0.19967383295297622,
"rewards/accuracy_reward": 0.8080357126891613,
"rewards/format_reward": 0.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 654.9661010742187,
"epoch": 0.14933333333333335,
"grad_norm": 0.1639912873506546,
"kl": 0.0032644271850585938,
"learning_rate": 2.979735890885282e-06,
"loss": 0.0765,
"num_tokens": 7290748.0,
"reward": 0.8232143156230449,
"reward_std": 0.18937840089201927,
"rewards/accuracy_reward": 0.8232142880558968,
"rewards/format_reward": 0.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 707.6750289916993,
"epoch": 0.17066666666666666,
"grad_norm": 0.1635686606168747,
"kl": 0.002868843078613281,
"learning_rate": 2.957235057439301e-06,
"loss": 0.0723,
"num_tokens": 8244687.0,
"reward": 0.7946428894996643,
"reward_std": 0.18750199358910322,
"rewards/accuracy_reward": 0.7946428544819355,
"rewards/format_reward": 0.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 676.180387878418,
"epoch": 0.192,
"grad_norm": 0.14903444051742554,
"kl": 0.003135490417480469,
"learning_rate": 2.9265847744427307e-06,
"loss": 0.0618,
"num_tokens": 9153419.0,
"reward": 0.8321428894996643,
"reward_std": 0.1699943056330085,
"rewards/accuracy_reward": 0.8321428552269936,
"rewards/format_reward": 0.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 623.0259208679199,
"epoch": 0.21333333333333335,
"grad_norm": 0.2258339673280716,
"kl": 0.0038270950317382812,
"learning_rate": 2.887956450710995e-06,
"loss": 0.0603,
"num_tokens": 10003724.0,
"reward": 0.8401786059141159,
"reward_std": 0.16125054527074098,
"rewards/accuracy_reward": 0.8401785671710968,
"rewards/format_reward": 0.0,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 659.4062774658203,
"epoch": 0.23466666666666666,
"grad_norm": 0.30922192335128784,
"kl": 0.003952980041503906,
"learning_rate": 2.8415661114995055e-06,
"loss": 0.0489,
"num_tokens": 10890953.0,
"reward": 0.808035746216774,
"reward_std": 0.154349534958601,
"rewards/accuracy_reward": 0.8080357141792774,
"rewards/format_reward": 0.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 639.1562812805175,
"epoch": 0.256,
"grad_norm": 0.1367289125919342,
"kl": 0.004214859008789063,
"learning_rate": 2.7876731904027993e-06,
"loss": 0.0428,
"num_tokens": 11784818.0,
"reward": 0.811607176065445,
"reward_std": 0.15857027508318425,
"rewards/accuracy_reward": 0.8116071425378323,
"rewards/format_reward": 0.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 642.5893127441407,
"epoch": 0.2773333333333333,
"grad_norm": 0.3067258298397064,
"kl": 0.004591751098632813,
"learning_rate": 2.726579078496647e-06,
"loss": 0.0456,
"num_tokens": 12662200.0,
"reward": 0.8410714611411094,
"reward_std": 0.15819188673049212,
"rewards/accuracy_reward": 0.8410714238882064,
"rewards/format_reward": 0.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 682.150032043457,
"epoch": 0.2986666666666667,
"grad_norm": 1.2549998760223389,
"kl": 0.005356979370117187,
"learning_rate": 2.6586254388368995e-06,
"loss": 0.0582,
"num_tokens": 13583589.0,
"reward": 0.7830357477068901,
"reward_std": 0.18484641518443823,
"rewards/accuracy_reward": 0.7830357119441033,
"rewards/format_reward": 0.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 705.7393188476562,
"epoch": 0.32,
"grad_norm": 0.2942885756492615,
"kl": 0.00424346923828125,
"learning_rate": 2.584192295741087e-06,
"loss": 0.0507,
"num_tokens": 14528325.0,
"reward": 0.797321455180645,
"reward_std": 0.17518679071217774,
"rewards/accuracy_reward": 0.7973214276134968,
"rewards/format_reward": 0.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 619.9482444763183,
"epoch": 0.3413333333333333,
"grad_norm": 0.17190858721733093,
"kl": 0.00408172607421875,
"learning_rate": 2.5036959095382875e-06,
"loss": 0.0474,
"num_tokens": 15383695.0,
"reward": 0.8223214603960514,
"reward_std": 0.13084097821265459,
"rewards/accuracy_reward": 0.8223214283585548,
"rewards/format_reward": 0.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 651.5598556518555,
"epoch": 0.3626666666666667,
"grad_norm": 0.46220502257347107,
"kl": 0.0038448333740234374,
"learning_rate": 2.4175864486725093e-06,
"loss": 0.0395,
"num_tokens": 16274673.0,
"reward": 0.8464286021888257,
"reward_std": 0.14037740416824818,
"rewards/accuracy_reward": 0.846428568661213,
"rewards/format_reward": 0.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 669.5187744140625,
"epoch": 0.384,
"grad_norm": 0.2906588315963745,
"kl": 0.004624176025390625,
"learning_rate": 2.3263454721781537e-06,
"loss": 0.045,
"num_tokens": 17179556.0,
"reward": 0.8419643200933933,
"reward_std": 0.16921061147004365,
"rewards/accuracy_reward": 0.8419642858207226,
"rewards/format_reward": 0.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 671.370565032959,
"epoch": 0.4053333333333333,
"grad_norm": 0.25627151131629944,
"kl": 0.00406341552734375,
"learning_rate": 2.230483236606551e-06,
"loss": 0.0304,
"num_tokens": 18091140.0,
"reward": 0.7982143193483353,
"reward_std": 0.1869086405262351,
"rewards/accuracy_reward": 0.7982142873108387,
"rewards/format_reward": 0.0,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 673.0446693420411,
"epoch": 0.4266666666666667,
"grad_norm": 0.8661001920700073,
"kl": 0.004367828369140625,
"learning_rate": 2.1305358424643485e-06,
"loss": 0.0367,
"num_tokens": 18998523.0,
"reward": 0.8035714536905288,
"reward_std": 0.17903811391443014,
"rewards/accuracy_reward": 0.8035714350640774,
"rewards/format_reward": 0.0,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 663.0018180847168,
"epoch": 0.448,
"grad_norm": 0.28504350781440735,
"kl": 0.004234695434570312,
"learning_rate": 2.027062236122014e-06,
"loss": 0.023,
"num_tokens": 19901861.0,
"reward": 0.8026786029338837,
"reward_std": 0.19563668854534627,
"rewards/accuracy_reward": 0.802678569406271,
"rewards/format_reward": 0.0,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 668.9250289916993,
"epoch": 0.4693333333333333,
"grad_norm": 0.23065027594566345,
"kl": 0.005011749267578125,
"learning_rate": 1.9206410839590043e-06,
"loss": 0.0265,
"num_tokens": 20801382.0,
"reward": 0.8017857454717159,
"reward_std": 0.1697748264297843,
"rewards/accuracy_reward": 0.8017857126891613,
"rewards/format_reward": 0.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 669.5696678161621,
"epoch": 0.49066666666666664,
"grad_norm": 0.25070255994796753,
"kl": 0.004204177856445312,
"learning_rate": 1.8118675362266389e-06,
"loss": 0.04,
"num_tokens": 21713070.0,
"reward": 0.8071428939700127,
"reward_std": 0.15041764192283152,
"rewards/accuracy_reward": 0.8071428596973419,
"rewards/format_reward": 0.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 745.8312896728515,
"epoch": 0.512,
"grad_norm": 0.2231619954109192,
"kl": 0.004381752014160157,
"learning_rate": 1.7013498987264833e-06,
"loss": 0.0291,
"num_tokens": 22706608.0,
"reward": 0.7928571775555611,
"reward_std": 0.15372475292533636,
"rewards/accuracy_reward": 0.7928571425378322,
"rewards/format_reward": 0.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 683.2152099609375,
"epoch": 0.5333333333333333,
"grad_norm": 0.22776393592357635,
"kl": 0.004504776000976563,
"learning_rate": 1.5897062309175513e-06,
"loss": 0.0457,
"num_tokens": 23648209.0,
"reward": 0.7892857491970062,
"reward_std": 0.17449938021600248,
"rewards/accuracy_reward": 0.7892857134342194,
"rewards/format_reward": 0.0,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 657.4553871154785,
"epoch": 0.5546666666666666,
"grad_norm": 0.16024665534496307,
"kl": 0.00522613525390625,
"learning_rate": 1.4775608894771048e-06,
"loss": 0.0366,
"num_tokens": 24549815.0,
"reward": 0.8125000387430191,
"reward_std": 0.1693068951368332,
"rewards/accuracy_reward": 0.8124999962747097,
"rewards/format_reward": 0.0,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 642.0161018371582,
"epoch": 0.576,
"grad_norm": 0.15500028431415558,
"kl": 0.004698562622070313,
"learning_rate": 1.3655410366448499e-06,
"loss": 0.0268,
"num_tokens": 25438763.0,
"reward": 0.8062500298023224,
"reward_std": 0.1475358337163925,
"rewards/accuracy_reward": 0.806250000745058,
"rewards/format_reward": 0.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 729.4768188476562,
"epoch": 0.5973333333333334,
"grad_norm": 0.19548794627189636,
"kl": 0.004253196716308594,
"learning_rate": 1.2542731328772936e-06,
"loss": 0.0278,
"num_tokens": 26409399.0,
"reward": 0.8008928880095482,
"reward_std": 0.14697162248194218,
"rewards/accuracy_reward": 0.8008928537368775,
"rewards/format_reward": 0.0,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 665.8884208679199,
"epoch": 0.6186666666666667,
"grad_norm": 0.23221784830093384,
"kl": 0.005530166625976563,
"learning_rate": 1.1443794334267539e-06,
"loss": 0.0404,
"num_tokens": 27322886.0,
"reward": 0.8187500342726708,
"reward_std": 0.15602440275251866,
"rewards/accuracy_reward": 0.8187499970197678,
"rewards/format_reward": 0.0,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 676.9500312805176,
"epoch": 0.64,
"grad_norm": 0.18050934374332428,
"kl": 0.0050506591796875,
"learning_rate": 1.036474508437579e-06,
"loss": 0.0525,
"num_tokens": 28241951.0,
"reward": 0.8491071745753288,
"reward_std": 0.16172744631767272,
"rewards/accuracy_reward": 0.8491071447730064,
"rewards/format_reward": 0.0,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 642.0652069091797,
"epoch": 0.6613333333333333,
"grad_norm": 0.12173999845981598,
"kl": 0.004921722412109375,
"learning_rate": 9.311618060206075e-07,
"loss": 0.0422,
"num_tokens": 29117563.0,
"reward": 0.805357176065445,
"reward_std": 0.17488674409687519,
"rewards/accuracy_reward": 0.8053571425378323,
"rewards/format_reward": 0.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 716.8884292602539,
"epoch": 0.6826666666666666,
"grad_norm": 0.2920779585838318,
"kl": 0.004403495788574218,
"learning_rate": 8.290302775265509e-07,
"loss": 0.0223,
"num_tokens": 30082031.0,
"reward": 0.7910714626312256,
"reward_std": 0.1914942855015397,
"rewards/accuracy_reward": 0.7910714291036129,
"rewards/format_reward": 0.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 701.8036018371582,
"epoch": 0.704,
"grad_norm": 0.9731239080429077,
"kl": 0.00501861572265625,
"learning_rate": 7.30651083891141e-07,
"loss": 0.0393,
"num_tokens": 31018320.0,
"reward": 0.8223214641213417,
"reward_std": 0.19152794536203147,
"rewards/accuracy_reward": 0.8223214253783226,
"rewards/format_reward": 0.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 686.908959197998,
"epoch": 0.7253333333333334,
"grad_norm": 0.21161052584648132,
"kl": 0.004227638244628906,
"learning_rate": 6.3657440147149e-07,
"loss": 0.0509,
"num_tokens": 31945256.0,
"reward": 0.8053571827709675,
"reward_std": 0.1896941650658846,
"rewards/accuracy_reward": 0.8110118992626667,
"rewards/format_reward": 0.0,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 729.9232475280762,
"epoch": 0.7466666666666667,
"grad_norm": 0.15954989194869995,
"kl": 0.004137992858886719,
"learning_rate": 5.473263452367318e-07,
"loss": 0.0326,
"num_tokens": 32932758.0,
"reward": 0.7705357417464256,
"reward_std": 0.18078458309173584,
"rewards/accuracy_reward": 0.7705357111990452,
"rewards/format_reward": 0.0,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 649.5544960021973,
"epoch": 0.768,
"grad_norm": 0.13724558055400848,
"kl": 0.004343414306640625,
"learning_rate": 4.63406026519703e-07,
"loss": 0.027,
"num_tokens": 33826439.0,
"reward": 0.8089286006987095,
"reward_std": 0.15814926102757454,
"rewards/accuracy_reward": 0.8089285723865032,
"rewards/format_reward": 0.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 718.4714622497559,
"epoch": 0.7893333333333333,
"grad_norm": 0.14736232161521912,
"kl": 0.004001426696777344,
"learning_rate": 3.852827617839085e-07,
"loss": 0.0637,
"num_tokens": 34789978.0,
"reward": 0.8017857491970062,
"reward_std": 0.1630306661128998,
"rewards/accuracy_reward": 0.8017857111990452,
"rewards/format_reward": 0.0,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 673.543782043457,
"epoch": 0.8106666666666666,
"grad_norm": 0.3745248019695282,
"kl": 0.004261016845703125,
"learning_rate": 3.133934480154885e-07,
"loss": 0.032,
"num_tokens": 35712907.0,
"reward": 0.8008928954601288,
"reward_std": 0.1807129830121994,
"rewards/accuracy_reward": 0.8008928552269936,
"rewards/format_reward": 0.0,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 664.7464561462402,
"epoch": 0.832,
"grad_norm": 0.2295093536376953,
"kl": 0.004295730590820312,
"learning_rate": 2.48140119418046e-07,
"loss": 0.0484,
"num_tokens": 36621153.0,
"reward": 0.8062500327825546,
"reward_std": 0.16623926647007464,
"rewards/accuracy_reward": 0.8122023768723011,
"rewards/format_reward": 0.0,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 727.7053886413574,
"epoch": 0.8533333333333334,
"grad_norm": 0.31535524129867554,
"kl": 0.003837394714355469,
"learning_rate": 1.8988769907430552e-07,
"loss": 0.0474,
"num_tokens": 37591786.0,
"reward": 0.7875000357627868,
"reward_std": 0.16348065678030252,
"rewards/accuracy_reward": 0.7875000007450581,
"rewards/format_reward": 0.0,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 673.6312782287598,
"epoch": 0.8746666666666667,
"grad_norm": 0.2970116138458252,
"kl": 0.0037149429321289063,
"learning_rate": 1.3896195814820269e-07,
"loss": 0.0408,
"num_tokens": 38502885.0,
"reward": 0.842857176065445,
"reward_std": 0.15663124285638333,
"rewards/accuracy_reward": 0.8428571417927742,
"rewards/format_reward": 0.0,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 658.6187789916992,
"epoch": 0.896,
"grad_norm": 0.3235113322734833,
"kl": 0.005364990234375,
"learning_rate": 9.564769404039419e-08,
"loss": 0.0322,
"num_tokens": 39400271.0,
"reward": 0.7964285999536515,
"reward_std": 0.19183696284890175,
"rewards/accuracy_reward": 0.7964285723865032,
"rewards/format_reward": 0.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 658.0768119812012,
"epoch": 0.9173333333333333,
"grad_norm": 0.19358721375465393,
"kl": 0.0041599273681640625,
"learning_rate": 6.018713768566658e-08,
"loss": 0.024,
"num_tokens": 40305926.0,
"reward": 0.8125000312924385,
"reward_std": 0.16783579289913178,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 0.0,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 672.805387878418,
"epoch": 0.9386666666666666,
"grad_norm": 0.2503073513507843,
"kl": 0.004539871215820312,
"learning_rate": 3.277859889929147e-08,
"loss": 0.0397,
"num_tokens": 41216982.0,
"reward": 0.8482143253087997,
"reward_std": 0.20670701321214438,
"rewards/accuracy_reward": 0.8482142858207226,
"rewards/format_reward": 0.0,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 654.6687858581543,
"epoch": 0.96,
"grad_norm": 0.17779560387134552,
"kl": 0.0040149688720703125,
"learning_rate": 1.357535734809795e-08,
"loss": 0.0372,
"num_tokens": 42114081.0,
"reward": 0.8000000335276127,
"reward_std": 0.16292541287839413,
"rewards/accuracy_reward": 0.7999999985098839,
"rewards/format_reward": 0.0,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 646.3321731567382,
"epoch": 0.9813333333333333,
"grad_norm": 0.2772572338581085,
"kl": 0.004202651977539063,
"learning_rate": 2.684805348397268e-09,
"loss": 0.0411,
"num_tokens": 43004377.0,
"reward": 0.805357177555561,
"reward_std": 0.17196922078728677,
"rewards/accuracy_reward": 0.8053571373224259,
"rewards/format_reward": 0.0,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 680.2171030044556,
"epoch": 0.9984,
"kl": 0.003808736801147461,
"num_tokens": 43742868.0,
"reward": 0.8024553880095482,
"reward_std": 0.17644294258207083,
"rewards/accuracy_reward": 0.802455360069871,
"rewards/format_reward": 0.0,
"step": 234,
"total_flos": 0.0,
"train_loss": 0.055906044398872264,
"train_runtime": 36259.8049,
"train_samples_per_second": 0.207,
"train_steps_per_second": 0.006
}
],
"logging_steps": 5,
"max_steps": 234,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}