q3-30b-rc1-kto-adpt / trainer_state.json
Fizzarolli's picture
Upload folder using huggingface_hub
b3d1ed1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9456264775413712,
"eval_steps": 500,
"global_step": 52,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.037825059101654845,
"grad_norm": 6.102660179138184,
"kl": 0.0,
"learning_rate": 0.0,
"logits/chosen": 196634302.17142856,
"logits/rejected": 99832523.03448276,
"logps/chosen": -983.6032366071429,
"logps/rejected": -592.0450565732758,
"loss": 0.5,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.07565011820330969,
"grad_norm": 5.946473598480225,
"kl": 0.0,
"learning_rate": 6.666666666666666e-07,
"logits/chosen": 174062182.4,
"logits/rejected": 42135149.176470585,
"logps/chosen": -1048.4553385416666,
"logps/rejected": -543.3271484375,
"loss": 0.5,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.11347517730496454,
"grad_norm": 6.028696537017822,
"kl": 1.4145872592926025,
"learning_rate": 1.3333333333333332e-06,
"logits/chosen": 149491836.12121212,
"logits/rejected": 16149931.35483871,
"logps/chosen": -1068.7826704545455,
"logps/rejected": -462.9348538306452,
"loss": 0.5007,
"rewards/chosen": 0.07321595423149341,
"rewards/margins": 0.1519100286161212,
"rewards/rejected": -0.0786940743846278,
"step": 3
},
{
"epoch": 0.15130023640661938,
"grad_norm": 6.901468276977539,
"kl": 1.32453191280365,
"learning_rate": 2e-06,
"logits/chosen": 169462896.0,
"logits/rejected": 43728952.0,
"logps/chosen": -1022.6226196289062,
"logps/rejected": -445.5266418457031,
"loss": 0.5168,
"rewards/chosen": -0.036316584795713425,
"rewards/margins": -0.012025408446788788,
"rewards/rejected": -0.024291176348924637,
"step": 4
},
{
"epoch": 0.18912529550827423,
"grad_norm": 4.817409515380859,
"kl": 0.5722990036010742,
"learning_rate": 1.9979453927503364e-06,
"logits/chosen": 149387924.6451613,
"logits/rejected": 72442670.54545455,
"logps/chosen": -950.062247983871,
"logps/rejected": -557.7066169507576,
"loss": 0.4998,
"rewards/chosen": 0.02832491167130009,
"rewards/margins": 0.0007265486908331897,
"rewards/rejected": 0.0275983629804669,
"step": 5
},
{
"epoch": 0.22695035460992907,
"grad_norm": 3.4966697692871094,
"kl": 0.5014443397521973,
"learning_rate": 1.991790013823246e-06,
"logits/chosen": 90473276.23529412,
"logits/rejected": 14355141.333333334,
"logps/chosen": -773.8732766544117,
"logps/rejected": -412.08531901041664,
"loss": 0.5008,
"rewards/chosen": 0.015493747066049016,
"rewards/margins": 0.07545245932597741,
"rewards/rejected": -0.05995871225992839,
"step": 6
},
{
"epoch": 0.2647754137115839,
"grad_norm": 4.85699462890625,
"kl": 0.9150105714797974,
"learning_rate": 1.9815591569910653e-06,
"logits/chosen": 177541861.5172414,
"logits/rejected": 74630538.97142857,
"logps/chosen": -1035.0591998922414,
"logps/rejected": -533.7266741071429,
"loss": 0.4996,
"rewards/chosen": 0.05604078440830625,
"rewards/margins": -0.02882181564575346,
"rewards/rejected": 0.08486260005405971,
"step": 7
},
{
"epoch": 0.30260047281323876,
"grad_norm": 5.519815444946289,
"kl": 1.0732237100601196,
"learning_rate": 1.9672948630390295e-06,
"logits/chosen": 143991003.42857143,
"logits/rejected": 37585928.827586204,
"logps/chosen": -981.2824776785715,
"logps/rejected": -516.1851427801724,
"loss": 0.4675,
"rewards/chosen": 0.1541661262512207,
"rewards/margins": 0.42498263326184504,
"rewards/rejected": -0.2708165070106243,
"step": 8
},
{
"epoch": 0.3404255319148936,
"grad_norm": 4.185125350952148,
"kl": 1.2695928812026978,
"learning_rate": 1.9490557470106686e-06,
"logits/chosen": 144893661.86666667,
"logits/rejected": 41582317.176470585,
"logps/chosen": -883.7143880208333,
"logps/rejected": -439.1708984375,
"loss": 0.4954,
"rewards/chosen": -0.04857488075892131,
"rewards/margins": -0.037722761257022036,
"rewards/rejected": -0.01085211950189927,
"step": 9
},
{
"epoch": 0.37825059101654845,
"grad_norm": 4.946338176727295,
"kl": 2.036167860031128,
"learning_rate": 1.9269167573460217e-06,
"logits/chosen": 119253806.54545455,
"logits/rejected": 4415281.548387097,
"logps/chosen": -917.6138139204545,
"logps/rejected": -416.56313004032256,
"loss": 0.4539,
"rewards/chosen": 0.2660622163252397,
"rewards/margins": 0.25140741458736204,
"rewards/rejected": 0.01465480173787763,
"step": 10
},
{
"epoch": 0.4160756501182033,
"grad_norm": 4.3735785484313965,
"kl": 2.626142978668213,
"learning_rate": 1.9009688679024189e-06,
"logits/chosen": 51137304.0,
"logits/rejected": 9401010.0,
"logps/chosen": -785.0097045898438,
"logps/rejected": -515.2562866210938,
"loss": 0.4788,
"rewards/chosen": 0.15124297142028809,
"rewards/margins": 0.3036640137434006,
"rewards/rejected": -0.1524210423231125,
"step": 11
},
{
"epoch": 0.45390070921985815,
"grad_norm": 4.936854839324951,
"kl": 2.3246853351593018,
"learning_rate": 1.8713187041233893e-06,
"logits/chosen": 181763699.61290324,
"logits/rejected": 38560000.0,
"logps/chosen": -970.0010080645161,
"logps/rejected": -475.65030184659093,
"loss": 0.4744,
"rewards/chosen": 0.06626310656147619,
"rewards/margins": 0.23985308752498086,
"rewards/rejected": -0.17358998096350467,
"step": 12
},
{
"epoch": 0.491725768321513,
"grad_norm": 4.4254326820373535,
"kl": 1.088865876197815,
"learning_rate": 1.8380881048918404e-06,
"logits/chosen": 136485074.82352942,
"logits/rejected": 83436902.4,
"logps/chosen": -846.1370634191177,
"logps/rejected": -561.441796875,
"loss": 0.4861,
"rewards/chosen": 0.18190832699046416,
"rewards/margins": 0.11596365535960479,
"rewards/rejected": 0.06594467163085938,
"step": 13
},
{
"epoch": 0.5295508274231678,
"grad_norm": 3.7348413467407227,
"kl": 2.592059850692749,
"learning_rate": 1.8014136218679566e-06,
"logits/chosen": 81516349.79310344,
"logits/rejected": 33273589.028571427,
"logps/chosen": -772.0715921336207,
"logps/rejected": -523.1571428571428,
"loss": 0.4858,
"rewards/chosen": 0.25091521493319807,
"rewards/margins": 0.031018516935151202,
"rewards/rejected": 0.21989669799804687,
"step": 14
},
{
"epoch": 0.5673758865248227,
"grad_norm": 5.629943370819092,
"kl": 2.344330072402954,
"learning_rate": 1.7614459583691342e-06,
"logits/chosen": 134785755.42857143,
"logits/rejected": 69182587.5862069,
"logps/chosen": -908.4704241071429,
"logps/rejected": -505.3213900862069,
"loss": 0.4707,
"rewards/chosen": 0.317371450151716,
"rewards/margins": 0.3183928394552522,
"rewards/rejected": -0.0010213893035362508,
"step": 15
},
{
"epoch": 0.6052009456264775,
"grad_norm": 4.68190860748291,
"kl": 1.7908133268356323,
"learning_rate": 1.7183493500977275e-06,
"logits/chosen": 170032571.73333332,
"logits/rejected": 67021959.52941176,
"logps/chosen": -948.2464192708334,
"logps/rejected": -565.9269301470588,
"loss": 0.4587,
"rewards/chosen": 0.22927993138631184,
"rewards/margins": 0.2928940473818311,
"rewards/rejected": -0.0636141159955193,
"step": 16
},
{
"epoch": 0.6430260047281324,
"grad_norm": 5.001438617706299,
"kl": 1.466516375541687,
"learning_rate": 1.6723008902613168e-06,
"logits/chosen": 149596454.78787878,
"logits/rejected": 51657533.93548387,
"logps/chosen": -934.278290719697,
"logps/rejected": -586.933845766129,
"loss": 0.4811,
"rewards/chosen": 0.2862388148452296,
"rewards/margins": 0.24233368356673013,
"rewards/rejected": 0.04390513127849948,
"step": 17
},
{
"epoch": 0.6808510638297872,
"grad_norm": 4.730621337890625,
"kl": 2.506091594696045,
"learning_rate": 1.6234898018587336e-06,
"logits/chosen": 222986325.33333334,
"logits/rejected": 85738066.8235294,
"logps/chosen": -1094.235546875,
"logps/rejected": -609.6760110294117,
"loss": 0.5094,
"rewards/chosen": 0.3892539342244466,
"rewards/margins": 0.1954242855894799,
"rewards/rejected": 0.19382964863496668,
"step": 18
},
{
"epoch": 0.7186761229314421,
"grad_norm": 4.429194450378418,
"kl": 3.0494582653045654,
"learning_rate": 1.5721166601221695e-06,
"logits/chosen": 53398761.4117647,
"logits/rejected": -3114230.933333333,
"logps/chosen": -779.4967830882352,
"logps/rejected": -421.95270182291665,
"loss": 0.4481,
"rewards/chosen": 0.40259605295517864,
"rewards/margins": 0.38857168204644144,
"rewards/rejected": 0.014024370908737182,
"step": 19
},
{
"epoch": 0.7565011820330969,
"grad_norm": 4.854764938354492,
"kl": 3.306687593460083,
"learning_rate": 1.5183925683105251e-06,
"logits/chosen": 176390912.0,
"logits/rejected": 69896576.0,
"logps/chosen": -942.5886840820312,
"logps/rejected": -547.885986328125,
"loss": 0.4655,
"rewards/chosen": 0.44881612062454224,
"rewards/margins": 0.4531639628112316,
"rewards/rejected": -0.004347842186689377,
"step": 20
},
{
"epoch": 0.7943262411347518,
"grad_norm": 4.21329927444458,
"kl": 2.6854774951934814,
"learning_rate": 1.4625382902408354e-06,
"logits/chosen": 135675238.4,
"logits/rejected": 1414048.4705882352,
"logps/chosen": -931.5176432291667,
"logps/rejected": -442.47047334558823,
"loss": 0.471,
"rewards/chosen": 0.4190946896870931,
"rewards/margins": 0.4137749505978005,
"rewards/rejected": 0.005319739089292639,
"step": 21
},
{
"epoch": 0.8321513002364066,
"grad_norm": 3.600161552429199,
"kl": 2.8906941413879395,
"learning_rate": 1.4047833431223936e-06,
"logits/chosen": 70284709.64705883,
"logits/rejected": 20018178.133333333,
"logps/chosen": -715.8906824448529,
"logps/rejected": -460.94720052083335,
"loss": 0.4582,
"rewards/chosen": 0.3729521527009852,
"rewards/margins": 0.4021145516750859,
"rewards/rejected": -0.029162398974100747,
"step": 22
},
{
"epoch": 0.8699763593380615,
"grad_norm": 3.66640043258667,
"kl": 2.6149392127990723,
"learning_rate": 1.3453650544213076e-06,
"logits/chosen": 140292640.0,
"logits/rejected": 47265416.0,
"logps/chosen": -893.296142578125,
"logps/rejected": -523.0548095703125,
"loss": 0.4231,
"rewards/chosen": 0.6247925162315369,
"rewards/margins": 0.7861275523900986,
"rewards/rejected": -0.1613350361585617,
"step": 23
},
{
"epoch": 0.9078014184397163,
"grad_norm": 3.6788175106048584,
"kl": 2.519148588180542,
"learning_rate": 1.2845275866310324e-06,
"logits/chosen": 145627733.33333334,
"logits/rejected": 55117643.294117644,
"logps/chosen": -871.0560546875,
"logps/rejected": -437.0091911764706,
"loss": 0.4735,
"rewards/chosen": 0.5424024581909179,
"rewards/margins": 0.49889986795537605,
"rewards/rejected": 0.04350259023554185,
"step": 24
},
{
"epoch": 0.9456264775413712,
"grad_norm": 4.228794574737549,
"kl": 3.256070137023926,
"learning_rate": 1.2225209339563143e-06,
"logits/chosen": 153259218.82352942,
"logits/rejected": 17615726.933333334,
"logps/chosen": -892.5439453125,
"logps/rejected": -430.61676432291665,
"loss": 0.4525,
"rewards/chosen": 0.7282369838041418,
"rewards/margins": 0.6316887285195145,
"rewards/rejected": 0.09654825528462728,
"step": 25
},
{
"epoch": 0.983451536643026,
"grad_norm": 4.561258792877197,
"kl": 3.3823060989379883,
"learning_rate": 1.1595998950333793e-06,
"logits/chosen": 148130304.0,
"logits/rejected": 17554260.0,
"logps/chosen": -1006.059814453125,
"logps/rejected": -537.9644165039062,
"loss": 0.4519,
"rewards/chosen": 0.49162358045578003,
"rewards/margins": 0.43815357238054276,
"rewards/rejected": 0.053470008075237274,
"step": 26
},
{
"epoch": 1.0,
"grad_norm": 1.6852926015853882,
"kl": 2.1286067962646484,
"learning_rate": 1.0960230259076817e-06,
"logits/chosen": 53759712.0,
"logits/rejected": 33787948.0,
"logps/chosen": -643.0557454427084,
"logps/rejected": -457.4282531738281,
"loss": 0.214,
"rewards/chosen": 0.28392742077509564,
"rewards/margins": 0.21173327664534253,
"rewards/rejected": 0.07219414412975311,
"step": 27
},
{
"epoch": 1.037825059101655,
"grad_norm": 5.137193202972412,
"kl": 4.823151588439941,
"learning_rate": 1.0320515775716554e-06,
"logits/chosen": 200775387.42857143,
"logits/rejected": 100983984.55172414,
"logps/chosen": -977.4030133928571,
"logps/rejected": -592.4806707974138,
"loss": 0.4626,
"rewards/chosen": 0.6200266156877791,
"rewards/margins": 0.6635866314319555,
"rewards/rejected": -0.0435600157441764,
"step": 28
},
{
"epoch": 1.0756501182033098,
"grad_norm": 4.594226837158203,
"kl": 3.3556861877441406,
"learning_rate": 9.679484224283447e-07,
"logits/chosen": 179323545.6,
"logits/rejected": 45320342.5882353,
"logps/chosen": -1038.7833333333333,
"logps/rejected": -542.6672794117648,
"loss": 0.4375,
"rewards/chosen": 0.9672001520792644,
"rewards/margins": 0.901208511988322,
"rewards/rejected": 0.06599164009094238,
"step": 29
},
{
"epoch": 1.1134751773049645,
"grad_norm": 4.856236457824707,
"kl": 4.625980377197266,
"learning_rate": 9.039769740923182e-07,
"logits/chosen": 153493752.24242425,
"logits/rejected": 17529911.741935484,
"logps/chosen": -1062.4770359848485,
"logps/rejected": -463.51165574596774,
"loss": 0.4398,
"rewards/chosen": 0.7037940169825698,
"rewards/margins": 0.8401704748121991,
"rewards/rejected": -0.13637645782962923,
"step": 30
},
{
"epoch": 1.1513002364066194,
"grad_norm": 4.414266586303711,
"kl": 4.7278618812561035,
"learning_rate": 8.40400104966621e-07,
"logits/chosen": 175260000.0,
"logits/rejected": 45709312.0,
"logps/chosen": -1013.2972412109375,
"logps/rejected": -444.91375732421875,
"loss": 0.4285,
"rewards/chosen": 0.8962305188179016,
"rewards/margins": 0.859231524169445,
"rewards/rejected": 0.036998994648456573,
"step": 31
},
{
"epoch": 1.1891252955082743,
"grad_norm": 3.963383913040161,
"kl": 4.444534778594971,
"learning_rate": 7.774790660436857e-07,
"logits/chosen": 153567876.12903225,
"logits/rejected": 75968279.27272727,
"logps/chosen": -942.1448462701613,
"logps/rejected": -556.363340435606,
"loss": 0.449,
"rewards/chosen": 0.8200658367526147,
"rewards/margins": 0.6581434252674628,
"rewards/rejected": 0.16192241148515182,
"step": 32
},
{
"epoch": 1.226950354609929,
"grad_norm": 2.951836585998535,
"kl": 3.6152563095092773,
"learning_rate": 7.154724133689676e-07,
"logits/chosen": 93397052.23529412,
"logits/rejected": 14595782.4,
"logps/chosen": -768.8926355698529,
"logps/rejected": -413.5569661458333,
"loss": 0.4498,
"rewards/chosen": 0.5135440826416016,
"rewards/margins": 0.7206665833791097,
"rewards/rejected": -0.20712250073750813,
"step": 33
},
{
"epoch": 1.2647754137115839,
"grad_norm": 4.550557613372803,
"kl": 3.669976234436035,
"learning_rate": 6.546349455786925e-07,
"logits/chosen": 183599315.86206895,
"logits/rejected": 75403607.77142857,
"logps/chosen": -1025.5631061422414,
"logps/rejected": -534.7680803571428,
"loss": 0.4358,
"rewards/chosen": 1.0056335843842605,
"rewards/margins": 1.024911846726986,
"rewards/rejected": -0.01927826234272548,
"step": 34
},
{
"epoch": 1.3026004728132388,
"grad_norm": 4.653792381286621,
"kl": 3.6588058471679688,
"learning_rate": 5.952166568776062e-07,
"logits/chosen": 148335016.22857141,
"logits/rejected": 40633794.20689655,
"logps/chosen": -974.4972098214286,
"logps/rejected": -514.6373922413793,
"loss": 0.4266,
"rewards/chosen": 0.8326939174107143,
"rewards/margins": 0.9487363669672624,
"rewards/rejected": -0.11604244955654802,
"step": 35
},
{
"epoch": 1.3404255319148937,
"grad_norm": 3.622363328933716,
"kl": 3.4761648178100586,
"learning_rate": 5.37461709759165e-07,
"logits/chosen": 148679406.93333334,
"logits/rejected": 42793573.64705882,
"logps/chosen": -877.1561197916667,
"logps/rejected": -439.0417911305147,
"loss": 0.4514,
"rewards/chosen": 0.6072582880655925,
"rewards/margins": 0.6052035388993282,
"rewards/rejected": 0.0020547491662642535,
"step": 36
},
{
"epoch": 1.3782505910165486,
"grad_norm": 3.795612335205078,
"kl": 4.738083362579346,
"learning_rate": 4.816074316894749e-07,
"logits/chosen": 122439517.0909091,
"logits/rejected": 6666082.064516129,
"logps/chosen": -911.3649976325758,
"logps/rejected": -416.80128528225805,
"loss": 0.3949,
"rewards/chosen": 0.8909483244924834,
"rewards/margins": 0.9001048640480023,
"rewards/rejected": -0.009156539555518858,
"step": 37
},
{
"epoch": 1.4160756501182032,
"grad_norm": 3.8737242221832275,
"kl": 4.693930625915527,
"learning_rate": 4.278833398778305e-07,
"logits/chosen": 53371148.0,
"logits/rejected": 11690896.0,
"logps/chosen": -780.3959350585938,
"logps/rejected": -514.53857421875,
"loss": 0.4622,
"rewards/chosen": 0.6126159429550171,
"rewards/margins": 0.6932726949453354,
"rewards/rejected": -0.0806567519903183,
"step": 38
},
{
"epoch": 1.4539007092198581,
"grad_norm": 4.137379169464111,
"kl": 3.699460983276367,
"learning_rate": 3.765101981412665e-07,
"logits/chosen": 185525049.80645162,
"logits/rejected": 40507283.39393939,
"logps/chosen": -962.7782888104839,
"logps/rejected": -474.48996803977275,
"loss": 0.4349,
"rewards/chosen": 0.788528811547064,
"rewards/margins": 0.8460882728516997,
"rewards/rejected": -0.05755946130463571,
"step": 39
},
{
"epoch": 1.491725768321513,
"grad_norm": 3.6001534461975098,
"kl": 3.141786575317383,
"learning_rate": 3.276991097386831e-07,
"logits/chosen": 139919826.82352942,
"logits/rejected": 84876014.93333334,
"logps/chosen": -839.6150045955883,
"logps/rejected": -562.3081380208333,
"loss": 0.4271,
"rewards/chosen": 0.8341112697825712,
"rewards/margins": 0.8548066302841785,
"rewards/rejected": -0.02069536050160726,
"step": 40
},
{
"epoch": 1.5295508274231677,
"grad_norm": 3.451026439666748,
"kl": 3.2268035411834717,
"learning_rate": 2.816506499022725e-07,
"logits/chosen": 84237550.34482759,
"logits/rejected": 33436748.8,
"logps/chosen": -769.4187769396551,
"logps/rejected": -524.4465401785715,
"loss": 0.4379,
"rewards/chosen": 0.5161945079935009,
"rewards/margins": 0.42524682547658543,
"rewards/rejected": 0.09094768251691546,
"step": 41
},
{
"epoch": 1.5673758865248226,
"grad_norm": 4.36342191696167,
"kl": 3.4994237422943115,
"learning_rate": 2.3855404163086556e-07,
"logits/chosen": 137653745.37142858,
"logits/rejected": 69890074.48275863,
"logps/chosen": -904.1540178571429,
"logps/rejected": -506.37173356681035,
"loss": 0.429,
"rewards/chosen": 0.7490062168666295,
"rewards/margins": 0.8550563135757823,
"rewards/rejected": -0.10605009670915275,
"step": 42
},
{
"epoch": 1.6052009456264775,
"grad_norm": 4.330196380615234,
"kl": 4.095585823059082,
"learning_rate": 1.9858637813204349e-07,
"logits/chosen": 171540189.86666667,
"logits/rejected": 67830979.76470588,
"logps/chosen": -944.7064453125,
"logps/rejected": -567.6101792279412,
"loss": 0.4334,
"rewards/chosen": 0.5832799911499024,
"rewards/margins": 0.815221789303948,
"rewards/rejected": -0.23194179815404556,
"step": 43
},
{
"epoch": 1.6430260047281324,
"grad_norm": 4.233847141265869,
"kl": 4.319591999053955,
"learning_rate": 1.619118951081594e-07,
"logits/chosen": 152153925.8181818,
"logits/rejected": 52702876.90322581,
"logps/chosen": -930.294862689394,
"logps/rejected": -587.3856476814516,
"loss": 0.4389,
"rewards/chosen": 0.6845672491824988,
"rewards/margins": 0.6858430385356541,
"rewards/rejected": -0.0012757893531553208,
"step": 44
},
{
"epoch": 1.6808510638297873,
"grad_norm": 4.332212924957275,
"kl": 4.509950637817383,
"learning_rate": 1.286812958766106e-07,
"logits/chosen": 224899908.26666668,
"logits/rejected": 84017543.52941176,
"logps/chosen": -1090.4815104166667,
"logps/rejected": -611.0609489889706,
"loss": 0.4422,
"rewards/chosen": 0.7646568298339844,
"rewards/margins": 0.7093157158178443,
"rewards/rejected": 0.05534111401614021,
"step": 45
},
{
"epoch": 1.7186761229314422,
"grad_norm": 3.7254793643951416,
"kl": 4.550510406494141,
"learning_rate": 9.903113209758096e-08,
"logits/chosen": 55315689.4117647,
"logits/rejected": -2547915.7333333334,
"logps/chosen": -776.220703125,
"logps/rejected": -422.91708984375,
"loss": 0.4112,
"rewards/chosen": 0.7302110896391028,
"rewards/margins": 0.8126279419543697,
"rewards/rejected": -0.08241685231526692,
"step": 46
},
{
"epoch": 1.756501182033097,
"grad_norm": 4.257685661315918,
"kl": 5.242433547973633,
"learning_rate": 7.308324265397836e-08,
"logits/chosen": 177834400.0,
"logits/rejected": 68700808.0,
"logps/chosen": -938.9208984375,
"logps/rejected": -549.43896484375,
"loss": 0.4154,
"rewards/chosen": 0.8155966401100159,
"rewards/margins": 0.9752314537763596,
"rewards/rejected": -0.1596348136663437,
"step": 47
},
{
"epoch": 1.7943262411347518,
"grad_norm": 3.6647257804870605,
"kl": 3.49682354927063,
"learning_rate": 5.094425298933136e-08,
"logits/chosen": 137424546.13333333,
"logits/rejected": 2488748.2352941176,
"logps/chosen": -927.8557291666667,
"logps/rejected": -442.87818818933823,
"loss": 0.4424,
"rewards/chosen": 0.7852853775024414,
"rewards/margins": 0.820735776424408,
"rewards/rejected": -0.03545039892196655,
"step": 48
},
{
"epoch": 1.8321513002364065,
"grad_norm": 3.1744253635406494,
"kl": 3.5714423656463623,
"learning_rate": 3.270513696097055e-08,
"logits/chosen": 72532555.29411764,
"logits/rejected": 21655136.0,
"logps/chosen": -712.5492302389706,
"logps/rejected": -460.46861979166664,
"loss": 0.4459,
"rewards/chosen": 0.7071007560281193,
"rewards/margins": 0.688404831348681,
"rewards/rejected": 0.018695924679438272,
"step": 49
},
{
"epoch": 1.8699763593380614,
"grad_norm": 3.220852851867676,
"kl": 3.0724895000457764,
"learning_rate": 1.844084300893456e-08,
"logits/chosen": 142174912.0,
"logits/rejected": 48436404.0,
"logps/chosen": -890.3675537109375,
"logps/rejected": -522.8453369140625,
"loss": 0.3967,
"rewards/chosen": 0.9176401495933533,
"rewards/margins": 1.058032900094986,
"rewards/rejected": -0.1403927505016327,
"step": 50
},
{
"epoch": 1.9078014184397163,
"grad_norm": 3.456094980239868,
"kl": 3.655463933944702,
"learning_rate": 8.209986176753948e-09,
"logits/chosen": 148127197.86666667,
"logits/rejected": 54668976.941176474,
"logps/chosen": -868.8671223958333,
"logps/rejected": -438.52404067095586,
"loss": 0.4497,
"rewards/chosen": 0.7612937927246094,
"rewards/margins": 0.8692790157654706,
"rewards/rejected": -0.10798522304086124,
"step": 51
},
{
"epoch": 1.9456264775413712,
"grad_norm": 4.0304388999938965,
"kl": 4.205752849578857,
"learning_rate": 2.054607249663665e-09,
"logits/chosen": 153849931.29411766,
"logits/rejected": 19226353.066666666,
"logps/chosen": -891.9115349264706,
"logps/rejected": -430.0306640625,
"loss": 0.4745,
"rewards/chosen": 0.7914738374597886,
"rewards/margins": 0.6363162396000881,
"rewards/rejected": 0.1551575978597005,
"step": 52
}
],
"logging_steps": 1,
"max_steps": 52,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}