chenggong1995 commited on
Commit
dff5975
·
verified ·
1 Parent(s): 51dd17c

Model save

Browse files
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ model_name: Qwen-2.5-Base-7B-gen8-mix_hint50-grpo-CL-beta1e-3-epoch1-v2
4
+ tags:
5
+ - generated_from_trainer
6
+ - trl
7
+ - grpo
8
+ licence: license
9
+ ---
10
+
11
+ # Model Card for Qwen-2.5-Base-7B-gen8-mix_hint50-grpo-CL-beta1e-3-epoch1-v2
12
+
13
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
14
+ It has been trained using [TRL](https://github.com/huggingface/trl).
15
+
16
+ ## Quick start
17
+
18
+ ```python
19
+ from transformers import pipeline
20
+
21
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
22
+ generator = pipeline("text-generation", model="chenggong1995/Qwen-2.5-Base-7B-gen8-mix_hint50-grpo-CL-beta1e-3-epoch1-v2", device="cuda")
23
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
24
+ print(output["generated_text"])
25
+ ```
26
+
27
+ ## Training procedure
28
+
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/gongc1995-city-university-of-hong-kong/huggingface/runs/587x54lv)
30
+
31
+
32
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
33
+
34
+ ### Framework versions
35
+
36
+ - TRL: 0.16.0
37
+ - Transformers: 4.50.0
38
+ - Pytorch: 2.5.1
39
+ - Datasets: 3.5.0
40
+ - Tokenizers: 0.21.1
41
+
42
+ ## Citations
43
+
44
+ Cite GRPO as:
45
+
46
+ ```bibtex
47
+ @article{zhihong2024deepseekmath,
48
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
49
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
50
+ year = 2024,
51
+ eprint = {arXiv:2402.03300},
52
+ }
53
+
54
+ ```
55
+
56
+ Cite TRL as:
57
+
58
+ ```bibtex
59
+ @misc{vonwerra2022trl,
60
+ title = {{TRL: Transformer Reinforcement Learning}},
61
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
62
+ year = 2020,
63
+ journal = {GitHub repository},
64
+ publisher = {GitHub},
65
+ howpublished = {\url{https://github.com/huggingface/trl}}
66
+ }
67
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.024119453254814554,
4
+ "train_runtime": 33084.5686,
5
+ "train_samples": 18434,
6
+ "train_samples_per_second": 0.557,
7
+ "train_steps_per_second": 0.005
8
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.50.0"
6
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.024119453254814554,
4
+ "train_runtime": 33084.5686,
5
+ "train_samples": 18434,
6
+ "train_samples_per_second": 0.557,
7
+ "train_steps_per_second": 0.005
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9962034927866363,
6
+ "eval_steps": 2000000,
7
+ "global_step": 164,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio": 0.0,
14
+ "completion_length": 803.0022583007812,
15
+ "epoch": 0.006074411541381929,
16
+ "grad_norm": 0.14674668166353436,
17
+ "kl": 0.0,
18
+ "learning_rate": 5.88235294117647e-08,
19
+ "loss": 0.0479,
20
+ "num_tokens": 918402.0,
21
+ "reward": 0.9815848618745804,
22
+ "reward_std": 0.23756355978548527,
23
+ "rewards/accuracy_reward": 0.4933035746216774,
24
+ "rewards/format_reward": 0.9765624925494194,
25
+ "step": 1
26
+ },
27
+ {
28
+ "clip_ratio": 0.0,
29
+ "completion_length": 734.3027648925781,
30
+ "epoch": 0.030372057706909643,
31
+ "grad_norm": 0.13057922454955487,
32
+ "kl": 5.84721565246582e-05,
33
+ "learning_rate": 2.941176470588235e-07,
34
+ "loss": 0.037,
35
+ "num_tokens": 4258679.0,
36
+ "reward": 1.0901228114962578,
37
+ "reward_std": 0.23264290555380285,
38
+ "rewards/accuracy_reward": 0.5965401763096452,
39
+ "rewards/format_reward": 0.9871651735156775,
40
+ "step": 5
41
+ },
42
+ {
43
+ "clip_ratio": 0.0,
44
+ "completion_length": 743.0277114868164,
45
+ "epoch": 0.060744115413819286,
46
+ "grad_norm": 0.1343510023679506,
47
+ "kl": 8.401274681091309e-05,
48
+ "learning_rate": 5.88235294117647e-07,
49
+ "loss": 0.0465,
50
+ "num_tokens": 8471139.0,
51
+ "reward": 1.062276841700077,
52
+ "reward_std": 0.23240854553878307,
53
+ "rewards/accuracy_reward": 0.5718749992549419,
54
+ "rewards/format_reward": 0.9808035627007484,
55
+ "step": 10
56
+ },
57
+ {
58
+ "clip_ratio": 0.0,
59
+ "completion_length": 739.1007049560546,
60
+ "epoch": 0.09111617312072894,
61
+ "grad_norm": 0.738570752534254,
62
+ "kl": 0.00017675161361694336,
63
+ "learning_rate": 8.823529411764705e-07,
64
+ "loss": 0.0358,
65
+ "num_tokens": 12734494.0,
66
+ "reward": 1.0722098737955092,
67
+ "reward_std": 0.2205923892557621,
68
+ "rewards/accuracy_reward": 0.5785714268684388,
69
+ "rewards/format_reward": 0.9872767761349678,
70
+ "step": 15
71
+ },
72
+ {
73
+ "clip_ratio": 0.0,
74
+ "completion_length": 745.5227966308594,
75
+ "epoch": 0.12148823082763857,
76
+ "grad_norm": 0.30158627332435983,
77
+ "kl": 0.0026874780654907227,
78
+ "learning_rate": 9.989726963751682e-07,
79
+ "loss": 0.037,
80
+ "num_tokens": 16990620.0,
81
+ "reward": 1.0786830827593803,
82
+ "reward_std": 0.23385403044521808,
83
+ "rewards/accuracy_reward": 0.5861607141792774,
84
+ "rewards/format_reward": 0.9850446373224259,
85
+ "step": 20
86
+ },
87
+ {
88
+ "clip_ratio": 0.0,
89
+ "completion_length": 731.8799407958984,
90
+ "epoch": 0.15186028853454822,
91
+ "grad_norm": 0.10368791276497527,
92
+ "kl": 0.0003843784332275391,
93
+ "learning_rate": 9.927100106776212e-07,
94
+ "loss": 0.0351,
95
+ "num_tokens": 21176106.0,
96
+ "reward": 1.106584869325161,
97
+ "reward_std": 0.21388941686600446,
98
+ "rewards/accuracy_reward": 0.6142857171595096,
99
+ "rewards/format_reward": 0.9845982104539871,
100
+ "step": 25
101
+ },
102
+ {
103
+ "clip_ratio": 0.0,
104
+ "completion_length": 716.5080688476562,
105
+ "epoch": 0.18223234624145787,
106
+ "grad_norm": 0.24334246130754558,
107
+ "kl": 0.0008969306945800781,
108
+ "learning_rate": 9.808267184205181e-07,
109
+ "loss": 0.0203,
110
+ "num_tokens": 25301046.0,
111
+ "reward": 1.0974330857396126,
112
+ "reward_std": 0.21846173331141472,
113
+ "rewards/accuracy_reward": 0.6022321447730065,
114
+ "rewards/format_reward": 0.9904017791152,
115
+ "step": 30
116
+ },
117
+ {
118
+ "clip_ratio": 0.0,
119
+ "completion_length": 718.258511352539,
120
+ "epoch": 0.2126044039483675,
121
+ "grad_norm": 0.13060657109261103,
122
+ "kl": 0.0011320114135742188,
123
+ "learning_rate": 9.634583786730108e-07,
124
+ "loss": 0.0247,
125
+ "num_tokens": 29447476.0,
126
+ "reward": 1.116071480512619,
127
+ "reward_std": 0.21333869993686677,
128
+ "rewards/accuracy_reward": 0.620758930593729,
129
+ "rewards/format_reward": 0.9906249955296517,
130
+ "step": 35
131
+ },
132
+ {
133
+ "clip_ratio": 0.0,
134
+ "completion_length": 734.5969055175781,
135
+ "epoch": 0.24297646165527714,
136
+ "grad_norm": 0.14790799582315342,
137
+ "kl": 0.0016210556030273437,
138
+ "learning_rate": 9.408031213740044e-07,
139
+ "loss": 0.0307,
140
+ "num_tokens": 33678894.0,
141
+ "reward": 1.0677455827593803,
142
+ "reward_std": 0.2126396529376507,
143
+ "rewards/accuracy_reward": 0.5736607156693936,
144
+ "rewards/format_reward": 0.9881696373224258,
145
+ "step": 40
146
+ },
147
+ {
148
+ "clip_ratio": 0.0,
149
+ "completion_length": 724.3696716308593,
150
+ "epoch": 0.2733485193621868,
151
+ "grad_norm": 0.1235294567362974,
152
+ "kl": 0.0030605316162109373,
153
+ "learning_rate": 9.131193871579974e-07,
154
+ "loss": 0.0288,
155
+ "num_tokens": 37860574.0,
156
+ "reward": 1.0801339834928512,
157
+ "reward_std": 0.22113933004438877,
158
+ "rewards/accuracy_reward": 0.5854910694062709,
159
+ "rewards/format_reward": 0.9892857074737549,
160
+ "step": 45
161
+ },
162
+ {
163
+ "clip_ratio": 0.0,
164
+ "completion_length": 719.0857437133789,
165
+ "epoch": 0.30372057706909644,
166
+ "grad_norm": 0.11260442217805001,
167
+ "kl": 0.004908370971679688,
168
+ "learning_rate": 8.807229791845671e-07,
169
+ "loss": 0.0309,
170
+ "num_tokens": 42021414.0,
171
+ "reward": 1.1001116633415222,
172
+ "reward_std": 0.2082567172124982,
173
+ "rewards/accuracy_reward": 0.6053571477532387,
174
+ "rewards/format_reward": 0.9895089223980904,
175
+ "step": 50
176
+ },
177
+ {
178
+ "clip_ratio": 0.0,
179
+ "completion_length": 710.4136505126953,
180
+ "epoch": 0.3340926347760061,
181
+ "grad_norm": 0.12446746182181773,
182
+ "kl": 0.00711669921875,
183
+ "learning_rate": 8.439834606028593e-07,
184
+ "loss": 0.03,
185
+ "num_tokens": 46149299.0,
186
+ "reward": 1.1018973752856254,
187
+ "reward_std": 0.20198939852416514,
188
+ "rewards/accuracy_reward": 0.6075892850756646,
189
+ "rewards/format_reward": 0.9886160641908646,
190
+ "step": 55
191
+ },
192
+ {
193
+ "clip_ratio": 0.0,
194
+ "completion_length": 711.5286056518555,
195
+ "epoch": 0.36446469248291574,
196
+ "grad_norm": 0.13140372700302483,
197
+ "kl": 0.01026763916015625,
198
+ "learning_rate": 8.033199387471276e-07,
199
+ "loss": 0.0255,
200
+ "num_tokens": 50248419.0,
201
+ "reward": 1.0939732655882835,
202
+ "reward_std": 0.20504674576222898,
203
+ "rewards/accuracy_reward": 0.6002232126891613,
204
+ "rewards/format_reward": 0.9874999925494194,
205
+ "step": 60
206
+ },
207
+ {
208
+ "clip_ratio": 0.0,
209
+ "completion_length": 693.0864105224609,
210
+ "epoch": 0.39483675018982534,
211
+ "grad_norm": 5.84593208956697,
212
+ "kl": 0.01532440185546875,
213
+ "learning_rate": 7.591962841552626e-07,
214
+ "loss": 0.0221,
215
+ "num_tokens": 54273302.0,
216
+ "reward": 1.102567011117935,
217
+ "reward_std": 0.19313923437148334,
218
+ "rewards/accuracy_reward": 0.6060267843306064,
219
+ "rewards/format_reward": 0.9930803507566452,
220
+ "step": 65
221
+ },
222
+ {
223
+ "clip_ratio": 0.0,
224
+ "completion_length": 698.3335159301757,
225
+ "epoch": 0.425208807896735,
226
+ "grad_norm": 0.18797262379226592,
227
+ "kl": 0.0115875244140625,
228
+ "learning_rate": 7.121158389495185e-07,
229
+ "loss": 0.0308,
230
+ "num_tokens": 58317828.0,
231
+ "reward": 1.0886161223053932,
232
+ "reward_std": 0.2050962893292308,
233
+ "rewards/accuracy_reward": 0.5941964283585548,
234
+ "rewards/format_reward": 0.9888392791152001,
235
+ "step": 70
236
+ },
237
+ {
238
+ "clip_ratio": 0.0,
239
+ "completion_length": 680.4533798217774,
240
+ "epoch": 0.45558086560364464,
241
+ "grad_norm": 0.12462934113614446,
242
+ "kl": 0.01414947509765625,
243
+ "learning_rate": 6.626156749437736e-07,
244
+ "loss": 0.0258,
245
+ "num_tokens": 62259643.0,
246
+ "reward": 1.1068080976605414,
247
+ "reward_std": 0.1903689544647932,
248
+ "rewards/accuracy_reward": 0.6107142798602581,
249
+ "rewards/format_reward": 0.9921874910593033,
250
+ "step": 75
251
+ },
252
+ {
253
+ "clip_ratio": 0.0,
254
+ "completion_length": 671.9855209350586,
255
+ "epoch": 0.4859529233105543,
256
+ "grad_norm": 0.12209526567257212,
257
+ "kl": 0.01604156494140625,
258
+ "learning_rate": 6.112604669781572e-07,
259
+ "loss": 0.0134,
260
+ "num_tokens": 66175850.0,
261
+ "reward": 1.0722098708152772,
262
+ "reward_std": 0.1989122748374939,
263
+ "rewards/accuracy_reward": 0.5743303574621678,
264
+ "rewards/format_reward": 0.9957589223980904,
265
+ "step": 80
266
+ },
267
+ {
268
+ "clip_ratio": 0.0,
269
+ "completion_length": 655.3560516357422,
270
+ "epoch": 0.5163249810174639,
271
+ "grad_norm": 0.12987027750323757,
272
+ "kl": 0.017596435546875,
273
+ "learning_rate": 5.586360513712009e-07,
274
+ "loss": 0.0202,
275
+ "num_tokens": 70039477.0,
276
+ "reward": 1.1373884499073028,
277
+ "reward_std": 0.17809431692585348,
278
+ "rewards/accuracy_reward": 0.6401785723865032,
279
+ "rewards/format_reward": 0.9944196373224259,
280
+ "step": 85
281
+ },
282
+ {
283
+ "clip_ratio": 0.0,
284
+ "completion_length": 679.969448852539,
285
+ "epoch": 0.5466970387243736,
286
+ "grad_norm": 0.11569644648116158,
287
+ "kl": 0.01629638671875,
288
+ "learning_rate": 5.053427429716866e-07,
289
+ "loss": 0.0261,
290
+ "num_tokens": 74010748.0,
291
+ "reward": 1.1206473752856254,
292
+ "reward_std": 0.16938802655786275,
293
+ "rewards/accuracy_reward": 0.62518887296319,
294
+ "rewards/format_reward": 0.9930803492665291,
295
+ "step": 90
296
+ },
297
+ {
298
+ "clip_ratio": 0.0,
299
+ "completion_length": 670.8172203063965,
300
+ "epoch": 0.5770690964312832,
301
+ "grad_norm": 0.14653189798827554,
302
+ "kl": 0.01604461669921875,
303
+ "learning_rate": 4.519884870461591e-07,
304
+ "loss": 0.0062,
305
+ "num_tokens": 77924425.0,
306
+ "reward": 1.1319196939468383,
307
+ "reward_std": 0.18193732015788555,
308
+ "rewards/accuracy_reward": 0.6334821462631226,
309
+ "rewards/format_reward": 0.9968749970197678,
310
+ "step": 95
311
+ },
312
+ {
313
+ "clip_ratio": 0.0,
314
+ "completion_length": 689.6049453735352,
315
+ "epoch": 0.6074411541381929,
316
+ "grad_norm": 0.1373443920915848,
317
+ "kl": 0.0151153564453125,
318
+ "learning_rate": 3.991819241221835e-07,
319
+ "loss": 0.0202,
320
+ "num_tokens": 81937855.0,
321
+ "reward": 1.1222098782658576,
322
+ "reward_std": 0.1860942555591464,
323
+ "rewards/accuracy_reward": 0.625,
324
+ "rewards/format_reward": 0.9944196373224259,
325
+ "step": 100
326
+ },
327
+ {
328
+ "clip_ratio": 0.0,
329
+ "completion_length": 675.8683364868164,
330
+ "epoch": 0.6378132118451025,
331
+ "grad_norm": 0.14762350547769187,
332
+ "kl": 0.015521240234375,
333
+ "learning_rate": 3.4752544690038643e-07,
334
+ "loss": 0.0151,
335
+ "num_tokens": 85899921.0,
336
+ "reward": 1.128125049173832,
337
+ "reward_std": 0.1950968151912093,
338
+ "rewards/accuracy_reward": 0.6305803559720516,
339
+ "rewards/format_reward": 0.9950892791152001,
340
+ "step": 105
341
+ },
342
+ {
343
+ "clip_ratio": 0.0,
344
+ "completion_length": 696.7239181518555,
345
+ "epoch": 0.6681852695520122,
346
+ "grad_norm": 0.12897190097851505,
347
+ "kl": 0.0144561767578125,
348
+ "learning_rate": 2.976083284388031e-07,
349
+ "loss": 0.0225,
350
+ "num_tokens": 89967132.0,
351
+ "reward": 1.0918527334928512,
352
+ "reward_std": 0.1823650782927871,
353
+ "rewards/accuracy_reward": 0.5953125059604645,
354
+ "rewards/format_reward": 0.9930803492665291,
355
+ "step": 110
356
+ },
357
+ {
358
+ "clip_ratio": 0.0,
359
+ "completion_length": 719.4877540588379,
360
+ "epoch": 0.6985573272589218,
361
+ "grad_norm": 0.12020301457071197,
362
+ "kl": 0.014898681640625,
363
+ "learning_rate": 2.500000000000001e-07,
364
+ "loss": 0.0153,
365
+ "num_tokens": 94129389.0,
366
+ "reward": 1.1255580872297286,
367
+ "reward_std": 0.19005396589636803,
368
+ "rewards/accuracy_reward": 0.6283482149243355,
369
+ "rewards/format_reward": 0.9944196343421936,
370
+ "step": 115
371
+ },
372
+ {
373
+ "clip_ratio": 0.0,
374
+ "completion_length": 681.8125282287598,
375
+ "epoch": 0.7289293849658315,
376
+ "grad_norm": 0.1344831996031667,
377
+ "kl": 0.01582489013671875,
378
+ "learning_rate": 2.0524355524417015e-07,
379
+ "loss": 0.0192,
380
+ "num_tokens": 98103221.0,
381
+ "reward": 1.1333705812692643,
382
+ "reward_std": 0.20433492437005044,
383
+ "rewards/accuracy_reward": 0.6354910746216774,
384
+ "rewards/format_reward": 0.9957589209079742,
385
+ "step": 120
386
+ },
387
+ {
388
+ "clip_ratio": 0.0,
389
+ "completion_length": 681.395115661621,
390
+ "epoch": 0.7593014426727411,
391
+ "grad_norm": 0.13865803911159388,
392
+ "kl": 0.0163787841796875,
393
+ "learning_rate": 1.6384955486934154e-07,
394
+ "loss": 0.0122,
395
+ "num_tokens": 102096279.0,
396
+ "reward": 1.1371652349829673,
397
+ "reward_std": 0.18053851332515478,
398
+ "rewards/accuracy_reward": 0.6404017873108387,
399
+ "rewards/format_reward": 0.9935267820954323,
400
+ "step": 125
401
+ },
402
+ {
403
+ "clip_ratio": 0.0,
404
+ "completion_length": 702.8656555175781,
405
+ "epoch": 0.7896735003796507,
406
+ "grad_norm": 0.10840464189916307,
407
+ "kl": 0.01602935791015625,
408
+ "learning_rate": 1.262902023724824e-07,
409
+ "loss": 0.0162,
410
+ "num_tokens": 106168389.0,
411
+ "reward": 1.1045759484171866,
412
+ "reward_std": 0.18122291592881085,
413
+ "rewards/accuracy_reward": 0.6062500029802322,
414
+ "rewards/format_reward": 0.9966517820954323,
415
+ "step": 130
416
+ },
417
+ {
418
+ "clip_ratio": 0.0,
419
+ "completion_length": 696.1946792602539,
420
+ "epoch": 0.8200455580865603,
421
+ "grad_norm": 0.1591189816361499,
422
+ "kl": 0.01680145263671875,
423
+ "learning_rate": 9.299395737170757e-08,
424
+ "loss": 0.0167,
425
+ "num_tokens": 110246405.0,
426
+ "reward": 1.0947545170783997,
427
+ "reward_std": 0.20664523243904115,
428
+ "rewards/accuracy_reward": 0.5973214291036129,
429
+ "rewards/format_reward": 0.9948660656809807,
430
+ "step": 135
431
+ },
432
+ {
433
+ "clip_ratio": 0.0,
434
+ "completion_length": 698.2578453063965,
435
+ "epoch": 0.85041761579347,
436
+ "grad_norm": 0.13052581661637774,
437
+ "kl": 0.0175323486328125,
438
+ "learning_rate": 6.43406479383053e-08,
439
+ "loss": 0.0206,
440
+ "num_tokens": 114326168.0,
441
+ "reward": 1.1349330857396125,
442
+ "reward_std": 0.19575350042432546,
443
+ "rewards/accuracy_reward": 0.6372767858207226,
444
+ "rewards/format_reward": 0.9953124925494194,
445
+ "step": 140
446
+ },
447
+ {
448
+ "clip_ratio": 0.0,
449
+ "completion_length": 678.3460144042969,
450
+ "epoch": 0.8807896735003796,
451
+ "grad_norm": 0.12342327662368928,
452
+ "kl": 0.01726837158203125,
453
+ "learning_rate": 4.065713769482082e-08,
454
+ "loss": 0.0217,
455
+ "num_tokens": 118268150.0,
456
+ "reward": 1.131250050663948,
457
+ "reward_std": 0.19892821311950684,
458
+ "rewards/accuracy_reward": 0.6337053582072258,
459
+ "rewards/format_reward": 0.9950892806053162,
460
+ "step": 145
461
+ },
462
+ {
463
+ "clip_ratio": 0.0,
464
+ "completion_length": 699.1832885742188,
465
+ "epoch": 0.9111617312072893,
466
+ "grad_norm": 0.11347140306400243,
467
+ "kl": 0.01734161376953125,
468
+ "learning_rate": 2.2213597106929605e-08,
469
+ "loss": 0.0206,
470
+ "num_tokens": 122323331.0,
471
+ "reward": 1.123325940966606,
472
+ "reward_std": 0.19308128226548432,
473
+ "rewards/accuracy_reward": 0.625669640302658,
474
+ "rewards/format_reward": 0.9953124925494194,
475
+ "step": 150
476
+ },
477
+ {
478
+ "clip_ratio": 0.0,
479
+ "completion_length": 699.7196716308594,
480
+ "epoch": 0.9415337889141989,
481
+ "grad_norm": 0.1475830222889839,
482
+ "kl": 0.01719207763671875,
483
+ "learning_rate": 9.22042150446728e-09,
484
+ "loss": 0.0185,
485
+ "num_tokens": 126366451.0,
486
+ "reward": 1.154241117835045,
487
+ "reward_std": 0.1731583815999329,
488
+ "rewards/accuracy_reward": 0.6560267820954323,
489
+ "rewards/format_reward": 0.9964285656809807,
490
+ "step": 155
491
+ },
492
+ {
493
+ "clip_ratio": 0.0,
494
+ "completion_length": 695.9060577392578,
495
+ "epoch": 0.9719058466211086,
496
+ "grad_norm": 0.13967302854618574,
497
+ "kl": 0.01683197021484375,
498
+ "learning_rate": 1.8258309893965374e-09,
499
+ "loss": 0.0191,
500
+ "num_tokens": 130430318.0,
501
+ "reward": 1.1508929118514062,
502
+ "reward_std": 0.20208097249269485,
503
+ "rewards/accuracy_reward": 0.6546875029802323,
504
+ "rewards/format_reward": 0.9924107119441032,
505
+ "step": 160
506
+ },
507
+ {
508
+ "clip_ratio": 0.0,
509
+ "completion_length": 684.7267169952393,
510
+ "epoch": 0.9962034927866363,
511
+ "kl": 0.017988204956054688,
512
+ "num_tokens": 133627034.0,
513
+ "reward": 1.118443138897419,
514
+ "reward_std": 0.1834622365422547,
515
+ "rewards/accuracy_reward": 0.6226820051670074,
516
+ "rewards/format_reward": 0.9946986511349678,
517
+ "step": 164,
518
+ "total_flos": 0.0,
519
+ "train_loss": 0.024119453254814554,
520
+ "train_runtime": 33084.5686,
521
+ "train_samples_per_second": 0.557,
522
+ "train_steps_per_second": 0.005
523
+ }
524
+ ],
525
+ "logging_steps": 5,
526
+ "max_steps": 164,
527
+ "num_input_tokens_seen": 0,
528
+ "num_train_epochs": 1,
529
+ "save_steps": 500,
530
+ "stateful_callbacks": {
531
+ "TrainerControl": {
532
+ "args": {
533
+ "should_epoch_stop": false,
534
+ "should_evaluate": false,
535
+ "should_log": false,
536
+ "should_save": true,
537
+ "should_training_stop": true
538
+ },
539
+ "attributes": {}
540
+ }
541
+ },
542
+ "total_flos": 0.0,
543
+ "train_batch_size": 16,
544
+ "trial_name": null,
545
+ "trial_params": null
546
+ }