chenggong1995 commited on
Commit
eef2cee
·
verified ·
1 Parent(s): 4df2947

Model save

Browse files
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ model_name: Qwen-2.5-Base-7B-gen8-mix_hint50-grpo-CL-beta0-epoch1-v2
4
+ tags:
5
+ - generated_from_trainer
6
+ - trl
7
+ - grpo
8
+ licence: license
9
+ ---
10
+
11
+ # Model Card for Qwen-2.5-Base-7B-gen8-mix_hint50-grpo-CL-beta0-epoch1-v2
12
+
13
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
14
+ It has been trained using [TRL](https://github.com/huggingface/trl).
15
+
16
+ ## Quick start
17
+
18
+ ```python
19
+ from transformers import pipeline
20
+
21
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
22
+ generator = pipeline("text-generation", model="chenggong1995/Qwen-2.5-Base-7B-gen8-mix_hint50-grpo-CL-beta0-epoch1-v2", device="cuda")
23
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
24
+ print(output["generated_text"])
25
+ ```
26
+
27
+ ## Training procedure
28
+
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/gongc1995-city-university-of-hong-kong/huggingface/runs/0urzu3fg)
30
+
31
+
32
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
33
+
34
+ ### Framework versions
35
+
36
+ - TRL: 0.16.0
37
+ - Transformers: 4.50.0
38
+ - Pytorch: 2.5.1
39
+ - Datasets: 3.5.0
40
+ - Tokenizers: 0.21.1
41
+
42
+ ## Citations
43
+
44
+ Cite GRPO as:
45
+
46
+ ```bibtex
47
+ @article{zhihong2024deepseekmath,
48
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
49
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
50
+ year = 2024,
51
+ eprint = {arXiv:2402.03300},
52
+ }
53
+
54
+ ```
55
+
56
+ Cite TRL as:
57
+
58
+ ```bibtex
59
+ @misc{vonwerra2022trl,
60
+ title = {{TRL: Transformer Reinforcement Learning}},
61
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
62
+ year = 2020,
63
+ journal = {GitHub repository},
64
+ publisher = {GitHub},
65
+ howpublished = {\url{https://github.com/huggingface/trl}}
66
+ }
67
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.019764071750659043,
4
+ "train_runtime": 31261.7596,
5
+ "train_samples": 18434,
6
+ "train_samples_per_second": 0.59,
7
+ "train_steps_per_second": 0.005
8
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.50.0"
6
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.019764071750659043,
4
+ "train_runtime": 31261.7596,
5
+ "train_samples": 18434,
6
+ "train_samples_per_second": 0.59,
7
+ "train_steps_per_second": 0.005
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9962034927866363,
6
+ "eval_steps": 2000000,
7
+ "global_step": 164,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio": 0.0,
14
+ "completion_length": 803.0022583007812,
15
+ "epoch": 0.006074411541381929,
16
+ "grad_norm": 0.14656181705800808,
17
+ "learning_rate": 5.88235294117647e-08,
18
+ "loss": 0.0479,
19
+ "num_tokens": 918402.0,
20
+ "reward": 0.9815848618745804,
21
+ "reward_std": 0.23756355978548527,
22
+ "rewards/accuracy_reward": 0.4933035746216774,
23
+ "rewards/format_reward": 0.9765624925494194,
24
+ "step": 1
25
+ },
26
+ {
27
+ "clip_ratio": 0.0,
28
+ "completion_length": 732.8228569030762,
29
+ "epoch": 0.030372057706909643,
30
+ "grad_norm": 0.1360498469331386,
31
+ "learning_rate": 2.941176470588235e-07,
32
+ "loss": 0.0275,
33
+ "num_tokens": 4253375.0,
34
+ "reward": 1.0891462452709675,
35
+ "reward_std": 0.2294587183278054,
36
+ "rewards/accuracy_reward": 0.5954241026192904,
37
+ "rewards/format_reward": 0.9874441903084517,
38
+ "step": 5
39
+ },
40
+ {
41
+ "clip_ratio": 0.0,
42
+ "completion_length": 733.9051712036132,
43
+ "epoch": 0.060744115413819286,
44
+ "grad_norm": 2.020390330382682,
45
+ "learning_rate": 5.88235294117647e-07,
46
+ "loss": 0.0394,
47
+ "num_tokens": 8424966.0,
48
+ "reward": 1.074776826798916,
49
+ "reward_std": 0.2325986440293491,
50
+ "rewards/accuracy_reward": 0.5812499992549419,
51
+ "rewards/format_reward": 0.9870535641908645,
52
+ "step": 10
53
+ },
54
+ {
55
+ "clip_ratio": 0.0,
56
+ "completion_length": 730.850700378418,
57
+ "epoch": 0.09111617312072894,
58
+ "grad_norm": 0.6485429367394874,
59
+ "learning_rate": 8.823529411764705e-07,
60
+ "loss": 0.0319,
61
+ "num_tokens": 12651361.0,
62
+ "reward": 1.068080399930477,
63
+ "reward_std": 0.22116196975111962,
64
+ "rewards/accuracy_reward": 0.5736607149243355,
65
+ "rewards/format_reward": 0.988839277625084,
66
+ "step": 15
67
+ },
68
+ {
69
+ "clip_ratio": 0.0,
70
+ "completion_length": 749.2339599609375,
71
+ "epoch": 0.12148823082763857,
72
+ "grad_norm": 0.15121241250002537,
73
+ "learning_rate": 9.989726963751682e-07,
74
+ "loss": 0.0412,
75
+ "num_tokens": 16924113.0,
76
+ "reward": 1.0832589775323869,
77
+ "reward_std": 0.24419224373996257,
78
+ "rewards/accuracy_reward": 0.590625,
79
+ "rewards/format_reward": 0.9852678462862968,
80
+ "step": 20
81
+ },
82
+ {
83
+ "clip_ratio": 0.0,
84
+ "completion_length": 733.0969100952149,
85
+ "epoch": 0.15186028853454822,
86
+ "grad_norm": 0.1625406073177716,
87
+ "learning_rate": 9.927100106776212e-07,
88
+ "loss": 0.0385,
89
+ "num_tokens": 21115051.0,
90
+ "reward": 1.1054687976837159,
91
+ "reward_std": 0.22408477105200292,
92
+ "rewards/accuracy_reward": 0.6136160686612129,
93
+ "rewards/format_reward": 0.9837053477764129,
94
+ "step": 25
95
+ },
96
+ {
97
+ "clip_ratio": 0.0,
98
+ "completion_length": 720.6406631469727,
99
+ "epoch": 0.18223234624145787,
100
+ "grad_norm": 2.666467566757727,
101
+ "learning_rate": 9.808267184205181e-07,
102
+ "loss": 0.0239,
103
+ "num_tokens": 25258505.0,
104
+ "reward": 1.0918527334928512,
105
+ "reward_std": 0.22341929338872432,
106
+ "rewards/accuracy_reward": 0.5984375,
107
+ "rewards/format_reward": 0.9868303462862968,
108
+ "step": 30
109
+ },
110
+ {
111
+ "clip_ratio": 0.0,
112
+ "completion_length": 701.581282043457,
113
+ "epoch": 0.2126044039483675,
114
+ "grad_norm": 0.2580879090307691,
115
+ "learning_rate": 9.634583786730108e-07,
116
+ "loss": 0.0261,
117
+ "num_tokens": 29330221.0,
118
+ "reward": 1.1060268327593803,
119
+ "reward_std": 0.21581739820539952,
120
+ "rewards/accuracy_reward": 0.613616070151329,
121
+ "rewards/format_reward": 0.984821417927742,
122
+ "step": 35
123
+ },
124
+ {
125
+ "clip_ratio": 0.0,
126
+ "completion_length": 707.9558319091797,
127
+ "epoch": 0.24297646165527714,
128
+ "grad_norm": 0.8886692354776214,
129
+ "learning_rate": 9.408031213740044e-07,
130
+ "loss": 0.0272,
131
+ "num_tokens": 33442287.0,
132
+ "reward": 1.0679688140749932,
133
+ "reward_std": 0.21454464346170427,
134
+ "rewards/accuracy_reward": 0.5720982141792774,
135
+ "rewards/format_reward": 0.9917410612106323,
136
+ "step": 40
137
+ },
138
+ {
139
+ "clip_ratio": 0.0,
140
+ "completion_length": 699.1395431518555,
141
+ "epoch": 0.2733485193621868,
142
+ "grad_norm": 0.1839147765074166,
143
+ "learning_rate": 9.131193871579974e-07,
144
+ "loss": 0.0273,
145
+ "num_tokens": 37510936.0,
146
+ "reward": 1.0806920111179352,
147
+ "reward_std": 0.20978465098887683,
148
+ "rewards/accuracy_reward": 0.5868303567171097,
149
+ "rewards/format_reward": 0.987723208963871,
150
+ "step": 45
151
+ },
152
+ {
153
+ "clip_ratio": 0.0,
154
+ "completion_length": 688.0076187133789,
155
+ "epoch": 0.30372057706909644,
156
+ "grad_norm": 2.39827459380332,
157
+ "learning_rate": 8.807229791845671e-07,
158
+ "loss": 0.0254,
159
+ "num_tokens": 41532546.0,
160
+ "reward": 1.0955357611179353,
161
+ "reward_std": 0.2291064240038395,
162
+ "rewards/accuracy_reward": 0.6026785708963871,
163
+ "rewards/format_reward": 0.9857142746448517,
164
+ "step": 50
165
+ },
166
+ {
167
+ "clip_ratio": 0.0,
168
+ "completion_length": 705.5748062133789,
169
+ "epoch": 0.3340926347760061,
170
+ "grad_norm": 6.336128416350602,
171
+ "learning_rate": 8.439834606028593e-07,
172
+ "loss": 0.0312,
173
+ "num_tokens": 45638753.0,
174
+ "reward": 1.0700893342494964,
175
+ "reward_std": 0.22525846362113952,
176
+ "rewards/accuracy_reward": 0.5787946447730065,
177
+ "rewards/format_reward": 0.9825892791152,
178
+ "step": 55
179
+ },
180
+ {
181
+ "clip_ratio": 0.0,
182
+ "completion_length": 699.0966857910156,
183
+ "epoch": 0.36446469248291574,
184
+ "grad_norm": 2.8773101008786597,
185
+ "learning_rate": 8.033199387471276e-07,
186
+ "loss": 0.0257,
187
+ "num_tokens": 49682178.0,
188
+ "reward": 1.0860491678118707,
189
+ "reward_std": 0.19614009652286768,
190
+ "rewards/accuracy_reward": 0.5910714276134967,
191
+ "rewards/format_reward": 0.9899553537368775,
192
+ "step": 60
193
+ },
194
+ {
195
+ "clip_ratio": 0.0,
196
+ "completion_length": 676.4727966308594,
197
+ "epoch": 0.39483675018982534,
198
+ "grad_norm": 0.1564093879360907,
199
+ "learning_rate": 7.591962841552626e-07,
200
+ "loss": 0.0148,
201
+ "num_tokens": 53632632.0,
202
+ "reward": 1.104575951397419,
203
+ "reward_std": 0.1936511306092143,
204
+ "rewards/accuracy_reward": 0.6069196410477161,
205
+ "rewards/format_reward": 0.9953124940395355,
206
+ "step": 65
207
+ },
208
+ {
209
+ "clip_ratio": 0.0,
210
+ "completion_length": 693.0926666259766,
211
+ "epoch": 0.425208807896735,
212
+ "grad_norm": 0.21433648094895766,
213
+ "learning_rate": 7.121158389495185e-07,
214
+ "loss": 0.0179,
215
+ "num_tokens": 57653679.0,
216
+ "reward": 1.096875049173832,
217
+ "reward_std": 0.19877330139279364,
218
+ "rewards/accuracy_reward": 0.6008928552269935,
219
+ "rewards/format_reward": 0.9919642791152,
220
+ "step": 70
221
+ },
222
+ {
223
+ "clip_ratio": 0.0,
224
+ "completion_length": 677.7578414916992,
225
+ "epoch": 0.45558086560364464,
226
+ "grad_norm": 0.13423679492808266,
227
+ "learning_rate": 6.626156749437736e-07,
228
+ "loss": 0.023,
229
+ "num_tokens": 61583418.0,
230
+ "reward": 1.0986607670783997,
231
+ "reward_std": 0.19030132256448268,
232
+ "rewards/accuracy_reward": 0.6033482141792774,
233
+ "rewards/format_reward": 0.9906249925494194,
234
+ "step": 75
235
+ },
236
+ {
237
+ "clip_ratio": 0.0,
238
+ "completion_length": 677.3216796875,
239
+ "epoch": 0.4859529233105543,
240
+ "grad_norm": 0.12374303041671109,
241
+ "learning_rate": 6.112604669781572e-07,
242
+ "loss": 0.0139,
243
+ "num_tokens": 65523531.0,
244
+ "reward": 1.0838170185685159,
245
+ "reward_std": 0.20196059457957743,
246
+ "rewards/accuracy_reward": 0.5859375014901161,
247
+ "rewards/format_reward": 0.9957589223980904,
248
+ "step": 80
249
+ },
250
+ {
251
+ "clip_ratio": 0.0,
252
+ "completion_length": 661.0935562133789,
253
+ "epoch": 0.5163249810174639,
254
+ "grad_norm": 0.24633881049994122,
255
+ "learning_rate": 5.586360513712009e-07,
256
+ "loss": 0.0146,
257
+ "num_tokens": 69412862.0,
258
+ "reward": 1.1289062976837159,
259
+ "reward_std": 0.19050537403672935,
260
+ "rewards/accuracy_reward": 0.6308035746216774,
261
+ "rewards/format_reward": 0.9962053522467613,
262
+ "step": 85
263
+ },
264
+ {
265
+ "clip_ratio": 0.0,
266
+ "completion_length": 684.6299407958984,
267
+ "epoch": 0.5466970387243736,
268
+ "grad_norm": 0.20823664365461947,
269
+ "learning_rate": 5.053427429716866e-07,
270
+ "loss": 0.0183,
271
+ "num_tokens": 73405012.0,
272
+ "reward": 1.127790230512619,
273
+ "reward_std": 0.1836160296574235,
274
+ "rewards/accuracy_reward": 0.6328468464314938,
275
+ "rewards/format_reward": 0.9921874940395355,
276
+ "step": 90
277
+ },
278
+ {
279
+ "clip_ratio": 0.0,
280
+ "completion_length": 679.512525177002,
281
+ "epoch": 0.5770690964312832,
282
+ "grad_norm": 0.17725373837889702,
283
+ "learning_rate": 4.519884870461591e-07,
284
+ "loss": 0.013,
285
+ "num_tokens": 77357644.0,
286
+ "reward": 1.1372768431901932,
287
+ "reward_std": 0.1831109957769513,
288
+ "rewards/accuracy_reward": 0.6390624985098838,
289
+ "rewards/format_reward": 0.9964285656809807,
290
+ "step": 95
291
+ },
292
+ {
293
+ "clip_ratio": 0.0,
294
+ "completion_length": 691.0065063476562,
295
+ "epoch": 0.6074411541381929,
296
+ "grad_norm": 0.18903103595690637,
297
+ "learning_rate": 3.991819241221835e-07,
298
+ "loss": 0.015,
299
+ "num_tokens": 81377353.0,
300
+ "reward": 1.1179687947034835,
301
+ "reward_std": 0.18047410659492016,
302
+ "rewards/accuracy_reward": 0.6209821425378322,
303
+ "rewards/format_reward": 0.9939732044935227,
304
+ "step": 100
305
+ },
306
+ {
307
+ "clip_ratio": 0.0,
308
+ "completion_length": 678.3850769042969,
309
+ "epoch": 0.6378132118451025,
310
+ "grad_norm": 0.20779542924679995,
311
+ "learning_rate": 3.4752544690038643e-07,
312
+ "loss": 0.0112,
313
+ "num_tokens": 85350694.0,
314
+ "reward": 1.1294643417000771,
315
+ "reward_std": 0.18618108071386813,
316
+ "rewards/accuracy_reward": 0.6323660708963871,
317
+ "rewards/format_reward": 0.9941964238882065,
318
+ "step": 105
319
+ },
320
+ {
321
+ "clip_ratio": 0.0,
322
+ "completion_length": 688.9138717651367,
323
+ "epoch": 0.6681852695520122,
324
+ "grad_norm": 0.2676944642662668,
325
+ "learning_rate": 2.976083284388031e-07,
326
+ "loss": 0.0163,
327
+ "num_tokens": 89382916.0,
328
+ "reward": 1.101339329779148,
329
+ "reward_std": 0.1832605952396989,
330
+ "rewards/accuracy_reward": 0.6040178589522839,
331
+ "rewards/format_reward": 0.9946428507566452,
332
+ "step": 110
333
+ },
334
+ {
335
+ "clip_ratio": 0.0,
336
+ "completion_length": 709.0390884399415,
337
+ "epoch": 0.6985573272589218,
338
+ "grad_norm": 0.1723246791980809,
339
+ "learning_rate": 2.500000000000001e-07,
340
+ "loss": 0.0124,
341
+ "num_tokens": 93498363.0,
342
+ "reward": 1.123660759627819,
343
+ "reward_std": 0.18700322844088077,
344
+ "rewards/accuracy_reward": 0.6258928567171097,
345
+ "rewards/format_reward": 0.9955357074737549,
346
+ "step": 115
347
+ },
348
+ {
349
+ "clip_ratio": 0.0,
350
+ "completion_length": 674.1076217651367,
351
+ "epoch": 0.7289293849658315,
352
+ "grad_norm": 0.16904917847238007,
353
+ "learning_rate": 2.0524355524417015e-07,
354
+ "loss": 0.0146,
355
+ "num_tokens": 97437677.0,
356
+ "reward": 1.131361649930477,
357
+ "reward_std": 0.19161452651023864,
358
+ "rewards/accuracy_reward": 0.6328125,
359
+ "rewards/format_reward": 0.997098208963871,
360
+ "step": 120
361
+ },
362
+ {
363
+ "clip_ratio": 0.0,
364
+ "completion_length": 682.0279342651368,
365
+ "epoch": 0.7593014426727411,
366
+ "grad_norm": 0.17946193842059094,
367
+ "learning_rate": 1.6384955486934154e-07,
368
+ "loss": 0.0112,
369
+ "num_tokens": 101433570.0,
370
+ "reward": 1.1439732566475869,
371
+ "reward_std": 0.1751913372427225,
372
+ "rewards/accuracy_reward": 0.6462053626775741,
373
+ "rewards/format_reward": 0.9955357074737549,
374
+ "step": 125
375
+ },
376
+ {
377
+ "clip_ratio": 0.0,
378
+ "completion_length": 695.6339569091797,
379
+ "epoch": 0.7896735003796507,
380
+ "grad_norm": 0.1905658306829658,
381
+ "learning_rate": 1.262902023724824e-07,
382
+ "loss": 0.0081,
383
+ "num_tokens": 105473282.0,
384
+ "reward": 1.1212054073810578,
385
+ "reward_std": 0.17985087335109712,
386
+ "rewards/accuracy_reward": 0.6225446417927742,
387
+ "rewards/format_reward": 0.9973214253783226,
388
+ "step": 130
389
+ },
390
+ {
391
+ "clip_ratio": 0.0,
392
+ "completion_length": 689.9069519042969,
393
+ "epoch": 0.8200455580865603,
394
+ "grad_norm": 0.18740313988650945,
395
+ "learning_rate": 9.299395737170757e-08,
396
+ "loss": 0.0183,
397
+ "num_tokens": 109523129.0,
398
+ "reward": 1.0936384439468383,
399
+ "reward_std": 0.20308975782245398,
400
+ "rewards/accuracy_reward": 0.5962053589522839,
401
+ "rewards/format_reward": 0.9948660627007484,
402
+ "step": 135
403
+ },
404
+ {
405
+ "clip_ratio": 0.0,
406
+ "completion_length": 689.1723495483399,
407
+ "epoch": 0.85041761579347,
408
+ "grad_norm": 0.17643493300658672,
409
+ "learning_rate": 6.43406479383053e-08,
410
+ "loss": 0.0089,
411
+ "num_tokens": 113562189.0,
412
+ "reward": 1.1333705842494965,
413
+ "reward_std": 0.18020046018064023,
414
+ "rewards/accuracy_reward": 0.6352678582072258,
415
+ "rewards/format_reward": 0.9962053537368775,
416
+ "step": 140
417
+ },
418
+ {
419
+ "clip_ratio": 0.0,
420
+ "completion_length": 673.6042694091797,
421
+ "epoch": 0.8807896735003796,
422
+ "grad_norm": 3.201259451192229,
423
+ "learning_rate": 4.065713769482082e-08,
424
+ "loss": 0.0138,
425
+ "num_tokens": 117482928.0,
426
+ "reward": 1.1319196924567223,
427
+ "reward_std": 0.1873009530827403,
428
+ "rewards/accuracy_reward": 0.6339285716414451,
429
+ "rewards/format_reward": 0.9959821388125419,
430
+ "step": 145
431
+ },
432
+ {
433
+ "clip_ratio": 0.0,
434
+ "completion_length": 687.8794952392578,
435
+ "epoch": 0.9111617312072893,
436
+ "grad_norm": 2.7680554916216296,
437
+ "learning_rate": 2.2213597106929605e-08,
438
+ "loss": 0.0152,
439
+ "num_tokens": 121487468.0,
440
+ "reward": 1.1334821954369545,
441
+ "reward_std": 0.1863085398450494,
442
+ "rewards/accuracy_reward": 0.634821429848671,
443
+ "rewards/format_reward": 0.9973214238882064,
444
+ "step": 150
445
+ },
446
+ {
447
+ "clip_ratio": 0.0,
448
+ "completion_length": 691.2263702392578,
449
+ "epoch": 0.9415337889141989,
450
+ "grad_norm": 0.16790447183243948,
451
+ "learning_rate": 9.22042150446728e-09,
452
+ "loss": 0.0089,
453
+ "num_tokens": 125492538.0,
454
+ "reward": 1.1498884424567222,
455
+ "reward_std": 0.1872717458754778,
456
+ "rewards/accuracy_reward": 0.6511160723865033,
457
+ "rewards/format_reward": 0.997544638812542,
458
+ "step": 155
459
+ },
460
+ {
461
+ "clip_ratio": 0.0,
462
+ "completion_length": 689.0906539916992,
463
+ "epoch": 0.9719058466211086,
464
+ "grad_norm": 0.8754711405794506,
465
+ "learning_rate": 1.8258309893965374e-09,
466
+ "loss": 0.0103,
467
+ "num_tokens": 129525872.0,
468
+ "reward": 1.152567020058632,
469
+ "reward_std": 0.189371613971889,
470
+ "rewards/accuracy_reward": 0.6555803567171097,
471
+ "rewards/format_reward": 0.9939732134342194,
472
+ "step": 160
473
+ },
474
+ {
475
+ "clip_ratio": 0.0,
476
+ "completion_length": 673.1839790344238,
477
+ "epoch": 0.9962034927866363,
478
+ "num_tokens": 132694223.0,
479
+ "reward": 1.1247210260480642,
480
+ "reward_std": 0.1711470059817657,
481
+ "rewards/accuracy_reward": 0.6278545688837767,
482
+ "rewards/format_reward": 0.997209819033742,
483
+ "step": 164,
484
+ "total_flos": 0.0,
485
+ "train_loss": 0.019764071750659043,
486
+ "train_runtime": 31261.7596,
487
+ "train_samples_per_second": 0.59,
488
+ "train_steps_per_second": 0.005
489
+ }
490
+ ],
491
+ "logging_steps": 5,
492
+ "max_steps": 164,
493
+ "num_input_tokens_seen": 0,
494
+ "num_train_epochs": 1,
495
+ "save_steps": 500,
496
+ "stateful_callbacks": {
497
+ "TrainerControl": {
498
+ "args": {
499
+ "should_epoch_stop": false,
500
+ "should_evaluate": false,
501
+ "should_log": false,
502
+ "should_save": true,
503
+ "should_training_stop": true
504
+ },
505
+ "attributes": {}
506
+ }
507
+ },
508
+ "total_flos": 0.0,
509
+ "train_batch_size": 16,
510
+ "trial_name": null,
511
+ "trial_params": null
512
+ }