hazentr commited on
Commit
7cd7e07
·
verified ·
1 Parent(s): 3745bf8

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -2
  2. all_results.json +4 -4
  3. model.safetensors +1 -1
  4. train_results.json +4 -4
  5. trainer_state.json +83 -83
README.md CHANGED
@@ -5,10 +5,10 @@ model_name: Qwen2.5-0.5B-Instruct-Gensyn-Swarm-quick_timid_frog
5
  tags:
6
  - generated_from_trainer
7
  - rl-swarm
8
- - trl
9
- - I am quick timid frog
10
  - gensyn
11
  - grpo
 
 
12
  licence: license
13
  ---
14
 
 
5
  tags:
6
  - generated_from_trainer
7
  - rl-swarm
 
 
8
  - gensyn
9
  - grpo
10
+ - I am quick timid frog
11
+ - trl
12
  licence: license
13
  ---
14
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.05050143450498581,
4
- "train_runtime": 733.4022,
5
  "train_samples": 5,
6
- "train_samples_per_second": 0.218,
7
- "train_steps_per_second": 0.027
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0874896764755249,
4
+ "train_runtime": 937.823,
5
  "train_samples": 5,
6
+ "train_samples_per_second": 0.171,
7
+ "train_steps_per_second": 0.021
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a00a688a4a482c3644a8ebeecdb6df4ad1b67cdeca0702cf7ef00d4ca1271b5
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:713264c96749c24604d08d142b497451f9725614fb94d291a0e5e9dee6936d71
3
  size 1976163472
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.05050143450498581,
4
- "train_runtime": 733.4022,
5
  "train_samples": 5,
6
- "train_samples_per_second": 0.218,
7
- "train_steps_per_second": 0.027
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0874896764755249,
4
+ "train_runtime": 937.823,
5
  "train_samples": 5,
6
+ "train_samples_per_second": 0.171,
7
+ "train_steps_per_second": 0.021
8
  }
trainer_state.json CHANGED
@@ -15,22 +15,22 @@
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
- "completions/clipped_ratio": 0.0625,
19
- "completions/max_length": 736.5,
20
- "completions/max_terminated_length": 517.5,
21
- "completions/mean_length": 234.125,
22
- "completions/mean_terminated_length": 181.95536041259766,
23
- "completions/min_length": 13.5,
24
- "completions/min_terminated_length": 13.5,
25
  "epoch": 1.8,
26
  "frac_reward_zero_std": 0.0,
27
- "grad_norm": 9.468099594116211,
28
- "kl": 0.0009500358792138286,
29
  "learning_rate": 5e-07,
30
- "loss": 0.1074,
31
- "num_tokens": 7842.0,
32
- "reward": 0.14829658716917038,
33
- "reward_std": 0.09615837037563324,
34
  "rewards/concensus_correctness_reward_func/mean": 0.0,
35
  "rewards/concensus_correctness_reward_func/std": 0.0,
36
  "rewards/consensus_reward_func/mean": 0.0,
@@ -39,14 +39,14 @@
39
  "rewards/cumulative_reward_2/std": 0.0,
40
  "rewards/final_correctness_reward_func/mean": 0.0,
41
  "rewards/final_correctness_reward_func/std": 0.0,
42
- "rewards/question_recreation_reward_func/mean": 0.1441090926527977,
43
- "rewards/question_recreation_reward_func/std": 0.0952618196606636,
44
  "rewards/soft_format_reward_func/mean": 0.0,
45
  "rewards/soft_format_reward_func/std": 0.0,
46
  "rewards/strict_format_reward_func/mean": 0.0,
47
  "rewards/strict_format_reward_func/std": 0.0,
48
- "rewards/xmlcount_reward_func/mean": 0.004187500104308128,
49
- "rewards/xmlcount_reward_func/std": 0.011844038963317871,
50
  "step": 2
51
  },
52
  {
@@ -57,20 +57,20 @@
57
  "clip_ratio/region_mean": 0.0,
58
  "completions/clipped_ratio": 0.125,
59
  "completions/max_length": 1024.0,
60
- "completions/max_terminated_length": 787.5,
61
- "completions/mean_length": 421.5,
62
- "completions/mean_terminated_length": 335.4285888671875,
63
- "completions/min_length": 18.0,
64
- "completions/min_terminated_length": 18.0,
65
  "epoch": 3.8,
66
  "frac_reward_zero_std": 0.0,
67
- "grad_norm": 4.018620491027832,
68
- "kl": 0.0006310820899670944,
69
  "learning_rate": 4.864543104251586e-07,
70
- "loss": 0.1916,
71
- "num_tokens": 18682.0,
72
- "reward": 0.022797180339694023,
73
- "reward_std": 0.007740819826722145,
74
  "rewards/concensus_correctness_reward_func/mean": 0.0,
75
  "rewards/concensus_correctness_reward_func/std": 0.0,
76
  "rewards/consensus_reward_func/mean": 0.0,
@@ -79,8 +79,8 @@
79
  "rewards/cumulative_reward_2/std": 0.0,
80
  "rewards/final_correctness_reward_func/mean": 0.0,
81
  "rewards/final_correctness_reward_func/std": 0.0,
82
- "rewards/question_recreation_reward_func/mean": 0.022797180339694023,
83
- "rewards/question_recreation_reward_func/std": 0.015481723938137293,
84
  "rewards/soft_format_reward_func/mean": 0.0,
85
  "rewards/soft_format_reward_func/std": 0.0,
86
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -95,22 +95,22 @@
95
  "clip_ratio/low_mean": 0.0,
96
  "clip_ratio/low_min": 0.0,
97
  "clip_ratio/region_mean": 0.0,
98
- "completions/clipped_ratio": 0.1875,
99
  "completions/max_length": 1024.0,
100
- "completions/max_terminated_length": 836.5,
101
- "completions/mean_length": 406.625,
102
- "completions/mean_terminated_length": 263.8690643310547,
103
- "completions/min_length": 8.5,
104
- "completions/min_terminated_length": 8.5,
105
  "epoch": 5.8,
106
  "frac_reward_zero_std": 0.0,
107
- "grad_norm": 9.984585762023926,
108
- "kl": 0.0017272870391025208,
109
  "learning_rate": 4.472851273490984e-07,
110
- "loss": 0.1226,
111
- "num_tokens": 29284.0,
112
- "reward": 0.05447566136717796,
113
- "reward_std": 0.040898644365370274,
114
  "rewards/concensus_correctness_reward_func/mean": 0.0,
115
  "rewards/concensus_correctness_reward_func/std": 0.0,
116
  "rewards/consensus_reward_func/mean": 0.0,
@@ -119,8 +119,8 @@
119
  "rewards/cumulative_reward_2/std": 0.0,
120
  "rewards/final_correctness_reward_func/mean": 0.0,
121
  "rewards/final_correctness_reward_func/std": 0.0,
122
- "rewards/question_recreation_reward_func/mean": 0.05447566136717796,
123
- "rewards/question_recreation_reward_func/std": 0.03653379296883941,
124
  "rewards/soft_format_reward_func/mean": 0.0,
125
  "rewards/soft_format_reward_func/std": 0.0,
126
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -135,22 +135,22 @@
135
  "clip_ratio/low_mean": 0.0,
136
  "clip_ratio/low_min": 0.0,
137
  "clip_ratio/region_mean": 0.0,
138
- "completions/clipped_ratio": 0.25,
139
- "completions/max_length": 1024.0,
140
- "completions/max_terminated_length": 389.5,
141
- "completions/mean_length": 363.3125,
142
- "completions/mean_terminated_length": 143.08333587646484,
143
- "completions/min_length": 17.0,
144
- "completions/min_terminated_length": 17.0,
145
  "epoch": 7.8,
146
  "frac_reward_zero_std": 0.0,
147
- "grad_norm": 9.14108657836914,
148
- "kl": 0.006233984004211379,
149
  "learning_rate": 3.867370395306068e-07,
150
- "loss": -0.1465,
151
- "num_tokens": 39193.0,
152
- "reward": 0.09087881818413734,
153
- "reward_std": 0.06694184988737106,
154
  "rewards/concensus_correctness_reward_func/mean": 0.0,
155
  "rewards/concensus_correctness_reward_func/std": 0.0,
156
  "rewards/consensus_reward_func/mean": 0.0,
@@ -159,14 +159,14 @@
159
  "rewards/cumulative_reward_2/std": 0.0,
160
  "rewards/final_correctness_reward_func/mean": 0.0,
161
  "rewards/final_correctness_reward_func/std": 0.0,
162
- "rewards/question_recreation_reward_func/mean": 0.1174413226544857,
163
- "rewards/question_recreation_reward_func/std": 0.07382831908762455,
164
  "rewards/soft_format_reward_func/mean": 0.0,
165
  "rewards/soft_format_reward_func/std": 0.0,
166
  "rewards/strict_format_reward_func/mean": 0.0,
167
  "rewards/strict_format_reward_func/std": 0.0,
168
- "rewards/xmlcount_reward_func/mean": -0.026562499813735485,
169
- "rewards/xmlcount_reward_func/std": 0.07513009570538998,
170
  "step": 8
171
  },
172
  {
@@ -175,22 +175,22 @@
175
  "clip_ratio/low_mean": 0.0,
176
  "clip_ratio/low_min": 0.0,
177
  "clip_ratio/region_mean": 0.0,
178
- "completions/clipped_ratio": 0.25,
179
- "completions/max_length": 1024.0,
180
- "completions/max_terminated_length": 272.0,
181
- "completions/mean_length": 344.125,
182
- "completions/mean_terminated_length": 117.50000381469727,
183
- "completions/min_length": 30.5,
184
- "completions/min_terminated_length": 30.5,
185
  "epoch": 9.8,
186
  "frac_reward_zero_std": 0.0,
187
- "grad_norm": 8.28457260131836,
188
- "kl": 0.005484735171194188,
189
  "learning_rate": 3.1137137178519977e-07,
190
- "loss": -0.0226,
191
- "num_tokens": 48795.0,
192
- "reward": 0.09757254645228386,
193
- "reward_std": 0.05229111574590206,
194
  "rewards/concensus_correctness_reward_func/mean": 0.0,
195
  "rewards/concensus_correctness_reward_func/std": 0.0,
196
  "rewards/consensus_reward_func/mean": 0.0,
@@ -199,29 +199,29 @@
199
  "rewards/cumulative_reward_2/std": 0.0,
200
  "rewards/final_correctness_reward_func/mean": 0.0,
201
  "rewards/final_correctness_reward_func/std": 0.0,
202
- "rewards/question_recreation_reward_func/mean": 0.0833225455135107,
203
- "rewards/question_recreation_reward_func/std": 0.056736329570412636,
204
  "rewards/soft_format_reward_func/mean": 0.0,
205
  "rewards/soft_format_reward_func/std": 0.0,
206
  "rewards/strict_format_reward_func/mean": 0.0,
207
  "rewards/strict_format_reward_func/std": 0.0,
208
- "rewards/xmlcount_reward_func/mean": 0.01425000000745058,
209
- "rewards/xmlcount_reward_func/std": 0.040305085480213165,
210
  "step": 10
211
  },
212
  {
213
  "epoch": 9.8,
214
  "step": 10,
215
  "total_flos": 0.0,
216
- "train_loss": 0.05050143450498581,
217
- "train_runtime": 733.4022,
218
- "train_samples_per_second": 0.218,
219
- "train_steps_per_second": 0.027
220
  }
221
  ],
222
  "logging_steps": 2,
223
  "max_steps": 20,
224
- "num_input_tokens_seen": 48795,
225
  "num_train_epochs": 10,
226
  "save_steps": 25,
227
  "stateful_callbacks": {
 
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.25,
19
+ "completions/max_length": 938.0,
20
+ "completions/max_terminated_length": 865.5,
21
+ "completions/mean_length": 656.125,
22
+ "completions/mean_terminated_length": 580.375,
23
+ "completions/min_length": 287.5,
24
+ "completions/min_terminated_length": 287.5,
25
  "epoch": 1.8,
26
  "frac_reward_zero_std": 0.0,
27
+ "grad_norm": 2.378174066543579,
28
+ "kl": 0.00024694142030057264,
29
  "learning_rate": 5e-07,
30
+ "loss": -0.0692,
31
+ "num_tokens": 14594.0,
32
+ "reward": -0.09812777116894722,
33
+ "reward_std": 0.16612516529858112,
34
  "rewards/concensus_correctness_reward_func/mean": 0.0,
35
  "rewards/concensus_correctness_reward_func/std": 0.0,
36
  "rewards/consensus_reward_func/mean": 0.0,
 
39
  "rewards/cumulative_reward_2/std": 0.0,
40
  "rewards/final_correctness_reward_func/mean": 0.0,
41
  "rewards/final_correctness_reward_func/std": 0.0,
42
+ "rewards/question_recreation_reward_func/mean": 0.01712222769856453,
43
+ "rewards/question_recreation_reward_func/std": 0.01072776922956109,
44
  "rewards/soft_format_reward_func/mean": 0.0,
45
  "rewards/soft_format_reward_func/std": 0.0,
46
  "rewards/strict_format_reward_func/mean": 0.0,
47
  "rewards/strict_format_reward_func/std": 0.0,
48
+ "rewards/xmlcount_reward_func/mean": -0.11524999886751175,
49
+ "rewards/xmlcount_reward_func/std": 0.32597625255584717,
50
  "step": 2
51
  },
52
  {
 
57
  "clip_ratio/region_mean": 0.0,
58
  "completions/clipped_ratio": 0.125,
59
  "completions/max_length": 1024.0,
60
+ "completions/max_terminated_length": 868.0,
61
+ "completions/mean_length": 470.0625,
62
+ "completions/mean_terminated_length": 390.9285888671875,
63
+ "completions/min_length": 60.0,
64
+ "completions/min_terminated_length": 60.0,
65
  "epoch": 3.8,
66
  "frac_reward_zero_std": 0.0,
67
+ "grad_norm": 5.8317060470581055,
68
+ "kl": 0.00036002036267746007,
69
  "learning_rate": 4.864543104251586e-07,
70
+ "loss": 0.136,
71
+ "num_tokens": 26211.0,
72
+ "reward": 0.07699444144964218,
73
+ "reward_std": 0.030455347150564194,
74
  "rewards/concensus_correctness_reward_func/mean": 0.0,
75
  "rewards/concensus_correctness_reward_func/std": 0.0,
76
  "rewards/consensus_reward_func/mean": 0.0,
 
79
  "rewards/cumulative_reward_2/std": 0.0,
80
  "rewards/final_correctness_reward_func/mean": 0.0,
81
  "rewards/final_correctness_reward_func/std": 0.0,
82
+ "rewards/question_recreation_reward_func/mean": 0.07699444144964218,
83
+ "rewards/question_recreation_reward_func/std": 0.04443136602640152,
84
  "rewards/soft_format_reward_func/mean": 0.0,
85
  "rewards/soft_format_reward_func/std": 0.0,
86
  "rewards/strict_format_reward_func/mean": 0.0,
 
95
  "clip_ratio/low_mean": 0.0,
96
  "clip_ratio/low_min": 0.0,
97
  "clip_ratio/region_mean": 0.0,
98
+ "completions/clipped_ratio": 0.125,
99
  "completions/max_length": 1024.0,
100
+ "completions/max_terminated_length": 1003.5,
101
+ "completions/mean_length": 548.625,
102
+ "completions/mean_terminated_length": 480.7143096923828,
103
+ "completions/min_length": 134.0,
104
+ "completions/min_terminated_length": 134.0,
105
  "epoch": 5.8,
106
  "frac_reward_zero_std": 0.0,
107
+ "grad_norm": 3.393465518951416,
108
+ "kl": 0.00036590506533684675,
109
  "learning_rate": 4.472851273490984e-07,
110
+ "loss": 0.1056,
111
+ "num_tokens": 39085.0,
112
+ "reward": 0.052806172519922256,
113
+ "reward_std": 0.017434499226510525,
114
  "rewards/concensus_correctness_reward_func/mean": 0.0,
115
  "rewards/concensus_correctness_reward_func/std": 0.0,
116
  "rewards/consensus_reward_func/mean": 0.0,
 
119
  "rewards/cumulative_reward_2/std": 0.0,
120
  "rewards/final_correctness_reward_func/mean": 0.0,
121
  "rewards/final_correctness_reward_func/std": 0.0,
122
+ "rewards/question_recreation_reward_func/mean": 0.052806172519922256,
123
+ "rewards/question_recreation_reward_func/std": 0.018543646670877934,
124
  "rewards/soft_format_reward_func/mean": 0.0,
125
  "rewards/soft_format_reward_func/std": 0.0,
126
  "rewards/strict_format_reward_func/mean": 0.0,
 
135
  "clip_ratio/low_mean": 0.0,
136
  "clip_ratio/low_min": 0.0,
137
  "clip_ratio/region_mean": 0.0,
138
+ "completions/clipped_ratio": 0.0,
139
+ "completions/max_length": 900.0,
140
+ "completions/max_terminated_length": 900.0,
141
+ "completions/mean_length": 458.8125,
142
+ "completions/mean_terminated_length": 458.8125,
143
+ "completions/min_length": 114.0,
144
+ "completions/min_terminated_length": 114.0,
145
  "epoch": 7.8,
146
  "frac_reward_zero_std": 0.0,
147
+ "grad_norm": 4.418057918548584,
148
+ "kl": 0.0005609585878119105,
149
  "learning_rate": 3.867370395306068e-07,
150
+ "loss": 0.0939,
151
+ "num_tokens": 50522.0,
152
+ "reward": 0.024604666978120804,
153
+ "reward_std": 0.009030941408127546,
154
  "rewards/concensus_correctness_reward_func/mean": 0.0,
155
  "rewards/concensus_correctness_reward_func/std": 0.0,
156
  "rewards/consensus_reward_func/mean": 0.0,
 
159
  "rewards/cumulative_reward_2/std": 0.0,
160
  "rewards/final_correctness_reward_func/mean": 0.0,
161
  "rewards/final_correctness_reward_func/std": 0.0,
162
+ "rewards/question_recreation_reward_func/mean": 0.024604666978120804,
163
+ "rewards/question_recreation_reward_func/std": 0.011531331343576312,
164
  "rewards/soft_format_reward_func/mean": 0.0,
165
  "rewards/soft_format_reward_func/std": 0.0,
166
  "rewards/strict_format_reward_func/mean": 0.0,
167
  "rewards/strict_format_reward_func/std": 0.0,
168
+ "rewards/xmlcount_reward_func/mean": 0.0,
169
+ "rewards/xmlcount_reward_func/std": 0.0,
170
  "step": 8
171
  },
172
  {
 
175
  "clip_ratio/low_mean": 0.0,
176
  "clip_ratio/low_min": 0.0,
177
  "clip_ratio/region_mean": 0.0,
178
+ "completions/clipped_ratio": 0.0625,
179
+ "completions/max_length": 928.5,
180
+ "completions/max_terminated_length": 824.0,
181
+ "completions/mean_length": 451.25,
182
+ "completions/mean_terminated_length": 412.8482208251953,
183
+ "completions/min_length": 56.0,
184
+ "completions/min_terminated_length": 56.0,
185
  "epoch": 9.8,
186
  "frac_reward_zero_std": 0.0,
187
+ "grad_norm": 4.358504772186279,
188
+ "kl": 0.0006784129072912037,
189
  "learning_rate": 3.1137137178519977e-07,
190
+ "loss": 0.1711,
191
+ "num_tokens": 61838.0,
192
+ "reward": 0.08084426820278168,
193
+ "reward_std": 0.05193536356091499,
194
  "rewards/concensus_correctness_reward_func/mean": 0.0,
195
  "rewards/concensus_correctness_reward_func/std": 0.0,
196
  "rewards/consensus_reward_func/mean": 0.0,
 
199
  "rewards/cumulative_reward_2/std": 0.0,
200
  "rewards/final_correctness_reward_func/mean": 0.0,
201
  "rewards/final_correctness_reward_func/std": 0.0,
202
+ "rewards/question_recreation_reward_func/mean": 0.08084426820278168,
203
+ "rewards/question_recreation_reward_func/std": 0.08748885244131088,
204
  "rewards/soft_format_reward_func/mean": 0.0,
205
  "rewards/soft_format_reward_func/std": 0.0,
206
  "rewards/strict_format_reward_func/mean": 0.0,
207
  "rewards/strict_format_reward_func/std": 0.0,
208
+ "rewards/xmlcount_reward_func/mean": 0.0,
209
+ "rewards/xmlcount_reward_func/std": 0.0,
210
  "step": 10
211
  },
212
  {
213
  "epoch": 9.8,
214
  "step": 10,
215
  "total_flos": 0.0,
216
+ "train_loss": 0.0874896764755249,
217
+ "train_runtime": 937.823,
218
+ "train_samples_per_second": 0.171,
219
+ "train_steps_per_second": 0.021
220
  }
221
  ],
222
  "logging_steps": 2,
223
  "max_steps": 20,
224
+ "num_input_tokens_seen": 61838,
225
  "num_train_epochs": 10,
226
  "save_steps": 25,
227
  "stateful_callbacks": {