hamedkharazmi commited on
Commit
ef2562b
·
verified ·
1 Parent(s): 22465ed

End of training

Browse files
README.md CHANGED
@@ -40,7 +40,7 @@ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing
40
 
41
  - TRL: 0.15.2
42
  - Transformers: 4.48.2
43
- - Pytorch: 2.5.1+cu121
44
  - Datasets: 3.6.0
45
  - Tokenizers: 0.21.1
46
 
 
40
 
41
  - TRL: 0.15.2
42
  - Transformers: 4.48.2
43
+ - Pytorch: 2.5.1
44
  - Datasets: 3.6.0
45
  - Tokenizers: 0.21.1
46
 
adapter_config.json CHANGED
@@ -24,13 +24,13 @@
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
- "gate_proj",
28
- "up_proj",
29
- "q_proj",
30
  "o_proj",
31
- "v_proj",
32
  "down_proj",
33
- "k_proj"
 
 
 
 
34
  ],
35
  "task_type": "CAUSAL_LM",
36
  "trainable_token_indices": null,
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
 
 
 
27
  "o_proj",
 
28
  "down_proj",
29
+ "up_proj",
30
+ "k_proj",
31
+ "gate_proj",
32
+ "v_proj",
33
+ "q_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
36
  "trainable_token_indices": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:619030f187f1a7ed66ae988b67627c6461f61d9ad09ec49864892babd1bae79f
3
  size 35237104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cd6afa37d03b192852e3643a7269121507a7535742557a47b0ec0ac94420aa3
3
  size 35237104
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 2.9364487659222505e-06,
4
- "train_runtime": 628.8319,
5
- "train_samples": 44,
6
- "train_samples_per_second": 0.509,
7
- "train_steps_per_second": 0.032
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 2.546286020788102e-06,
4
+ "train_runtime": 864.6962,
5
+ "train_samples": 20,
6
+ "train_samples_per_second": 0.37,
7
+ "train_steps_per_second": 0.023
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 2.9364487659222505e-06,
4
- "train_runtime": 628.8319,
5
- "train_samples": 44,
6
- "train_samples_per_second": 0.509,
7
- "train_steps_per_second": 0.032
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 2.546286020788102e-06,
4
+ "train_runtime": 864.6962,
5
+ "train_samples": 20,
6
+ "train_samples_per_second": 0.37,
7
+ "train_steps_per_second": 0.023
8
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.8181818181818183,
5
  "eval_steps": 500,
6
  "global_step": 20,
7
  "is_hyper_param_search": false,
@@ -9,209 +9,209 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 372.25,
13
- "epoch": 0.18181818181818182,
14
- "grad_norm": 1.9300328493118286,
15
- "kl": 0.002685736268176697,
16
  "learning_rate": 4.965903258506806e-07,
17
  "loss": 0.0,
18
- "reward": 1.9535507448017597,
19
- "reward_std": 2.3369685858488083,
20
- "rewards/concensus_correctness_reward_func": 0.75,
21
- "rewards/consensus_reward_func": 0.0625,
22
  "rewards/cumulative_reward_2": 0.0,
23
- "rewards/final_correctness_reward_func": 0.375,
24
- "rewards/question_recreation_reward_func": 0.4927695244550705,
25
  "rewards/soft_format_reward_func": 0.0,
26
  "rewards/strict_format_reward_func": 0.0,
27
- "rewards/xmlcount_reward_func": 0.27328124828636646,
28
  "step": 2
29
  },
30
  {
31
- "completion_length": 343.03125,
32
- "epoch": 0.36363636363636365,
33
- "grad_norm": 3.323760747909546,
34
- "kl": 0.0038706637133145705,
35
  "learning_rate": 4.698684378016222e-07,
36
  "loss": 0.0,
37
- "reward": 1.1179207004606724,
38
- "reward_std": 0.9569476544857025,
39
- "rewards/concensus_correctness_reward_func": 0.09912500157952309,
40
- "rewards/consensus_reward_func": 0.125,
41
  "rewards/cumulative_reward_2": 0.0,
42
  "rewards/final_correctness_reward_func": 0.25,
43
- "rewards/question_recreation_reward_func": 0.36013949010521173,
44
  "rewards/soft_format_reward_func": 0.0,
45
- "rewards/strict_format_reward_func": 0.015625,
46
- "rewards/xmlcount_reward_func": 0.2680312469601631,
47
  "step": 4
48
  },
49
  {
50
- "completion_length": 275.65625,
51
- "epoch": 0.5454545454545454,
52
- "grad_norm": 1.9294134378433228,
53
- "kl": 0.0021318396175047383,
54
  "learning_rate": 4.193203929064353e-07,
55
  "loss": 0.0,
56
- "reward": 1.1408963352441788,
57
- "reward_std": 0.967832338064909,
58
- "rewards/concensus_correctness_reward_func": 0.125,
59
- "rewards/consensus_reward_func": 0.125,
60
  "rewards/cumulative_reward_2": 0.0,
61
- "rewards/final_correctness_reward_func": 0.0625,
62
- "rewards/question_recreation_reward_func": 0.4269276261329651,
63
- "rewards/soft_format_reward_func": 0.015625,
64
  "rewards/strict_format_reward_func": 0.0,
65
- "rewards/xmlcount_reward_func": 0.3858437556773424,
66
  "step": 6
67
  },
68
  {
69
- "completion_length": 319.96875,
70
- "epoch": 0.7272727272727273,
71
- "grad_norm": 1.2534875869750977,
72
- "kl": 0.0027487699990160763,
73
  "learning_rate": 3.5042385616324236e-07,
74
  "loss": 0.0,
75
- "reward": 0.9228200912475586,
76
- "reward_std": 1.5564873665571213,
77
- "rewards/concensus_correctness_reward_func": 0.125,
78
- "rewards/consensus_reward_func": 0.1875,
79
  "rewards/cumulative_reward_2": 0.0,
80
- "rewards/final_correctness_reward_func": 0.125,
81
- "rewards/question_recreation_reward_func": 0.3794763386249542,
82
  "rewards/soft_format_reward_func": 0.0,
83
  "rewards/strict_format_reward_func": 0.0,
84
- "rewards/xmlcount_reward_func": 0.10584374889731407,
85
  "step": 8
86
  },
87
  {
88
- "completion_length": 435.375,
89
- "epoch": 0.9090909090909091,
90
- "grad_norm": 1.2784631252288818,
91
- "kl": 0.002754525456111878,
92
  "learning_rate": 2.706448363680831e-07,
93
  "loss": 0.0,
94
- "reward": 0.8444329872727394,
95
- "reward_std": 1.0790864313021302,
96
- "rewards/concensus_correctness_reward_func": 0.125,
97
- "rewards/consensus_reward_func": 0.125,
98
  "rewards/cumulative_reward_2": 0.0,
99
- "rewards/final_correctness_reward_func": 0.1875,
100
- "rewards/question_recreation_reward_func": 0.28580794762820005,
101
- "rewards/soft_format_reward_func": 0.015625,
102
  "rewards/strict_format_reward_func": 0.0,
103
- "rewards/xmlcount_reward_func": 0.1055000051856041,
104
  "step": 10
105
  },
106
  {
107
- "completion_length": 383.21875,
108
- "epoch": 1.0909090909090908,
109
- "grad_norm": 3.725076913833618,
110
- "kl": 0.0038445138634415343,
111
  "learning_rate": 1.886286282148002e-07,
112
  "loss": 0.0,
113
- "reward": 1.3737196251749992,
114
- "reward_std": 1.564310446381569,
115
- "rewards/concensus_correctness_reward_func": 0.24268750101327896,
116
- "rewards/consensus_reward_func": 0.375,
117
  "rewards/cumulative_reward_2": 0.0,
118
- "rewards/final_correctness_reward_func": 0.125,
119
- "rewards/question_recreation_reward_func": 0.37809463776648045,
120
  "rewards/soft_format_reward_func": 0.0,
121
  "rewards/strict_format_reward_func": 0.0,
122
- "rewards/xmlcount_reward_func": 0.2529374994337559,
123
  "step": 12
124
  },
125
  {
126
- "completion_length": 408.4375,
127
- "epoch": 1.2727272727272727,
128
- "grad_norm": 3.1979212760925293,
129
- "kl": 0.002077052035019733,
130
  "learning_rate": 1.1326296046939333e-07,
131
  "loss": 0.0,
132
- "reward": 4.569394081830978,
133
- "reward_std": 5.111470676958561,
134
- "rewards/concensus_correctness_reward_func": 3.208000000566244,
135
- "rewards/consensus_reward_func": 0.3125,
136
  "rewards/cumulative_reward_2": 0.0,
137
- "rewards/final_correctness_reward_func": 0.375,
138
- "rewards/question_recreation_reward_func": 0.46067542023956776,
139
  "rewards/soft_format_reward_func": 0.015625,
140
  "rewards/strict_format_reward_func": 0.0,
141
- "rewards/xmlcount_reward_func": 0.19759375415742397,
142
  "step": 14
143
  },
144
  {
145
- "completion_length": 436.34375,
146
- "epoch": 1.4545454545454546,
147
- "grad_norm": 1.4435375928878784,
148
- "kl": 0.0014565634628525004,
149
  "learning_rate": 5.271487265090163e-08,
150
  "loss": 0.0,
151
- "reward": 1.052122674882412,
152
- "reward_std": 0.8749217577278614,
153
- "rewards/concensus_correctness_reward_func": 0.03125,
154
  "rewards/consensus_reward_func": 0.0,
155
  "rewards/cumulative_reward_2": 0.0,
156
- "rewards/final_correctness_reward_func": 0.0625,
157
- "rewards/question_recreation_reward_func": 0.4829976772889495,
158
  "rewards/soft_format_reward_func": 0.0,
159
  "rewards/strict_format_reward_func": 0.0,
160
- "rewards/xmlcount_reward_func": 0.4753750041127205,
161
  "step": 16
162
  },
163
  {
164
- "completion_length": 434.21875,
165
- "epoch": 1.6363636363636362,
166
- "grad_norm": 1.8603307008743286,
167
- "kl": 0.004560476292681415,
168
  "learning_rate": 1.3545689574841341e-08,
169
  "loss": 0.0,
170
- "reward": 1.5926124211400747,
171
- "reward_std": 2.343985839514062,
172
- "rewards/concensus_correctness_reward_func": 0.65625,
173
- "rewards/consensus_reward_func": 0.125,
174
  "rewards/cumulative_reward_2": 0.0,
175
  "rewards/final_correctness_reward_func": 0.1875,
176
- "rewards/question_recreation_reward_func": 0.3674874210264534,
177
  "rewards/soft_format_reward_func": 0.0,
178
- "rewards/strict_format_reward_func": 0.015625,
179
- "rewards/xmlcount_reward_func": 0.2407499998807907,
180
  "step": 18
181
  },
182
  {
183
- "completion_length": 310.65625,
184
- "epoch": 1.8181818181818183,
185
- "grad_norm": 1.8424596786499023,
186
- "kl": 0.0032100926473503932,
187
  "learning_rate": 0.0,
188
  "loss": 0.0,
189
- "reward": 0.7539666602388024,
190
- "reward_std": 0.9992944076657295,
191
- "rewards/concensus_correctness_reward_func": 0.03506249934434891,
192
  "rewards/consensus_reward_func": 0.125,
193
  "rewards/cumulative_reward_2": 0.0,
194
- "rewards/final_correctness_reward_func": 0.0,
195
- "rewards/question_recreation_reward_func": 0.31849788781255484,
196
  "rewards/soft_format_reward_func": 0.0,
197
- "rewards/strict_format_reward_func": 0.015625,
198
- "rewards/xmlcount_reward_func": 0.25978125259280205,
199
  "step": 20
200
  },
201
  {
202
- "epoch": 1.8181818181818183,
203
  "step": 20,
204
  "total_flos": 0.0,
205
- "train_loss": 2.9364487659222505e-06,
206
- "train_runtime": 628.8319,
207
- "train_samples_per_second": 0.509,
208
- "train_steps_per_second": 0.032
209
  }
210
  ],
211
  "logging_steps": 2,
212
  "max_steps": 20,
213
  "num_input_tokens_seen": 0,
214
- "num_train_epochs": 2,
215
  "save_steps": 25,
216
  "stateful_callbacks": {
217
  "TrainerControl": {
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
  "eval_steps": 500,
6
  "global_step": 20,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 537.6875,
13
+ "epoch": 0.4,
14
+ "grad_norm": 1.4502513408660889,
15
+ "kl": 0.0045772524463245645,
16
  "learning_rate": 4.965903258506806e-07,
17
  "loss": 0.0,
18
+ "reward": 1.4177430057898164,
19
+ "reward_std": 2.1470614364370704,
20
+ "rewards/concensus_correctness_reward_func": 0.7554374933242798,
21
+ "rewards/consensus_reward_func": 0.125,
22
  "rewards/cumulative_reward_2": 0.0,
23
+ "rewards/final_correctness_reward_func": 0.25,
24
+ "rewards/question_recreation_reward_func": 0.25036793970502913,
25
  "rewards/soft_format_reward_func": 0.0,
26
  "rewards/strict_format_reward_func": 0.0,
27
+ "rewards/xmlcount_reward_func": 0.03693749010562897,
28
  "step": 2
29
  },
30
  {
31
+ "completion_length": 468.53125,
32
+ "epoch": 0.8,
33
+ "grad_norm": 1.9520736932754517,
34
+ "kl": 0.001855591282946989,
35
  "learning_rate": 4.698684378016222e-07,
36
  "loss": 0.0,
37
+ "reward": 0.3959203027188778,
38
+ "reward_std": 1.1887737782672048,
39
+ "rewards/concensus_correctness_reward_func": 0.03125,
40
+ "rewards/consensus_reward_func": 0.0,
41
  "rewards/cumulative_reward_2": 0.0,
42
  "rewards/final_correctness_reward_func": 0.25,
43
+ "rewards/question_recreation_reward_func": 0.2753265555948019,
44
  "rewards/soft_format_reward_func": 0.0,
45
+ "rewards/strict_format_reward_func": 0.0,
46
+ "rewards/xmlcount_reward_func": -0.16065625473856926,
47
  "step": 4
48
  },
49
  {
50
+ "completion_length": 441.78125,
51
+ "epoch": 1.2,
52
+ "grad_norm": 1.735103726387024,
53
+ "kl": 0.0016164953267434612,
54
  "learning_rate": 4.193203929064353e-07,
55
  "loss": 0.0,
56
+ "reward": 0.623163734562695,
57
+ "reward_std": 0.7843756377696991,
58
+ "rewards/concensus_correctness_reward_func": 0.06006250157952309,
59
+ "rewards/consensus_reward_func": 0.0,
60
  "rewards/cumulative_reward_2": 0.0,
61
+ "rewards/final_correctness_reward_func": 0.25,
62
+ "rewards/question_recreation_reward_func": 0.24881997099146247,
63
+ "rewards/soft_format_reward_func": 0.0,
64
  "rewards/strict_format_reward_func": 0.0,
65
+ "rewards/xmlcount_reward_func": 0.06428124755620956,
66
  "step": 6
67
  },
68
  {
69
+ "completion_length": 435.59375,
70
+ "epoch": 1.6,
71
+ "grad_norm": 2.2628138065338135,
72
+ "kl": 0.0013706091995118186,
73
  "learning_rate": 3.5042385616324236e-07,
74
  "loss": 0.0,
75
+ "reward": 0.3420835845172405,
76
+ "reward_std": 0.797295784112066,
77
+ "rewards/concensus_correctness_reward_func": 0.0,
78
+ "rewards/consensus_reward_func": 0.0,
79
  "rewards/cumulative_reward_2": 0.0,
80
+ "rewards/final_correctness_reward_func": 0.0625,
81
+ "rewards/question_recreation_reward_func": 0.3639585729688406,
82
  "rewards/soft_format_reward_func": 0.0,
83
  "rewards/strict_format_reward_func": 0.0,
84
+ "rewards/xmlcount_reward_func": -0.08437500428408384,
85
  "step": 8
86
  },
87
  {
88
+ "completion_length": 319.5625,
89
+ "epoch": 2.0,
90
+ "grad_norm": 2.1416268348693848,
91
+ "kl": 0.0025234216154785827,
92
  "learning_rate": 2.706448363680831e-07,
93
  "loss": 0.0,
94
+ "reward": 1.0948227606713772,
95
+ "reward_std": 1.085303759202361,
96
+ "rewards/concensus_correctness_reward_func": 0.08193749934434891,
97
+ "rewards/consensus_reward_func": 0.0,
98
  "rewards/cumulative_reward_2": 0.0,
99
+ "rewards/final_correctness_reward_func": 0.25,
100
+ "rewards/question_recreation_reward_func": 0.37897900864481926,
101
+ "rewards/soft_format_reward_func": 0.0,
102
  "rewards/strict_format_reward_func": 0.0,
103
+ "rewards/xmlcount_reward_func": 0.38390624336898327,
104
  "step": 10
105
  },
106
  {
107
+ "completion_length": 525.09375,
108
+ "epoch": 2.4,
109
+ "grad_norm": 2.21820068359375,
110
+ "kl": 0.001763832857250236,
111
  "learning_rate": 1.886286282148002e-07,
112
  "loss": 0.0,
113
+ "reward": 0.4163860874250531,
114
+ "reward_std": 1.0169010870158672,
115
+ "rewards/concensus_correctness_reward_func": 0.0,
116
+ "rewards/consensus_reward_func": 0.0,
117
  "rewards/cumulative_reward_2": 0.0,
118
+ "rewards/final_correctness_reward_func": 0.0625,
119
+ "rewards/question_recreation_reward_func": 0.413198578171432,
120
  "rewards/soft_format_reward_func": 0.0,
121
  "rewards/strict_format_reward_func": 0.0,
122
+ "rewards/xmlcount_reward_func": -0.0593124907463789,
123
  "step": 12
124
  },
125
  {
126
+ "completion_length": 311.84375,
127
+ "epoch": 2.8,
128
+ "grad_norm": 2.4503183364868164,
129
+ "kl": 0.0024037769326241687,
130
  "learning_rate": 1.1326296046939333e-07,
131
  "loss": 0.0,
132
+ "reward": 1.1352357491850853,
133
+ "reward_std": 1.1741696512326598,
134
+ "rewards/concensus_correctness_reward_func": 0.15131249651312828,
135
+ "rewards/consensus_reward_func": 0.0625,
136
  "rewards/cumulative_reward_2": 0.0,
137
+ "rewards/final_correctness_reward_func": 0.25,
138
+ "rewards/question_recreation_reward_func": 0.37954823300242424,
139
  "rewards/soft_format_reward_func": 0.015625,
140
  "rewards/strict_format_reward_func": 0.0,
141
+ "rewards/xmlcount_reward_func": 0.2762500010430813,
142
  "step": 14
143
  },
144
  {
145
+ "completion_length": 371.1875,
146
+ "epoch": 3.2,
147
+ "grad_norm": 2.3564932346343994,
148
+ "kl": 0.003548893568222411,
149
  "learning_rate": 5.271487265090163e-08,
150
  "loss": 0.0,
151
+ "reward": 0.6700164806097746,
152
+ "reward_std": 0.8069747863337398,
153
+ "rewards/concensus_correctness_reward_func": 0.0,
154
  "rewards/consensus_reward_func": 0.0,
155
  "rewards/cumulative_reward_2": 0.0,
156
+ "rewards/final_correctness_reward_func": 0.125,
157
+ "rewards/question_recreation_reward_func": 0.3270789897069335,
158
  "rewards/soft_format_reward_func": 0.0,
159
  "rewards/strict_format_reward_func": 0.0,
160
+ "rewards/xmlcount_reward_func": 0.2179374946281314,
161
  "step": 16
162
  },
163
  {
164
+ "completion_length": 364.21875,
165
+ "epoch": 3.6,
166
+ "grad_norm": 3.2016563415527344,
167
+ "kl": 0.0021984638296999037,
168
  "learning_rate": 1.3545689574841341e-08,
169
  "loss": 0.0,
170
+ "reward": 0.9208500199019909,
171
+ "reward_std": 1.1289626825600863,
172
+ "rewards/concensus_correctness_reward_func": 0.21381250023841858,
173
+ "rewards/consensus_reward_func": 0.0625,
174
  "rewards/cumulative_reward_2": 0.0,
175
  "rewards/final_correctness_reward_func": 0.1875,
176
+ "rewards/question_recreation_reward_func": 0.184193748049438,
177
  "rewards/soft_format_reward_func": 0.0,
178
+ "rewards/strict_format_reward_func": 0.0,
179
+ "rewards/xmlcount_reward_func": 0.2728437576442957,
180
  "step": 18
181
  },
182
  {
183
+ "completion_length": 364.34375,
184
+ "epoch": 4.0,
185
+ "grad_norm": 2.2764692306518555,
186
+ "kl": 0.0035686935880221426,
187
  "learning_rate": 0.0,
188
  "loss": 0.0,
189
+ "reward": 1.4730796124786139,
190
+ "reward_std": 1.3868739902973175,
191
+ "rewards/concensus_correctness_reward_func": 0.32718750089406967,
192
  "rewards/consensus_reward_func": 0.125,
193
  "rewards/cumulative_reward_2": 0.0,
194
+ "rewards/final_correctness_reward_func": 0.1875,
195
+ "rewards/question_recreation_reward_func": 0.46451712027192116,
196
  "rewards/soft_format_reward_func": 0.0,
197
+ "rewards/strict_format_reward_func": 0.0,
198
+ "rewards/xmlcount_reward_func": 0.3688750071451068,
199
  "step": 20
200
  },
201
  {
202
+ "epoch": 4.0,
203
  "step": 20,
204
  "total_flos": 0.0,
205
+ "train_loss": 2.546286020788102e-06,
206
+ "train_runtime": 864.6962,
207
+ "train_samples_per_second": 0.37,
208
+ "train_steps_per_second": 0.023
209
  }
210
  ],
211
  "logging_steps": 2,
212
  "max_steps": 20,
213
  "num_input_tokens_seen": 0,
214
+ "num_train_epochs": 4,
215
  "save_steps": 25,
216
  "stateful_callbacks": {
217
  "TrainerControl": {