Files changed (1) hide show
  1. README.md +299 -285
README.md CHANGED
@@ -1,285 +1,299 @@
1
- ---
2
- library_name: transformers
3
- license: apache-2.0
4
- base_model: Qwen/Qwen2.5-32B-Instruct
5
- tags:
6
- - generated_from_trainer
7
- datasets:
8
- - shisa-ai/shisa-v2-best-of-n-athenev2-tulu70b-llama33-only-no-sysprompt
9
- - shisa-ai/shisa-v2-roleplaying-sft
10
- - shisa-ai/translation_set_april_6
11
- - shisa-ai/rewild-set-deepseek-subset
12
- - shisa-ai/magpie-ultra-set
13
- - shisa-ai/magpie-advanced-questions-set
14
- - shisa-ai/japan-magpie-set
15
- - shisa-ai/shisa-v2-instruction-following-sft
16
- model-index:
17
- - name: outputs/ablation-194-finalsft2-shisa-v2-qwen2.5-32b
18
- results: []
19
- ---
20
-
21
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
22
- should probably proofread and complete it, then remove this comment. -->
23
-
24
- [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
25
- <details><summary>See axolotl config</summary>
26
-
27
- axolotl version: `0.8.0.dev0`
28
- ```yaml
29
- # train w/ shisa-ai/shisa-v1-athenev2-reannotated-filtered
30
-
31
- base_model: Qwen/Qwen2.5-32B-Instruct
32
-
33
- load_in_8bit: false
34
- load_in_4bit: false
35
- strict: false
36
-
37
- # User Liger
38
- plugins:
39
- - axolotl.integrations.liger.LigerPlugin
40
- liger_rope: true
41
- liger_rms_norm: true
42
- liger_glu_activation: true
43
- liger_fused_linear_cross_entropy: true
44
-
45
- datasets:
46
- - path: shisa-ai/shisa-v2-best-of-n-athenev2-tulu70b-llama33-only-no-sysprompt
47
- type: chat_template
48
- field_messages: conversations
49
- message_field_role: from
50
- message_field_content: value
51
- - path: shisa-ai/shisa-v2-roleplaying-sft
52
- type: chat_template
53
- field_messages: conversations
54
- message_property_mappings:
55
- role: role
56
- content: content
57
- roles:
58
- system:
59
- - system
60
- assistant:
61
- - gpt
62
- - model
63
- - assistant
64
- user:
65
- - human
66
- - user
67
- roles_to_train: ["assistant"]
68
- - path: shisa-ai/translation_set_april_6
69
- split: train[:25%]
70
- type: chat_template
71
- field_messages: conversations
72
- message_property_mappings:
73
- role: role
74
- content: content
75
- roles:
76
- system:
77
- - system
78
- assistant:
79
- - gpt
80
- - model
81
- - assistant
82
- user:
83
- - human
84
- - user
85
- roles_to_train: ["assistant"]
86
- - path: shisa-ai/rewild-set-deepseek-subset
87
- split: train[:25%]
88
- type: chat_template
89
- field_messages: conversations
90
- message_property_mappings:
91
- role: role
92
- content: content
93
- roles:
94
- system:
95
- - system
96
- assistant:
97
- - gpt
98
- - model
99
- - assistant
100
- user:
101
- - human
102
- - user
103
- roles_to_train: ["assistant"]
104
- - path: shisa-ai/magpie-ultra-set
105
- split: train[:8%]
106
- type: chat_template
107
- field_messages: conversations
108
- message_property_mappings:
109
- role: role
110
- content: content
111
- roles:
112
- system:
113
- - system
114
- assistant:
115
- - gpt
116
- - model
117
- - assistant
118
- user:
119
- - human
120
- - user
121
- roles_to_train: ["assistant"]
122
- - path: shisa-ai/magpie-advanced-questions-set
123
- split: train[:8%]
124
- type: chat_template
125
- field_messages: conversations
126
- message_property_mappings:
127
- role: role
128
- content: content
129
- roles:
130
- system:
131
- - system
132
- assistant:
133
- - gpt
134
- - model
135
- - assistant
136
- user:
137
- - human
138
- - user
139
- roles_to_train: ["assistant"]
140
- - path: shisa-ai/japan-magpie-set
141
- split: train
142
- type: chat_template
143
- field_messages: conversations
144
- message_property_mappings:
145
- role: role
146
- content: content
147
- roles:
148
- system:
149
- - system
150
- assistant:
151
- - gpt
152
- - model
153
- - assistant
154
- user:
155
- - human
156
- - user
157
- roles_to_train: ["assistant"]
158
- - path: shisa-ai/shisa-v2-instruction-following-sft
159
- split: train[:50%]
160
- type: chat_template
161
- field_messages: conversations
162
- message_property_mappings:
163
- role: role
164
- content: content
165
- roles:
166
- system:
167
- - system
168
- assistant:
169
- - gpt
170
- - model
171
- - assistant
172
- user:
173
- - human
174
- - user
175
- roles_to_train: ["assistant"]
176
-
177
- dataset_prepared_path: last_run_prepared
178
- val_set_size: 0.05
179
- output_dir: ./outputs/ablation-194-finalsft2-shisa-v2-qwen2.5-32b
180
-
181
- sequence_len: 8192
182
- sample_packing: true
183
- pad_to_sequence_len: true
184
-
185
- # marginal difference
186
- neftune_noise_alpha: 5
187
-
188
- use_wandb: true
189
- wandb_project: shisa-v2
190
- wandb_entity: augmxnt
191
- wandb_name: ablation-194-finalsft2-shisa-v2-qwen2.5-32b
192
-
193
- gradient_accumulation_steps: 1
194
- micro_batch_size: 4
195
- num_epochs: 3
196
- optimizer: paged_adamw_8bit
197
- lr_scheduler: linear
198
- learning_rate: 5e-6
199
-
200
- train_on_inputs: false
201
- group_by_length: false
202
- bf16: auto
203
- fp16:
204
- tf32: false
205
-
206
- gradient_checkpointing: true
207
- gradient_checkpointing_kwargs:
208
- use_reentrant: false
209
- early_stopping_patience:
210
- resume_from_checkpoint:
211
- logging_steps: 1
212
- xformers_attention:
213
- flash_attention: true
214
-
215
- warmup_steps: 100
216
- evals_per_epoch: 2
217
- eval_table_size:
218
- saves_per_epoch: 0
219
- save_total_limit: 1 # Only store a single checkpoint
220
- debug:
221
- deepspeed: zero3_bf16.json
222
- weight_decay: 1e-4
223
- fsdp:
224
- fsdp_config:
225
- special_tokens:
226
-
227
- ```
228
-
229
- </details><br>
230
-
231
- # outputs/ablation-194-finalsft2-shisa-v2-qwen2.5-32b
232
-
233
- This model is a fine-tuned version of [Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) on the shisa-ai/shisa-v2-best-of-n-athenev2-tulu70b-llama33-only-no-sysprompt, the shisa-ai/shisa-v2-roleplaying-sft, the shisa-ai/translation_set_april_6, the shisa-ai/rewild-set-deepseek-subset, the shisa-ai/magpie-ultra-set, the shisa-ai/magpie-advanced-questions-set, the shisa-ai/japan-magpie-set and the shisa-ai/shisa-v2-instruction-following-sft datasets.
234
- It achieves the following results on the evaluation set:
235
- - Loss: 0.6159
236
-
237
- ## Model description
238
-
239
- More information needed
240
-
241
- ## Intended uses & limitations
242
-
243
- More information needed
244
-
245
- ## Training and evaluation data
246
-
247
- More information needed
248
-
249
- ## Training procedure
250
-
251
- ### Training hyperparameters
252
-
253
- The following hyperparameters were used during training:
254
- - learning_rate: 5e-06
255
- - train_batch_size: 4
256
- - eval_batch_size: 4
257
- - seed: 42
258
- - distributed_type: multi-GPU
259
- - num_devices: 32
260
- - total_train_batch_size: 128
261
- - total_eval_batch_size: 128
262
- - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
263
- - lr_scheduler_type: linear
264
- - lr_scheduler_warmup_steps: 100
265
- - num_epochs: 3.0
266
-
267
- ### Training results
268
-
269
- | Training Loss | Epoch | Step | Validation Loss |
270
- |:-------------:|:------:|:----:|:---------------:|
271
- | 0.9086 | 0.0026 | 1 | 0.9309 |
272
- | 0.6467 | 0.5 | 189 | 0.6664 |
273
- | 0.6343 | 1.0 | 378 | 0.6380 |
274
- | 0.6019 | 1.5 | 567 | 0.6267 |
275
- | 0.5968 | 2.0 | 756 | 0.6189 |
276
- | 0.5726 | 2.5 | 945 | 0.6175 |
277
- | 0.5617 | 3.0 | 1134 | 0.6159 |
278
-
279
-
280
- ### Framework versions
281
-
282
- - Transformers 4.50.0
283
- - Pytorch 2.6.0+cu124
284
- - Datasets 3.4.1
285
- - Tokenizers 0.21.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: Qwen/Qwen2.5-32B-Instruct
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - shisa-ai/shisa-v2-best-of-n-athenev2-tulu70b-llama33-only-no-sysprompt
9
+ - shisa-ai/shisa-v2-roleplaying-sft
10
+ - shisa-ai/translation_set_april_6
11
+ - shisa-ai/rewild-set-deepseek-subset
12
+ - shisa-ai/magpie-ultra-set
13
+ - shisa-ai/magpie-advanced-questions-set
14
+ - shisa-ai/japan-magpie-set
15
+ - shisa-ai/shisa-v2-instruction-following-sft
16
+ language:
17
+ - zho
18
+ - eng
19
+ - fra
20
+ - spa
21
+ - por
22
+ - deu
23
+ - ita
24
+ - rus
25
+ - jpn
26
+ - kor
27
+ - vie
28
+ - tha
29
+ - ara
30
+ model-index:
31
+ - name: outputs/ablation-194-finalsft2-shisa-v2-qwen2.5-32b
32
+ results: []
33
+ ---
34
+
35
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
36
+ should probably proofread and complete it, then remove this comment. -->
37
+
38
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
39
+ <details><summary>See axolotl config</summary>
40
+
41
+ axolotl version: `0.8.0.dev0`
42
+ ```yaml
43
+ # train w/ shisa-ai/shisa-v1-athenev2-reannotated-filtered
44
+
45
+ base_model: Qwen/Qwen2.5-32B-Instruct
46
+
47
+ load_in_8bit: false
48
+ load_in_4bit: false
49
+ strict: false
50
+
51
+ # User Liger
52
+ plugins:
53
+ - axolotl.integrations.liger.LigerPlugin
54
+ liger_rope: true
55
+ liger_rms_norm: true
56
+ liger_glu_activation: true
57
+ liger_fused_linear_cross_entropy: true
58
+
59
+ datasets:
60
+ - path: shisa-ai/shisa-v2-best-of-n-athenev2-tulu70b-llama33-only-no-sysprompt
61
+ type: chat_template
62
+ field_messages: conversations
63
+ message_field_role: from
64
+ message_field_content: value
65
+ - path: shisa-ai/shisa-v2-roleplaying-sft
66
+ type: chat_template
67
+ field_messages: conversations
68
+ message_property_mappings:
69
+ role: role
70
+ content: content
71
+ roles:
72
+ system:
73
+ - system
74
+ assistant:
75
+ - gpt
76
+ - model
77
+ - assistant
78
+ user:
79
+ - human
80
+ - user
81
+ roles_to_train: ["assistant"]
82
+ - path: shisa-ai/translation_set_april_6
83
+ split: train[:25%]
84
+ type: chat_template
85
+ field_messages: conversations
86
+ message_property_mappings:
87
+ role: role
88
+ content: content
89
+ roles:
90
+ system:
91
+ - system
92
+ assistant:
93
+ - gpt
94
+ - model
95
+ - assistant
96
+ user:
97
+ - human
98
+ - user
99
+ roles_to_train: ["assistant"]
100
+ - path: shisa-ai/rewild-set-deepseek-subset
101
+ split: train[:25%]
102
+ type: chat_template
103
+ field_messages: conversations
104
+ message_property_mappings:
105
+ role: role
106
+ content: content
107
+ roles:
108
+ system:
109
+ - system
110
+ assistant:
111
+ - gpt
112
+ - model
113
+ - assistant
114
+ user:
115
+ - human
116
+ - user
117
+ roles_to_train: ["assistant"]
118
+ - path: shisa-ai/magpie-ultra-set
119
+ split: train[:8%]
120
+ type: chat_template
121
+ field_messages: conversations
122
+ message_property_mappings:
123
+ role: role
124
+ content: content
125
+ roles:
126
+ system:
127
+ - system
128
+ assistant:
129
+ - gpt
130
+ - model
131
+ - assistant
132
+ user:
133
+ - human
134
+ - user
135
+ roles_to_train: ["assistant"]
136
+ - path: shisa-ai/magpie-advanced-questions-set
137
+ split: train[:8%]
138
+ type: chat_template
139
+ field_messages: conversations
140
+ message_property_mappings:
141
+ role: role
142
+ content: content
143
+ roles:
144
+ system:
145
+ - system
146
+ assistant:
147
+ - gpt
148
+ - model
149
+ - assistant
150
+ user:
151
+ - human
152
+ - user
153
+ roles_to_train: ["assistant"]
154
+ - path: shisa-ai/japan-magpie-set
155
+ split: train
156
+ type: chat_template
157
+ field_messages: conversations
158
+ message_property_mappings:
159
+ role: role
160
+ content: content
161
+ roles:
162
+ system:
163
+ - system
164
+ assistant:
165
+ - gpt
166
+ - model
167
+ - assistant
168
+ user:
169
+ - human
170
+ - user
171
+ roles_to_train: ["assistant"]
172
+ - path: shisa-ai/shisa-v2-instruction-following-sft
173
+ split: train[:50%]
174
+ type: chat_template
175
+ field_messages: conversations
176
+ message_property_mappings:
177
+ role: role
178
+ content: content
179
+ roles:
180
+ system:
181
+ - system
182
+ assistant:
183
+ - gpt
184
+ - model
185
+ - assistant
186
+ user:
187
+ - human
188
+ - user
189
+ roles_to_train: ["assistant"]
190
+
191
+ dataset_prepared_path: last_run_prepared
192
+ val_set_size: 0.05
193
+ output_dir: ./outputs/ablation-194-finalsft2-shisa-v2-qwen2.5-32b
194
+
195
+ sequence_len: 8192
196
+ sample_packing: true
197
+ pad_to_sequence_len: true
198
+
199
+ # marginal difference
200
+ neftune_noise_alpha: 5
201
+
202
+ use_wandb: true
203
+ wandb_project: shisa-v2
204
+ wandb_entity: augmxnt
205
+ wandb_name: ablation-194-finalsft2-shisa-v2-qwen2.5-32b
206
+
207
+ gradient_accumulation_steps: 1
208
+ micro_batch_size: 4
209
+ num_epochs: 3
210
+ optimizer: paged_adamw_8bit
211
+ lr_scheduler: linear
212
+ learning_rate: 5e-6
213
+
214
+ train_on_inputs: false
215
+ group_by_length: false
216
+ bf16: auto
217
+ fp16:
218
+ tf32: false
219
+
220
+ gradient_checkpointing: true
221
+ gradient_checkpointing_kwargs:
222
+ use_reentrant: false
223
+ early_stopping_patience:
224
+ resume_from_checkpoint:
225
+ logging_steps: 1
226
+ xformers_attention:
227
+ flash_attention: true
228
+
229
+ warmup_steps: 100
230
+ evals_per_epoch: 2
231
+ eval_table_size:
232
+ saves_per_epoch: 0
233
+ save_total_limit: 1 # Only store a single checkpoint
234
+ debug:
235
+ deepspeed: zero3_bf16.json
236
+ weight_decay: 1e-4
237
+ fsdp:
238
+ fsdp_config:
239
+ special_tokens:
240
+
241
+ ```
242
+
243
+ </details><br>
244
+
245
+ # outputs/ablation-194-finalsft2-shisa-v2-qwen2.5-32b
246
+
247
+ This model is a fine-tuned version of [Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) on the shisa-ai/shisa-v2-best-of-n-athenev2-tulu70b-llama33-only-no-sysprompt, the shisa-ai/shisa-v2-roleplaying-sft, the shisa-ai/translation_set_april_6, the shisa-ai/rewild-set-deepseek-subset, the shisa-ai/magpie-ultra-set, the shisa-ai/magpie-advanced-questions-set, the shisa-ai/japan-magpie-set and the shisa-ai/shisa-v2-instruction-following-sft datasets.
248
+ It achieves the following results on the evaluation set:
249
+ - Loss: 0.6159
250
+
251
+ ## Model description
252
+
253
+ More information needed
254
+
255
+ ## Intended uses & limitations
256
+
257
+ More information needed
258
+
259
+ ## Training and evaluation data
260
+
261
+ More information needed
262
+
263
+ ## Training procedure
264
+
265
+ ### Training hyperparameters
266
+
267
+ The following hyperparameters were used during training:
268
+ - learning_rate: 5e-06
269
+ - train_batch_size: 4
270
+ - eval_batch_size: 4
271
+ - seed: 42
272
+ - distributed_type: multi-GPU
273
+ - num_devices: 32
274
+ - total_train_batch_size: 128
275
+ - total_eval_batch_size: 128
276
+ - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
277
+ - lr_scheduler_type: linear
278
+ - lr_scheduler_warmup_steps: 100
279
+ - num_epochs: 3.0
280
+
281
+ ### Training results
282
+
283
+ | Training Loss | Epoch | Step | Validation Loss |
284
+ |:-------------:|:------:|:----:|:---------------:|
285
+ | 0.9086 | 0.0026 | 1 | 0.9309 |
286
+ | 0.6467 | 0.5 | 189 | 0.6664 |
287
+ | 0.6343 | 1.0 | 378 | 0.6380 |
288
+ | 0.6019 | 1.5 | 567 | 0.6267 |
289
+ | 0.5968 | 2.0 | 756 | 0.6189 |
290
+ | 0.5726 | 2.5 | 945 | 0.6175 |
291
+ | 0.5617 | 3.0 | 1134 | 0.6159 |
292
+
293
+
294
+ ### Framework versions
295
+
296
+ - Transformers 4.50.0
297
+ - Pytorch 2.6.0+cu124
298
+ - Datasets 3.4.1
299
+ - Tokenizers 0.21.1