|
--- |
|
library_name: transformers |
|
license: apache-2.0 |
|
base_model: Qwen/Qwen3-0.6B-Base |
|
tags: |
|
- axolotl |
|
- generated_from_trainer |
|
datasets: |
|
- open-thoughts/OpenThoughts2-1M |
|
model-index: |
|
- name: base |
|
results: [] |
|
--- |
|
|
|
<!-- This model card has been generated automatically according to the information the Trainer had access to. You |
|
should probably proofread and complete it, then remove this comment. --> |
|
|
|
[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl) |
|
<details><summary>See axolotl config</summary> |
|
|
|
axolotl version: `0.10.0.dev0` |
|
```yaml |
|
base_model: Qwen/Qwen3-0.6B-Base |
|
hub_model_id: cyberbabooshka/base |
|
wandb_name: base |
|
|
|
tokenizer_type: AutoTokenizer |
|
load_in_8bit: false |
|
load_in_4bit: false |
|
|
|
num_processes: 64 |
|
dataset_processes: 64 |
|
dataset_prepared_path: last_run_prepared |
|
|
|
chat_template: jinja |
|
chat_template_jinja: >- |
|
{%- if tools %} |
|
{{- '<|im_start|>system\n' }} |
|
{%- if messages[0].role == 'system' %} |
|
{{- messages[0].content + '\n\n' }} |
|
{%- endif %} |
|
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }} |
|
{%- for tool in tools %} |
|
{{- "\n" }} |
|
{{- tool | tojson }} |
|
{%- endfor %} |
|
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }} |
|
{%- else %} |
|
{%- if messages[0].role == 'system' %} |
|
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} |
|
{%- endif %} |
|
{%- endif %} |
|
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} |
|
{%- for message in messages[::-1] %} |
|
{%- set index = (messages|length - 1) - loop.index0 %} |
|
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %} |
|
{%- set ns.multi_step_tool = false %} |
|
{%- set ns.last_query_index = index %} |
|
{%- endif %} |
|
{%- endfor %} |
|
{%- for message in messages %} |
|
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %} |
|
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} |
|
{%- elif message.role == "assistant" %} |
|
{%- set content = message.content %} |
|
{%- set reasoning_content = '' %} |
|
{%- if message.reasoning_content is defined and message.reasoning_content is not none %} |
|
{%- set reasoning_content = message.reasoning_content %} |
|
{%- else %} |
|
{%- if '</think>' in message.content %} |
|
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %} |
|
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %} |
|
{%- endif %} |
|
{%- endif %} |
|
{%- if loop.index0 > ns.last_query_index %} |
|
{%- if loop.last or (not loop.last and reasoning_content) %} |
|
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }} |
|
{%- else %} |
|
{{- '<|im_start|>' + message.role + '\n' + content }} |
|
{%- endif %} |
|
{%- else %} |
|
{{- '<|im_start|>' + message.role + '\n' + content }} |
|
{%- endif %} |
|
{%- if message.tool_calls %} |
|
{%- for tool_call in message.tool_calls %} |
|
{%- if (loop.first and content) or (not loop.first) %} |
|
{{- '\n' }} |
|
{%- endif %} |
|
{%- if tool_call.function %} |
|
{%- set tool_call = tool_call.function %} |
|
{%- endif %} |
|
{{- '<tool_call>\n{"name": "' }} |
|
{{- tool_call.name }} |
|
{{- '", "arguments": ' }} |
|
{%- if tool_call.arguments is string %} |
|
{{- tool_call.arguments }} |
|
{%- else %} |
|
{{- tool_call.arguments | tojson }} |
|
{%- endif %} |
|
{{- '}\n</tool_call>' }} |
|
{%- endfor %} |
|
{%- endif %} |
|
{{- '<|im_end|>\n' }} |
|
{%- elif message.role == "tool" %} |
|
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} |
|
{{- '<|im_start|>user' }} |
|
{%- endif %} |
|
{{- '\n<tool_response>\n' }} |
|
{{- message.content }} |
|
{{- '\n</tool_response>' }} |
|
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} |
|
{{- '<|im_end|>\n' }} |
|
{%- endif %} |
|
{%- endif %} |
|
{%- endfor %} |
|
{%- if add_generation_prompt %} |
|
{{- '<|im_start|>assistant\n' }} |
|
{%- if enable_thinking is defined and enable_thinking is false %} |
|
{{- '<think>\n\n</think>\n\n' }} |
|
{%- else %} |
|
{{- '<think>\n' }} |
|
{%- endif %} |
|
{%- endif %} |
|
|
|
datasets: |
|
- path: open-thoughts/OpenThoughts2-1M |
|
split: train[1%:] |
|
type: chat_template |
|
field_messages: conversations |
|
train_on_eos: turn |
|
train_on_eot: turn |
|
message_property_mappings: |
|
role: from |
|
content: value |
|
roles: |
|
user: |
|
- user |
|
assistant: |
|
- assistant |
|
|
|
test_datasets: |
|
- path: open-thoughts/OpenThoughts2-1M |
|
split: train[:1%] |
|
type: chat_template |
|
field_messages: conversations |
|
train_on_eos: turn |
|
train_on_eot: turn |
|
message_property_mappings: |
|
role: from |
|
content: value |
|
roles: |
|
user: |
|
- user |
|
assistant: |
|
- assistant |
|
|
|
output_dir: ./outputs |
|
|
|
sequence_len: 9096 |
|
batch_flattening: true |
|
sample_packing: false |
|
|
|
# adapter: lora |
|
lora_model_dir: |
|
lora_r: 64 |
|
lora_alpha: 32 |
|
lora_dropout: 0.0 |
|
lora_target_modules: |
|
- embed_tokens |
|
lora_target_linear: true |
|
lora_on_cpu: false |
|
|
|
wandb_project: mnlp |
|
wandb_entity: aleksandr-dremov-epfl |
|
wandb_watch: |
|
wandb_log_model: |
|
|
|
gradient_accumulation_steps: 2 |
|
eval_batch_size: 16 |
|
micro_batch_size: 4 |
|
|
|
optimizer: ademamix_8bit |
|
weight_decay: 0.01 |
|
|
|
learning_rate: 0.00001 |
|
warmup_steps: 500 |
|
|
|
wsd_final_lr_factor: 0.0 |
|
wsd_init_div_factor: 100 |
|
wsd_fract_decay: 0.2 |
|
wsd_decay_type: "sqrt" |
|
wsd_sqrt_power: 0.5 |
|
wsd_cooldown_start_lr_factor: 1.0 |
|
|
|
bf16: auto |
|
tf32: false |
|
|
|
torch_compile: true |
|
flash_attention: true |
|
gradient_checkpointing: false |
|
|
|
resume_from_checkpoint: |
|
auto_resume_from_checkpoints: true |
|
|
|
logging_steps: 16 |
|
eval_steps: 2000 |
|
save_steps: 1000 |
|
max_steps: 40000 |
|
num_epochs: 20000000 |
|
save_total_limit: 2 |
|
|
|
special_tokens: |
|
eos_token: "<|im_end|>" |
|
pad_token: "<|endoftext|>" |
|
|
|
eot_tokens: |
|
- <|im_end|> |
|
|
|
plugins: |
|
- axolotl_wsd.WSDSchedulerPlugin |
|
|
|
``` |
|
|
|
</details><br> |
|
|
|
# base |
|
|
|
This model is a fine-tuned version of [Qwen/Qwen3-0.6B-Base](https://huggingface.co/Qwen/Qwen3-0.6B-Base) on the open-thoughts/OpenThoughts2-1M dataset. |
|
It achieves the following results on the evaluation set: |
|
- Loss: 0.5060 |
|
|
|
## Model description |
|
|
|
More information needed |
|
|
|
## Intended uses & limitations |
|
|
|
More information needed |
|
|
|
## Training and evaluation data |
|
|
|
More information needed |
|
|
|
## Training procedure |
|
|
|
### Training hyperparameters |
|
|
|
The following hyperparameters were used during training: |
|
- learning_rate: 1e-05 |
|
- train_batch_size: 4 |
|
- eval_batch_size: 16 |
|
- seed: 42 |
|
- distributed_type: multi-GPU |
|
- num_devices: 4 |
|
- gradient_accumulation_steps: 2 |
|
- total_train_batch_size: 32 |
|
- total_eval_batch_size: 64 |
|
- optimizer: Use OptimizerNames.ADEMAMIX_8BIT and the args are: |
|
No additional optimizer arguments |
|
- lr_scheduler_type: cosine |
|
- lr_scheduler_warmup_steps: 500 |
|
- training_steps: 40000 |
|
|
|
### Training results |
|
|
|
| Training Loss | Epoch | Step | Validation Loss | |
|
|:-------------:|:------:|:-----:|:---------------:| |
|
| No log | 0.0000 | 1 | 0.8524 | |
|
| 0.5816 | 0.0671 | 2000 | 0.6038 | |
|
| 0.554 | 0.1342 | 4000 | 0.5775 | |
|
| 0.5746 | 0.2013 | 6000 | 0.5623 | |
|
| 0.5304 | 0.2684 | 8000 | 0.5516 | |
|
| 0.5334 | 0.3355 | 10000 | 0.5434 | |
|
| 0.5378 | 0.4026 | 12000 | 0.5372 | |
|
| 0.5205 | 0.4697 | 14000 | 0.5322 | |
|
| 0.5301 | 0.5368 | 16000 | 0.5284 | |
|
| 0.4979 | 0.6039 | 18000 | 0.5253 | |
|
| 0.514 | 0.6710 | 20000 | 0.5225 | |
|
| 0.5022 | 0.7381 | 22000 | 0.5202 | |
|
| 0.5183 | 0.8052 | 24000 | 0.5187 | |
|
| 0.4987 | 0.8724 | 26000 | 0.5175 | |
|
| 0.5041 | 0.9395 | 28000 | 0.5161 | |
|
| 0.4961 | 1.0066 | 30000 | 0.5159 | |
|
| 0.4882 | 1.0737 | 32000 | 0.5161 | |
|
| 0.5021 | 1.1408 | 34000 | 0.5117 | |
|
| 0.4793 | 1.2079 | 36000 | 0.5093 | |
|
| 0.4854 | 1.2750 | 38000 | 0.5071 | |
|
| 0.4947 | 1.3421 | 40000 | 0.5060 | |
|
|
|
|
|
### Framework versions |
|
|
|
- Transformers 4.51.3 |
|
- Pytorch 2.6.0+cu124 |
|
- Datasets 3.5.0 |
|
- Tokenizers 0.21.1 |
|
|