|
[INFO|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/vocab.json |
|
|
|
[INFO|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/merges.txt |
|
|
|
[INFO|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/tokenizer.json |
|
|
|
[INFO|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file added_tokens.json from cache at None |
|
|
|
[INFO|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file special_tokens_map.json from cache at None |
|
|
|
[INFO|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/tokenizer_config.json |
|
|
|
[INFO|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file chat_template.jinja from cache at None |
|
|
|
[INFO|2025-05-18 16:53:57] tokenization_utils_base.py:2323 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|2025-05-18 16:53:57] configuration_utils.py:693 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/config.json |
|
|
|
[INFO|2025-05-18 16:53:57] configuration_utils.py:765 >> Model config Qwen3Config { |
|
"architectures": [ |
|
"Qwen3ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151645, |
|
"head_dim": 128, |
|
"hidden_act": "silu", |
|
"hidden_size": 2560, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 9728, |
|
"max_position_embeddings": 40960, |
|
"max_window_layers": 36, |
|
"model_type": "qwen3", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 36, |
|
"num_key_value_heads": 8, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 1000000, |
|
"sliding_window": null, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.51.3", |
|
"use_cache": true, |
|
"use_sliding_window": false, |
|
"vocab_size": 151936 |
|
} |
|
|
|
|
|
[INFO|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/vocab.json |
|
|
|
[INFO|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/merges.txt |
|
|
|
[INFO|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/tokenizer.json |
|
|
|
[INFO|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file added_tokens.json from cache at None |
|
|
|
[INFO|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file special_tokens_map.json from cache at None |
|
|
|
[INFO|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/tokenizer_config.json |
|
|
|
[INFO|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file chat_template.jinja from cache at None |
|
|
|
[INFO|2025-05-18 16:53:58] tokenization_utils_base.py:2323 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|2025-05-18 16:53:58] logging.py:143 >> Loading dataset alpaca_zh_demo.json... |
|
|
|
[INFO|2025-05-18 16:53:59] configuration_utils.py:693 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/config.json |
|
|
|
[INFO|2025-05-18 16:53:59] configuration_utils.py:765 >> Model config Qwen3Config { |
|
"architectures": [ |
|
"Qwen3ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151645, |
|
"head_dim": 128, |
|
"hidden_act": "silu", |
|
"hidden_size": 2560, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 9728, |
|
"max_position_embeddings": 40960, |
|
"max_window_layers": 36, |
|
"model_type": "qwen3", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 36, |
|
"num_key_value_heads": 8, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 1000000, |
|
"sliding_window": null, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.51.3", |
|
"use_cache": true, |
|
"use_sliding_window": false, |
|
"vocab_size": 151936 |
|
} |
|
|
|
|
|
[INFO|2025-05-18 16:53:59] logging.py:143 >> KV cache is disabled during training. |
|
|
|
[INFO|2025-05-18 16:53:59] modeling_utils.py:1124 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/model.safetensors.index.json |
|
|
|
[INFO|2025-05-18 16:53:59] modeling_utils.py:2167 >> Instantiating Qwen3ForCausalLM model under default dtype torch.bfloat16. |
|
|
|
[INFO|2025-05-18 16:53:59] configuration_utils.py:1142 >> Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151645, |
|
"use_cache": false |
|
} |
|
|
|
|
|
[INFO|2025-05-18 16:54:01] modeling_utils.py:4930 >> All model checkpoint weights were used when initializing Qwen3ForCausalLM. |
|
|
|
|
|
[INFO|2025-05-18 16:54:01] modeling_utils.py:4938 >> All the weights of Qwen3ForCausalLM were initialized from the model checkpoint at Qwen/Qwen3-4B. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen3ForCausalLM for predictions without further training. |
|
|
|
[INFO|2025-05-18 16:54:01] configuration_utils.py:1097 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/generation_config.json |
|
|
|
[INFO|2025-05-18 16:54:01] configuration_utils.py:1142 >> Generate config GenerationConfig { |
|
"bos_token_id": 151643, |
|
"do_sample": true, |
|
"eos_token_id": [ |
|
151645, |
|
151643 |
|
], |
|
"pad_token_id": 151643, |
|
"temperature": 0.6, |
|
"top_k": 20, |
|
"top_p": 0.95 |
|
} |
|
|
|
|
|
[INFO|2025-05-18 16:54:01] logging.py:143 >> Gradient checkpointing enabled. |
|
|
|
[INFO|2025-05-18 16:54:01] logging.py:143 >> Using vanilla attention implementation. |
|
|
|
[INFO|2025-05-18 16:54:01] logging.py:143 >> Upcasting trainable params to float32. |
|
|
|
[INFO|2025-05-18 16:54:01] logging.py:143 >> Fine-tuning method: LoRA |
|
|
|
[INFO|2025-05-18 16:54:01] logging.py:143 >> Found linear modules: k_proj,o_proj,q_proj,up_proj,v_proj,gate_proj,down_proj |
|
|
|
[INFO|2025-05-18 16:54:02] logging.py:143 >> trainable params: 16,515,072 || all params: 4,038,983,168 || trainable%: 0.4089 |
|
|
|
[INFO|2025-05-18 16:54:02] trainer.py:748 >> Using auto half precision backend |
|
|
|
[INFO|2025-05-18 16:54:02] trainer.py:2414 >> ***** Running training ***** |
|
|
|
[INFO|2025-05-18 16:54:02] trainer.py:2415 >> Num examples = 1,000 |
|
|
|
[INFO|2025-05-18 16:54:02] trainer.py:2416 >> Num Epochs = 3 |
|
|
|
[INFO|2025-05-18 16:54:02] trainer.py:2417 >> Instantaneous batch size per device = 2 |
|
|
|
[INFO|2025-05-18 16:54:02] trainer.py:2420 >> Total train batch size (w. parallel, distributed & accumulation) = 32 |
|
|
|
[INFO|2025-05-18 16:54:02] trainer.py:2421 >> Gradient Accumulation steps = 8 |
|
|
|
[INFO|2025-05-18 16:54:02] trainer.py:2422 >> Total optimization steps = 93 |
|
|
|
[INFO|2025-05-18 16:54:02] trainer.py:2423 >> Number of trainable parameters = 16,515,072 |
|
|
|
[INFO|2025-05-18 16:54:38] logging.py:143 >> {'loss': 2.5834, 'learning_rate': 4.9772e-05, 'epoch': 0.16, 'throughput': 843.60} |
|
|
|
[INFO|2025-05-18 16:54:52] logging.py:143 >> {'loss': 2.0249, 'learning_rate': 4.8853e-05, 'epoch': 0.32, 'throughput': 1214.74} |
|
|
|
[INFO|2025-05-18 16:55:08] logging.py:143 >> {'loss': 1.6448, 'learning_rate': 4.7256e-05, 'epoch': 0.48, 'throughput': 1395.41} |
|
|
|
[INFO|2025-05-18 16:55:22] logging.py:143 >> {'loss': 1.5754, 'learning_rate': 4.5025e-05, 'epoch': 0.64, 'throughput': 1525.57} |
|
|
|
[INFO|2025-05-18 16:55:36] logging.py:143 >> {'loss': 1.5325, 'learning_rate': 4.2224e-05, 'epoch': 0.80, 'throughput': 1608.49} |
|
|
|
[INFO|2025-05-18 16:55:49] logging.py:143 >> {'loss': 1.5399, 'learning_rate': 3.8933e-05, 'epoch': 0.96, 'throughput': 1681.17} |
|
|
|
[INFO|2025-05-18 16:56:01] logging.py:143 >> {'loss': 1.4496, 'learning_rate': 3.5246e-05, 'epoch': 1.10, 'throughput': 1726.10} |
|
|
|
[INFO|2025-05-18 16:56:15] logging.py:143 >> {'loss': 1.4453, 'learning_rate': 3.1266e-05, 'epoch': 1.26, 'throughput': 1763.43} |
|
|
|
[INFO|2025-05-18 16:56:30] logging.py:143 >> {'loss': 1.5063, 'learning_rate': 2.7109e-05, 'epoch': 1.42, 'throughput': 1802.17} |
|
|
|
[INFO|2025-05-18 16:56:44] logging.py:143 >> {'loss': 1.5230, 'learning_rate': 2.2891e-05, 'epoch': 1.58, 'throughput': 1835.87} |
|
|
|
[INFO|2025-05-18 16:56:58] logging.py:143 >> {'loss': 1.4390, 'learning_rate': 1.8734e-05, 'epoch': 1.74, 'throughput': 1861.70} |
|
|
|
[INFO|2025-05-18 16:57:12] logging.py:143 >> {'loss': 1.4884, 'learning_rate': 1.4754e-05, 'epoch': 1.90, 'throughput': 1880.88} |
|
|
|
[INFO|2025-05-18 16:57:23] logging.py:143 >> {'loss': 1.5448, 'learning_rate': 1.1067e-05, 'epoch': 2.03, 'throughput': 1886.27} |
|
|
|
[INFO|2025-05-18 16:57:39] logging.py:143 >> {'loss': 1.4931, 'learning_rate': 7.7758e-06, 'epoch': 2.19, 'throughput': 1903.44} |
|
|
|
[INFO|2025-05-18 16:57:52] logging.py:143 >> {'loss': 1.4863, 'learning_rate': 4.9750e-06, 'epoch': 2.35, 'throughput': 1913.31} |
|
|
|
[INFO|2025-05-18 16:58:06] logging.py:143 >> {'loss': 1.4528, 'learning_rate': 2.7440e-06, 'epoch': 2.51, 'throughput': 1926.75} |
|
|
|
[INFO|2025-05-18 16:58:21] logging.py:143 >> {'loss': 1.3916, 'learning_rate': 1.1465e-06, 'epoch': 2.67, 'throughput': 1933.94} |
|
|
|
[INFO|2025-05-18 16:58:35] logging.py:143 >> {'loss': 1.4927, 'learning_rate': 2.2788e-07, 'epoch': 2.83, 'throughput': 1947.73} |
|
|
|
[INFO|2025-05-18 16:58:43] trainer.py:3984 >> Saving model checkpoint to saves/Qwen3-4B-Instruct/lora/train_2025-05-18-16-33-56/checkpoint-93 |
|
|
|
[INFO|2025-05-18 16:58:43] configuration_utils.py:693 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/config.json |
|
|
|
[INFO|2025-05-18 16:58:43] configuration_utils.py:765 >> Model config Qwen3Config { |
|
"architectures": [ |
|
"Qwen3ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151645, |
|
"head_dim": 128, |
|
"hidden_act": "silu", |
|
"hidden_size": 2560, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 9728, |
|
"max_position_embeddings": 40960, |
|
"max_window_layers": 36, |
|
"model_type": "qwen3", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 36, |
|
"num_key_value_heads": 8, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 1000000, |
|
"sliding_window": null, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.51.3", |
|
"use_cache": true, |
|
"use_sliding_window": false, |
|
"vocab_size": 151936 |
|
} |
|
|
|
|
|
[INFO|2025-05-18 16:58:44] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Qwen3-4B-Instruct/lora/train_2025-05-18-16-33-56/checkpoint-93/tokenizer_config.json |
|
|
|
[INFO|2025-05-18 16:58:44] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Qwen3-4B-Instruct/lora/train_2025-05-18-16-33-56/checkpoint-93/special_tokens_map.json |
|
|
|
[INFO|2025-05-18 16:58:45] trainer.py:2681 >> |
|
|
|
Training completed. Do not forget to share your model on huggingface.co/models =) |
|
|
|
|
|
|
|
[INFO|2025-05-18 16:58:45] trainer.py:3984 >> Saving model checkpoint to saves/Qwen3-4B-Instruct/lora/train_2025-05-18-16-33-56 |
|
|
|
[INFO|2025-05-18 16:58:45] configuration_utils.py:693 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/config.json |
|
|
|
[INFO|2025-05-18 16:58:45] configuration_utils.py:765 >> Model config Qwen3Config { |
|
"architectures": [ |
|
"Qwen3ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 151643, |
|
"eos_token_id": 151645, |
|
"head_dim": 128, |
|
"hidden_act": "silu", |
|
"hidden_size": 2560, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 9728, |
|
"max_position_embeddings": 40960, |
|
"max_window_layers": 36, |
|
"model_type": "qwen3", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 36, |
|
"num_key_value_heads": 8, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 1000000, |
|
"sliding_window": null, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.51.3", |
|
"use_cache": true, |
|
"use_sliding_window": false, |
|
"vocab_size": 151936 |
|
} |
|
|
|
|
|
[INFO|2025-05-18 16:58:46] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Qwen3-4B-Instruct/lora/train_2025-05-18-16-33-56/tokenizer_config.json |
|
|
|
[INFO|2025-05-18 16:58:46] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Qwen3-4B-Instruct/lora/train_2025-05-18-16-33-56/special_tokens_map.json |
|
|
|
[WARNING|2025-05-18 16:58:46] logging.py:148 >> No metric eval_loss to plot. |
|
|
|
[WARNING|2025-05-18 16:58:46] logging.py:148 >> No metric eval_accuracy to plot. |
|
|
|
[INFO|2025-05-18 16:58:46] modelcard.py:450 >> Dropping the following result as it does not have all the necessary fields: |
|
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} |
|
|
|
|