llamamodeltest01 / running_log.txt

Upload folder using huggingface_hub

1054bca verified 4 months ago

13.8 kB

	[INFO\|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/vocab.json

	[INFO\|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/merges.txt

	[INFO\|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/tokenizer.json

	[INFO\|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file added_tokens.json from cache at None

	[INFO\|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file special_tokens_map.json from cache at None

	[INFO\|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/tokenizer_config.json

	[INFO\|2025-05-18 16:53:56] tokenization_utils_base.py:2060 >> loading file chat_template.jinja from cache at None

	[INFO\|2025-05-18 16:53:57] tokenization_utils_base.py:2323 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

	[INFO\|2025-05-18 16:53:57] configuration_utils.py:693 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/config.json

	[INFO\|2025-05-18 16:53:57] configuration_utils.py:765 >> Model config Qwen3Config {
	"architectures": [
	"Qwen3ForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 151643,
	"eos_token_id": 151645,
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 2560,
	"initializer_range": 0.02,
	"intermediate_size": 9728,
	"max_position_embeddings": 40960,
	"max_window_layers": 36,
	"model_type": "qwen3",
	"num_attention_heads": 32,
	"num_hidden_layers": 36,
	"num_key_value_heads": 8,
	"rms_norm_eps": 1e-06,
	"rope_scaling": null,
	"rope_theta": 1000000,
	"sliding_window": null,
	"tie_word_embeddings": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.51.3",
	"use_cache": true,
	"use_sliding_window": false,
	"vocab_size": 151936
	}


	[INFO\|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/vocab.json

	[INFO\|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/merges.txt

	[INFO\|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/tokenizer.json

	[INFO\|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file added_tokens.json from cache at None

	[INFO\|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file special_tokens_map.json from cache at None

	[INFO\|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/tokenizer_config.json

	[INFO\|2025-05-18 16:53:57] tokenization_utils_base.py:2060 >> loading file chat_template.jinja from cache at None

	[INFO\|2025-05-18 16:53:58] tokenization_utils_base.py:2323 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

	[INFO\|2025-05-18 16:53:58] logging.py:143 >> Loading dataset alpaca_zh_demo.json...

	[INFO\|2025-05-18 16:53:59] configuration_utils.py:693 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/config.json

	[INFO\|2025-05-18 16:53:59] configuration_utils.py:765 >> Model config Qwen3Config {
	"architectures": [
	"Qwen3ForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 151643,
	"eos_token_id": 151645,
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 2560,
	"initializer_range": 0.02,
	"intermediate_size": 9728,
	"max_position_embeddings": 40960,
	"max_window_layers": 36,
	"model_type": "qwen3",
	"num_attention_heads": 32,
	"num_hidden_layers": 36,
	"num_key_value_heads": 8,
	"rms_norm_eps": 1e-06,
	"rope_scaling": null,
	"rope_theta": 1000000,
	"sliding_window": null,
	"tie_word_embeddings": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.51.3",
	"use_cache": true,
	"use_sliding_window": false,
	"vocab_size": 151936
	}


	[INFO\|2025-05-18 16:53:59] logging.py:143 >> KV cache is disabled during training.

	[INFO\|2025-05-18 16:53:59] modeling_utils.py:1124 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/model.safetensors.index.json

	[INFO\|2025-05-18 16:53:59] modeling_utils.py:2167 >> Instantiating Qwen3ForCausalLM model under default dtype torch.bfloat16.

	[INFO\|2025-05-18 16:53:59] configuration_utils.py:1142 >> Generate config GenerationConfig {
	"bos_token_id": 151643,
	"eos_token_id": 151645,
	"use_cache": false
	}


	[INFO\|2025-05-18 16:54:01] modeling_utils.py:4930 >> All model checkpoint weights were used when initializing Qwen3ForCausalLM.


	[INFO\|2025-05-18 16:54:01] modeling_utils.py:4938 >> All the weights of Qwen3ForCausalLM were initialized from the model checkpoint at Qwen/Qwen3-4B.
	If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen3ForCausalLM for predictions without further training.

	[INFO\|2025-05-18 16:54:01] configuration_utils.py:1097 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/generation_config.json

	[INFO\|2025-05-18 16:54:01] configuration_utils.py:1142 >> Generate config GenerationConfig {
	"bos_token_id": 151643,
	"do_sample": true,
	"eos_token_id": [
	151645,
	151643
	],
	"pad_token_id": 151643,
	"temperature": 0.6,
	"top_k": 20,
	"top_p": 0.95
	}


	[INFO\|2025-05-18 16:54:01] logging.py:143 >> Gradient checkpointing enabled.

	[INFO\|2025-05-18 16:54:01] logging.py:143 >> Using vanilla attention implementation.

	[INFO\|2025-05-18 16:54:01] logging.py:143 >> Upcasting trainable params to float32.

	[INFO\|2025-05-18 16:54:01] logging.py:143 >> Fine-tuning method: LoRA

	[INFO\|2025-05-18 16:54:01] logging.py:143 >> Found linear modules: k_proj,o_proj,q_proj,up_proj,v_proj,gate_proj,down_proj

	[INFO\|2025-05-18 16:54:02] logging.py:143 >> trainable params: 16,515,072 \|\| all params: 4,038,983,168 \|\| trainable%: 0.4089

	[INFO\|2025-05-18 16:54:02] trainer.py:748 >> Using auto half precision backend

	[INFO\|2025-05-18 16:54:02] trainer.py:2414 >> *** Running training ***

	[INFO\|2025-05-18 16:54:02] trainer.py:2415 >> Num examples = 1,000

	[INFO\|2025-05-18 16:54:02] trainer.py:2416 >> Num Epochs = 3

	[INFO\|2025-05-18 16:54:02] trainer.py:2417 >> Instantaneous batch size per device = 2

	[INFO\|2025-05-18 16:54:02] trainer.py:2420 >> Total train batch size (w. parallel, distributed & accumulation) = 32

	[INFO\|2025-05-18 16:54:02] trainer.py:2421 >> Gradient Accumulation steps = 8

	[INFO\|2025-05-18 16:54:02] trainer.py:2422 >> Total optimization steps = 93

	[INFO\|2025-05-18 16:54:02] trainer.py:2423 >> Number of trainable parameters = 16,515,072

	[INFO\|2025-05-18 16:54:38] logging.py:143 >> {'loss': 2.5834, 'learning_rate': 4.9772e-05, 'epoch': 0.16, 'throughput': 843.60}

	[INFO\|2025-05-18 16:54:52] logging.py:143 >> {'loss': 2.0249, 'learning_rate': 4.8853e-05, 'epoch': 0.32, 'throughput': 1214.74}

	[INFO\|2025-05-18 16:55:08] logging.py:143 >> {'loss': 1.6448, 'learning_rate': 4.7256e-05, 'epoch': 0.48, 'throughput': 1395.41}

	[INFO\|2025-05-18 16:55:22] logging.py:143 >> {'loss': 1.5754, 'learning_rate': 4.5025e-05, 'epoch': 0.64, 'throughput': 1525.57}

	[INFO\|2025-05-18 16:55:36] logging.py:143 >> {'loss': 1.5325, 'learning_rate': 4.2224e-05, 'epoch': 0.80, 'throughput': 1608.49}

	[INFO\|2025-05-18 16:55:49] logging.py:143 >> {'loss': 1.5399, 'learning_rate': 3.8933e-05, 'epoch': 0.96, 'throughput': 1681.17}

	[INFO\|2025-05-18 16:56:01] logging.py:143 >> {'loss': 1.4496, 'learning_rate': 3.5246e-05, 'epoch': 1.10, 'throughput': 1726.10}

	[INFO\|2025-05-18 16:56:15] logging.py:143 >> {'loss': 1.4453, 'learning_rate': 3.1266e-05, 'epoch': 1.26, 'throughput': 1763.43}

	[INFO\|2025-05-18 16:56:30] logging.py:143 >> {'loss': 1.5063, 'learning_rate': 2.7109e-05, 'epoch': 1.42, 'throughput': 1802.17}

	[INFO\|2025-05-18 16:56:44] logging.py:143 >> {'loss': 1.5230, 'learning_rate': 2.2891e-05, 'epoch': 1.58, 'throughput': 1835.87}

	[INFO\|2025-05-18 16:56:58] logging.py:143 >> {'loss': 1.4390, 'learning_rate': 1.8734e-05, 'epoch': 1.74, 'throughput': 1861.70}

	[INFO\|2025-05-18 16:57:12] logging.py:143 >> {'loss': 1.4884, 'learning_rate': 1.4754e-05, 'epoch': 1.90, 'throughput': 1880.88}

	[INFO\|2025-05-18 16:57:23] logging.py:143 >> {'loss': 1.5448, 'learning_rate': 1.1067e-05, 'epoch': 2.03, 'throughput': 1886.27}

	[INFO\|2025-05-18 16:57:39] logging.py:143 >> {'loss': 1.4931, 'learning_rate': 7.7758e-06, 'epoch': 2.19, 'throughput': 1903.44}

	[INFO\|2025-05-18 16:57:52] logging.py:143 >> {'loss': 1.4863, 'learning_rate': 4.9750e-06, 'epoch': 2.35, 'throughput': 1913.31}

	[INFO\|2025-05-18 16:58:06] logging.py:143 >> {'loss': 1.4528, 'learning_rate': 2.7440e-06, 'epoch': 2.51, 'throughput': 1926.75}

	[INFO\|2025-05-18 16:58:21] logging.py:143 >> {'loss': 1.3916, 'learning_rate': 1.1465e-06, 'epoch': 2.67, 'throughput': 1933.94}

	[INFO\|2025-05-18 16:58:35] logging.py:143 >> {'loss': 1.4927, 'learning_rate': 2.2788e-07, 'epoch': 2.83, 'throughput': 1947.73}

	[INFO\|2025-05-18 16:58:43] trainer.py:3984 >> Saving model checkpoint to saves/Qwen3-4B-Instruct/lora/train_2025-05-18-16-33-56/checkpoint-93

	[INFO\|2025-05-18 16:58:43] configuration_utils.py:693 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/config.json

	[INFO\|2025-05-18 16:58:43] configuration_utils.py:765 >> Model config Qwen3Config {
	"architectures": [
	"Qwen3ForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 151643,
	"eos_token_id": 151645,
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 2560,
	"initializer_range": 0.02,
	"intermediate_size": 9728,
	"max_position_embeddings": 40960,
	"max_window_layers": 36,
	"model_type": "qwen3",
	"num_attention_heads": 32,
	"num_hidden_layers": 36,
	"num_key_value_heads": 8,
	"rms_norm_eps": 1e-06,
	"rope_scaling": null,
	"rope_theta": 1000000,
	"sliding_window": null,
	"tie_word_embeddings": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.51.3",
	"use_cache": true,
	"use_sliding_window": false,
	"vocab_size": 151936
	}


	[INFO\|2025-05-18 16:58:44] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Qwen3-4B-Instruct/lora/train_2025-05-18-16-33-56/checkpoint-93/tokenizer_config.json

	[INFO\|2025-05-18 16:58:44] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Qwen3-4B-Instruct/lora/train_2025-05-18-16-33-56/checkpoint-93/special_tokens_map.json

	[INFO\|2025-05-18 16:58:45] trainer.py:2681 >>

	Training completed. Do not forget to share your model on huggingface.co/models =)



	[INFO\|2025-05-18 16:58:45] trainer.py:3984 >> Saving model checkpoint to saves/Qwen3-4B-Instruct/lora/train_2025-05-18-16-33-56

	[INFO\|2025-05-18 16:58:45] configuration_utils.py:693 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-4B/snapshots/82d62bb073771e7a1ea59435f548908540217d1f/config.json

	[INFO\|2025-05-18 16:58:45] configuration_utils.py:765 >> Model config Qwen3Config {
	"architectures": [
	"Qwen3ForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 151643,
	"eos_token_id": 151645,
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 2560,
	"initializer_range": 0.02,
	"intermediate_size": 9728,
	"max_position_embeddings": 40960,
	"max_window_layers": 36,
	"model_type": "qwen3",
	"num_attention_heads": 32,
	"num_hidden_layers": 36,
	"num_key_value_heads": 8,
	"rms_norm_eps": 1e-06,
	"rope_scaling": null,
	"rope_theta": 1000000,
	"sliding_window": null,
	"tie_word_embeddings": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.51.3",
	"use_cache": true,
	"use_sliding_window": false,
	"vocab_size": 151936
	}


	[INFO\|2025-05-18 16:58:46] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Qwen3-4B-Instruct/lora/train_2025-05-18-16-33-56/tokenizer_config.json

	[INFO\|2025-05-18 16:58:46] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Qwen3-4B-Instruct/lora/train_2025-05-18-16-33-56/special_tokens_map.json

	[WARNING\|2025-05-18 16:58:46] logging.py:148 >> No metric eval_loss to plot.

	[WARNING\|2025-05-18 16:58:46] logging.py:148 >> No metric eval_accuracy to plot.

	[INFO\|2025-05-18 16:58:46] modelcard.py:450 >> Dropping the following result as it does not have all the necessary fields:
	{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}