|
[INFO|2024-12-06 19:39:30] parser.py:355 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16 |
|
|
|
[INFO|2024-12-06 19:39:30] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json |
|
|
|
[INFO|2024-12-06 19:39:30] configuration_utils.py:800 >> Model config LlamaConfig { |
|
"_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 16, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 32.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.4", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|2024-12-06 19:39:30] tokenization_utils_base.py:2289 >> loading file tokenizer.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json |
|
|
|
[INFO|2024-12-06 19:39:30] tokenization_utils_base.py:2289 >> loading file added_tokens.json from cache at None |
|
|
|
[INFO|2024-12-06 19:39:30] tokenization_utils_base.py:2289 >> loading file special_tokens_map.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json |
|
|
|
[INFO|2024-12-06 19:39:30] tokenization_utils_base.py:2289 >> loading file tokenizer_config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json |
|
|
|
[INFO|2024-12-06 19:39:30] tokenization_utils_base.py:2533 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|2024-12-06 19:39:30] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json |
|
|
|
[INFO|2024-12-06 19:39:30] configuration_utils.py:800 >> Model config LlamaConfig { |
|
"_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 16, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 32.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.4", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|2024-12-06 19:39:30] tokenization_utils_base.py:2289 >> loading file tokenizer.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json |
|
|
|
[INFO|2024-12-06 19:39:30] tokenization_utils_base.py:2289 >> loading file added_tokens.json from cache at None |
|
|
|
[INFO|2024-12-06 19:39:30] tokenization_utils_base.py:2289 >> loading file special_tokens_map.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json |
|
|
|
[INFO|2024-12-06 19:39:30] tokenization_utils_base.py:2289 >> loading file tokenizer_config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json |
|
|
|
[INFO|2024-12-06 19:39:30] tokenization_utils_base.py:2533 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|2024-12-06 19:39:30] logging.py:157 >> Replace eos token: <|eot_id|> |
|
|
|
[INFO|2024-12-06 19:39:30] logging.py:157 >> Add pad token: <|eot_id|> |
|
|
|
[INFO|2024-12-06 19:39:30] logging.py:157 >> Loading dataset radiology_sft_instruct.json... |
|
|
|
[INFO|2024-12-06 19:39:31] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json |
|
|
|
[INFO|2024-12-06 19:39:31] configuration_utils.py:800 >> Model config LlamaConfig { |
|
"_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 16, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 32.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.4", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|2024-12-06 19:39:31] modeling_utils.py:3644 >> loading weights file model.safetensors from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/model.safetensors |
|
|
|
[INFO|2024-12-06 19:39:31] modeling_utils.py:1572 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16. |
|
|
|
[INFO|2024-12-06 19:39:31] configuration_utils.py:1038 >> Generate config GenerationConfig { |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
] |
|
} |
|
|
|
|
|
[INFO|2024-12-06 19:39:32] modeling_utils.py:4473 >> All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
|
|
[INFO|2024-12-06 19:39:32] modeling_utils.py:4481 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
|
|
[INFO|2024-12-06 19:39:32] configuration_utils.py:993 >> loading configuration file generation_config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/generation_config.json |
|
|
|
[INFO|2024-12-06 19:39:32] configuration_utils.py:1038 >> Generate config GenerationConfig { |
|
"bos_token_id": 128000, |
|
"do_sample": true, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"temperature": 0.6, |
|
"top_p": 0.9 |
|
} |
|
|
|
|
|
[INFO|2024-12-06 19:39:32] logging.py:157 >> Gradient checkpointing enabled. |
|
|
|
[INFO|2024-12-06 19:39:32] logging.py:157 >> Using torch SDPA for faster training and inference. |
|
|
|
[INFO|2024-12-06 19:39:32] logging.py:157 >> Upcasting trainable params to float32. |
|
|
|
[INFO|2024-12-06 19:39:32] logging.py:157 >> Fine-tuning method: Full |
|
|
|
[INFO|2024-12-06 19:39:32] logging.py:157 >> trainable params: 1,235,814,400 || all params: 1,235,814,400 || trainable%: 100.0000 |
|
|
|
[INFO|2024-12-06 19:39:32] trainer.py:648 >> Using auto half precision backend |
|
|
|
[INFO|2024-12-06 19:39:33] trainer.py:2134 >> ***** Running training ***** |
|
|
|
[INFO|2024-12-06 19:39:33] trainer.py:2135 >> Num examples = 2,831 |
|
|
|
[INFO|2024-12-06 19:39:33] trainer.py:2136 >> Num Epochs = 3 |
|
|
|
[INFO|2024-12-06 19:39:33] trainer.py:2137 >> Instantaneous batch size per device = 16 |
|
|
|
[INFO|2024-12-06 19:39:33] trainer.py:2140 >> Total train batch size (w. parallel, distributed & accumulation) = 16 |
|
|
|
[INFO|2024-12-06 19:39:33] trainer.py:2141 >> Gradient Accumulation steps = 1 |
|
|
|
[INFO|2024-12-06 19:39:33] trainer.py:2142 >> Total optimization steps = 531 |
|
|
|
[INFO|2024-12-06 19:39:33] trainer.py:2143 >> Number of trainable parameters = 1,235,814,400 |
|
|
|
[INFO|2024-12-06 19:39:36] logging.py:157 >> {'loss': 3.3945, 'learning_rate': 4.9989e-05, 'epoch': 0.03, 'throughput': 4100.16} |
|
|
|
[INFO|2024-12-06 19:39:39] logging.py:157 >> {'loss': 2.0329, 'learning_rate': 4.9956e-05, 'epoch': 0.06, 'throughput': 4303.91} |
|
|
|
[INFO|2024-12-06 19:39:42] logging.py:157 >> {'loss': 1.9958, 'learning_rate': 4.9902e-05, 'epoch': 0.08, 'throughput': 4427.32} |
|
|
|
[INFO|2024-12-06 19:39:44] logging.py:157 >> {'loss': 1.6684, 'learning_rate': 4.9825e-05, 'epoch': 0.11, 'throughput': 4474.24} |
|
|
|
[INFO|2024-12-06 19:39:47] logging.py:157 >> {'loss': 1.5506, 'learning_rate': 4.9727e-05, 'epoch': 0.14, 'throughput': 4485.18} |
|
|
|
[INFO|2024-12-06 19:39:50] logging.py:157 >> {'loss': 1.7502, 'learning_rate': 4.9607e-05, 'epoch': 0.17, 'throughput': 4536.25} |
|
|
|
[INFO|2024-12-06 19:39:53] logging.py:157 >> {'loss': 1.6902, 'learning_rate': 4.9466e-05, 'epoch': 0.20, 'throughput': 4614.65} |
|
|
|
[INFO|2024-12-06 19:39:55] logging.py:157 >> {'loss': 1.6861, 'learning_rate': 4.9303e-05, 'epoch': 0.23, 'throughput': 4651.17} |
|
|
|
[INFO|2024-12-06 19:39:59] logging.py:157 >> {'loss': 1.4512, 'learning_rate': 4.9119e-05, 'epoch': 0.25, 'throughput': 4640.00} |
|
|
|
[INFO|2024-12-06 19:40:02] logging.py:157 >> {'loss': 1.3701, 'learning_rate': 4.8914e-05, 'epoch': 0.28, 'throughput': 4663.91} |
|
|
|
[INFO|2024-12-06 19:40:05] logging.py:157 >> {'loss': 1.3091, 'learning_rate': 4.8688e-05, 'epoch': 0.31, 'throughput': 4659.05} |
|
|
|
[INFO|2024-12-06 19:40:07] logging.py:157 >> {'loss': 1.3839, 'learning_rate': 4.8441e-05, 'epoch': 0.34, 'throughput': 4680.20} |
|
|
|
[INFO|2024-12-06 19:40:10] logging.py:157 >> {'loss': 1.3058, 'learning_rate': 4.8174e-05, 'epoch': 0.37, 'throughput': 4671.35} |
|
|
|
[INFO|2024-12-06 19:40:13] logging.py:157 >> {'loss': 1.5883, 'learning_rate': 4.7887e-05, 'epoch': 0.40, 'throughput': 4679.82} |
|
|
|
[INFO|2024-12-06 19:40:16] logging.py:157 >> {'loss': 1.4788, 'learning_rate': 4.7579e-05, 'epoch': 0.42, 'throughput': 4714.12} |
|
|
|
[INFO|2024-12-06 19:40:19] logging.py:157 >> {'loss': 0.9295, 'learning_rate': 4.7252e-05, 'epoch': 0.45, 'throughput': 4701.46} |
|
|
|
[INFO|2024-12-06 19:40:22] logging.py:157 >> {'loss': 1.4156, 'learning_rate': 4.6905e-05, 'epoch': 0.48, 'throughput': 4714.29} |
|
|
|
[INFO|2024-12-06 19:40:24] logging.py:157 >> {'loss': 1.2105, 'learning_rate': 4.6539e-05, 'epoch': 0.51, 'throughput': 4725.01} |
|
|
|
[INFO|2024-12-06 19:40:27] logging.py:157 >> {'loss': 1.5074, 'learning_rate': 4.6154e-05, 'epoch': 0.54, 'throughput': 4727.45} |
|
|
|
[INFO|2024-12-06 19:40:30] logging.py:157 >> {'loss': 1.1506, 'learning_rate': 4.5751e-05, 'epoch': 0.56, 'throughput': 4723.05} |
|
|
|
[INFO|2024-12-06 19:40:33] logging.py:157 >> {'loss': 1.6196, 'learning_rate': 4.5329e-05, 'epoch': 0.59, 'throughput': 4734.76} |
|
|
|
[INFO|2024-12-06 19:40:36] logging.py:157 >> {'loss': 1.4498, 'learning_rate': 4.4890e-05, 'epoch': 0.62, 'throughput': 4739.72} |
|
|
|
[INFO|2024-12-06 19:40:38] logging.py:157 >> {'loss': 1.3215, 'learning_rate': 4.4433e-05, 'epoch': 0.65, 'throughput': 4734.64} |
|
|
|
[INFO|2024-12-06 19:40:41] logging.py:157 >> {'loss': 1.1256, 'learning_rate': 4.3960e-05, 'epoch': 0.68, 'throughput': 4728.86} |
|
|
|
[INFO|2024-12-06 19:40:44] logging.py:157 >> {'loss': 1.0795, 'learning_rate': 4.3469e-05, 'epoch': 0.71, 'throughput': 4727.11} |
|
|
|
[INFO|2024-12-06 19:40:47] logging.py:157 >> {'loss': 1.5119, 'learning_rate': 4.2963e-05, 'epoch': 0.73, 'throughput': 4733.60} |
|
|
|
[INFO|2024-12-06 19:40:50] logging.py:157 >> {'loss': 1.2050, 'learning_rate': 4.2441e-05, 'epoch': 0.76, 'throughput': 4740.20} |
|
|
|
[INFO|2024-12-06 19:40:53] logging.py:157 >> {'loss': 1.1215, 'learning_rate': 4.1903e-05, 'epoch': 0.79, 'throughput': 4744.81} |
|
|
|
[INFO|2024-12-06 19:40:55] logging.py:157 >> {'loss': 0.9398, 'learning_rate': 4.1351e-05, 'epoch': 0.82, 'throughput': 4736.75} |
|
|
|
[INFO|2024-12-06 19:40:58] logging.py:157 >> {'loss': 1.0699, 'learning_rate': 4.0785e-05, 'epoch': 0.85, 'throughput': 4725.37} |
|
|
|
[INFO|2024-12-06 19:40:58] trainer.py:3819 >> |
|
***** Running Evaluation ***** |
|
|
|
[INFO|2024-12-06 19:40:58] trainer.py:3821 >> Num examples = 500 |
|
|
|
[INFO|2024-12-06 19:40:58] trainer.py:3824 >> Batch size = 16 |
|
|
|
[INFO|2024-12-06 19:41:01] trainer.py:3503 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-150 |
|
|
|
[INFO|2024-12-06 19:41:01] configuration_utils.py:472 >> Configuration saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-150/config.json |
|
|
|
[INFO|2024-12-06 19:41:01] configuration_utils.py:807 >> Configuration saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-150/generation_config.json |
|
|
|
[INFO|2024-12-06 19:41:48] modeling_utils.py:2765 >> Model weights saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-150/model.safetensors |
|
|
|
[INFO|2024-12-06 19:41:48] tokenization_utils_base.py:2702 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-150/tokenizer_config.json |
|
|
|
[INFO|2024-12-06 19:41:48] tokenization_utils_base.py:2711 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-150/special_tokens_map.json |
|
|
|
[INFO|2024-12-06 19:43:23] logging.py:157 >> {'loss': 1.6687, 'learning_rate': 4.0204e-05, 'epoch': 0.88, 'throughput': 1800.34} |
|
|
|
[INFO|2024-12-06 19:43:26] logging.py:157 >> {'loss': 1.0751, 'learning_rate': 3.9611e-05, 'epoch': 0.90, 'throughput': 1835.70} |
|
|
|
[INFO|2024-12-06 19:43:28] logging.py:157 >> {'loss': 1.2790, 'learning_rate': 3.9004e-05, 'epoch': 0.93, 'throughput': 1872.03} |
|
|
|
[INFO|2024-12-06 19:43:31] logging.py:157 >> {'loss': 1.2999, 'learning_rate': 3.8386e-05, 'epoch': 0.96, 'throughput': 1905.13} |
|
|
|
[INFO|2024-12-06 19:43:34] logging.py:157 >> {'loss': 1.0235, 'learning_rate': 3.7755e-05, 'epoch': 0.99, 'throughput': 1937.16} |
|
|
|
[INFO|2024-12-06 19:43:37] logging.py:157 >> {'loss': 1.0089, 'learning_rate': 3.7114e-05, 'epoch': 1.02, 'throughput': 1972.85} |
|
|
|
[INFO|2024-12-06 19:43:40] logging.py:157 >> {'loss': 0.7540, 'learning_rate': 3.6462e-05, 'epoch': 1.05, 'throughput': 2007.23} |
|
|
|
[INFO|2024-12-06 19:43:43] logging.py:157 >> {'loss': 0.8852, 'learning_rate': 3.5799e-05, 'epoch': 1.07, 'throughput': 2045.69} |
|
|
|
[INFO|2024-12-06 19:43:46] logging.py:157 >> {'loss': 0.8851, 'learning_rate': 3.5128e-05, 'epoch': 1.10, 'throughput': 2075.94} |
|
|
|
[INFO|2024-12-06 19:43:48] logging.py:157 >> {'loss': 0.7020, 'learning_rate': 3.4447e-05, 'epoch': 1.13, 'throughput': 2106.65} |
|
|
|
[INFO|2024-12-06 19:43:51] logging.py:157 >> {'loss': 0.7118, 'learning_rate': 3.3759e-05, 'epoch': 1.16, 'throughput': 2139.19} |
|
|
|
[INFO|2024-12-06 19:43:54] logging.py:157 >> {'loss': 0.6800, 'learning_rate': 3.3062e-05, 'epoch': 1.19, 'throughput': 2168.32} |
|
|
|
[INFO|2024-12-06 19:43:57] logging.py:157 >> {'loss': 0.6594, 'learning_rate': 3.2359e-05, 'epoch': 1.21, 'throughput': 2192.39} |
|
|
|
[INFO|2024-12-06 19:44:00] logging.py:157 >> {'loss': 0.7456, 'learning_rate': 3.1649e-05, 'epoch': 1.24, 'throughput': 2225.18} |
|
|
|
[INFO|2024-12-06 19:44:02] logging.py:157 >> {'loss': 0.7569, 'learning_rate': 3.0933e-05, 'epoch': 1.27, 'throughput': 2246.34} |
|
|
|
[INFO|2024-12-06 19:44:05] logging.py:157 >> {'loss': 0.7629, 'learning_rate': 3.0212e-05, 'epoch': 1.30, 'throughput': 2275.84} |
|
|
|
[INFO|2024-12-06 19:44:08] logging.py:157 >> {'loss': 0.7041, 'learning_rate': 2.9487e-05, 'epoch': 1.33, 'throughput': 2295.71} |
|
|
|
[INFO|2024-12-06 19:44:11] logging.py:157 >> {'loss': 0.8789, 'learning_rate': 2.8757e-05, 'epoch': 1.36, 'throughput': 2317.35} |
|
|
|
[INFO|2024-12-06 19:44:13] logging.py:157 >> {'loss': 0.6578, 'learning_rate': 2.8025e-05, 'epoch': 1.38, 'throughput': 2340.72} |
|
|
|
[INFO|2024-12-06 19:44:16] logging.py:157 >> {'loss': 0.5401, 'learning_rate': 2.7289e-05, 'epoch': 1.41, 'throughput': 2360.13} |
|
|
|
[INFO|2024-12-06 19:44:19] logging.py:157 >> {'loss': 0.8656, 'learning_rate': 2.6552e-05, 'epoch': 1.44, 'throughput': 2382.13} |
|
|
|
[INFO|2024-12-06 19:44:21] logging.py:157 >> {'loss': 0.7928, 'learning_rate': 2.5813e-05, 'epoch': 1.47, 'throughput': 2401.29} |
|
|
|
[INFO|2024-12-06 19:44:24] logging.py:157 >> {'loss': 0.7717, 'learning_rate': 2.5074e-05, 'epoch': 1.50, 'throughput': 2421.66} |
|
|
|
[INFO|2024-12-06 19:44:27] logging.py:157 >> {'loss': 0.6618, 'learning_rate': 2.4334e-05, 'epoch': 1.53, 'throughput': 2444.57} |
|
|
|
[INFO|2024-12-06 19:44:29] logging.py:157 >> {'loss': 0.6259, 'learning_rate': 2.3596e-05, 'epoch': 1.55, 'throughput': 2466.41} |
|
|
|
[INFO|2024-12-06 19:44:32] logging.py:157 >> {'loss': 0.7245, 'learning_rate': 2.2858e-05, 'epoch': 1.58, 'throughput': 2485.60} |
|
|
|
[INFO|2024-12-06 19:44:35] logging.py:157 >> {'loss': 0.6821, 'learning_rate': 2.2122e-05, 'epoch': 1.61, 'throughput': 2508.09} |
|
|
|
[INFO|2024-12-06 19:44:38] logging.py:157 >> {'loss': 0.7570, 'learning_rate': 2.1389e-05, 'epoch': 1.64, 'throughput': 2528.56} |
|
|
|
[INFO|2024-12-06 19:44:41] logging.py:157 >> {'loss': 0.7308, 'learning_rate': 2.0659e-05, 'epoch': 1.67, 'throughput': 2550.84} |
|
|
|
[INFO|2024-12-06 19:44:44] logging.py:157 >> {'loss': 0.7075, 'learning_rate': 1.9932e-05, 'epoch': 1.69, 'throughput': 2573.93} |
|
|
|
[INFO|2024-12-06 19:44:44] trainer.py:3819 >> |
|
***** Running Evaluation ***** |
|
|
|
[INFO|2024-12-06 19:44:44] trainer.py:3821 >> Num examples = 500 |
|
|
|
[INFO|2024-12-06 19:44:44] trainer.py:3824 >> Batch size = 16 |
|
|
|
[INFO|2024-12-06 19:44:47] trainer.py:3503 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-300 |
|
|
|
[INFO|2024-12-06 19:44:47] configuration_utils.py:472 >> Configuration saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-300/config.json |
|
|
|
[INFO|2024-12-06 19:44:47] configuration_utils.py:807 >> Configuration saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-300/generation_config.json |
|
|
|
[INFO|2024-12-06 19:45:34] modeling_utils.py:2765 >> Model weights saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-300/model.safetensors |
|
|
|
[INFO|2024-12-06 19:45:34] tokenization_utils_base.py:2702 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-300/tokenizer_config.json |
|
|
|
[INFO|2024-12-06 19:45:34] tokenization_utils_base.py:2711 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-300/special_tokens_map.json |
|
|
|
[INFO|2024-12-06 19:47:04] logging.py:157 >> {'loss': 0.7277, 'learning_rate': 1.9211e-05, 'epoch': 1.72, 'throughput': 1800.85} |
|
|
|
[INFO|2024-12-06 19:47:07] logging.py:157 >> {'loss': 0.6651, 'learning_rate': 1.8494e-05, 'epoch': 1.75, 'throughput': 1819.52} |
|
|
|
[INFO|2024-12-06 19:47:09] logging.py:157 >> {'loss': 0.6795, 'learning_rate': 1.7783e-05, 'epoch': 1.78, 'throughput': 1835.81} |
|
|
|
[INFO|2024-12-06 19:47:12] logging.py:157 >> {'loss': 0.5096, 'learning_rate': 1.7078e-05, 'epoch': 1.81, 'throughput': 1850.11} |
|
|
|
[INFO|2024-12-06 19:47:15] logging.py:157 >> {'loss': 0.7516, 'learning_rate': 1.6380e-05, 'epoch': 1.84, 'throughput': 1872.39} |
|
|
|
[INFO|2024-12-06 19:47:18] logging.py:157 >> {'loss': 0.7133, 'learning_rate': 1.5690e-05, 'epoch': 1.86, 'throughput': 1888.86} |
|
|
|
[INFO|2024-12-06 19:47:21] logging.py:157 >> {'loss': 0.6808, 'learning_rate': 1.5008e-05, 'epoch': 1.89, 'throughput': 1909.54} |
|
|
|
[INFO|2024-12-06 19:47:23] logging.py:157 >> {'loss': 0.6812, 'learning_rate': 1.4334e-05, 'epoch': 1.92, 'throughput': 1923.80} |
|
|
|
[INFO|2024-12-06 19:47:26] logging.py:157 >> {'loss': 0.7640, 'learning_rate': 1.3670e-05, 'epoch': 1.95, 'throughput': 1945.55} |
|
|
|
[INFO|2024-12-06 19:47:30] logging.py:157 >> {'loss': 0.5694, 'learning_rate': 1.3016e-05, 'epoch': 1.98, 'throughput': 1967.33} |
|
|
|
[INFO|2024-12-06 19:47:32] logging.py:157 >> {'loss': 0.4464, 'learning_rate': 1.2372e-05, 'epoch': 2.01, 'throughput': 1981.07} |
|
|
|
[INFO|2024-12-06 19:47:35] logging.py:157 >> {'loss': 0.2825, 'learning_rate': 1.1739e-05, 'epoch': 2.03, 'throughput': 1995.07} |
|
|
|
[INFO|2024-12-06 19:47:38] logging.py:157 >> {'loss': 0.2687, 'learning_rate': 1.1118e-05, 'epoch': 2.06, 'throughput': 2011.80} |
|
|
|
[INFO|2024-12-06 19:47:41] logging.py:157 >> {'loss': 0.2449, 'learning_rate': 1.0510e-05, 'epoch': 2.09, 'throughput': 2030.67} |
|
|
|
[INFO|2024-12-06 19:47:43] logging.py:157 >> {'loss': 0.2621, 'learning_rate': 9.9133e-06, 'epoch': 2.12, 'throughput': 2043.84} |
|
|
|
[INFO|2024-12-06 19:47:46] logging.py:157 >> {'loss': 0.1924, 'learning_rate': 9.3303e-06, 'epoch': 2.15, 'throughput': 2057.82} |
|
|
|
[INFO|2024-12-06 19:47:49] logging.py:157 >> {'loss': 0.3127, 'learning_rate': 8.7610e-06, 'epoch': 2.18, 'throughput': 2076.93} |
|
|
|
[INFO|2024-12-06 19:47:52] logging.py:157 >> {'loss': 0.2310, 'learning_rate': 8.2059e-06, 'epoch': 2.20, 'throughput': 2089.35} |
|
|
|
[INFO|2024-12-06 19:47:54] logging.py:157 >> {'loss': 0.2005, 'learning_rate': 7.6655e-06, 'epoch': 2.23, 'throughput': 2103.48} |
|
|
|
[INFO|2024-12-06 19:47:57] logging.py:157 >> {'loss': 0.2251, 'learning_rate': 7.1403e-06, 'epoch': 2.26, 'throughput': 2118.49} |
|
|
|
[INFO|2024-12-06 19:48:00] logging.py:157 >> {'loss': 0.2761, 'learning_rate': 6.6306e-06, 'epoch': 2.29, 'throughput': 2134.56} |
|
|
|
[INFO|2024-12-06 19:48:03] logging.py:157 >> {'loss': 0.2018, 'learning_rate': 6.1371e-06, 'epoch': 2.32, 'throughput': 2146.32} |
|
|
|
[INFO|2024-12-06 19:48:05] logging.py:157 >> {'loss': 0.2415, 'learning_rate': 5.6601e-06, 'epoch': 2.34, 'throughput': 2160.70} |
|
|
|
[INFO|2024-12-06 19:48:08] logging.py:157 >> {'loss': 0.1822, 'learning_rate': 5.2000e-06, 'epoch': 2.37, 'throughput': 2171.98} |
|
|
|
[INFO|2024-12-06 19:48:11] logging.py:157 >> {'loss': 0.1718, 'learning_rate': 4.7572e-06, 'epoch': 2.40, 'throughput': 2188.45} |
|
|
|
[INFO|2024-12-06 19:48:14] logging.py:157 >> {'loss': 0.2641, 'learning_rate': 4.3321e-06, 'epoch': 2.43, 'throughput': 2202.61} |
|
|
|
[INFO|2024-12-06 19:48:16] logging.py:157 >> {'loss': 0.2228, 'learning_rate': 3.9252e-06, 'epoch': 2.46, 'throughput': 2214.73} |
|
|
|
[INFO|2024-12-06 19:48:19] logging.py:157 >> {'loss': 0.2269, 'learning_rate': 3.5366e-06, 'epoch': 2.49, 'throughput': 2225.55} |
|
|
|
[INFO|2024-12-06 19:48:21] logging.py:157 >> {'loss': 0.2592, 'learning_rate': 3.1669e-06, 'epoch': 2.51, 'throughput': 2235.91} |
|
|
|
[INFO|2024-12-06 19:48:24] logging.py:157 >> {'loss': 0.2458, 'learning_rate': 2.8162e-06, 'epoch': 2.54, 'throughput': 2248.32} |
|
|
|
[INFO|2024-12-06 19:48:24] trainer.py:3819 >> |
|
***** Running Evaluation ***** |
|
|
|
[INFO|2024-12-06 19:48:24] trainer.py:3821 >> Num examples = 500 |
|
|
|
[INFO|2024-12-06 19:48:24] trainer.py:3824 >> Batch size = 16 |
|
|
|
[INFO|2024-12-06 19:48:28] trainer.py:3503 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-450 |
|
|
|
[INFO|2024-12-06 19:48:28] configuration_utils.py:472 >> Configuration saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-450/config.json |
|
|
|
[INFO|2024-12-06 19:48:28] configuration_utils.py:807 >> Configuration saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-450/generation_config.json |
|
|
|
[INFO|2024-12-06 19:49:13] modeling_utils.py:2765 >> Model weights saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-450/model.safetensors |
|
|
|
[INFO|2024-12-06 19:49:13] tokenization_utils_base.py:2702 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-450/tokenizer_config.json |
|
|
|
[INFO|2024-12-06 19:49:13] tokenization_utils_base.py:2711 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-450/special_tokens_map.json |
|
|
|
[INFO|2024-12-06 19:50:43] logging.py:157 >> {'loss': 0.2558, 'learning_rate': 2.4850e-06, 'epoch': 2.57, 'throughput': 1804.47} |
|
|
|
[INFO|2024-12-06 19:50:45] logging.py:157 >> {'loss': 0.2414, 'learning_rate': 2.1734e-06, 'epoch': 2.60, 'throughput': 1815.49} |
|
|
|
[INFO|2024-12-06 19:50:48] logging.py:157 >> {'loss': 0.1976, 'learning_rate': 1.8818e-06, 'epoch': 2.63, 'throughput': 1829.04} |
|
|
|
[INFO|2024-12-06 19:50:51] logging.py:157 >> {'loss': 0.2398, 'learning_rate': 1.6105e-06, 'epoch': 2.66, 'throughput': 1842.24} |
|
|
|
[INFO|2024-12-06 19:50:54] logging.py:157 >> {'loss': 0.1771, 'learning_rate': 1.3596e-06, 'epoch': 2.68, 'throughput': 1853.72} |
|
|
|
[INFO|2024-12-06 19:50:57] logging.py:157 >> {'loss': 0.2150, 'learning_rate': 1.1294e-06, 'epoch': 2.71, 'throughput': 1865.52} |
|
|
|
[INFO|2024-12-06 19:50:59] logging.py:157 >> {'loss': 0.2141, 'learning_rate': 9.2014e-07, 'epoch': 2.74, 'throughput': 1876.07} |
|
|
|
[INFO|2024-12-06 19:51:02] logging.py:157 >> {'loss': 0.1785, 'learning_rate': 7.3191e-07, 'epoch': 2.77, 'throughput': 1888.15} |
|
|
|
[INFO|2024-12-06 19:51:05] logging.py:157 >> {'loss': 0.1754, 'learning_rate': 5.6492e-07, 'epoch': 2.80, 'throughput': 1899.67} |
|
|
|
[INFO|2024-12-06 19:51:07] logging.py:157 >> {'loss': 0.1936, 'learning_rate': 4.1930e-07, 'epoch': 2.82, 'throughput': 1910.28} |
|
|
|
[INFO|2024-12-06 19:51:10] logging.py:157 >> {'loss': 0.2127, 'learning_rate': 2.9520e-07, 'epoch': 2.85, 'throughput': 1920.13} |
|
|
|
[INFO|2024-12-06 19:51:13] logging.py:157 >> {'loss': 0.1950, 'learning_rate': 1.9271e-07, 'epoch': 2.88, 'throughput': 1931.88} |
|
|
|
[INFO|2024-12-06 19:51:16] logging.py:157 >> {'loss': 0.2163, 'learning_rate': 1.1193e-07, 'epoch': 2.91, 'throughput': 1944.26} |
|
|
|
[INFO|2024-12-06 19:51:18] logging.py:157 >> {'loss': 0.1973, 'learning_rate': 5.2924e-08, 'epoch': 2.94, 'throughput': 1953.53} |
|
|
|
[INFO|2024-12-06 19:51:21] logging.py:157 >> {'loss': 0.2167, 'learning_rate': 1.5750e-08, 'epoch': 2.97, 'throughput': 1965.55} |
|
|
|
[INFO|2024-12-06 19:51:24] logging.py:157 >> {'loss': 0.2383, 'learning_rate': 4.3754e-10, 'epoch': 2.99, 'throughput': 1978.38} |
|
|
|
[INFO|2024-12-06 19:51:25] trainer.py:3503 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-531 |
|
|
|
[INFO|2024-12-06 19:51:25] configuration_utils.py:472 >> Configuration saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-531/config.json |
|
|
|
[INFO|2024-12-06 19:51:25] configuration_utils.py:807 >> Configuration saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-531/generation_config.json |
|
|
|
[INFO|2024-12-06 19:52:11] modeling_utils.py:2765 >> Model weights saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-531/model.safetensors |
|
|
|
[INFO|2024-12-06 19:52:11] tokenization_utils_base.py:2702 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-531/tokenizer_config.json |
|
|
|
[INFO|2024-12-06 19:52:11] tokenization_utils_base.py:2711 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/checkpoint-531/special_tokens_map.json |
|
|
|
[INFO|2024-12-06 19:53:37] trainer.py:2394 >> |
|
|
|
Training completed. Do not forget to share your model on huggingface.co/models =) |
|
|
|
|
|
|
|
[INFO|2024-12-06 19:53:37] trainer.py:3503 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft |
|
|
|
[INFO|2024-12-06 19:53:37] configuration_utils.py:472 >> Configuration saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/config.json |
|
|
|
[INFO|2024-12-06 19:53:37] configuration_utils.py:807 >> Configuration saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/generation_config.json |
|
|
|
[INFO|2024-12-06 19:54:24] modeling_utils.py:2765 >> Model weights saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/model.safetensors |
|
|
|
[INFO|2024-12-06 19:54:24] tokenization_utils_base.py:2702 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/tokenizer_config.json |
|
|
|
[INFO|2024-12-06 19:54:24] tokenization_utils_base.py:2711 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/full/llama3.2_1binst_full_sft/special_tokens_map.json |
|
|
|
[WARNING|2024-12-06 19:54:25] logging.py:162 >> No metric eval_accuracy to plot. |
|
|
|
[INFO|2024-12-06 19:54:25] trainer.py:3819 >> |
|
***** Running Evaluation ***** |
|
|
|
[INFO|2024-12-06 19:54:25] trainer.py:3821 >> Num examples = 500 |
|
|
|
[INFO|2024-12-06 19:54:25] trainer.py:3824 >> Batch size = 16 |
|
|
|
[INFO|2024-12-06 19:54:28] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields: |
|
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} |
|
|
|
|