Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit unsloth Train examples
Community Article
Published
March 24, 2025
System
Ubuntu 22.04
unsloth install
python3 -m venv Llama-3.1-Nemotron-Nano-Train
source Llama-3.1-Nemotron-Nano-Train/bin/activate
pip install unsloth
unsloth Train
from unsloth import FastLanguageModel
import torch
# Define model parameters
max_seq_length = 2048 # Maximum sequence length for the model
dtype = None # Automatically detect data type; use Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4-bit quantization to reduce memory usage
# Load the pre-trained model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="aifeifei798/Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit", # Local path to the model
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
# token="hf_...", # Use a token if using gated models like meta-llama/Llama-2-7b-hf
)
# Apply PEFT (Parameter-Efficient Fine-Tuning) to the model
model = FastLanguageModel.get_peft_model(
model,
r=16, # Rank for LoRA (Low-Rank Adaptation); choose any number > 0
target_modules=[ # Target modules for LoRA
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_alpha=16, # Scaling factor for LoRA
lora_dropout=0, # Dropout rate for LoRA
bias="none", # Bias setting for LoRA
use_gradient_checkpointing="unsloth", # Use gradient checkpointing for memory efficiency
random_state=3407,
use_rslora=False, # Option to use rank-stabilized LoRA
loftq_config=None, # Configuration for LoftQ
)
from datasets import load_dataset
# Define a function to format prompts
def formatting_prompts_func(examples):
texts = []
inputs = examples["input"]
outputs = examples["output"]
for input, output in zip(inputs, outputs):
text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
detailed thinking on<|eot_id|><|start_header_id|>user<|end_header_id|>
{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>{output}<|eot_id|>"""
texts.append(text)
return {"text": texts}
# Load the dataset
dataset = load_dataset("aifeifei798/Chinese-DeepSeek-R1-Distill-data-110k-alpaca", split="train")
# Apply the formatting function to the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)
print(dataset[0]) # Print the first example to verify formatting
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
# Initialize the trainer for supervised fine-tuning
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=max_seq_length,
dataset_num_proc=16,
packing=False, # Disable packing for potentially faster training with short sequences
args=TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_steps=5,
max_steps=30, # Set the number of training steps, The recommended number of training steps is approximately 15,000.
learning_rate=2e-4,
fp16=not is_bfloat16_supported(), # Use FP16 if BF16 is not supported
bf16=is_bfloat16_supported(), # Use BF16 if supported
logging_steps=1,
optim="adamw_8bit", # Use 8-bit AdamW optimizer
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
report_to="none", # Disable reporting to external services
save_steps=5, # Save the model every 5 steps
save_total_limit=10 # Keep only the last 10 checkpoints
# load_best_model_at_end=False, # Ensure this is False to resume training
),
)
# Train the model
trainer_stats = trainer.train()
# Define the path to the latest checkpoint directory
# checkpoint_path = "outputs/checkpoint-XXXX" # Replace XXXX with the latest checkpoint step
# trainer.train(resume_from_checkpoint=checkpoint_path)
# Save the fine-tuned model and tokenizer
model.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit-lora")
tokenizer.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit-lora")
# Save the merged model and tokenizer
model.save_pretrained_merged("Llama-3.1-Nemotron-Nano-8B-v1-bnb-Chinese", tokenizer)