Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit unsloth Train examples

Community Article Published March 24, 2025

System

Ubuntu 22.04

unsloth install

python3 -m venv Llama-3.1-Nemotron-Nano-Train
source Llama-3.1-Nemotron-Nano-Train/bin/activate
pip install unsloth

unsloth Train

from unsloth import FastLanguageModel
import torch

# Define model parameters
max_seq_length = 2048  # Maximum sequence length for the model
dtype = None  # Automatically detect data type; use Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4-bit quantization to reduce memory usage

# Load the pre-trained model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="aifeifei798/Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit",  # Local path to the model
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token="hf_...",  # Use a token if using gated models like meta-llama/Llama-2-7b-hf
)

# Apply PEFT (Parameter-Efficient Fine-Tuning) to the model
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Rank for LoRA (Low-Rank Adaptation); choose any number > 0
    target_modules=[  # Target modules for LoRA
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,  # Scaling factor for LoRA
    lora_dropout=0,  # Dropout rate for LoRA
    bias="none",  # Bias setting for LoRA
    use_gradient_checkpointing="unsloth",  # Use gradient checkpointing for memory efficiency
    random_state=3407,
    use_rslora=False,  # Option to use rank-stabilized LoRA
    loftq_config=None,  # Configuration for LoftQ
)

from datasets import load_dataset

# Define a function to format prompts
def formatting_prompts_func(examples):
    texts = []
    inputs = examples["input"]
    outputs = examples["output"]
    for input, output in zip(inputs, outputs):
        text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

detailed thinking on<|eot_id|><|start_header_id|>user<|end_header_id|>

{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>{output}<|eot_id|>"""
        texts.append(text)
    return {"text": texts}

# Load the dataset
dataset = load_dataset("aifeifei798/Chinese-DeepSeek-R1-Distill-data-110k-alpaca", split="train")

# Apply the formatting function to the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)
print(dataset[0])  # Print the first example to verify formatting

from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

# Initialize the trainer for supervised fine-tuning
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=16,
    packing=False,  # Disable packing for potentially faster training with short sequences
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=30,  # Set the number of training steps, The recommended number of training steps is approximately 15,000.
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),  # Use FP16 if BF16 is not supported
        bf16=is_bfloat16_supported(),  # Use BF16 if supported
        logging_steps=1,
        optim="adamw_8bit",  # Use 8-bit AdamW optimizer
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",  # Disable reporting to external services
        save_steps=5,  # Save the model every 5 steps
        save_total_limit=10  # Keep only the last 10 checkpoints
        # load_best_model_at_end=False,  # Ensure this is False to resume training
    ),
)

# Train the model
trainer_stats = trainer.train()

# Define the path to the latest checkpoint directory
# checkpoint_path = "outputs/checkpoint-XXXX"  # Replace XXXX with the latest checkpoint step
# trainer.train(resume_from_checkpoint=checkpoint_path)

# Save the fine-tuned model and tokenizer
model.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit-lora")
tokenizer.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit-lora")

# Save the merged model and tokenizer
model.save_pretrained_merged("Llama-3.1-Nemotron-Nano-8B-v1-bnb-Chinese", tokenizer)

bit4 model

Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit

Community

Your need to confirm your account before you can post a new comment.

Sign up or log in to comment