Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit unsloth Train examples

Community Article Published March 24, 2025

System

unsloth install

unsloth Train
bit4 model

System

Ubuntu 22.04

unsloth install

python3 -m venv Llama-3.1-Nemotron-Nano-Train
source Llama-3.1-Nemotron-Nano-Train/bin/activate
pip install unsloth

unsloth Train

from unsloth import FastLanguageModel
import torch

# Define model parameters
max_seq_length = 2048  # Maximum sequence length for the model
dtype = None  # Automatically detect data type; use Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4-bit quantization to reduce memory usage

# Load the pre-trained model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="aifeifei798/Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit",  # Local path to the model
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token="hf_...",  # Use a token if using gated models like meta-llama/Llama-2-7b-hf
)

# Apply PEFT (Parameter-Efficient Fine-Tuning) to the model
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Rank for LoRA (Low-Rank Adaptation); choose any number > 0
    target_modules=[  # Target modules for LoRA
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,  # Scaling factor for LoRA
    lora_dropout=0,  # Dropout rate for LoRA
    bias="none",  # Bias setting for LoRA
    use_gradient_checkpointing="unsloth",  # Use gradient checkpointing for memory efficiency
    random_state=3407,
    use_rslora=False,  # Option to use rank-stabilized LoRA
    loftq_config=None,  # Configuration for LoftQ
)

from datasets import load_dataset

# Define a function to format prompts
def formatting_prompts_func(examples):
    texts = []
    inputs = examples["input"]
    outputs = examples["output"]
    for input, output in zip(inputs, outputs):
        text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

detailed thinking on<|eot_id|><|start_header_id|>user<|end_header_id|>

{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>{output}<|eot_id|>"""
        texts.append(text)
    return {"text": texts}

# Load the dataset
dataset = load_dataset("aifeifei798/Chinese-DeepSeek-R1-Distill-data-110k-alpaca", split="train")

# Apply the formatting function to the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)
print(dataset[0])  # Print the first example to verify formatting

from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

# Initialize the trainer for supervised fine-tuning
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=16,
    packing=False,  # Disable packing for potentially faster training with short sequences
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=30,  # Set the number of training steps, The recommended number of training steps is approximately 15,000.
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),  # Use FP16 if BF16 is not supported
        bf16=is_bfloat16_supported(),  # Use BF16 if supported
        logging_steps=1,
        optim="adamw_8bit",  # Use 8-bit AdamW optimizer
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",  # Disable reporting to external services
        save_steps=5,  # Save the model every 5 steps
        save_total_limit=10  # Keep only the last 10 checkpoints
        # load_best_model_at_end=False,  # Ensure this is False to resume training
    ),
)

# Train the model
trainer_stats = trainer.train()

# Define the path to the latest checkpoint directory
# checkpoint_path = "outputs/checkpoint-XXXX"  # Replace XXXX with the latest checkpoint step
# trainer.train(resume_from_checkpoint=checkpoint_path)

# Save the fine-tuned model and tokenizer
model.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit-lora")
tokenizer.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit-lora")

# Save the merged model and tokenizer
model.save_pretrained_merged("Llama-3.1-Nemotron-Nano-8B-v1-bnb-Chinese", tokenizer)

bit4 model

Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit

Community

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

Your need to confirm your account before you can post a new comment.

· Sign up or log in to comment