Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- DEPLOY_CHECKLIST.md +42 -97
- app.py +3 -12
- run_transformers_training.py +244 -110
- transformers_config.json +3 -0
DEPLOY_CHECKLIST.md
CHANGED
@@ -1,107 +1,52 @@
|
|
1 |
-
# Phi-4 Training
|
2 |
-
|
3 |
-
##
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
- [ ]
|
10 |
-
|
11 |
-
|
12 |
-
- [ ]
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
- [ ]
|
21 |
-
- [ ]
|
22 |
-
- [ ]
|
23 |
-
- [ ]
|
24 |
-
- [ ]
|
25 |
-
|
26 |
-
|
27 |
-
- [ ]
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
- [ ]
|
33 |
-
- [ ]
|
34 |
-
- [ ]
|
35 |
-
- [ ] Sequential sampler used in dataloader (no shuffling)
|
36 |
-
- [ ] Max sequence length of 2048 applied
|
37 |
-
- [ ] Format validation for first few examples enabled
|
38 |
-
|
39 |
-
### 4. Dependency Management ✓
|
40 |
-
|
41 |
-
- [ ] requirements.txt includes all necessary packages:
|
42 |
-
- [ ] unsloth
|
43 |
-
- [ ] peft
|
44 |
-
- [ ] bitsandbytes
|
45 |
-
- [ ] einops
|
46 |
-
- [ ] sentencepiece
|
47 |
-
- [ ] datasets
|
48 |
-
- [ ] transformers
|
49 |
-
- [ ] Optional packages marked as such (e.g., flash-attn)
|
50 |
-
- [ ] Dependency version constraints avoid known conflicts
|
51 |
-
|
52 |
-
### 5. Error Handling & Logging ✓
|
53 |
-
|
54 |
-
- [ ] Proper error catching for dataset loading
|
55 |
-
- [ ] Fallback mechanisms for chat template application
|
56 |
-
- [ ] Clear, concise log messages that work with HF Space interface
|
57 |
-
- [ ] Memory usage tracking at key points (start, end, periodic)
|
58 |
-
- [ ] Third-party loggers set to WARNING to reduce noise
|
59 |
-
- [ ] Low-verbosity log format for better HF Space compatibility
|
60 |
-
|
61 |
-
### 6. Training Setup ✓
|
62 |
-
|
63 |
-
- [ ] Number of epochs properly configured (default: 3)
|
64 |
-
- [ ] Learning rate appropriate (default: 2e-5)
|
65 |
-
- [ ] Warmup ratio set (default: 0.05)
|
66 |
-
- [ ] Checkpointing frequency set to reasonable value (default: 100 steps)
|
67 |
-
- [ ] Output directory correctly configured
|
68 |
-
- [ ] HuggingFace Hub parameters set correctly if pushing models
|
69 |
-
|
70 |
-
### 7. Pre-Flight Verification ✓
|
71 |
-
|
72 |
-
- [ ] No linting errors or indentation issues
|
73 |
-
- [ ] Updated config values are consistent across files
|
74 |
-
- [ ] Batch size × gradient accumulation × GPUs gives reasonable total batch
|
75 |
-
- [ ] Verified that requirements.txt matches actual imports in code
|
76 |
-
- [ ] Confirmed tokenizer settings match the model requirements
|
77 |
|
78 |
---
|
79 |
|
80 |
-
##
|
81 |
-
|
82 |
-
If you've made any configuration changes, record them here before deployment:
|
83 |
|
84 |
-
|
|
85 |
-
|
86 |
-
|
|
87 |
-
|
|
|
|
88 |
|
89 |
---
|
90 |
|
91 |
-
|
92 |
|
93 |
-
**
|
94 |
-
|
95 |
-
**Expected Training Speed**: ~XXX examples/second with current configuration
|
96 |
-
|
97 |
-
**Memory Requirements**: Peak usage expected to be ~20GB per GPU
|
98 |
-
|
99 |
-
**Common Issues to Watch For**:
|
100 |
-
- OOM errors on GPU 0: If seen, reduce batch size by 2 and increase grad accumulation by 1
|
101 |
-
- Imbalanced GPU usage: Check device mapping and FSDP configuration
|
102 |
-
- Slow training: Verify that all GPUs are being utilized efficiently
|
103 |
-
- Log flooding: Reduce verbosity of component logs (transformers, datasets, etc.)
|
104 |
-
|
105 |
-
---
|
106 |
|
107 |
*Last Updated: 2025-03-09*
|
|
|
1 |
+
# Phi-4 Training Critical Deployment Checklist
|
2 |
+
|
3 |
+
## Essential Configuration Requirements
|
4 |
+
|
5 |
+
### 1. Model Configuration
|
6 |
+
- [ ] Model name: `unsloth/phi-4-unsloth-bnb-4bit`
|
7 |
+
- [ ] BF16 precision enabled, FP16 disabled
|
8 |
+
- [ ] Appropriate sequence length (2048)
|
9 |
+
- [ ] LoRA parameters correctly configured (r: 32, alpha: 16)
|
10 |
+
|
11 |
+
### 2. Hardware & Resource Management
|
12 |
+
- [ ] Per-device batch size ≤ 16
|
13 |
+
- [ ] Gradient accumulation steps ≥ 3
|
14 |
+
- [ ] Gradient checkpointing enabled
|
15 |
+
- [ ] Memory usage limits properly set (85% of GPU capacity)
|
16 |
+
|
17 |
+
### 3. Critical Dataset Handling Rules
|
18 |
+
- [ ] **NO REORDERING of dataset entries** - original order must be preserved
|
19 |
+
- [ ] **NO COMBINING of separate entries** - each entry must remain distinct
|
20 |
+
- [ ] **SEQUENTIAL PROCESSING required** - entries must be processed one after another
|
21 |
+
- [ ] `sort_by_id` and `maintain_paper_order` flags properly set to preserve data sequence
|
22 |
+
- [ ] Sequential sampler used with no shuffling (`"shuffle": false`)
|
23 |
+
- [ ] Dataset sequential integrity verified with validation samples
|
24 |
+
- [ ] Conversation structure preserved (original format maintained)
|
25 |
+
|
26 |
+
### 4. Essential Error Handling
|
27 |
+
- [ ] Clear error catching for dataset loading issues
|
28 |
+
- [ ] Memory tracking at key training points
|
29 |
+
- [ ] Low-verbosity logging for HF Space compatibility
|
30 |
+
|
31 |
+
### 5. Training Core Requirements
|
32 |
+
- [ ] Appropriate learning rate (2e-5)
|
33 |
+
- [ ] Proper checkpointing frequency
|
34 |
+
- [ ] Hub settings correctly configured for model saving
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
---
|
37 |
|
38 |
+
## Pre-Deployment Verification
|
|
|
|
|
39 |
|
40 |
+
| Requirement | Status | Notes |
|
41 |
+
|-------------|--------|-------|
|
42 |
+
| Data sequential integrity | | Confirm entries processed in order |
|
43 |
+
| GPU memory within limits | | Check peak memory doesn't exceed 20GB per GPU |
|
44 |
+
| Training batch verification | | Verify first few batches maintain proper order |
|
45 |
|
46 |
---
|
47 |
|
48 |
+
**Current Hardware**: 4× NVIDIA L4 GPUs (24GB VRAM each)
|
49 |
|
50 |
+
**CRITICAL REMINDER**: Data sequence preservation is the highest priority - any shuffling, reordering, or combining of entries will compromise model quality.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
*Last Updated: 2025-03-09*
|
app.py
CHANGED
@@ -109,18 +109,9 @@ def display_config():
|
|
109 |
def start_training():
|
110 |
"""Start the training process."""
|
111 |
try:
|
112 |
-
#
|
113 |
-
log_info("
|
114 |
-
|
115 |
-
try:
|
116 |
-
result = subprocess.run(verify_cmd, shell=True, check=True, capture_output=True, text=True)
|
117 |
-
if "All critical checks passed!" not in result.stdout:
|
118 |
-
log_info("Verification found issues. Please review:")
|
119 |
-
log_info(result.stdout)
|
120 |
-
return "Verification detected potential issues. Please review the logs before proceeding."
|
121 |
-
except subprocess.CalledProcessError as e:
|
122 |
-
log_info(f"Verification failed: {e.stderr}")
|
123 |
-
return "Verification failed. Please check the logs for details."
|
124 |
|
125 |
# Start training
|
126 |
log_info("Starting training process...")
|
|
|
109 |
def start_training():
|
110 |
"""Start the training process."""
|
111 |
try:
|
112 |
+
# Log configuration check
|
113 |
+
log_info("Preparing to start training process...")
|
114 |
+
log_info("Using consolidated configuration from transformers_config.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
# Start training
|
117 |
log_info("Starting training process...")
|
run_transformers_training.py
CHANGED
@@ -8,6 +8,14 @@ import argparse
|
|
8 |
import logging
|
9 |
from datetime import datetime
|
10 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
# Import Unsloth first, before other ML imports
|
13 |
try:
|
@@ -19,7 +27,6 @@ except ImportError:
|
|
19 |
logger = logging.getLogger(__name__)
|
20 |
logger.warning("Unsloth not available. Please install with: pip install unsloth")
|
21 |
|
22 |
-
import torch
|
23 |
from datasets import load_dataset
|
24 |
from transformers import (
|
25 |
AutoModelForCausalLM,
|
@@ -46,6 +53,9 @@ logging.getLogger("accelerate").setLevel(logging.WARNING)
|
|
46 |
logging.getLogger("torch").setLevel(logging.WARNING)
|
47 |
logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
|
48 |
|
|
|
|
|
|
|
49 |
# Define a clean logging function for HF Space compatibility
|
50 |
def log_info(message):
|
51 |
"""Log information in a format compatible with Hugging Face Spaces"""
|
@@ -336,6 +346,45 @@ def load_dataset_with_mapping(dataset_config):
|
|
336 |
# Note: Explicitly NOT sorting the dataset to preserve original order
|
337 |
logger.info("Preserving original dataset order (no sorting)")
|
338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
# Log examples without printing full content
|
340 |
if "conversations" in dataset.column_names:
|
341 |
sample_ids = [example['id'] for example in dataset.select(range(min(5, len(dataset))))]
|
@@ -532,37 +581,107 @@ class SimpleDataCollator:
|
|
532 |
|
533 |
class LoggingCallback(TrainerCallback):
|
534 |
def __init__(self):
|
|
|
|
|
535 |
self.last_log_time = time.time()
|
536 |
-
self.
|
|
|
|
|
|
|
537 |
|
538 |
def on_step_end(self, args, state, control, **kwargs):
|
539 |
# Log every 50 steps or every 5 minutes, whichever comes first
|
540 |
current_time = time.time()
|
541 |
|
542 |
-
#
|
543 |
-
if
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
551 |
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
565 |
def on_train_begin(self, args, state, control, **kwargs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
566 |
log_info("=== Training is starting ===")
|
567 |
|
568 |
# Log important training parameters for visibility
|
@@ -571,9 +690,9 @@ class LoggingCallback(TrainerCallback):
|
|
571 |
log_info(f"Epochs: {args.num_train_epochs}")
|
572 |
|
573 |
# Log memory information in compact format
|
574 |
-
if
|
575 |
memory_info = []
|
576 |
-
for i in range(
|
577 |
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
578 |
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
579 |
memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
|
@@ -581,15 +700,18 @@ class LoggingCallback(TrainerCallback):
|
|
581 |
log_info(f"Initial memory usage - {', '.join(memory_info)}")
|
582 |
|
583 |
def on_train_end(self, args, state, control, **kwargs):
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
|
|
591 |
|
592 |
-
|
|
|
|
|
593 |
|
594 |
log_info(f"Total steps: {state.global_step}")
|
595 |
log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
|
@@ -627,6 +749,15 @@ def main():
|
|
627 |
# Set up logging
|
628 |
log_info("Starting Phi-4 fine-tuning process")
|
629 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
630 |
# Parse arguments
|
631 |
args = parse_args()
|
632 |
|
@@ -645,64 +776,66 @@ def main():
|
|
645 |
else:
|
646 |
log_info("Running in non-distributed mode (single process)")
|
647 |
|
648 |
-
# Load all configurations
|
649 |
try:
|
650 |
configs = load_configs(args.config_dir)
|
651 |
|
652 |
-
# Extract specific configs
|
653 |
if not configs:
|
654 |
logger.error("Failed to load configuration")
|
655 |
return 1
|
|
|
|
|
|
|
|
|
|
|
656 |
|
657 |
# Verify configuration sections exist
|
658 |
-
if
|
659 |
logger.error("transformers_config.json not found or invalid")
|
660 |
return 1
|
661 |
|
662 |
-
if
|
663 |
logger.warning("Hardware configuration section not found in transformers_config.json. Using default hardware configuration.")
|
664 |
|
665 |
-
if
|
666 |
logger.error("Dataset configuration section not found in transformers_config.json")
|
667 |
return 1
|
668 |
|
669 |
# Validate model configuration
|
670 |
-
|
671 |
-
|
|
|
|
|
|
|
672 |
logger.error("Model name not specified in configuration")
|
673 |
logger.error("Please ensure 'name' is specified under 'model' in transformers_config.json")
|
674 |
return 1
|
675 |
|
676 |
-
model_name = model_config.get("model", {}).get("name") or model_config.get("model_name_or_path") or model_config.get("model_name")
|
677 |
log_info(f"Using model: {model_name}")
|
678 |
log_info("All configurations loaded successfully")
|
679 |
|
680 |
-
# Extract specific configs
|
681 |
-
model_config = configs["transformers"]
|
682 |
-
hardware_config = configs.get("hardware", {})
|
683 |
-
dataset_config = configs["dataset"]
|
684 |
-
|
685 |
# Apply hardware-specific settings if available
|
686 |
if hardware_config:
|
687 |
# Get training optimizations from hardware config
|
688 |
training_opts = hardware_config.get("training_optimizations", {})
|
689 |
|
690 |
# Apply batch size and gradient accumulation settings
|
691 |
-
if training_opts.get("per_device_batch_size") and
|
692 |
batch_size = training_opts.get("per_device_batch_size")
|
693 |
-
|
694 |
log_info(f"Applied hardware-optimized batch size: {batch_size}")
|
695 |
|
696 |
-
if training_opts.get("gradient_accumulation_steps") and
|
697 |
grad_steps = training_opts.get("gradient_accumulation_steps")
|
698 |
-
|
699 |
log_info(f"Applied hardware-optimized gradient accumulation: {grad_steps}")
|
700 |
|
701 |
# Apply memory optimizations
|
702 |
memory_opts = training_opts.get("memory_optimizations", {})
|
703 |
-
if memory_opts.get("use_gradient_checkpointing") is not None and
|
704 |
grad_ckpt = memory_opts.get("use_gradient_checkpointing")
|
705 |
-
|
706 |
log_info(f"Applied hardware-optimized gradient checkpointing: {grad_ckpt}")
|
707 |
|
708 |
# Apply system settings
|
@@ -720,38 +853,27 @@ def main():
|
|
720 |
return 1
|
721 |
|
722 |
# Set random seed for reproducibility
|
723 |
-
seed =
|
724 |
set_seed(seed)
|
725 |
log_info(f"Set random seed to {seed} for reproducibility")
|
726 |
|
727 |
-
#
|
728 |
-
if
|
729 |
-
# Empty CUDA cache
|
730 |
torch.cuda.empty_cache()
|
|
|
|
|
|
|
|
|
|
|
|
|
731 |
|
732 |
-
|
733 |
-
|
734 |
-
|
735 |
-
# Get memory fraction from hardware config
|
736 |
-
cuda_memory_fraction = hardware_config.get("system_settings", {}).get("cuda_memory_fraction", 0.85)
|
737 |
-
|
738 |
-
# Log initial memory information in a compact form
|
739 |
-
gpu_info = []
|
740 |
-
for i in range(torch.cuda.device_count()):
|
741 |
-
name = torch.cuda.get_device_name(i)
|
742 |
-
allocated = torch.cuda.memory_allocated(i) / 1024**3
|
743 |
-
total = torch.cuda.get_device_properties(i).total_memory / 1024**3
|
744 |
-
reserved_memory = total * cuda_memory_fraction
|
745 |
-
gpu_info.append(f"GPU {i}: {name} ({allocated:.1f}GB/{reserved_memory:.1f}GB)")
|
746 |
-
|
747 |
-
log_info(f"Hardware: {torch.cuda.device_count()} GPUs detected")
|
748 |
-
log_info(f"GPU details: {', '.join(gpu_info)}")
|
749 |
-
else:
|
750 |
-
log_info("No GPU detected, using CPU (training will be very slow)")
|
751 |
|
752 |
try:
|
753 |
log_info("Loading model and tokenizer...")
|
754 |
-
model, tokenizer = load_model_and_tokenizer(
|
755 |
log_info("Model and tokenizer loaded successfully")
|
756 |
|
757 |
# Load dataset with proper mapping
|
@@ -781,25 +903,21 @@ def main():
|
|
781 |
log_info("Using FP16 precision from hardware config")
|
782 |
else:
|
783 |
# Fall back to transformers config
|
784 |
-
use_bf16 =
|
785 |
-
use_fp16 =
|
786 |
log_info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
|
787 |
|
788 |
# Get per device batch size - from transformers config, but possibly overridden by hardware config
|
789 |
-
per_device_batch_size =
|
790 |
-
gradient_accumulation_steps =
|
791 |
|
792 |
# For multi-GPU setup, adjust for better balance
|
793 |
-
if
|
794 |
-
log_info(f"Multi-GPU setup
|
795 |
-
log_info(f"Training config: {per_device_batch_size} samples/GPU × {gradient_accumulation_steps} accumulation steps")
|
796 |
-
|
797 |
-
# Determine multi-GPU strategy from hardware config
|
798 |
-
multi_gpu_strategy = hardware_config.get("training_optimizations", {}).get("multi_gpu_strategy", "data_parallel")
|
799 |
|
800 |
# Set up FSDP for multi-GPU training if specified and in distributed mode
|
801 |
fsdp_config = None
|
802 |
-
if multi_gpu_strategy == "fsdp" and is_distributed and
|
803 |
try:
|
804 |
from torch.distributed.fsdp import (
|
805 |
FullyShardedDataParallel as FSDP,
|
@@ -845,33 +963,33 @@ def main():
|
|
845 |
# Set up training arguments
|
846 |
log_info("Setting up training arguments")
|
847 |
training_args = TrainingArguments(
|
848 |
-
output_dir=
|
849 |
-
num_train_epochs=
|
850 |
per_device_train_batch_size=per_device_batch_size,
|
851 |
gradient_accumulation_steps=gradient_accumulation_steps,
|
852 |
-
learning_rate=
|
853 |
-
weight_decay=
|
854 |
-
warmup_ratio=
|
855 |
-
lr_scheduler_type=
|
856 |
-
logging_steps=
|
857 |
-
save_strategy=
|
858 |
-
save_steps=
|
859 |
-
save_total_limit=
|
860 |
fp16=use_fp16,
|
861 |
bf16=use_bf16,
|
862 |
-
max_grad_norm=
|
863 |
-
push_to_hub=
|
864 |
-
hub_model_id=
|
865 |
hub_token=os.environ.get("HF_TOKEN", None),
|
866 |
report_to="tensorboard",
|
867 |
remove_unused_columns=False, # Keep all columns
|
868 |
-
gradient_checkpointing=
|
869 |
dataloader_pin_memory=pin_memory,
|
870 |
-
optim=
|
871 |
ddp_find_unused_parameters=False, # Improve distributed training efficiency
|
872 |
dataloader_drop_last=False, # Process all examples
|
873 |
dataloader_num_workers=dataloader_workers,
|
874 |
-
no_cuda=False if
|
875 |
# Only add FSDP if we're in distributed mode with FSDP strategy
|
876 |
fsdp=fsdp_config if is_distributed and multi_gpu_strategy == "fsdp" else None,
|
877 |
)
|
@@ -894,11 +1012,27 @@ def main():
|
|
894 |
"""Custom dataloader that preserves original dataset order"""
|
895 |
log_info("Creating sequential dataloader to maintain original dataset order")
|
896 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
897 |
# Calculate batch size based on device availability
|
898 |
if getattr(training_args, "no_cuda", False):
|
899 |
batch_size = training_args.per_device_train_batch_size
|
900 |
else:
|
901 |
-
batch_size = max(training_args.per_device_train_batch_size * max(1,
|
902 |
|
903 |
log_info(f"Using sequential sampler with batch size {batch_size}")
|
904 |
|
@@ -920,12 +1054,12 @@ def main():
|
|
920 |
log_info("=== Starting Training ===")
|
921 |
try:
|
922 |
# Empty cache again right before training
|
923 |
-
if
|
924 |
torch.cuda.empty_cache()
|
925 |
log_info("Cleared CUDA cache before training")
|
926 |
|
927 |
# Display compact training info
|
928 |
-
total_steps = int(len(dataset) / (per_device_batch_size *
|
929 |
log_info(f"Training plan: {len(dataset)} examples over {training_args.num_train_epochs} epochs ≈ {total_steps} steps")
|
930 |
|
931 |
trainer.train()
|
@@ -937,8 +1071,8 @@ def main():
|
|
937 |
log_info(f"Model saved to {training_args.output_dir}")
|
938 |
|
939 |
# Push to hub if enabled
|
940 |
-
if
|
941 |
-
hub_id =
|
942 |
log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
|
943 |
trainer.push_to_hub()
|
944 |
log_info("Model successfully pushed to Hub")
|
@@ -947,9 +1081,9 @@ def main():
|
|
947 |
except Exception as e:
|
948 |
logger.error(f"Training failed with error: {str(e)}")
|
949 |
# Log CUDA memory info if available in compact format
|
950 |
-
if
|
951 |
memory_info = []
|
952 |
-
for i in range(
|
953 |
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
954 |
reserved = torch.cuda.memory_reserved(i) / 1024**2
|
955 |
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
|
|
8 |
import logging
|
9 |
from datetime import datetime
|
10 |
import time
|
11 |
+
import warnings
|
12 |
+
import torch
|
13 |
+
from importlib.util import find_spec
|
14 |
+
|
15 |
+
# Global variables for hardware detection
|
16 |
+
CUDA_AVAILABLE = torch.cuda.is_available()
|
17 |
+
NUM_GPUS = torch.cuda.device_count() if CUDA_AVAILABLE else 0
|
18 |
+
DEVICE_TYPE = "cuda" if CUDA_AVAILABLE else "cpu"
|
19 |
|
20 |
# Import Unsloth first, before other ML imports
|
21 |
try:
|
|
|
27 |
logger = logging.getLogger(__name__)
|
28 |
logger.warning("Unsloth not available. Please install with: pip install unsloth")
|
29 |
|
|
|
30 |
from datasets import load_dataset
|
31 |
from transformers import (
|
32 |
AutoModelForCausalLM,
|
|
|
53 |
logging.getLogger("torch").setLevel(logging.WARNING)
|
54 |
logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
|
55 |
|
56 |
+
# Check availability of libraries
|
57 |
+
peft_available = find_spec("peft") is not None
|
58 |
+
|
59 |
# Define a clean logging function for HF Space compatibility
|
60 |
def log_info(message):
|
61 |
"""Log information in a format compatible with Hugging Face Spaces"""
|
|
|
346 |
# Note: Explicitly NOT sorting the dataset to preserve original order
|
347 |
logger.info("Preserving original dataset order (no sorting)")
|
348 |
|
349 |
+
# Check data ordering requirements
|
350 |
+
processing_config = dataset_config.get("dataset", {}).get("processing", {})
|
351 |
+
data_loading_config = dataset_config.get("data_loading", {})
|
352 |
+
|
353 |
+
# Flag consolidation - we only need one flag to control sequence preservation
|
354 |
+
# Default to True to ensure safety
|
355 |
+
preserve_sequence = processing_config.get("preserve_entry_sequence", True)
|
356 |
+
shuffle_disabled = not data_loading_config.get("shuffle", False)
|
357 |
+
|
358 |
+
if not preserve_sequence:
|
359 |
+
logger.warning("CRITICAL: preserve_entry_sequence is set to False. This is NOT RECOMMENDED!")
|
360 |
+
logger.warning("Data sequence integrity is essential for proper model training.")
|
361 |
+
|
362 |
+
if not shuffle_disabled:
|
363 |
+
logger.error("CRITICAL: shuffle is enabled in the dataset config!")
|
364 |
+
logger.error("This will RANDOMIZE your dataset and break sequential order.")
|
365 |
+
logger.error("Please set shuffle: false in your data_loading configuration.")
|
366 |
+
# Actually enforce sequence preservation by raising an error
|
367 |
+
raise ValueError("Dataset shuffling is enabled but preserve_entry_sequence is required. " +
|
368 |
+
"Please disable shuffling in your configuration.")
|
369 |
+
|
370 |
+
# Verify the IDs are in sequential order if they're numeric
|
371 |
+
try:
|
372 |
+
if len(dataset) > 1 and all(isinstance(example.get('id', ''), (int, str)) for example in dataset.select(range(min(10, len(dataset))))):
|
373 |
+
sample_ids = [example['id'] for example in dataset.select(range(min(10, len(dataset))))]
|
374 |
+
logger.info(f"Verifying sequential integrity with first few IDs: {sample_ids}")
|
375 |
+
|
376 |
+
# Check if IDs are numeric and ordered
|
377 |
+
if all(isinstance(id, int) or id.isdigit() for id in sample_ids):
|
378 |
+
numeric_ids = [int(id) if isinstance(id, str) else id for id in sample_ids]
|
379 |
+
is_ordered = all(numeric_ids[i] <= numeric_ids[i+1] for i in range(len(numeric_ids)-1))
|
380 |
+
if not is_ordered:
|
381 |
+
logger.warning("WARNING: Sample IDs are not in sequential order.")
|
382 |
+
logger.warning("This may indicate that data sequence is not preserved.")
|
383 |
+
else:
|
384 |
+
logger.info("Sample IDs appear to be in sequential order.")
|
385 |
+
except Exception as e:
|
386 |
+
logger.warning(f"Could not verify sequential integrity: {e}")
|
387 |
+
|
388 |
# Log examples without printing full content
|
389 |
if "conversations" in dataset.column_names:
|
390 |
sample_ids = [example['id'] for example in dataset.select(range(min(5, len(dataset))))]
|
|
|
581 |
|
582 |
class LoggingCallback(TrainerCallback):
|
583 |
def __init__(self):
|
584 |
+
super().__init__()
|
585 |
+
self.training_started = time.time()
|
586 |
self.last_log_time = time.time()
|
587 |
+
self.last_step = 0
|
588 |
+
self.verify_sequence = None
|
589 |
+
self.sequence_samples = None
|
590 |
+
self.sample_indices = None
|
591 |
|
592 |
def on_step_end(self, args, state, control, **kwargs):
|
593 |
# Log every 50 steps or every 5 minutes, whichever comes first
|
594 |
current_time = time.time()
|
595 |
|
596 |
+
# Perform actual sequence integrity verification if enabled
|
597 |
+
if self.verify_sequence is True and state.global_step % 100 == 0 and self.sequence_samples:
|
598 |
+
try:
|
599 |
+
# Get a batch of data without disturbing the training
|
600 |
+
batch = next(iter(trainer.get_train_dataloader()))
|
601 |
+
if 'input_ids' in batch and 'labels' in batch:
|
602 |
+
log_info("Verifying data sequence integrity...")
|
603 |
+
|
604 |
+
# Check if we can access some of our reference samples
|
605 |
+
current_indices = list(range(min(3, len(trainer.train_dataset))))
|
606 |
+
current_samples = [trainer.train_dataset[i] for i in current_indices]
|
607 |
+
|
608 |
+
# Compare current samples with our reference samples from training start
|
609 |
+
is_sequence_maintained = True
|
610 |
+
for i, (orig_idx, orig_sample) in enumerate(zip(self.sample_indices, self.sequence_samples)):
|
611 |
+
# Check if sample IDs still match our reference
|
612 |
+
if orig_idx < len(current_samples):
|
613 |
+
current_sample = current_samples[i]
|
614 |
+
|
615 |
+
# Compare IDs if available
|
616 |
+
if 'id' in orig_sample and 'id' in current_sample:
|
617 |
+
if orig_sample['id'] != current_sample['id']:
|
618 |
+
log_info(f"WARNING: Sequence integrity compromised! Sample {i} ID changed from {orig_sample['id']} to {current_sample['id']}")
|
619 |
+
is_sequence_maintained = False
|
620 |
+
|
621 |
+
# Compare input fingerprints
|
622 |
+
if 'conversations' in orig_sample and 'conversations' in current_sample:
|
623 |
+
orig_len = len(orig_sample['conversations'])
|
624 |
+
curr_len = len(current_sample['conversations'])
|
625 |
+
if orig_len != curr_len:
|
626 |
+
log_info(f"WARNING: Sequence integrity compromised! Sample {i} conversation length changed from {orig_len} to {curr_len}")
|
627 |
+
is_sequence_maintained = False
|
628 |
+
|
629 |
+
if is_sequence_maintained:
|
630 |
+
log_info("Data sequence integrity check: OK")
|
631 |
+
else:
|
632 |
+
log_info("CRITICAL WARNING: Data sequence integrity check FAILED!")
|
633 |
+
except Exception as e:
|
634 |
+
log_info(f"Warning: Couldn't verify sequence integrity: {e}")
|
635 |
|
636 |
+
time_interval = current_time - self.last_log_time
|
637 |
+
step_interval = state.global_step - self.last_step
|
638 |
+
|
639 |
+
if step_interval >= 50 or time_interval >= 300: # 5 minutes = 300 seconds
|
640 |
+
# Calculate throughput
|
641 |
+
examples_per_second = step_interval * args.per_device_train_batch_size * args.gradient_accumulation_steps / max(time_interval, 1e-6)
|
642 |
+
|
643 |
+
elapsed_total = time.strftime("%H:%M:%S", time.gmtime(current_time - self.training_started))
|
644 |
+
|
645 |
+
# Log progress
|
646 |
+
log_info(f"Step: {state.global_step}, Loss: {state.log_history[-1]['loss']:.4f}, "
|
647 |
+
f"Rate: {examples_per_second:.2f} examples/sec, Elapsed: {elapsed_total}")
|
648 |
|
649 |
+
# Report memory usage if CUDA is available
|
650 |
+
if CUDA_AVAILABLE:
|
651 |
+
log_info(f"GPU Memory: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB allocated, "
|
652 |
+
f"{torch.cuda.max_memory_reserved() / 1024**3:.2f} GB reserved")
|
653 |
+
|
654 |
+
# Reset for next interval
|
655 |
+
self.last_log_time = current_time
|
656 |
+
self.last_step = state.global_step
|
657 |
+
|
658 |
def on_train_begin(self, args, state, control, **kwargs):
|
659 |
+
log_info(f"=== Training started at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
|
660 |
+
log_info(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")
|
661 |
+
|
662 |
+
# Set up sequence verification with actual sample capturing
|
663 |
+
try:
|
664 |
+
self.verify_sequence = dataset_config.get("validation", {}).get("verify_sequence_integrity", False)
|
665 |
+
if self.verify_sequence:
|
666 |
+
log_info("Sequence integrity verification enabled during training")
|
667 |
+
|
668 |
+
# Save actual samples for later verification
|
669 |
+
if trainer and trainer.train_dataset:
|
670 |
+
# Get some reference samples from the beginning of the dataset
|
671 |
+
self.sample_indices = list(range(min(5, len(trainer.train_dataset))))
|
672 |
+
self.sequence_samples = [trainer.train_dataset[i] for i in self.sample_indices]
|
673 |
+
log_info(f"Captured {len(self.sequence_samples)} reference samples for sequence integrity verification")
|
674 |
+
|
675 |
+
# Log sample IDs for debugging
|
676 |
+
if len(self.sequence_samples) > 0 and 'id' in self.sequence_samples[0]:
|
677 |
+
sample_ids = [s.get('id') for s in self.sequence_samples if 'id' in s]
|
678 |
+
log_info(f"Reference sample IDs: {sample_ids}")
|
679 |
+
else:
|
680 |
+
log_info("Warning: Could not capture reference samples - verification will be limited")
|
681 |
+
except Exception as e:
|
682 |
+
log_info(f"Warning: Could not set up sequence integrity verification: {e}")
|
683 |
+
self.verify_sequence = False
|
684 |
+
|
685 |
log_info("=== Training is starting ===")
|
686 |
|
687 |
# Log important training parameters for visibility
|
|
|
690 |
log_info(f"Epochs: {args.num_train_epochs}")
|
691 |
|
692 |
# Log memory information in compact format
|
693 |
+
if CUDA_AVAILABLE:
|
694 |
memory_info = []
|
695 |
+
for i in range(NUM_GPUS):
|
696 |
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
697 |
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
698 |
memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
|
|
|
700 |
log_info(f"Initial memory usage - {', '.join(memory_info)}")
|
701 |
|
702 |
def on_train_end(self, args, state, control, **kwargs):
|
703 |
+
training_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - self.training_started))
|
704 |
+
log_info(f"=== Training completed in {training_time} ===")
|
705 |
+
|
706 |
+
# Log final memory usage
|
707 |
+
if CUDA_AVAILABLE:
|
708 |
+
for i in range(NUM_GPUS):
|
709 |
+
max_mem = torch.cuda.max_memory_allocated(i) / 1024**3 # GB
|
710 |
+
log_info(f"GPU {i} max memory: {max_mem:.2f} GB")
|
711 |
|
712 |
+
# Clear GPU memory
|
713 |
+
torch.cuda.empty_cache()
|
714 |
+
log_info("GPU memory cleared")
|
715 |
|
716 |
log_info(f"Total steps: {state.global_step}")
|
717 |
log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
|
|
|
749 |
# Set up logging
|
750 |
log_info("Starting Phi-4 fine-tuning process")
|
751 |
|
752 |
+
# Log hardware information
|
753 |
+
log_info(f"Hardware detection: CUDA {'available' if CUDA_AVAILABLE else 'not available'}")
|
754 |
+
if CUDA_AVAILABLE:
|
755 |
+
log_info(f"Found {NUM_GPUS} GPUs")
|
756 |
+
for i in range(NUM_GPUS):
|
757 |
+
log_info(f" GPU {i}: {torch.cuda.get_device_name(i)}")
|
758 |
+
else:
|
759 |
+
log_info("Running on CPU (training will be very slow)")
|
760 |
+
|
761 |
# Parse arguments
|
762 |
args = parse_args()
|
763 |
|
|
|
776 |
else:
|
777 |
log_info("Running in non-distributed mode (single process)")
|
778 |
|
779 |
+
# Load all configurations - do this once
|
780 |
try:
|
781 |
configs = load_configs(args.config_dir)
|
782 |
|
783 |
+
# Extract specific configs immediately after loading
|
784 |
if not configs:
|
785 |
logger.error("Failed to load configuration")
|
786 |
return 1
|
787 |
+
|
788 |
+
# Store configurations in clear variables
|
789 |
+
transformers_config = configs.get("transformers", {})
|
790 |
+
hardware_config = configs.get("hardware", {})
|
791 |
+
dataset_config = configs.get("dataset", {})
|
792 |
|
793 |
# Verify configuration sections exist
|
794 |
+
if not transformers_config:
|
795 |
logger.error("transformers_config.json not found or invalid")
|
796 |
return 1
|
797 |
|
798 |
+
if not hardware_config:
|
799 |
logger.warning("Hardware configuration section not found in transformers_config.json. Using default hardware configuration.")
|
800 |
|
801 |
+
if not dataset_config:
|
802 |
logger.error("Dataset configuration section not found in transformers_config.json")
|
803 |
return 1
|
804 |
|
805 |
# Validate model configuration
|
806 |
+
model_name = (transformers_config.get("model", {}).get("name") or
|
807 |
+
transformers_config.get("model_name_or_path") or
|
808 |
+
transformers_config.get("model_name"))
|
809 |
+
|
810 |
+
if not model_name:
|
811 |
logger.error("Model name not specified in configuration")
|
812 |
logger.error("Please ensure 'name' is specified under 'model' in transformers_config.json")
|
813 |
return 1
|
814 |
|
|
|
815 |
log_info(f"Using model: {model_name}")
|
816 |
log_info("All configurations loaded successfully")
|
817 |
|
|
|
|
|
|
|
|
|
|
|
818 |
# Apply hardware-specific settings if available
|
819 |
if hardware_config:
|
820 |
# Get training optimizations from hardware config
|
821 |
training_opts = hardware_config.get("training_optimizations", {})
|
822 |
|
823 |
# Apply batch size and gradient accumulation settings
|
824 |
+
if training_opts.get("per_device_batch_size") and transformers_config.get("training"):
|
825 |
batch_size = training_opts.get("per_device_batch_size")
|
826 |
+
transformers_config["training"]["per_device_train_batch_size"] = batch_size
|
827 |
log_info(f"Applied hardware-optimized batch size: {batch_size}")
|
828 |
|
829 |
+
if training_opts.get("gradient_accumulation_steps") and transformers_config.get("training"):
|
830 |
grad_steps = training_opts.get("gradient_accumulation_steps")
|
831 |
+
transformers_config["training"]["gradient_accumulation_steps"] = grad_steps
|
832 |
log_info(f"Applied hardware-optimized gradient accumulation: {grad_steps}")
|
833 |
|
834 |
# Apply memory optimizations
|
835 |
memory_opts = training_opts.get("memory_optimizations", {})
|
836 |
+
if memory_opts.get("use_gradient_checkpointing") is not None and transformers_config.get("training"):
|
837 |
grad_ckpt = memory_opts.get("use_gradient_checkpointing")
|
838 |
+
transformers_config["training"]["gradient_checkpointing"] = grad_ckpt
|
839 |
log_info(f"Applied hardware-optimized gradient checkpointing: {grad_ckpt}")
|
840 |
|
841 |
# Apply system settings
|
|
|
853 |
return 1
|
854 |
|
855 |
# Set random seed for reproducibility
|
856 |
+
seed = transformers_config.get("seed", 42)
|
857 |
set_seed(seed)
|
858 |
log_info(f"Set random seed to {seed} for reproducibility")
|
859 |
|
860 |
+
# Empty CUDA cache to ensure clean state
|
861 |
+
if CUDA_AVAILABLE:
|
|
|
862 |
torch.cuda.empty_cache()
|
863 |
+
log_info("Cleared CUDA cache")
|
864 |
+
|
865 |
+
# Setup environment variable for CUDA memory allocation
|
866 |
+
if CUDA_AVAILABLE:
|
867 |
+
system_settings = hardware_config.get("system_settings", {})
|
868 |
+
cuda_memory_fraction = system_settings.get("cuda_memory_fraction", 0.85)
|
869 |
|
870 |
+
if cuda_memory_fraction < 1.0:
|
871 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:128,expandable_segments:True"
|
872 |
+
log_info(f"Set CUDA memory allocation limit to expandable with max_split_size_mb:128")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
873 |
|
874 |
try:
|
875 |
log_info("Loading model and tokenizer...")
|
876 |
+
model, tokenizer = load_model_and_tokenizer(transformers_config)
|
877 |
log_info("Model and tokenizer loaded successfully")
|
878 |
|
879 |
# Load dataset with proper mapping
|
|
|
903 |
log_info("Using FP16 precision from hardware config")
|
904 |
else:
|
905 |
# Fall back to transformers config
|
906 |
+
use_bf16 = transformers_config.get("bf16", False) or transformers_config.get("torch_dtype", "") == "bfloat16"
|
907 |
+
use_fp16 = transformers_config.get("fp16", False) and not use_bf16 # Only use fp16 if bf16 is not set
|
908 |
log_info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
|
909 |
|
910 |
# Get per device batch size - from transformers config, but possibly overridden by hardware config
|
911 |
+
per_device_batch_size = transformers_config.get("training", {}).get("per_device_train_batch_size", 16)
|
912 |
+
gradient_accumulation_steps = transformers_config.get("training", {}).get("gradient_accumulation_steps", 3)
|
913 |
|
914 |
# For multi-GPU setup, adjust for better balance
|
915 |
+
if CUDA_AVAILABLE and NUM_GPUS > 1:
|
916 |
+
log_info(f"Multi-GPU setup: Adjusting for {NUM_GPUS} GPUs")
|
|
|
|
|
|
|
|
|
917 |
|
918 |
# Set up FSDP for multi-GPU training if specified and in distributed mode
|
919 |
fsdp_config = None
|
920 |
+
if multi_gpu_strategy == "fsdp" and is_distributed and NUM_GPUS > 1:
|
921 |
try:
|
922 |
from torch.distributed.fsdp import (
|
923 |
FullyShardedDataParallel as FSDP,
|
|
|
963 |
# Set up training arguments
|
964 |
log_info("Setting up training arguments")
|
965 |
training_args = TrainingArguments(
|
966 |
+
output_dir=transformers_config.get("output_dir", "./results") or transformers_config.get("checkpointing", {}).get("output_dir", "./results"),
|
967 |
+
num_train_epochs=transformers_config.get("training", {}).get("num_train_epochs", 3),
|
968 |
per_device_train_batch_size=per_device_batch_size,
|
969 |
gradient_accumulation_steps=gradient_accumulation_steps,
|
970 |
+
learning_rate=transformers_config.get("training", {}).get("learning_rate", 2e-5),
|
971 |
+
weight_decay=transformers_config.get("training", {}).get("weight_decay", 0.01),
|
972 |
+
warmup_ratio=transformers_config.get("training", {}).get("warmup_ratio", 0.05),
|
973 |
+
lr_scheduler_type=transformers_config.get("training", {}).get("lr_scheduler_type", "cosine"),
|
974 |
+
logging_steps=transformers_config.get("training", {}).get("logging_steps", 10),
|
975 |
+
save_strategy=transformers_config.get("checkpointing", {}).get("save_strategy", "steps"),
|
976 |
+
save_steps=transformers_config.get("checkpointing", {}).get("save_steps", 100),
|
977 |
+
save_total_limit=transformers_config.get("checkpointing", {}).get("save_total_limit", 3),
|
978 |
fp16=use_fp16,
|
979 |
bf16=use_bf16,
|
980 |
+
max_grad_norm=transformers_config.get("training", {}).get("max_grad_norm", 1.0),
|
981 |
+
push_to_hub=transformers_config.get("huggingface_hub", {}).get("push_to_hub", False),
|
982 |
+
hub_model_id=transformers_config.get("huggingface_hub", {}).get("hub_model_id", None),
|
983 |
hub_token=os.environ.get("HF_TOKEN", None),
|
984 |
report_to="tensorboard",
|
985 |
remove_unused_columns=False, # Keep all columns
|
986 |
+
gradient_checkpointing=transformers_config.get("training", {}).get("gradient_checkpointing", True),
|
987 |
dataloader_pin_memory=pin_memory,
|
988 |
+
optim=transformers_config.get("training", {}).get("optim", "adamw_torch"),
|
989 |
ddp_find_unused_parameters=False, # Improve distributed training efficiency
|
990 |
dataloader_drop_last=False, # Process all examples
|
991 |
dataloader_num_workers=dataloader_workers,
|
992 |
+
no_cuda=False if CUDA_AVAILABLE else True, # Use CUDA if available
|
993 |
# Only add FSDP if we're in distributed mode with FSDP strategy
|
994 |
fsdp=fsdp_config if is_distributed and multi_gpu_strategy == "fsdp" else None,
|
995 |
)
|
|
|
1012 |
"""Custom dataloader that preserves original dataset order"""
|
1013 |
log_info("Creating sequential dataloader to maintain original dataset order")
|
1014 |
|
1015 |
+
# Verification of sequence preservation flags - consolidated
|
1016 |
+
data_loading_config = dataset_config.get("data_loading", {})
|
1017 |
+
sequential_processing = data_loading_config.get("sequential_processing", True)
|
1018 |
+
shuffle_disabled = not data_loading_config.get("shuffle", False)
|
1019 |
+
|
1020 |
+
if not sequential_processing:
|
1021 |
+
log_info("CRITICAL WARNING: sequential_processing flag is disabled! This may affect data order.")
|
1022 |
+
log_info("Data sequence integrity is essential - using sequential sampler regardless of flag.")
|
1023 |
+
# Force sequential processing regardless of flag
|
1024 |
+
|
1025 |
+
if not shuffle_disabled:
|
1026 |
+
log_info("CRITICAL ERROR: Shuffle is not disabled! This will randomize data entry order!")
|
1027 |
+
# Actually handle the error rather than just logging it
|
1028 |
+
raise ValueError("Dataset shuffling is enabled but sequential processing is required. " +
|
1029 |
+
"Please disable shuffling in your configuration.")
|
1030 |
+
|
1031 |
# Calculate batch size based on device availability
|
1032 |
if getattr(training_args, "no_cuda", False):
|
1033 |
batch_size = training_args.per_device_train_batch_size
|
1034 |
else:
|
1035 |
+
batch_size = max(training_args.per_device_train_batch_size * max(1, NUM_GPUS), 1)
|
1036 |
|
1037 |
log_info(f"Using sequential sampler with batch size {batch_size}")
|
1038 |
|
|
|
1054 |
log_info("=== Starting Training ===")
|
1055 |
try:
|
1056 |
# Empty cache again right before training
|
1057 |
+
if CUDA_AVAILABLE:
|
1058 |
torch.cuda.empty_cache()
|
1059 |
log_info("Cleared CUDA cache before training")
|
1060 |
|
1061 |
# Display compact training info
|
1062 |
+
total_steps = int(len(dataset) / (per_device_batch_size * NUM_GPUS * gradient_accumulation_steps) * training_args.num_train_epochs)
|
1063 |
log_info(f"Training plan: {len(dataset)} examples over {training_args.num_train_epochs} epochs ≈ {total_steps} steps")
|
1064 |
|
1065 |
trainer.train()
|
|
|
1071 |
log_info(f"Model saved to {training_args.output_dir}")
|
1072 |
|
1073 |
# Push to hub if enabled
|
1074 |
+
if transformers_config.get("huggingface_hub", {}).get("push_to_hub", False):
|
1075 |
+
hub_id = transformers_config.get("huggingface_hub", {}).get("hub_model_id", "model")
|
1076 |
log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
|
1077 |
trainer.push_to_hub()
|
1078 |
log_info("Model successfully pushed to Hub")
|
|
|
1081 |
except Exception as e:
|
1082 |
logger.error(f"Training failed with error: {str(e)}")
|
1083 |
# Log CUDA memory info if available in compact format
|
1084 |
+
if CUDA_AVAILABLE:
|
1085 |
memory_info = []
|
1086 |
+
for i in range(NUM_GPUS):
|
1087 |
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
1088 |
reserved = torch.cuda.memory_reserved(i) / 1024**2
|
1089 |
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
transformers_config.json
CHANGED
@@ -139,6 +139,7 @@
|
|
139 |
"processing": {
|
140 |
"sort_by_id": true,
|
141 |
"maintain_paper_order": true,
|
|
|
142 |
"max_seq_length": 2048
|
143 |
}
|
144 |
},
|
@@ -159,6 +160,7 @@
|
|
159 |
"data_loading": {
|
160 |
"batch_size": 24,
|
161 |
"shuffle": false,
|
|
|
162 |
"drop_last": false,
|
163 |
"num_workers": 4,
|
164 |
"pin_memory": true,
|
@@ -167,6 +169,7 @@
|
|
167 |
"validation": {
|
168 |
"log_samples": 3,
|
169 |
"log_interval": 50,
|
|
|
170 |
"metrics": ["processed", "skipped", "avg_tokens", "unique_papers"]
|
171 |
}
|
172 |
}
|
|
|
139 |
"processing": {
|
140 |
"sort_by_id": true,
|
141 |
"maintain_paper_order": true,
|
142 |
+
"preserve_entry_sequence": true,
|
143 |
"max_seq_length": 2048
|
144 |
}
|
145 |
},
|
|
|
160 |
"data_loading": {
|
161 |
"batch_size": 24,
|
162 |
"shuffle": false,
|
163 |
+
"sequential_processing": true,
|
164 |
"drop_last": false,
|
165 |
"num_workers": 4,
|
166 |
"pin_memory": true,
|
|
|
169 |
"validation": {
|
170 |
"log_samples": 3,
|
171 |
"log_interval": 50,
|
172 |
+
"verify_sequence_integrity": true,
|
173 |
"metrics": ["processed", "skipped", "avg_tokens", "unique_papers"]
|
174 |
}
|
175 |
}
|