Spaces:

George-API
/

phi4training

Sleeping

App Files Files Community

George-API commited on Mar 10

Commit

5a7635c

verified ·

1 Parent(s): a7d1f2a

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

fixed_run_transformers_training.py +155 -2
requirements.txt +3 -1
run_transformers_training.py +130 -166
transformers_config.json +5 -14

fixed_run_transformers_training.py CHANGED Viewed

@@ -201,10 +201,29 @@ class LoggingCallback(TrainerCallback):
         log_info("=== Training is starting ===")
         # Log important training parameters for visibility
-        log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {max(1, torch.cuda.device_count())} GPUs")
         log_info(f"Learning rate: {args.learning_rate}")
         log_info(f"Epochs: {args.num_train_epochs}")
         # Log memory information in compact format
         if torch.cuda.is_available():
             memory_info = []
@@ -227,4 +246,138 @@ class LoggingCallback(TrainerCallback):
             log_info(f"Final memory usage - {', '.join(memory_info)}")
         log_info(f"Total steps: {state.global_step}")
-        log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")

         log_info("=== Training is starting ===")
         # Log important training parameters for visibility
+        effective_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps * max(1, torch.cuda.device_count())
+        log_info(f"Per device batch size: {args.per_device_train_batch_size}")
+        log_info(f"Gradient accumulation steps: {args.gradient_accumulation_steps}")
+        log_info(f"Number of GPUs: {max(1, torch.cuda.device_count())}")
+        log_info(f"Total effective batch size: {effective_batch_size}")
         log_info(f"Learning rate: {args.learning_rate}")
         log_info(f"Epochs: {args.num_train_epochs}")
+        # Log dataset information
+        if hasattr(trainer, 'train_dataset') and trainer.train_dataset is not None:
+            log_info(f"Dataset size: {len(trainer.train_dataset)} examples")
+            if len(trainer.train_dataset) > 0:
+                try:
+                    # Log first few prompt numbers to verify sequence
+                    prompt_numbers = []
+                    for i in range(min(5, len(trainer.train_dataset))):
+                        if 'prompt_number' in trainer.train_dataset[i]:
+                            prompt_numbers.append(trainer.train_dataset[i]['prompt_number'])
+                    if prompt_numbers:
+                        log_info(f"First few prompt numbers: {prompt_numbers}")
+                except Exception as e:
+                    log_info(f"Error accessing dataset samples: {e}")
         # Log memory information in compact format
         if torch.cuda.is_available():
             memory_info = []
             log_info(f"Final memory usage - {', '.join(memory_info)}")
         log_info(f"Total steps: {state.global_step}")
+        log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
+def custom_get_train_dataloader():
+    """Custom dataloader that preserves original dataset order"""
+    log_info("Creating sequential dataloader to maintain original dataset order")
+    # Create a simple sequential sampler
+    sequential_sampler = torch.utils.data.SequentialSampler(dataset)
+    # Verify shuffle is disabled
+    data_loading_config = dataset_config.get("data_loading", {})
+    shuffle_enabled = data_loading_config.get("shuffle", False)
+    if shuffle_enabled:
+        log_info("CRITICAL ERROR: Shuffle is enabled! This will randomize data entry order!")
+        raise ValueError("Dataset shuffling is enabled but sequential processing is required. " +
+                      "Please disable shuffling in your configuration.")
+    # Log our sequential processing approach
+    log_info("Using SequentialSampler to guarantee original dataset order is preserved")
+    log_info("Data order preservation is critical for proper training sequence")
+    # Calculate batch size based on device availability
+    if getattr(training_args, "no_cuda", False):
+        batch_size = training_args.per_device_train_batch_size
+    else:
+        batch_size = max(training_args.per_device_train_batch_size * max(1, NUM_GPUS), 1)
+    log_info(f"Using sequential sampler with batch size {batch_size}")
+    # Return DataLoader with sequential sampler
+    return torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sequential_sampler,
+        collate_fn=data_collator,
+        drop_last=training_args.dataloader_drop_last,
+        num_workers=training_args.dataloader_num_workers,
+        pin_memory=training_args.dataloader_pin_memory,
+    )
+def check_dependencies():
+    """Check for critical dependencies and provide useful warnings."""
+    # Check for flash attention without attempting import
+    flash_attention_available = False
+    try:
+        import importlib.util
+        if importlib.util.find_spec("flash_attn") is not None:
+            flash_attention_available = True
+            log_info("flash-attn found! Using Flash Attention for faster training.")
+        else:
+            log_info("flash-attn not found. Training will continue but may be slower.")
+            log_info("To use flash attention, install: pip install flash-attn==2.5.2 --no-build-isolation")
+            # Still continue as this is optional
+    except Exception as e:
+        log_info(f"Error checking for flash-attn: {e}")
+    # Check for torch CUDA
+    if not torch.cuda.is_available():
+        log_info("WARNING: CUDA not available. Training will be extremely slow on CPU!")
+    else:
+        log_info(f"Found {torch.cuda.device_count()} CUDA devices")
+    # Check for unsloth
+    unsloth_available = False
+    try:
+        import importlib.util
+        if importlib.util.find_spec("unsloth") is not None:
+            unsloth_available = True
+            log_info("Unsloth found! Using Unsloth for optimized training.")
+        else:
+            log_info("CRITICAL: Unsloth not found. This pipeline requires Unsloth.")
+            log_info("Install with: pip install unsloth>=2024.3")
+            return False
+    except Exception as e:
+        log_info(f"Error checking for unsloth: {e}")
+        return False
+    return True
+def main():
+    """Main training function with error handling."""
+    try:
+        # Initialize logging
+        log_info("Starting Phi-4 training process")
+        # Parse arguments
+        args = parse_args()
+        # Load environment variables
+        load_env_variables()
+        # Load config from file
+        config = load_configs(args.config)
+        # Extract specific configurations
+        hardware_config = config.get("hardware", {})
+        dataset_config = config.get("dataset", {})
+        # Define multi_gpu_strategy early to prevent undefined errors
+        multi_gpu_strategy = hardware_config.get("training_optimizations", {}).get("multi_gpu_strategy", "data_parallel")
+        log_info(f"Multi-GPU strategy: {multi_gpu_strategy}")
+        # Check dependencies
+        if not check_dependencies():
+            log_info("Aborting due to missing critical dependencies")
+            return 1
+        # Log hardware info
+        cuda_available = torch.cuda.is_available()
+        num_gpus = torch.cuda.device_count() if cuda_available else 0
+        log_info(f"Hardware: {num_gpus} GPUs detected" if cuda_available else "Hardware: CPU only")
+        # Rest of training code would go here
+        # ...
+        return 0
+    except Exception as e:
+        log_info(f"Error in main training loop: {str(e)}")
+        # Log CUDA memory if available
+        if torch.cuda.is_available():
+            try:
+                memory_info = []
+                for i in range(torch.cuda.device_count()):
+                    allocated = torch.cuda.memory_allocated(i) / 1024**2
+                    reserved = torch.cuda.memory_reserved(i) / 1024**2
+                    memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB")
+                log_info(f"GPU memory at failure: {', '.join(memory_info)}")
+            except:
+                pass
+        return 1
+if __name__ == "__main__":
+    import sys
+    sys.exit(main())

requirements.txt CHANGED Viewed

@@ -1,9 +1,11 @@
 accelerate>=0.27.0
 bitsandbytes>=0.41.0
 datasets>=2.15.0
 einops>=0.7.0
 filelock>=3.13.1
-flash-attn>=2.5.1
 gradio>=5.17.0
 huggingface-hub>=0.19.0
 matplotlib>=3.7.0

+# Use pre-built wheels for flash-attn instead of building from source
+--find-links https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.2/
 accelerate>=0.27.0
 bitsandbytes>=0.41.0
 datasets>=2.15.0
 einops>=0.7.0
 filelock>=3.13.1
+flash-attn==2.5.2
 gradio>=5.17.0
 huggingface-hub>=0.19.0
 matplotlib>=3.7.0

run_transformers_training.py CHANGED Viewed

@@ -284,83 +284,61 @@ def load_dataset_with_mapping(dataset_config):
         if not dataset_name:
             raise ValueError("Dataset name not provided in configuration")
-        logger.info(f"Loading dataset {dataset_name}, split {dataset_split}")
         dataset = load_dataset(dataset_name, split=dataset_split)
-        # Map columns if specified - with checks to avoid conflicts
-        column_mapping = dataset_config.get("dataset", {}).get("column_mapping", {})
-        if column_mapping:
-            logger.info(f"Checking column mapping: {column_mapping}")
-            # Only apply mappings for columns that need renaming and don't already exist
-            safe_mappings = {}
-            for target, source in column_mapping.items():
-                if source in dataset.column_names:
-                    # Skip if target already exists and is not the same as source
-                    if target in dataset.column_names and target != source:
-                        logger.warning(f"Cannot rename '{source}' to '{target}' - target column already exists")
-                    else:
-                        safe_mappings[source] = target
-            # Apply safe renames
-            if safe_mappings:
-                logger.info(f"Applying safe column mapping: {safe_mappings}")
-                for source, target in safe_mappings.items():
-                    if source != target:  # Only rename if names are different
-                        dataset = dataset.rename_column(source, target)
-        # Add prompt_number field that increments based on original order - simple approach
-        logger.info("Adding prompt_number based on original dataset order (starting at 1)")
-        # Simple approach 1: Add index as a column during dataset creation
-        # Create a list of dicts with indices
-        examples_with_idx = []
-        for i, example in enumerate(dataset):
-            example = dict(example)  # Make a copy to avoid modifying the original
-            example['prompt_number'] = i + 1  # 1-indexed
-            examples_with_idx.append(example)
-        # Recreate dataset with prompt_number included
-        from datasets import Dataset
-        dataset = Dataset.from_list(examples_with_idx)
-        logger.info("Successfully added prompt_number to dataset")
-        # If conversations is missing but text exists, attempt conversion
-        if "conversations" not in dataset.column_names and "text" in dataset.column_names:
-            logger.info("Converting 'text' field to 'conversations' format")
-            def convert_text_to_conversations(example):
-                # Check if text is already a list of conversation turns
-                if isinstance(example.get("text"), list):
-                    example["conversations"] = example["text"]
-                # Otherwise, create a simple conversation with the text as user message
-                else:
-                    example["conversations"] = [
-                        {"role": "user", "content": str(example.get("text", ""))}
-                    ]
-                return example
-            dataset = dataset.map(convert_text_to_conversations)
-            logger.info("Successfully converted 'text' to 'conversations'")
-        # Verify we have the required columns
-        if "conversations" not in dataset.column_names:
-            logger.error("Required 'conversations' column not found in dataset!")
-            raise ValueError("Required 'conversations' column missing from dataset")
-        # Log column names and a sample
         logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
         logger.info(f"Dataset columns: {dataset.column_names}")
-        # Log a sample for inspection
-        if len(dataset) > 0:
-            sample = dataset[0]
-            prompt_num = sample.get("prompt_number", "N/A")
-            article_id = sample.get("article_id", sample.get("id", "N/A"))
-            logger.info(f"First sample - Prompt number: {prompt_num}, ID: {article_id}")
         return dataset
     except Exception as e:
         logger.error(f"Error loading dataset: {str(e)}")
         raise
@@ -542,6 +520,72 @@ class LoggingCallback(TrainerCallback):
         self.sequence_samples = None
         self.sample_indices = None
     def on_step_end(self, args, state, control, **kwargs):
         # Log every 50 steps or every 5 minutes, whichever comes first
         current_time = time.time()
@@ -590,7 +634,7 @@ class LoggingCallback(TrainerCallback):
                                             if i < len(current_samples):
                                                 current_sample = current_samples[i]
-                                                # Compare prompt numbers if available
                                                 if ('prompt_number' in orig_sample and
                                                     'prompt_number' in current_sample and
                                                     orig_sample['prompt_number'] is not None and
@@ -599,8 +643,11 @@ class LoggingCallback(TrainerCallback):
                                                     if orig_sample['prompt_number'] != current_sample['prompt_number']:
                                                         log_info(f"WARNING: Sequence integrity compromised! Sample {i} prompt number changed from {orig_sample['prompt_number']} to {current_sample['prompt_number']}")
                                                         is_sequence_maintained = False
-                                                # Also compare IDs as a backup check
                                                 elif ('article_id' in orig_sample and
                                                       'article_id' in current_sample and
                                                       orig_sample['article_id'] is not None and
@@ -609,21 +656,9 @@ class LoggingCallback(TrainerCallback):
                                                     if orig_sample['article_id'] != current_sample['article_id']:
                                                         log_info(f"WARNING: Sequence integrity compromised! Sample {i} article_id changed from {orig_sample['article_id']} to {current_sample['article_id']}")
                                                         is_sequence_maintained = False
-                                                # Compare input fingerprints
-                                                if ('conversations' in orig_sample and
-                                                    'conversations' in current_sample and
-                                                    orig_sample['conversations'] is not None and
-                                                    current_sample['conversations'] is not None):
-                                                    orig_len = len(orig_sample['conversations'])
-                                                    curr_len = len(current_sample['conversations'])
-                                                    if orig_len != curr_len:
-                                                        log_info(f"WARNING: Sequence integrity compromised! Sample {i} conversation length changed from {orig_len} to {curr_len}")
-                                                        is_sequence_maintained = False
                                         if is_sequence_maintained:
-                                            log_info("Data sequence integrity check: OK")
                                         else:
                                             log_info("CRITICAL WARNING: Data sequence integrity check FAILED!")
                                     else:
@@ -635,90 +670,16 @@ class LoggingCallback(TrainerCallback):
             except Exception as e:
                 log_info(f"Warning: Couldn't verify sequence integrity: {e}")
-        time_interval = current_time - self.last_log_time
-        step_interval = state.global_step - self.last_step
-        if step_interval >= 50 or time_interval >= 300:  # 5 minutes = 300 seconds
-            # Calculate throughput
-            examples_per_second = step_interval * args.per_device_train_batch_size * args.gradient_accumulation_steps / max(time_interval, 1e-6)
-            elapsed_total = time.strftime("%H:%M:%S", time.gmtime(current_time - self.training_started))
-            # Log progress
-            log_info(f"Step: {state.global_step}, Loss: {state.log_history[-1]['loss']:.4f}, "
-                    f"Rate: {examples_per_second:.2f} examples/sec, Elapsed: {elapsed_total}")
-            # Report memory usage if CUDA is available
-            if CUDA_AVAILABLE:
-                log_info(f"GPU Memory: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB allocated, "
-                        f"{torch.cuda.max_memory_reserved() / 1024**3:.2f} GB reserved")
-            # Reset for next interval
             self.last_log_time = current_time
-            self.last_step = state.global_step
-    def on_train_begin(self, args, state, control, **kwargs):
-        log_info(f"=== Training started at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
-        log_info(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")
-        # Set up sequence verification with actual sample capturing
-        try:
-            self.verify_sequence = dataset_config.get("validation", {}).get("verify_sequence_integrity", False)
-            if self.verify_sequence:
-                log_info("Sequence integrity verification enabled during training")
-                # Save actual samples for later verification
-                if trainer and hasattr(trainer, 'train_dataset') and trainer.train_dataset is not None:
-                    # Get some reference samples from the beginning of the dataset defensively
-                    self.sample_indices = []
-                    self.sequence_samples = []
-                    max_samples = min(5, len(trainer.train_dataset))
-                    for i in range(max_samples):
-                        try:
-                            if i < len(trainer.train_dataset):
-                                self.sample_indices.append(i)
-                                self.sequence_samples.append(trainer.train_dataset[i])
-                        except Exception as e:
-                            log_info(f"Warning: Error capturing reference sample at index {i}: {e}")
-                    if self.sequence_samples:
-                        log_info(f"Captured {len(self.sequence_samples)} reference samples for sequence integrity verification")
-                        # Log sample prompt numbers for debugging
-                        sample_prompt_numbers = []
-                        for s in self.sequence_samples:
-                            if isinstance(s, dict) and 'prompt_number' in s and s['prompt_number'] is not None:
-                                sample_prompt_numbers.append(s.get('prompt_number'))
-                        if sample_prompt_numbers:
-                            log_info(f"Reference sample prompt numbers: {sample_prompt_numbers}")
-                    else:
-                        log_info("Warning: No reference samples were captured")
-                else:
-                    log_info("Warning: Could not capture reference samples - verification will be limited")
-        except Exception as e:
-            log_info(f"Warning: Could not set up sequence integrity verification: {e}")
-            self.verify_sequence = False
-        log_info("=== Training is starting ===")
-        # Log important training parameters for visibility
-        total_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps * NUM_GPUS
-        log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {NUM_GPUS} GPUs = {total_batch_size} total")
-        log_info(f"Learning rate: {args.learning_rate}")
-        log_info(f"Epochs: {args.num_train_epochs}")
-        # Log memory information in compact format
-        if CUDA_AVAILABLE:
-            memory_info = []
-            for i in range(NUM_GPUS):
-                allocated = torch.cuda.memory_allocated(i) / 1024**2
-                max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
-                memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
-            log_info(f"Initial memory usage - {', '.join(memory_info)}")
     def on_train_end(self, args, state, control, **kwargs):
         training_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - self.training_started))
         log_info(f"=== Training completed in {training_time} ===")
@@ -968,9 +929,12 @@ def main():
             shuffle_enabled = data_loading_config.get("shuffle", False)
             if shuffle_enabled:
-                log_info("CRITICAL ERROR: Shuffle is enabled! This will randomize data entry order!")
-                raise ValueError("Dataset shuffling is enabled but sequential processing is required. " +
-                              "Please disable shuffling in your configuration.")
             # Calculate batch size based on device availability
             if getattr(training_args, "no_cuda", False):
@@ -984,7 +948,7 @@ def main():
             return torch.utils.data.DataLoader(
                 dataset,
                 batch_size=batch_size,
-                sampler=sequential_sampler,
                 collate_fn=data_collator,
                 drop_last=training_args.dataloader_drop_last,
                 num_workers=training_args.dataloader_num_workers,

         if not dataset_name:
             raise ValueError("Dataset name not provided in configuration")
+        logger.info(f"Loading pre-processed dataset {dataset_name}, split {dataset_split}")
         dataset = load_dataset(dataset_name, split=dataset_split)
+        # Apply minimal processing since the dataset has already been properly structured
+        # Just perform validation to ensure required fields exist
+        # Check for required fields
+        required_fields = ["prompt_number", "article_id", "conversations"]
+        missing_fields = [field for field in required_fields if field not in dataset.column_names]
+        if missing_fields:
+            logger.warning(f"Dataset is missing required fields: {missing_fields}")
+            logger.warning("This may cause issues with sequence integrity and metadata management")
+        else:
+            logger.info(f"Dataset has all required fields: {required_fields}")
+        # Log a few samples for verification
+        if len(dataset) > 0:
+            sample_indices = range(min(5, len(dataset)))
+            sample_records = []
+            for i in sample_indices:
+                record = {}
+                record["prompt_number"] = dataset[i].get("prompt_number", "N/A")
+                record["article_id"] = dataset[i].get("article_id", "N/A")
+                if "conversations" in dataset[i]:
+                    record["conversations_length"] = len(dataset[i]["conversations"])
+                sample_records.append(record)
+            logger.info(f"Sample records: {sample_records}")
+        # Verify sequential integrity
+        if "prompt_number" in dataset.column_names and len(dataset) > 1:
+            first_prompt_numbers = [dataset[i]["prompt_number"] for i in range(min(10, len(dataset)))]
+            is_sequential = all(first_prompt_numbers[i] == i + 1 for i in range(len(first_prompt_numbers)))
+            if is_sequential:
+                logger.info("Dataset prompt numbers are sequential (1-indexed) - sequence integrity preserved")
+            else:
+                logger.warning("Dataset prompt numbers are not sequential - sequence integrity may be compromised")
+                logger.info(f"First few prompt numbers: {first_prompt_numbers}")
         logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
         logger.info(f"Dataset columns: {dataset.column_names}")
+        # Data loading configuration - ensure shuffle is disabled
+        data_loading_config = dataset_config.get("data_loading", {})
+        if data_loading_config.get("shuffle", False):
+            logger.error("CRITICAL: shuffle is enabled in the dataset config!")
+            logger.error("This will RANDOMIZE your dataset and break sequential order.")
+            logger.error("Setting shuffle to False to preserve order")
+            data_loading_config["shuffle"] = False
         return dataset
     except Exception as e:
         logger.error(f"Error loading dataset: {str(e)}")
         raise
         self.sequence_samples = None
         self.sample_indices = None
+    def on_train_begin(self, args, state, control, **kwargs):
+        log_info(f"=== Training started at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
+        log_info(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")
+        # Set up sequence verification with actual sample capturing
+        try:
+            self.verify_sequence = dataset_config.get("validation", {}).get("verify_sequence_integrity", False)
+            if self.verify_sequence:
+                log_info("Sequence integrity verification enabled during training")
+                # Save actual samples for later verification
+                if trainer and hasattr(trainer, 'train_dataset') and trainer.train_dataset is not None:
+                    # Get some reference samples from the beginning of the dataset defensively
+                    self.sample_indices = []
+                    self.sequence_samples = []
+                    max_samples = min(5, len(trainer.train_dataset))
+                    for i in range(max_samples):
+                        try:
+                            if i < len(trainer.train_dataset):
+                                self.sample_indices.append(i)
+                                self.sequence_samples.append(trainer.train_dataset[i])
+                        except Exception as e:
+                            log_info(f"Warning: Error capturing reference sample at index {i}: {e}")
+                    if self.sequence_samples:
+                        log_info(f"Captured {len(self.sequence_samples)} reference samples for sequence integrity verification")
+                        # Log sample prompt numbers for debugging
+                        sample_prompt_numbers = []
+                        for s in self.sequence_samples:
+                            if isinstance(s, dict) and 'prompt_number' in s and s['prompt_number'] is not None:
+                                sample_prompt_numbers.append(s.get('prompt_number'))
+                        if sample_prompt_numbers:
+                            log_info(f"Reference sample prompt numbers: {sample_prompt_numbers}")
+                            if sample_prompt_numbers == list(range(1, len(sample_prompt_numbers) + 1)):
+                                log_info("Prompt numbers are sequential (1-indexed) - sequence integrity confirmed")
+                            else:
+                                log_info("Prompt numbers are not in expected sequence - will verify during training")
+                    else:
+                        log_info("Warning: No reference samples were captured")
+                else:
+                    log_info("Warning: Could not capture reference samples - verification will be limited")
+        except Exception as e:
+            log_info(f"Warning: Could not set up sequence integrity verification: {e}")
+            self.verify_sequence = False
+        log_info("=== Training is starting ===")
+        # Log important training parameters for visibility
+        total_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps * NUM_GPUS
+        log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {NUM_GPUS} GPUs = {total_batch_size} total")
+        log_info(f"Learning rate: {args.learning_rate}")
+        log_info(f"Epochs: {args.num_train_epochs}")
+        # Log memory information in compact format
+        if CUDA_AVAILABLE:
+            memory_info = []
+            for i in range(NUM_GPUS):
+                allocated = torch.cuda.memory_allocated(i) / 1024**2
+                max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
+                memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
+            log_info(f"Initial memory usage - {', '.join(memory_info)}")
     def on_step_end(self, args, state, control, **kwargs):
         # Log every 50 steps or every 5 minutes, whichever comes first
         current_time = time.time()
                                             if i < len(current_samples):
                                                 current_sample = current_samples[i]
+                                                # Compare prompt numbers if available - this is our primary check now
                                                 if ('prompt_number' in orig_sample and
                                                     'prompt_number' in current_sample and
                                                     orig_sample['prompt_number'] is not None and
                                                     if orig_sample['prompt_number'] != current_sample['prompt_number']:
                                                         log_info(f"WARNING: Sequence integrity compromised! Sample {i} prompt number changed from {orig_sample['prompt_number']} to {current_sample['prompt_number']}")
                                                         is_sequence_maintained = False
+                                                    else:
+                                                        # This is now our primary verification
+                                                        log_info(f"Prompt number match confirmed for sample {i}: {orig_sample['prompt_number']}")
+                                                # Also compare article_id as a backup check
                                                 elif ('article_id' in orig_sample and
                                                       'article_id' in current_sample and
                                                       orig_sample['article_id'] is not None and
                                                     if orig_sample['article_id'] != current_sample['article_id']:
                                                         log_info(f"WARNING: Sequence integrity compromised! Sample {i} article_id changed from {orig_sample['article_id']} to {current_sample['article_id']}")
                                                         is_sequence_maintained = False
                                         if is_sequence_maintained:
+                                            log_info("Data sequence integrity check: OK - prompt numbers preserved")
                                         else:
                                             log_info("CRITICAL WARNING: Data sequence integrity check FAILED!")
                                     else:
             except Exception as e:
                 log_info(f"Warning: Couldn't verify sequence integrity: {e}")
+        # Log progress at regular intervals
+        if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
+            if state.log_history:
+                loss = state.log_history[-1].get('loss', 'N/A')
+                # Use simple formatting for better Space log compatibility
+                log_info(f"Step {state.global_step}: Loss {loss}")
+            else:
+                log_info(f"Step {state.global_step}: No loss data available")
             self.last_log_time = current_time
     def on_train_end(self, args, state, control, **kwargs):
         training_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - self.training_started))
         log_info(f"=== Training completed in {training_time} ===")
             shuffle_enabled = data_loading_config.get("shuffle", False)
             if shuffle_enabled:
+                log_info("WARNING: Shuffle is enabled in configuration! This will be overridden to preserve order.")
+                # We enforce sequential processing regardless of config
+            # Log our approach clearly
+            log_info("Using SequentialSampler to guarantee dataset order is preserved based on prompt_number")
+            log_info("Dataset is pre-processed with prompt_number field indicating the correct sequence")
             # Calculate batch size based on device availability
             if getattr(training_args, "no_cuda", False):
             return torch.utils.data.DataLoader(
                 dataset,
                 batch_size=batch_size,
+                sampler=sequential_sampler,  # Always use sequential sampler
                 collate_fn=data_collator,
                 drop_last=training_args.dataloader_drop_last,
                 num_workers=training_args.dataloader_num_workers,

transformers_config.json CHANGED Viewed

@@ -77,7 +77,7 @@
   "huggingface_hub": {
     "push_to_hub": true,
-    "hub_model_id": "phi-4-research-assistant",
     "hub_private_repo": true
   },
@@ -131,18 +131,9 @@
   "dataset": {
     "dataset": {
-      "name": "George-API/cognitive-data",
       "split": "train",
-      "column_mapping": {
-        "conversations": "text",
-        "article_id": "id"
-      },
-      "processing": {
-        "sort_by_article_id": true,
-        "maintain_paper_order": true,
-        "preserve_entry_sequence": true,
-        "max_seq_length": 2048
-      }
     },
     "data_formatting": {
       "chat_template": "phi",
@@ -171,7 +162,7 @@
       "log_samples": 3,
       "log_interval": 50,
       "verify_sequence_integrity": true,
-      "metrics": ["processed", "skipped", "avg_tokens", "unique_papers"]
     }
   }
-}

   "huggingface_hub": {
     "push_to_hub": true,
+    "hub_model_id": "phi-4-cognitive-assistant",
     "hub_private_repo": true
   },
   "dataset": {
     "dataset": {
+      "name": "George-API/phi4-cognitive-dataset",
       "split": "train",
+      "column_mapping": {}
     },
     "data_formatting": {
       "chat_template": "phi",
       "log_samples": 3,
       "log_interval": 50,
       "verify_sequence_integrity": true,
+      "metrics": ["processed", "skipped", "avg_tokens", "unique_articles"]
     }
   }
+}