hf-train-frontend

Runtime error

App Files Files Community

George-API commited on Mar 9

Commit

4a1fd53

verified ·

1 Parent(s): adb15f9

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

DEPLOY_CHECKLIST.md +107 -0
app.py +128 -177
fixed_run_transformers_training.py +230 -0
run_transformers_training.py +98 -49
transformers_config.json +85 -2
update_space.py +0 -2

DEPLOY_CHECKLIST.md ADDED Viewed

	@@ -0,0 +1,107 @@

+# Phi-4 Training Space Deployment Checklist
+## Critical Configuration Review
+Before updating the Hugging Face Space, verify each of these items to prevent deployment issues:
+### 1. Model Configuration ✓
+- [ ] Confirmed model name in transformers_config.json: `unsloth/phi-4-unsloth-bnb-4bit`
+- [ ] BF16 precision enabled, FP16 disabled (`"bf16": true, "fp16": false`)
+- [ ] Chat template correctly set to `"phi"` in config
+- [ ] LoRA parameters properly configured:
+  - [ ] `r`: 32
+  - [ ] `lora_alpha`: 16
+  - [ ] `target_modules`: All required attention modules included
+- [ ] Max sequence length matches dataset needs (default: 2048)
+### 2. GPU & Memory Management ✓
+- [ ] Per-device batch size set to 16 or lower
+- [ ] Gradient accumulation steps set to 3 or higher
+- [ ] Device mapping set to "auto" for multi-GPU
+- [ ] Max memory limit set to 85% of each GPU's capacity
+- [ ] `PYTORCH_CUDA_ALLOC_CONF` includes `"expandable_segments:True"`
+- [ ] Gradient checkpointing enabled (`"gradient_checkpointing": true`)
+- [ ] Dataloader workers reduced to 2 (from 4)
+- [ ] FSDP configuration enabled for multi-GPU setups
+### 3. Dataset Handling ✓
+- [ ] Dataset configuration correctly specified in dataset_config.json
+- [ ] Conversation structure preserved (id + conversations fields)
+- [ ] SimpleDataCollator configured to use apply_chat_template
+- [ ] No re-ordering or sorting of the dataset (preserves original order)
+- [ ] Sequential sampler used in dataloader (no shuffling)
+- [ ] Max sequence length of 2048 applied
+- [ ] Format validation for first few examples enabled
+### 4. Dependency Management ✓
+- [ ] requirements.txt includes all necessary packages:
+  - [ ] unsloth
+  - [ ] peft
+  - [ ] bitsandbytes
+  - [ ] einops
+  - [ ] sentencepiece
+  - [ ] datasets
+  - [ ] transformers
+- [ ] Optional packages marked as such (e.g., flash-attn)
+- [ ] Dependency version constraints avoid known conflicts
+### 5. Error Handling & Logging ✓
+- [ ] Proper error catching for dataset loading
+- [ ] Fallback mechanisms for chat template application
+- [ ] Clear, concise log messages that work with HF Space interface
+- [ ] Memory usage tracking at key points (start, end, periodic)
+- [ ] Third-party loggers set to WARNING to reduce noise
+- [ ] Low-verbosity log format for better HF Space compatibility
+### 6. Training Setup ✓
+- [ ] Number of epochs properly configured (default: 3)
+- [ ] Learning rate appropriate (default: 2e-5)
+- [ ] Warmup ratio set (default: 0.05)
+- [ ] Checkpointing frequency set to reasonable value (default: 100 steps)
+- [ ] Output directory correctly configured
+- [ ] HuggingFace Hub parameters set correctly if pushing models
+### 7. Pre-Flight Verification ✓
+- [ ] No linting errors or indentation issues
+- [ ] Updated config values are consistent across files
+- [ ] Batch size × gradient accumulation × GPUs gives reasonable total batch
+- [ ] Verified that requirements.txt matches actual imports in code
+- [ ] Confirmed tokenizer settings match the model requirements
+---
+## Last-Minute Configuration Changes
+If you've made any configuration changes, record them here before deployment:
+| Date | Parameter Changed | Old Value | New Value | Reason | Reviewer |
+|------|-------------------|-----------|-----------|--------|----------|
+|      |                   |           |           |        |          |
+|      |                   |           |           |        |          |
+---
+## Deployment Notes
+**Current Space Hardware**: 4× NVIDIA L4 GPUs (24GB VRAM each)
+**Expected Training Speed**: ~XXX examples/second with current configuration
+**Memory Requirements**: Peak usage expected to be ~20GB per GPU
+**Common Issues to Watch For**:
+- OOM errors on GPU 0: If seen, reduce batch size by 2 and increase grad accumulation by 1
+- Imbalanced GPU usage: Check device mapping and FSDP configuration
+- Slow training: Verify that all GPUs are being utilized efficiently
+- Log flooding: Reduce verbosity of component logs (transformers, datasets, etc.)
+---
+*Last Updated: 2025-03-09*

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import os
 import sys
 import json
 import logging
-import gradio as gr
-from pathlib import Path
 import subprocess
 import time
 from datetime import datetime
-# Configure logging
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
@@ -16,11 +17,23 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 # Configuration paths
 CONFIG_DIR = "."
 TRANSFORMERS_CONFIG = os.path.join(CONFIG_DIR, "transformers_config.json")
-HARDWARE_CONFIG = os.path.join(CONFIG_DIR, "hardware_config.json")
-DATASET_CONFIG = os.path.join(CONFIG_DIR, "dataset_config.json")
 def load_config(config_path):
     """Load configuration from JSON file."""
@@ -29,207 +42,145 @@ def load_config(config_path):
             with open(config_path, 'r') as f:
                 return json.load(f)
         else:
-            logger.error(f"Config file not found: {config_path}")
             return None
     except Exception as e:
-        logger.error(f"Error loading config: {str(e)}")
         return None
 def display_config():
     """Display current training configuration."""
-    transformers_config = load_config(TRANSFORMERS_CONFIG)
-    hardware_config = load_config(HARDWARE_CONFIG)
-    dataset_config = load_config(DATASET_CONFIG)
-    if not all([transformers_config, hardware_config, dataset_config]):
-        return "Error loading configuration files."
-    # Extract key parameters
-    model_name = transformers_config.get("model", {}).get("name", "")
-    dataset_name = dataset_config.get("dataset", {}).get("name", "")
-    batch_size = transformers_config.get("training", {}).get("per_device_train_batch_size", 0)
-    gradient_accum = transformers_config.get("training", {}).get("gradient_accumulation_steps", 0)
-    lr = transformers_config.get("training", {}).get("learning_rate", 0)
-    epochs = transformers_config.get("training", {}).get("num_train_epochs", 0)
-    gpu_count = hardware_config.get("specs", {}).get("gpu_count", 0)
-    gpu_type = hardware_config.get("specs", {}).get("gpu_type", "")
-    config_info = f"""
-    ## Current Training Configuration
-    **Model**: {model_name}
-    **Dataset**: {dataset_name}
-    **Training Parameters**:
-    - Learning Rate: {lr}
-    - Epochs: {epochs}
-    - Batch Size/GPU: {batch_size}
-    - Gradient Accumulation: {gradient_accum}
-    - Effective Batch Size: {batch_size * gradient_accum * gpu_count}
-    **Hardware**:
-    - GPUs: {gpu_count}x {gpu_type}
-    - Flash Attention: {hardware_config.get("memory_optimization", {}).get("use_flash_attention", False)}
-    - Gradient Checkpointing: {hardware_config.get("memory_optimization", {}).get("use_gradient_checkpointing", False)}
-    **Pre-quantized 4-bit Training**: Enabled
     """
-    return config_info
 def start_training():
     """Start the training process."""
     try:
-        # Check if already running
-        if os.path.exists("training.pid"):
-            with open("training.pid", "r") as f:
-                pid = f.read().strip()
-                try:
-                    # Check if process is still running
-                    os.kill(int(pid), 0)
-                    return f"Training is already running with PID {pid}"
-                except OSError:
-                    # Process not running, remove stale PID file
-                    os.remove("training.pid")
-        # Start training in background
         cmd = "python run_transformers_training.py"
-        process = subprocess.Popen(
-            cmd,
-            shell=True,
-            stdout=open('training.log', 'a'),
-            stderr=subprocess.STDOUT
-        )
-        # Save PID
-        with open("training.pid", "w") as f:
-            f.write(str(process.pid))
-        # Log start time
-        with open("training_history.log", "a") as f:
-            f.write(f"{datetime.now().isoformat()}: Training started (PID: {process.pid})\n")
-        return f"Training started with PID {process.pid}. Check status for updates."
-    except Exception as e:
-        return f"Error starting training: {str(e)}"
-def check_training_status():
-    """Check the status of training."""
-    try:
-        # Check if training is running
-        if os.path.exists("training.pid"):
-            with open("training.pid", "r") as f:
-                pid = f.read().strip()
-                try:
-                    # Check if process is still running
-                    os.kill(int(pid), 0)
-                    status = f"Training is running with PID {pid}"
-                except OSError:
-                    status = "Training process has stopped"
-                    os.remove("training.pid")
-        else:
-            status = "No training process is currently running"
-        # Get last lines from training log
-        log_content = "No training log available"
-        if os.path.exists("training.log"):
-            with open("training.log", "r") as f:
-                lines = f.readlines()
-                log_content = "".join(lines[-20:]) if lines else "Log file is empty"
-        return f"{status}\n\n**Recent Log:**\n```\n{log_content}\n```"
     except Exception as e:
-        return f"Error checking status: {str(e)}"
-# Create the Gradio interface
-with gr.Blocks(title="Phi-4 Unsloth Training", theme=gr.themes.Soft(primary_hue="blue")) as app:
-    gr.Markdown("# Phi-4 Unsloth 4-bit Training Interface")
-    with gr.Tabs():
-        with gr.TabItem("Configuration"):
-            config_output = gr.Markdown(display_config())
-            refresh_btn = gr.Button("Refresh Configuration")
-            refresh_btn.click(fn=display_config, outputs=config_output)
-        with gr.TabItem("Training Control"):
-            gr.Markdown("## Training Management")
-            with gr.Row():
-                start_btn = gr.Button("Start Training", variant="primary")
-                check_btn = gr.Button("Check Status")
-            status_output = gr.Markdown("Click 'Check Status' to see training progress")
-            start_btn.click(fn=start_training, outputs=status_output)
-            check_btn.click(fn=check_training_status, outputs=status_output)
-            # Auto-refresh status
-            gr.HTML('''
-            <script>
-            let intervalId;
-            document.addEventListener('DOMContentLoaded', function() {
-                // Find the "Check Status" button
-                const buttons = Array.from(document.querySelectorAll('button'));
-                const checkBtn = buttons.find(btn => btn.textContent.includes('Check Status'));
-                // Set up interval to click the button every 30 seconds
-                if (checkBtn) {
-                    intervalId = setInterval(() => {
-                        checkBtn.click();
-                    }, 30000);
-                }
-            });
-            // Clean up on tab/window close
-            window.addEventListener('beforeunload', function() {
-                clearInterval(intervalId);
-            });
-            </script>
-            ''')
-        with gr.TabItem("Help"):
-            gr.Markdown("""
-            ## Phi-4 Unsloth Training Help
-            This interface allows you to manage training of the Phi-4 model with Unsloth 4-bit optimizations.
-            ### Installation
-            Before starting training, ensure all dependencies are installed:
-            ```bash
-            pip install -r requirements.txt
-            ```
-            Critical packages:
-            - unsloth (>=2024.3)
-            - peft (>=0.9.0)
-            - transformers (>=4.36.0)
-            ### Quick Start
-            1. Review the configuration in the Configuration tab
-            2. Click "Start Training" to begin the process
-            3. Use "Check Status" to monitor progress
-            ### Notes
-            - Training uses the pre-quantized model `unsloth/phi-4-unsloth-bnb-4bit`
-            - The process maintains paper order and handles metadata appropriately
-            - Training progress will be regularly saved to HuggingFace Hub
-            ### Troubleshooting
-            If training stops unexpectedly:
-            - Check the logs for out-of-memory errors
-            - Verify the VRAM usage on each GPU
-            - Check for CUDA version compatibility
-            - If you see "Unsloth not available" error, run: `pip install unsloth>=2024.3 peft>=0.9.0`
-            """)
-# Launch the app
 if __name__ == "__main__":
-    app.launch()

+#!/usr/bin/env python
+# coding=utf-8
 import os
 import sys
 import json
 import logging
 import subprocess
 import time
 from datetime import datetime
+# Configure logging to match HF Space logs
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
 )
 logger = logging.getLogger(__name__)
+# Set other loggers to WARNING to reduce noise and ensure our logs are visible
+logging.getLogger("transformers").setLevel(logging.WARNING)
+logging.getLogger("datasets").setLevel(logging.WARNING)
+logging.getLogger("accelerate").setLevel(logging.WARNING)
+logging.getLogger("torch").setLevel(logging.WARNING)
+logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
+# Define a clean logging function for HF Space compatibility
+def log_info(message):
+    """Log information in a format compatible with Hugging Face Spaces"""
+    logger.info(message)
+    # Ensure output is flushed immediately for streaming
+    sys.stdout.flush()
 # Configuration paths
 CONFIG_DIR = "."
 TRANSFORMERS_CONFIG = os.path.join(CONFIG_DIR, "transformers_config.json")
 def load_config(config_path):
     """Load configuration from JSON file."""
             with open(config_path, 'r') as f:
                 return json.load(f)
         else:
+            log_info(f"Config file not found: {config_path}")
             return None
     except Exception as e:
+        log_info(f"Error loading config: {str(e)}")
         return None
 def display_config():
     """Display current training configuration."""
+    config = load_config(TRANSFORMERS_CONFIG)
+    if not config:
+        return "Error loading configuration file."
+    # Extract sub-configurations
+    transformers_config = config
+    hardware_config = config.get("hardware", {})
+    dataset_config = config.get("dataset", {})
+    model_name = transformers_config.get("model", {}).get("name") or transformers_config.get("model_name_or_path", "")
+    # Training parameters
+    training_config = transformers_config.get("training", {})
+    batch_size = training_config.get("per_device_train_batch_size", 16)
+    grad_accum = training_config.get("gradient_accumulation_steps", 3)
+    epochs = training_config.get("num_train_epochs", 3)
+    learning_rate = training_config.get("learning_rate", 2e-5)
+    # Hardware settings
+    gpu_count = hardware_config.get("specs", {}).get("gpu_count", 4)
+    gpu_type = hardware_config.get("specs", {}).get("gpu_type", "L4")
+    vram = hardware_config.get("specs", {}).get("vram_per_gpu", 24)
+    # Dataset info
+    dataset_name = dataset_config.get("dataset", {}).get("name", "")
+    # Format response as HTML for better display
+    html = f"""
+    <h2>Training Configuration</h2>
+    <h3>Model</h3>
+    <ul>
+        <li><b>Model:</b> {model_name}</li>
+        <li><b>Learning Rate:</b> {training_config.get('learning_rate', '2e-5')}</li>
+        <li><b>Batch Size:</b> {training_config.get('per_device_train_batch_size', 4)} × {training_config.get('gradient_accumulation_steps', 4)} = {training_config.get('per_device_train_batch_size', 4) * training_config.get('gradient_accumulation_steps', 4)}</li>
+        <li><b>Epochs:</b> {training_config.get('num_train_epochs', 3)}</li>
+        <li><b>Precision:</b> {'BF16' if transformers_config.get('bf16', True) else 'FP16' if transformers_config.get('fp16', False) else 'FP32'}</li>
+        <li><b>Max Sequence Length:</b> {transformers_config.get('tokenizer', {}).get('max_seq_length', 2048)}</li>
+    </ul>
+    <h3>Hardware</h3>
+    <ul>
+        <li><b>GPU:</b> {gpu_count}× {gpu_type} ({vram} GB)</li>
+        <li><b>Multi-GPU Strategy:</b> {hardware_config.get('training_optimizations', {}).get('multi_gpu_strategy', 'data_parallel')}</li>
+        <li><b>Memory Optimizations:</b> {'Gradient Checkpointing' if hardware_config.get('training_optimizations', {}).get('memory_optimizations', {}).get('use_gradient_checkpointing', True) else 'None'}</li>
+    </ul>
+    <h3>Dataset</h3>
+    <ul>
+        <li><b>Dataset:</b> {dataset_name}</li>
+        <li><b>Dataset Split:</b> {dataset_config.get('dataset', {}).get('split', 'train')}</li>
+    </ul>
     """
+    return html
 def start_training():
     """Start the training process."""
     try:
+        # Run verification script first
+        log_info("Running pre-training verification...")
+        verify_cmd = "python verify_deployment.py"
+        try:
+            result = subprocess.run(verify_cmd, shell=True, check=True, capture_output=True, text=True)
+            if "All critical checks passed!" not in result.stdout:
+                log_info("Verification found issues. Please review:")
+                log_info(result.stdout)
+                return "Verification detected potential issues. Please review the logs before proceeding."
+        except subprocess.CalledProcessError as e:
+            log_info(f"Verification failed: {e.stderr}")
+            return "Verification failed. Please check the logs for details."
+        # Start training
+        log_info("Starting training process...")
+        # Run in a background process for HF Space
         cmd = "python run_transformers_training.py"
+        # In HF Spaces, we don't need to handle process management ourselves
+        subprocess.Popen(cmd, shell=True, stdout=sys.stdout, stderr=sys.stderr)
+        log_info("Training process has been started. You can monitor progress in the logs.")
+        return "Training started successfully. Monitor progress in the Hugging Face Space logs."
     except Exception as e:
+        error_msg = f"Error starting training: {str(e)}"
+        log_info(error_msg)
+        return error_msg
+# Interface setup for gradio
+def create_interface():
+    import gradio as gr
+    with gr.Blocks(title="Phi-4 Training Center") as demo:
+        gr.Markdown("# Phi-4 Research Assistant Training")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("## Control Panel")
+                # Display current config
+                config_html = gr.HTML(display_config())
+                refresh_btn = gr.Button("Refresh Configuration")
+                # Training controls
+                train_btn = gr.Button("Start Training", variant="primary")
+                train_output = gr.Textbox(label="Status", interactive=False)
+            with gr.Column():
+                gr.Markdown("## Training Information")
+                gr.Markdown("""
+                ### Hardware:
+                - 4× NVIDIA L4 GPUs (24GB VRAM each)
+                - Training with BF16 precision
+                - Using Data Parallel for multi-GPU
+                ### Notes:
+                - Training may take several hours depending on dataset size
+                - Check the Space logs for real-time progress
+                - Model checkpoints will be saved to ./results directory
+                """)
+        # Connect buttons to functions
+        refresh_btn.click(lambda: gr.update(value=display_config()), outputs=config_html)
+        train_btn.click(start_training, outputs=train_output)
+    return demo
 if __name__ == "__main__":
+    # If run directly, create and launch the Gradio interface
+    demo = create_interface()
+    demo.queue()
+    demo.launch()

fixed_run_transformers_training.py ADDED Viewed

	@@ -0,0 +1,230 @@

+def format_phi_chat(messages, dataset_config):
+    """Format messages according to phi-4's chat template and dataset config."""
+    formatted_chat = ""
+    # Get role templates from config
+    roles = dataset_config.get("data_formatting", {}).get("roles", {
+        "system": "System: {content}\n\n",
+        "human": "Human: {content}\n\n",
+        "user": "Human: {content}\n\n",
+        "assistant": "Assistant: {content}\n\n"
+    })
+    # Handle research introduction metadata first
+    metadata = next((msg for msg in messages if isinstance(msg, dict) and
+                    "[RESEARCH INTRODUCTION]" in msg.get("content", "")), None)
+    if metadata:
+        system_template = roles.get("system", "System: {content}\n\n")
+        formatted_chat = system_template.format(content=metadata['content'])
+        messages = [msg for msg in messages if msg != metadata]
+    # Process remaining messages
+    for message in messages:
+        if not isinstance(message, dict) or "content" not in message:
+            logger.warning(f"Skipping invalid message format: {message}")
+            continue
+        role = message.get("role", "").lower()
+        content = message.get("content", "")
+        # Format based on role
+        if role == "human" or role == "user":
+            template = roles.get("user", roles.get("human", "Human: {content}\n\n"))
+            formatted_chat += template.format(content=content)
+        elif role == "assistant" or role == "bot":
+            template = roles.get("assistant", "Assistant: {content}\n\n")
+            formatted_chat += template.format(content=content)
+        elif role == "system":
+            # For system messages, prepend them
+            template = roles.get("system", "System: {content}\n\n")
+            formatted_chat = template.format(content=content) + formatted_chat
+        else:
+            # Default to system for unknown roles
+            logger.warning(f"Unknown role '{role}' - treating as system message")
+            template = roles.get("system", "System: {content}\n\n")
+            formatted_chat += template.format(content=content)
+    return formatted_chat.strip()
+class SimpleDataCollator:
+    def __init__(self, tokenizer, dataset_config):
+        self.tokenizer = tokenizer
+        self.dataset_config = dataset_config
+        self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
+        self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
+        self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
+        logger.info(f"SimpleDataCollator initialized - using pre-audited dataset with max_seq_length={self.max_seq_length}")
+        logger.info("Using exact dataset structure without reformatting")
+        # Check if we're on GPU
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"SimpleDataCollator using device: {self.device}")
+    def __call__(self, features):
+        """Process examples preserving exact JSONL structure"""
+        batch = {"input_ids": [], "attention_mask": [], "labels": []}
+        for example in features:
+            try:
+                # Get ID
+                paper_id = example.get("id", "")
+                # Get conversations - these should already contain role and content
+                conversations = example.get("conversations", [])
+                if not conversations:
+                    self.stats["skipped"] += 1
+                    continue
+                # Directly use the conversations array as input to the model's chat template
+                # This preserves the exact structure with roles and content as they are
+                try:
+                    # Let tokenizer handle the content with the model's chat template
+                    inputs = self.tokenizer.apply_chat_template(
+                        conversations,
+                        return_tensors=None,
+                        add_generation_prompt=False
+                    )
+                except Exception as chat_error:
+                    # Fallback if apply_chat_template fails
+                    logger.warning(f"Chat template application failed for example {paper_id}: {str(chat_error)[:100]}")
+                    # Create a basic representation of the conversation
+                    conversation_text = ""
+                    for msg in conversations:
+                        if isinstance(msg, dict) and 'content' in msg:
+                            conversation_text += msg.get('content', '') + "\n\n"
+                    # Basic tokenization
+                    inputs = self.tokenizer(
+                        conversation_text,
+                        add_special_tokens=True,
+                        return_tensors=None
+                    )
+                # Apply length cap if needed (shouldn't be necessary for pre-audited data)
+                if self.max_seq_length > 0 and len(inputs) > self.max_seq_length:
+                    logger.warning(f"Example {paper_id} exceeds max_seq_length ({len(inputs)} > {self.max_seq_length})")
+                    inputs = inputs[:self.max_seq_length]
+                # Create attention mask (1 for all tokens)
+                attention_mask = [1] * len(inputs)
+                if len(inputs) > 0:
+                    # For causal language modeling, labels are the same as inputs
+                    labels = inputs.copy()
+                    batch["input_ids"].append(inputs)
+                    batch["attention_mask"].append(attention_mask)
+                    batch["labels"].append(labels)
+                    self.stats["processed"] += 1
+                    self.stats["total_tokens"] += len(inputs)
+                    # Debug logging for first few examples
+                    log_samples = self.dataset_config.get("validation", {}).get("log_samples", 3)
+                    if self.stats["processed"] <= log_samples:
+                        logger.info(f"Example {self.stats['processed']}:")
+                        logger.info(f"Paper ID: {paper_id}")
+                        logger.info(f"Token count: {len(inputs)}")
+                        logger.info(f"Conversation entries: {len(conversations)}")
+                else:
+                    self.stats["skipped"] += 1
+            except Exception as e:
+                logger.warning(f"Error processing example: {str(e)[:100]}...")
+                logger.warning(f"Problematic example ID: {example.get('id', 'unknown')}")
+                self.stats["skipped"] += 1
+                continue
+        if not batch["input_ids"]:
+            logger.warning("Empty batch, returning dummy tensors")
+            return {
+                "input_ids": torch.zeros((1, 1), dtype=torch.long),
+                "attention_mask": torch.zeros((1, 1), dtype=torch.long),
+                "labels": torch.zeros((1, 1), dtype=torch.long)
+            }
+        # Pad the batch
+        max_length = max(len(ids) for ids in batch["input_ids"])
+        for i in range(len(batch["input_ids"])):
+            padding_length = max_length - len(batch["input_ids"][i])
+            if padding_length > 0:
+                batch["input_ids"][i].extend([self.pad_token_id] * padding_length)
+                batch["attention_mask"][i].extend([0] * padding_length)
+                batch["labels"][i].extend([-100] * padding_length)
+        # Convert to tensors
+        batch = {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
+        # Log stats periodically
+        log_interval = self.dataset_config.get("validation", {}).get("log_interval", 100)
+        if self.stats["processed"] % log_interval == 0 and self.stats["processed"] > 0:
+            logger.info(f"Data collator stats: processed={self.stats['processed']}, "
+                       f"skipped={self.stats['skipped']}, "
+                       f"avg_tokens={self.stats['total_tokens']/self.stats['processed']:.1f}")
+        return batch
+class LoggingCallback(TrainerCallback):
+    def __init__(self):
+        self.last_log_time = time.time()
+        self.last_memory_log_time = time.time()
+    def on_step_end(self, args, state, control, **kwargs):
+        # Log every 50 steps or every 5 minutes, whichever comes first
+        current_time = time.time()
+        # Log loss every 50 steps or 5 minutes
+        if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
+            if state.log_history:
+                loss = state.log_history[-1].get('loss', 'N/A')
+                # Use simple formatting for better HF Space log compatibility
+                log_info(f"Step {state.global_step}: Loss {loss}")
+            else:
+                log_info(f"Step {state.global_step}: No loss data available")
+            self.last_log_time = current_time
+        # Log memory usage every 15 minutes
+        if current_time - self.last_memory_log_time > 900:  # 15 minutes
+            if torch.cuda.is_available():
+                memory_info = []
+                for i in range(torch.cuda.device_count()):
+                    allocated = torch.cuda.memory_allocated(i) / 1024**2
+                    reserved = torch.cuda.memory_reserved(i) / 1024**2
+                    memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB")
+                # Log in compact format for better visibility
+                log_info(f"Memory usage - {', '.join(memory_info)}")
+            self.last_memory_log_time = current_time
+    def on_train_begin(self, args, state, control, **kwargs):
+        log_info("=== Training is starting ===")
+        # Log important training parameters for visibility
+        log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {max(1, torch.cuda.device_count())} GPUs")
+        log_info(f"Learning rate: {args.learning_rate}")
+        log_info(f"Epochs: {args.num_train_epochs}")
+        # Log memory information in compact format
+        if torch.cuda.is_available():
+            memory_info = []
+            for i in range(torch.cuda.device_count()):
+                allocated = torch.cuda.memory_allocated(i) / 1024**2
+                max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
+                memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
+            log_info(f"Initial memory usage - {', '.join(memory_info)}")
+    def on_train_end(self, args, state, control, **kwargs):
+        log_info("=== Training completed ===")
+        if torch.cuda.is_available():
+            memory_info = []
+            for i in range(torch.cuda.device_count()):
+                allocated = torch.cuda.memory_allocated(i) / 1024**2
+                max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
+                memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
+            log_info(f"Final memory usage - {', '.join(memory_info)}")
+        log_info(f"Total steps: {state.global_step}")
+        log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")

run_transformers_training.py CHANGED Viewed

@@ -113,26 +113,24 @@ def load_env_variables():
         os.environ["HUGGING_FACE_HUB_TOKEN"] = os.environ.get("HF_TOKEN")
 def load_configs(base_path):
-    """Load all configuration files."""
     configs = {}
-    # List of config files to load
-    config_files = [
-        "transformers_config.json",
-        "hardware_config.json",
-        "dataset_config.json"
-    ]
-    for config_file in config_files:
-        file_path = os.path.join(base_path, config_file)
-        try:
-            with open(file_path, "r") as f:
-                config_name = config_file.replace("_config.json", "")
-                configs[config_name] = json.load(f)
-                logger.info(f"Loaded {config_name} configuration from {file_path}")
-        except Exception as e:
-            logger.error(f"Error loading {config_file}: {e}")
-            raise
     return configs
@@ -238,7 +236,7 @@ def load_model_and_tokenizer(config):
         # Ensure model and optimizer init is on the same device
         logger.info(f"Model device map: {model.hf_device_map if hasattr(model, 'hf_device_map') else 'Not available'}")
         # Apply Unsloth's training optimizations with config parameters
         unsloth_config = config.get("unsloth", {})
         model = FastLanguageModel.get_peft_model(
@@ -640,25 +638,32 @@ def main():
     # Load environment variables
     load_env_variables()
     # Load all configurations
     try:
         configs = load_configs(args.config_dir)
         # Extract specific configs
         if not configs:
-            logger.error("Failed to load configurations")
             return 1
-        # Verify configurations exist
         if "transformers" not in configs:
             logger.error("transformers_config.json not found or invalid")
             return 1
-        if "hardware" not in configs:
-            logger.warning("hardware_config.json not found. Using default hardware configuration.")
-        if "dataset" not in configs:
-            logger.error("dataset_config.json not found or invalid")
             return 1
         # Validate model configuration
@@ -679,22 +684,36 @@ def main():
         # Apply hardware-specific settings if available
         if hardware_config:
             training_opts = hardware_config.get("training_optimizations", {})
-            per_device_batch_size = training_opts.get("per_device_batch_size")
-            gradient_accumulation = training_opts.get("gradient_accumulation_steps")
-            if per_device_batch_size and model_config.get("training"):
-                model_config["training"]["per_device_train_batch_size"] = per_device_batch_size
-                log_info(f"Applied hardware-specific batch size: {per_device_batch_size}")
-            if gradient_accumulation and model_config.get("training"):
-                model_config["training"]["gradient_accumulation_steps"] = gradient_accumulation
-                log_info(f"Applied hardware-specific gradient accumulation: {gradient_accumulation}")
             # Apply memory optimizations
             memory_opts = training_opts.get("memory_optimizations", {})
             if memory_opts.get("use_gradient_checkpointing") is not None and model_config.get("training"):
-                model_config["training"]["gradient_checkpointing"] = memory_opts["use_gradient_checkpointing"]
     except Exception as e:
         logger.error(f"Error loading configurations: {e}")
@@ -713,13 +732,17 @@ def main():
         # Set memory management env vars for better fragmentation handling
         os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
         # Log initial memory information in a compact form
         gpu_info = []
         for i in range(torch.cuda.device_count()):
             name = torch.cuda.get_device_name(i)
             allocated = torch.cuda.memory_allocated(i) / 1024**3
             total = torch.cuda.get_device_properties(i).total_memory / 1024**3
-            gpu_info.append(f"GPU {i}: {name} ({allocated:.1f}GB/{total:.1f}GB)")
         log_info(f"Hardware: {torch.cuda.device_count()} GPUs detected")
         log_info(f"GPU details: {', '.join(gpu_info)}")
@@ -739,28 +762,44 @@ def main():
         except Exception as e:
             logger.error(f"Error loading dataset: {e}")
             return 1
         # Create data collator
         data_collator = SimpleDataCollator(tokenizer, dataset_config)
         # Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
-        use_bf16 = model_config.get("bf16", False) or model_config.get("torch_dtype", "") == "bfloat16"
-        use_fp16 = model_config.get("fp16", False) and not use_bf16  # Only use fp16 if bf16 is not set
-        log_info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
-        # Get per device batch size - temporarily reduce if necessary for multi-GPU setup
-        per_device_batch_size = model_config.get("training", {}).get("per_device_train_batch_size", 24)
-        gradient_accumulation_steps = model_config.get("training", {}).get("gradient_accumulation_steps", 2)
         # For multi-GPU setup, adjust for better balance
         if torch.cuda.device_count() > 1:
             log_info(f"Multi-GPU setup with {torch.cuda.device_count()} GPUs")
             log_info(f"Training config: {per_device_batch_size} samples/GPU × {gradient_accumulation_steps} accumulation steps")
-        # Set up FSDP for multi-GPU training if available
         fsdp_config = None
-        if torch.cuda.device_count() > 1:
             try:
                 from torch.distributed.fsdp import (
                     FullyShardedDataParallel as FSDP,
@@ -793,6 +832,15 @@ def main():
             except ImportError:
                 log_info("FSDP imports failed, falling back to standard DDP")
                 fsdp_config = None
         # Set up training arguments
         log_info("Setting up training arguments")
@@ -818,13 +866,14 @@ def main():
             report_to="tensorboard",
             remove_unused_columns=False,  # Keep all columns
             gradient_checkpointing=model_config.get("training", {}).get("gradient_checkpointing", True),
-            dataloader_pin_memory=True,  # Keep data in pinned memory for faster transfer
             optim=model_config.get("training", {}).get("optim", "adamw_torch"),
             ddp_find_unused_parameters=False,  # Improve distributed training efficiency
             dataloader_drop_last=False,  # Process all examples
-            dataloader_num_workers=2,  # Reduced worker count
             no_cuda=False if torch.cuda.is_available() else True,  # Use CUDA if available
-            fsdp=fsdp_config,  # Add FSDP configuration if available
         )
         # Create sequential sampler to maintain original dataset order
@@ -907,7 +956,7 @@ def main():
                     memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB (max: {max_mem:.1f}MB)")
                 logger.error(f"GPU memory at failure: {', '.join(memory_info)}")
             raise
     except Exception as e:
         logger.error(f"Error in main training loop: {str(e)}")
         return 1

         os.environ["HUGGING_FACE_HUB_TOKEN"] = os.environ.get("HF_TOKEN")
 def load_configs(base_path):
+    """Load all configuration from a single consolidated file."""
     configs = {}
+    # Using a single consolidated config file
+    config_file = "transformers_config.json"
+    file_path = os.path.join(base_path, config_file)
+    try:
+        with open(file_path, "r") as f:
+            config = json.load(f)
+            # Extract sections into separate config dictionaries for compatibility
+            configs["transformers"] = config
+            configs["hardware"] = config.get("hardware", {})
+            configs["dataset"] = config.get("dataset", {})
+            logger.info(f"Loaded consolidated configuration from {file_path}")
+    except Exception as e:
+        logger.error(f"Error loading {config_file}: {e}")
+        raise
     return configs
         # Ensure model and optimizer init is on the same device
         logger.info(f"Model device map: {model.hf_device_map if hasattr(model, 'hf_device_map') else 'Not available'}")
         # Apply Unsloth's training optimizations with config parameters
         unsloth_config = config.get("unsloth", {})
         model = FastLanguageModel.get_peft_model(
     # Load environment variables
     load_env_variables()
+    # Check if we're in distributed mode
+    is_distributed = "WORLD_SIZE" in os.environ and int(os.environ.get("WORLD_SIZE", "1")) > 1
+    if is_distributed:
+        log_info(f"Running in distributed mode with world size: {os.environ.get('WORLD_SIZE')}")
+    else:
+        log_info("Running in non-distributed mode (single process)")
     # Load all configurations
     try:
         configs = load_configs(args.config_dir)
         # Extract specific configs
         if not configs:
+            logger.error("Failed to load configuration")
             return 1
+        # Verify configuration sections exist
         if "transformers" not in configs:
             logger.error("transformers_config.json not found or invalid")
             return 1
+        if "hardware" not in configs or not configs["hardware"]:
+            logger.warning("Hardware configuration section not found in transformers_config.json. Using default hardware configuration.")
+        if "dataset" not in configs or not configs["dataset"]:
+            logger.error("Dataset configuration section not found in transformers_config.json")
             return 1
         # Validate model configuration
         # Apply hardware-specific settings if available
         if hardware_config:
+            # Get training optimizations from hardware config
             training_opts = hardware_config.get("training_optimizations", {})
+            # Apply batch size and gradient accumulation settings
+            if training_opts.get("per_device_batch_size") and model_config.get("training"):
+                batch_size = training_opts.get("per_device_batch_size")
+                model_config["training"]["per_device_train_batch_size"] = batch_size
+                log_info(f"Applied hardware-optimized batch size: {batch_size}")
+            if training_opts.get("gradient_accumulation_steps") and model_config.get("training"):
+                grad_steps = training_opts.get("gradient_accumulation_steps")
+                model_config["training"]["gradient_accumulation_steps"] = grad_steps
+                log_info(f"Applied hardware-optimized gradient accumulation: {grad_steps}")
             # Apply memory optimizations
             memory_opts = training_opts.get("memory_optimizations", {})
             if memory_opts.get("use_gradient_checkpointing") is not None and model_config.get("training"):
+                grad_ckpt = memory_opts.get("use_gradient_checkpointing")
+                model_config["training"]["gradient_checkpointing"] = grad_ckpt
+                log_info(f"Applied hardware-optimized gradient checkpointing: {grad_ckpt}")
+            # Apply system settings
+            system_settings = hardware_config.get("system_settings", {})
+            if system_settings.get("dataloader_num_workers") is not None:
+                workers = system_settings.get("dataloader_num_workers")
+                log_info(f"Using {workers} dataloader workers from hardware config")
+            # Get distribution strategy
+            multi_gpu_strategy = training_opts.get("multi_gpu_strategy", "data_parallel")
+            log_info(f"Hardware config specifies {multi_gpu_strategy} for multi-GPU training")
     except Exception as e:
         logger.error(f"Error loading configurations: {e}")
         # Set memory management env vars for better fragmentation handling
         os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
+        # Get memory fraction from hardware config
+        cuda_memory_fraction = hardware_config.get("system_settings", {}).get("cuda_memory_fraction", 0.85)
         # Log initial memory information in a compact form
         gpu_info = []
         for i in range(torch.cuda.device_count()):
             name = torch.cuda.get_device_name(i)
             allocated = torch.cuda.memory_allocated(i) / 1024**3
             total = torch.cuda.get_device_properties(i).total_memory / 1024**3
+            reserved_memory = total * cuda_memory_fraction
+            gpu_info.append(f"GPU {i}: {name} ({allocated:.1f}GB/{reserved_memory:.1f}GB)")
         log_info(f"Hardware: {torch.cuda.device_count()} GPUs detected")
         log_info(f"GPU details: {', '.join(gpu_info)}")
         except Exception as e:
             logger.error(f"Error loading dataset: {e}")
             return 1
         # Create data collator
         data_collator = SimpleDataCollator(tokenizer, dataset_config)
         # Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
+        # First check hardware config, then transformers config
+        use_bf16 = False
+        use_fp16 = False
+        # Check hardware config first
+        hardware_precision = hardware_config.get("training_optimizations", {}).get("mixed_precision", "")
+        if hardware_precision.lower() == "bf16":
+            use_bf16 = True
+            log_info("Using BF16 precision from hardware config")
+        elif hardware_precision.lower() == "fp16":
+            use_fp16 = True
+            log_info("Using FP16 precision from hardware config")
+        else:
+            # Fall back to transformers config
+            use_bf16 = model_config.get("bf16", False) or model_config.get("torch_dtype", "") == "bfloat16"
+            use_fp16 = model_config.get("fp16", False) and not use_bf16  # Only use fp16 if bf16 is not set
+            log_info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
+        # Get per device batch size - from transformers config, but possibly overridden by hardware config
+        per_device_batch_size = model_config.get("training", {}).get("per_device_train_batch_size", 16)
+        gradient_accumulation_steps = model_config.get("training", {}).get("gradient_accumulation_steps", 3)
         # For multi-GPU setup, adjust for better balance
         if torch.cuda.device_count() > 1:
             log_info(f"Multi-GPU setup with {torch.cuda.device_count()} GPUs")
             log_info(f"Training config: {per_device_batch_size} samples/GPU × {gradient_accumulation_steps} accumulation steps")
+        # Determine multi-GPU strategy from hardware config
+        multi_gpu_strategy = hardware_config.get("training_optimizations", {}).get("multi_gpu_strategy", "data_parallel")
+        # Set up FSDP for multi-GPU training if specified and in distributed mode
         fsdp_config = None
+        if multi_gpu_strategy == "fsdp" and is_distributed and torch.cuda.device_count() > 1:
             try:
                 from torch.distributed.fsdp import (
                     FullyShardedDataParallel as FSDP,
             except ImportError:
                 log_info("FSDP imports failed, falling back to standard DDP")
                 fsdp_config = None
+        elif multi_gpu_strategy == "fsdp" and not is_distributed:
+            log_info("FSDP disabled: requires distributed environment (use torchrun or accelerate)")
+            log_info("Using DataParallel for multi-GPU training instead")
+        else:
+            log_info(f"Using {multi_gpu_strategy} for multi-GPU training")
+        # Get system settings from hardware config
+        dataloader_workers = hardware_config.get("system_settings", {}).get("dataloader_num_workers", 2)
+        pin_memory = hardware_config.get("system_settings", {}).get("dataloader_pin_memory", True)
         # Set up training arguments
         log_info("Setting up training arguments")
             report_to="tensorboard",
             remove_unused_columns=False,  # Keep all columns
             gradient_checkpointing=model_config.get("training", {}).get("gradient_checkpointing", True),
+            dataloader_pin_memory=pin_memory,
             optim=model_config.get("training", {}).get("optim", "adamw_torch"),
             ddp_find_unused_parameters=False,  # Improve distributed training efficiency
             dataloader_drop_last=False,  # Process all examples
+            dataloader_num_workers=dataloader_workers,
             no_cuda=False if torch.cuda.is_available() else True,  # Use CUDA if available
+            # Only add FSDP if we're in distributed mode with FSDP strategy
+            fsdp=fsdp_config if is_distributed and multi_gpu_strategy == "fsdp" else None,
         )
         # Create sequential sampler to maintain original dataset order
                     memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB (max: {max_mem:.1f}MB)")
                 logger.error(f"GPU memory at failure: {', '.join(memory_info)}")
             raise
     except Exception as e:
         logger.error(f"Error in main training loop: {str(e)}")
         return 1

transformers_config.json CHANGED Viewed

@@ -60,7 +60,7 @@
   "distributed_training": {
     "fsdp_config": {
-      "enabled": true,
       "sharding_strategy": "FULL_SHARD",
       "mixed_precision": "BF16",
       "activation_checkpointing": true,
@@ -86,5 +86,88 @@
   "use_flash_attention": true,
   "torch_dtype": "bfloat16",
   "bf16": true,
-  "fp16": false
 }

   "distributed_training": {
     "fsdp_config": {
+      "enabled": false,
       "sharding_strategy": "FULL_SHARD",
       "mixed_precision": "BF16",
       "activation_checkpointing": true,
   "use_flash_attention": true,
   "torch_dtype": "bfloat16",
   "bf16": true,
+  "fp16": false,
+  "hardware": {
+    "hardware_name": "4xL4",
+    "specs": {
+      "gpu_count": 4,
+      "gpu_type": "L4",
+      "vram_per_gpu": 24,
+      "total_vram": 96,
+      "vcpu_count": 48,
+      "ram": 186
+    },
+    "hardware_setup": {
+      "use_cpu": false,
+      "num_gpus": 4,
+      "device_map": "auto"
+    },
+    "training_optimizations": {
+      "per_device_batch_size": 16,
+      "gradient_accumulation_steps": 3,
+      "mixed_precision": "bf16",
+      "torch_compile": false,
+      "memory_optimizations": {
+        "use_gradient_checkpointing": true,
+        "use_flash_attention": true
+      },
+      "multi_gpu_strategy": "data_parallel"
+    },
+    "system_settings": {
+      "cuda_memory_fraction": 0.85,
+      "dataloader_num_workers": 2,
+      "dataloader_pin_memory": true
+    },
+    "memory_breakdown": {
+      "model_size": "~3.5GB (pre-quantized 4-bit)",
+      "optimizer_states": "~1GB",
+      "batch_memory_per_gpu": "~3GB",
+      "peak_memory_estimate": "~18GB",
+      "safe_headroom": "~6GB"
+    },
+    "compute_environment": "L4_CLOUD"
+  },
+  "dataset": {
+    "dataset": {
+      "name": "George-API/cognitive-data",
+      "split": "train",
+      "column_mapping": {
+        "conversations": "text"
+      },
+      "processing": {
+        "sort_by_id": true,
+        "maintain_paper_order": true,
+        "max_seq_length": 2048
+      }
+    },
+    "data_formatting": {
+      "chat_template": "phi",
+      "roles": {
+        "system": "System: {content}\n\n",
+        "human": "Human: {content}\n\n",
+        "assistant": "Assistant: {content}\n\n",
+        "user": "Human: {content}\n\n"
+      },
+      "metadata_handling": {
+        "include_paper_id": true,
+        "include_chunk_number": true,
+        "metadata_format": "Paper ID: {paper_id} | Chunk: {chunk_number}"
+      }
+    },
+    "data_loading": {
+      "batch_size": 24,
+      "shuffle": false,
+      "drop_last": false,
+      "num_workers": 4,
+      "pin_memory": true,
+      "prefetch_factor": 4
+    },
+    "validation": {
+      "log_samples": 3,
+      "log_interval": 50,
+      "metrics": ["processed", "skipped", "avg_tokens", "unique_papers"]
+    }
+  }
 }

update_space.py CHANGED Viewed

@@ -74,8 +74,6 @@ def verify_configs():
     current_dir = Path(__file__).parent
     required_files = [
         "transformers_config.json",
-        "hardware_config.json",
-        "dataset_config.json",
         "requirements.txt",
         "run_transformers_training.py"
     ]

     current_dir = Path(__file__).parent
     required_files = [
         "transformers_config.json",
         "requirements.txt",
         "run_transformers_training.py"
     ]