hf-train-frontend

Runtime error

App Files Files Community

George-API commited on Mar 9

Commit

20852a7

verified ·

1 Parent(s): 3da7418

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

app.py +195 -136
hardware_config.json +5 -5
run_transformers_training.py +203 -195
transformers_config.json +2 -2

app.py CHANGED Viewed

@@ -1,162 +1,221 @@
-import gradio as gr
 import os
-import subprocess
 import sys
 import json
-import re
-from threading import Thread
-import datetime
-import torch
-import threading
-def load_env_variables():
-    """Load environment variables from system or .env file."""
-    if os.environ.get("SPACE_ID"):
-        print("Running in Hugging Face Space")
-        if "/" in os.environ.get("SPACE_ID", ""):
-            username = os.environ.get("SPACE_ID").split("/")[0]
-            os.environ["HF_USERNAME"] = username
-            print(f"Set HF_USERNAME from SPACE_ID: {username}")
-    else:
-        try:
-            from dotenv import load_dotenv
-            env_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".env")
-            if os.path.exists(env_path):
-                load_dotenv(env_path)
-                print(f"Loaded environment variables from {env_path}")
-        except ImportError:
-            print("python-dotenv not installed, skipping .env loading")
-def check_environment():
-    """Check the environment for GPU availability and other requirements."""
-    env_info = {
-        "System": {
-            "Platform": sys.platform,
-            "Python Version": sys.version.split()[0]
-        },
-        "GPU": {
-            "CUDA Available": torch.cuda.is_available(),
-            "Device Count": torch.cuda.device_count() if torch.cuda.is_available() else 0
-        },
-        "Environment Variables": {
-            "HF_TOKEN": bool(os.environ.get("HF_TOKEN")),
-            "HF_USERNAME": bool(os.environ.get("HF_USERNAME")),
-            "HF_SPACE_NAME": bool(os.environ.get("HF_SPACE_NAME"))
-        }
-    }
-    if torch.cuda.is_available():
-        env_info["GPU"]["Device Name"] = torch.cuda.get_device_name(0)
-        env_info["GPU"]["Memory (GB)"] = round(torch.cuda.get_device_properties(0).total_memory / (1024**3), 2)
-    return env_info
-def run_training_process():
-    """Run the training process using the configuration files."""
     try:
-        current_dir = os.path.dirname(os.path.abspath(__file__))
-        training_script = os.path.join(current_dir, "run_transformers_training.py")
-        # Start the training process
         process = subprocess.Popen(
-            [sys.executable, training_script],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            text=True,
-            bufsize=1
         )
-        # Process the output line by line
-        for line in process.stdout:
-            print(line.strip())
-        process.wait()
-        return process.returncode
     except Exception as e:
-        print(f"Error in training process: {e}")
-        return 1
-def start_training(learning_rate, num_train_epochs, per_device_train_batch_size,
-                 gradient_accumulation_steps):
-    """Start the training process with the specified parameters."""
     try:
-        load_env_variables()
-        current_dir = os.path.dirname(os.path.abspath(__file__))
-        # Load and update transformers config
-        with open(os.path.join(current_dir, "transformers_config.json"), "r") as f:
-            config = json.load(f)
-        # Update training parameters
-        config["training"].update({
-            "num_train_epochs": num_train_epochs,
-            "learning_rate": learning_rate,
-            "per_device_train_batch_size": per_device_train_batch_size,
-            "gradient_accumulation_steps": gradient_accumulation_steps
-        })
-        # Update hub settings if username is available
-        if os.environ.get("HF_USERNAME"):
-            config["huggingface_hub"].update({
-                "hub_model_id": f"{os.environ['HF_USERNAME']}/Phi4-Cognitive-Science"
-            })
-        # Save updated config
-        with open(os.path.join(current_dir, "transformers_config.json"), "w") as f:
-            json.dump(config, f, indent=4)
-        # Start training in a separate thread
-        thread = threading.Thread(target=run_training_process)
-        thread.daemon = True
-        thread.start()
-        return "Training started! Check the Hugging Face Space logs for progress."
     except Exception as e:
-        return f"Error starting training: {str(e)}"
-with gr.Blocks(title="Phi-4 Training Interface") as demo:
-    gr.Markdown("# Phi-4 Unsupervised Training for Cognitive Science")
-    with gr.Tab("Training"):
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("## Model Configuration")
-                gr.Markdown("**Model**: unsloth/phi-4-unsloth-bnb-4bit")
-                gr.Markdown("**Dataset**: George-API/cognitive-data")
-                gr.Markdown("## Training Parameters")
-                learning_rate = gr.Slider(minimum=1e-6, maximum=1e-4, value=2e-5, step=1e-6,
-                                       label="Learning Rate")
-                num_train_epochs = gr.Slider(minimum=1, maximum=5, value=3, step=1,
-                                          label="Number of Epochs")
-                per_device_train_batch_size = gr.Slider(minimum=4, maximum=24, value=12, step=4,
-                                                      label="Per Device Train Batch Size (Unsloth Optimized)")
-                gradient_accumulation_steps = gr.Slider(minimum=1, maximum=8, value=4, step=1,
-                                                     label="Gradient Accumulation Steps")
                 start_btn = gr.Button("Start Training", variant="primary")
-                training_output = gr.Textbox(label="Training Output", interactive=False)
-    with gr.Tab("Environment"):
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("## Environment Information")
-                env_info = gr.JSON(label="Environment Info")
-                check_env_btn = gr.Button("Check Environment")
-    # Set up event handlers
-    start_btn.click(
-        fn=start_training,
-        inputs=[learning_rate, num_train_epochs, per_device_train_batch_size, gradient_accumulation_steps],
-        outputs=training_output
-    )
-    check_env_btn.click(
-        fn=check_environment,
-        inputs=[],
-        outputs=env_info
-    )
 if __name__ == "__main__":
-    load_env_variables()
-    demo.launch()

 import os
 import sys
 import json
+import logging
+import gradio as gr
+from pathlib import Path
+import subprocess
+import time
+from datetime import datetime
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger(__name__)
+# Configuration paths
+CONFIG_DIR = "."
+TRANSFORMERS_CONFIG = os.path.join(CONFIG_DIR, "transformers_config.json")
+HARDWARE_CONFIG = os.path.join(CONFIG_DIR, "hardware_config.json")
+DATASET_CONFIG = os.path.join(CONFIG_DIR, "dataset_config.json")
+def load_config(config_path):
+    """Load configuration from JSON file."""
+    try:
+        if os.path.exists(config_path):
+            with open(config_path, 'r') as f:
+                return json.load(f)
+        else:
+            logger.error(f"Config file not found: {config_path}")
+            return None
+    except Exception as e:
+        logger.error(f"Error loading config: {str(e)}")
+        return None
+def display_config():
+    """Display current training configuration."""
+    transformers_config = load_config(TRANSFORMERS_CONFIG)
+    hardware_config = load_config(HARDWARE_CONFIG)
+    dataset_config = load_config(DATASET_CONFIG)
+    if not all([transformers_config, hardware_config, dataset_config]):
+        return "Error loading configuration files."
+    # Extract key parameters
+    model_name = transformers_config.get("model", {}).get("name", "")
+    dataset_name = dataset_config.get("dataset", {}).get("name", "")
+    batch_size = transformers_config.get("training", {}).get("per_device_train_batch_size", 0)
+    gradient_accum = transformers_config.get("training", {}).get("gradient_accumulation_steps", 0)
+    lr = transformers_config.get("training", {}).get("learning_rate", 0)
+    epochs = transformers_config.get("training", {}).get("num_train_epochs", 0)
+    gpu_count = hardware_config.get("specs", {}).get("gpu_count", 0)
+    gpu_type = hardware_config.get("specs", {}).get("gpu_type", "")
+    config_info = f"""
+    ## Current Training Configuration
+    **Model**: {model_name}
+    **Dataset**: {dataset_name}
+    **Training Parameters**:
+    - Learning Rate: {lr}
+    - Epochs: {epochs}
+    - Batch Size/GPU: {batch_size}
+    - Gradient Accumulation: {gradient_accum}
+    - Effective Batch Size: {batch_size * gradient_accum * gpu_count}
+    **Hardware**:
+    - GPUs: {gpu_count}x {gpu_type}
+    - Flash Attention: {hardware_config.get("memory_optimization", {}).get("use_flash_attention", False)}
+    - Gradient Checkpointing: {hardware_config.get("memory_optimization", {}).get("use_gradient_checkpointing", False)}
+    **Pre-quantized 4-bit Training**: Enabled
+    """
+    return config_info
+def start_training():
+    """Start the training process."""
     try:
+        # Check if already running
+        if os.path.exists("training.pid"):
+            with open("training.pid", "r") as f:
+                pid = f.read().strip()
+                try:
+                    # Check if process is still running
+                    os.kill(int(pid), 0)
+                    return f"Training is already running with PID {pid}"
+                except OSError:
+                    # Process not running, remove stale PID file
+                    os.remove("training.pid")
+        # Start training in background
+        cmd = "python run_transformers_training.py"
         process = subprocess.Popen(
+            cmd,
+            shell=True,
+            stdout=open('training.log', 'a'),
+            stderr=subprocess.STDOUT
         )
+        # Save PID
+        with open("training.pid", "w") as f:
+            f.write(str(process.pid))
+        # Log start time
+        with open("training_history.log", "a") as f:
+            f.write(f"{datetime.now().isoformat()}: Training started (PID: {process.pid})\n")
+        return f"Training started with PID {process.pid}. Check status for updates."
     except Exception as e:
+        return f"Error starting training: {str(e)}"
+def check_training_status():
+    """Check the status of training."""
     try:
+        # Check if training is running
+        if os.path.exists("training.pid"):
+            with open("training.pid", "r") as f:
+                pid = f.read().strip()
+                try:
+                    # Check if process is still running
+                    os.kill(int(pid), 0)
+                    status = f"Training is running with PID {pid}"
+                except OSError:
+                    status = "Training process has stopped"
+                    os.remove("training.pid")
+        else:
+            status = "No training process is currently running"
+        # Get last lines from training log
+        log_content = "No training log available"
+        if os.path.exists("training.log"):
+            with open("training.log", "r") as f:
+                lines = f.readlines()
+                log_content = "".join(lines[-20:]) if lines else "Log file is empty"
+        return f"{status}\n\n**Recent Log:**\n```\n{log_content}\n```"
     except Exception as e:
+        return f"Error checking status: {str(e)}"
+# Create the Gradio interface
+with gr.Blocks(title="Phi-4 Unsloth Training", theme=gr.themes.Soft(primary_hue="blue")) as app:
+    gr.Markdown("# Phi-4 Unsloth 4-bit Training Interface")
+    with gr.Tabs():
+        with gr.TabItem("Configuration"):
+            config_output = gr.Markdown(display_config())
+            refresh_btn = gr.Button("Refresh Configuration")
+            refresh_btn.click(fn=display_config, outputs=config_output)
+        with gr.TabItem("Training Control"):
+            gr.Markdown("## Training Management")
+            with gr.Row():
                 start_btn = gr.Button("Start Training", variant="primary")
+                check_btn = gr.Button("Check Status")
+            status_output = gr.Markdown("Click 'Check Status' to see training progress")
+            start_btn.click(fn=start_training, outputs=status_output)
+            check_btn.click(fn=check_training_status, outputs=status_output)
+            # Auto-refresh status
+            gr.HTML('''
+            <script>
+            let intervalId;
+            document.addEventListener('DOMContentLoaded', function() {
+                // Find the "Check Status" button
+                const buttons = Array.from(document.querySelectorAll('button'));
+                const checkBtn = buttons.find(btn => btn.textContent.includes('Check Status'));
+                // Set up interval to click the button every 30 seconds
+                if (checkBtn) {
+                    intervalId = setInterval(() => {
+                        checkBtn.click();
+                    }, 30000);
+                }
+            });
+            // Clean up on tab/window close
+            window.addEventListener('beforeunload', function() {
+                clearInterval(intervalId);
+            });
+            </script>
+            ''')
+        with gr.TabItem("Help"):
+            gr.Markdown("""
+            ## Phi-4 Unsloth Training Help
+            This interface allows you to manage training of the Phi-4 model with Unsloth 4-bit optimizations.
+            ### Quick Start
+            1. Review the configuration in the Configuration tab
+            2. Click "Start Training" to begin the process
+            3. Use "Check Status" to monitor progress
+            ### Notes
+            - Training uses the pre-quantized model `unsloth/phi-4-unsloth-bnb-4bit`
+            - The process maintains paper order and handles metadata appropriately
+            - Training progress will be regularly saved to HuggingFace Hub
+            ### Troubleshooting
+            If training stops unexpectedly:
+            - Check the logs for out-of-memory errors
+            - Verify the VRAM usage on each GPU
+            - Check for CUDA version compatibility
+            """)
+# Launch the app
 if __name__ == "__main__":
+    app.launch()

hardware_config.json CHANGED Viewed

@@ -9,13 +9,13 @@
     "ram": 186
   },
   "training_optimizations": {
-    "per_device_batch_size": 32,
     "gradient_accumulation_steps": 2,
-    "effective_batch_size": 256,
     "memory_optimizations": {
       "use_gradient_checkpointing": true,
       "pin_memory": true,
-      "num_workers": 8,
       "use_flash_attention": true
     },
     "distributed_settings": {
@@ -41,9 +41,9 @@
   "mixed_precision": "bf16",
   "num_gpus": 4,
   "training_parameters": {
-    "per_device_train_batch_size": 32,
     "gradient_accumulation_steps": 2,
-    "dataloader_num_workers": 8,
     "dataloader_pin_memory": true,
     "gradient_checkpointing": true,
     "max_grad_norm": 1.0

     "ram": 186
   },
   "training_optimizations": {
+    "per_device_batch_size": 24,
     "gradient_accumulation_steps": 2,
+    "effective_batch_size": 192,
     "memory_optimizations": {
       "use_gradient_checkpointing": true,
       "pin_memory": true,
+      "num_workers": 4,
       "use_flash_attention": true
     },
     "distributed_settings": {
   "mixed_precision": "bf16",
   "num_gpus": 4,
   "training_parameters": {
+    "per_device_train_batch_size": 24,
     "gradient_accumulation_steps": 2,
+    "dataloader_num_workers": 4,
     "dataloader_pin_memory": true,
     "gradient_checkpointing": true,
     "max_grad_norm": 1.0

run_transformers_training.py CHANGED Viewed

@@ -127,13 +127,12 @@ def parse_args():
 def load_model_and_tokenizer(config):
     """Load model and tokenizer with proper error handling and optimizations."""
     try:
-        if config.get("use_unsloth", False) and unsloth_available:
-            logger.info("Using Unsloth optimizations")
             model, tokenizer = FastLanguageModel.from_pretrained(
                 model_name=config.get("model_name"),
                 max_seq_length=config.get("max_seq_length", 2048),
                 dtype=None,  # Let Unsloth choose optimal dtype
-                load_in_4bit=config.get("load_in_4bit", True),
                 device_map="auto",
             )
@@ -151,49 +150,14 @@ def load_model_and_tokenizer(config):
             )
             logger.info("Unsloth optimizations applied successfully")
         else:
-            if config.get("use_unsloth", False):
-                logger.warning("Unsloth requested but not available. Falling back to standard training.")
-            # Standard quantization setup
-            quantization_config = None
-            if config.get("load_in_4bit", False) and bitsandbytes_available:
-                logger.info("Using 4-bit quantization")
-                quantization_config = BitsAndBytesConfig(
-                    load_in_4bit=True,
-                    bnb_4bit_quant_type="nf4",
-                    bnb_4bit_compute_dtype=torch.float16,
-                    bnb_4bit_use_double_quant=True
-                )
-            # Load model with standard settings
-            model = AutoModelForCausalLM.from_pretrained(
-                config.get("model_name"),
-                quantization_config=quantization_config,
-                device_map="auto",
-                trust_remote_code=config.get("trust_remote_code", True),
-                use_cache=not config.get("gradient_checkpointing", True)
-            )
-            # Load tokenizer
-            tokenizer = AutoTokenizer.from_pretrained(
-                config.get("model_name"),
-                use_fast=config.get("use_fast_tokenizer", True),
-                trust_remote_code=config.get("trust_remote_code", True)
-            )
-            # Enable gradient checkpointing if requested
-            if config.get("gradient_checkpointing", True) and hasattr(model, "gradient_checkpointing_enable"):
-                model.gradient_checkpointing_enable(use_reentrant=False)
-                logger.info("Gradient checkpointing enabled")
         # Set up tokenizer settings
         if config.get("chat_template"):
-            if unsloth_available and config.get("use_unsloth", False):
-                chat_template = get_chat_template("phi")
-                tokenizer.chat_template = chat_template
-            else:
-                tokenizer.chat_template = config.get("chat_template")
-            logger.info(f"Set chat template to {config.get('chat_template')}")
         # Ensure proper token settings
         if tokenizer.pad_token_id is None:
@@ -210,33 +174,191 @@ def load_dataset_with_mapping(dataset_config):
     """Load and prepare dataset with proper column mapping."""
     try:
         # Load dataset
-        dataset = load_dataset(
-            dataset_config["dataset"]["name"],
-            split=dataset_config["dataset"]["split"]
-        )
-        logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
-        # Apply column mapping if specified
-        if "column_mapping" in dataset_config["dataset"]:
-            mapping = dataset_config["dataset"]["column_mapping"]
-            dataset = dataset.rename_columns({v: k for k, v in mapping.items()})
-            logger.info(f"Applied column mapping: {mapping}")
         # Sort dataset if required
-        if dataset_config["dataset"]["processing"]["sort_by_id"]:
-            logger.info("Sorting dataset by ID to maintain paper chunk order")
             dataset = dataset.sort("id")
-            # Log first few IDs to verify sorting
-            sample_ids = [example["id"] for example in dataset.select(range(min(5, len(dataset))))]
             logger.info(f"First few IDs after sorting: {sample_ids}")
         return dataset
     except Exception as e:
         logger.error(f"Error loading dataset: {str(e)}")
         raise
 def main():
     # Set up logging
     logger.info("Starting training process")
@@ -322,148 +444,34 @@ def main():
                 logger.error(f"Error setting up PEFT: {e}")
                 return 1
-        # Load dataset with proper mapping
         try:
-            dataset = load_dataset_with_mapping(dataset_config)
-            logger.info("Dataset loaded and prepared successfully")
         except Exception as e:
-            logger.error(f"Error loading dataset: {e}")
             return 1
-        # Simple data collator that processes each entry independently
-        class SimpleDataCollator:
-            def __init__(self, tokenizer):
-                self.tokenizer = tokenizer
-                self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
-                self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
-                self.prompt_counter = 0
-                self.paper_counters = {}
-                logger.info("SimpleDataCollator initialized - using phi-4 chat format")
-            def format_phi_chat(self, messages):
-                """Format messages according to phi-4's chat template."""
-                formatted_chat = ""
-                for message in messages:
-                    # Extract role and content
-                    if isinstance(message, dict):
-                        role = message.get("role", "").lower()
-                        content = message.get("content", "")
-                    else:
-                        role = getattr(message, "role", "").lower()
-                        content = getattr(message, "content", "")
-                    # Format based on role
-                    if role == "human" or role == "user":
-                        formatted_chat += f"Human: {content}\n\n"
-                    elif role == "assistant":
-                        formatted_chat += f"Assistant: {content}\n\n"
-                    elif role == "system":
-                        # For system messages, we prepend them with a special format
-                        formatted_chat = f"System: {content}\n\n" + formatted_chat
-                    else:
-                        logger.warning(f"Unknown role '{role}' - treating as system message")
-                        formatted_chat += f"System: {content}\n\n"
-                return formatted_chat.strip()
-            def __call__(self, features):
-                batch = {"input_ids": [], "attention_mask": [], "labels": []}
-                for example in features:
-                    try:
-                        # Get ID and conversation fields
-                        paper_id = example.get("id", "") if isinstance(example, dict) else getattr(example, "id", "")
-                        conversation = example.get("conversations", []) if isinstance(example, dict) else getattr(example, "conversations", [])
-                        if not conversation:
-                            self.stats["skipped"] += 1
-                            continue
-                        # Increment counters
-                        self.prompt_counter += 1
-                        if paper_id not in self.paper_counters:
-                            self.paper_counters[paper_id] = 0
-                        self.paper_counters[paper_id] += 1
-                        # Add metadata as system message
-                        metadata = {
-                            "role": "system",
-                            "content": f"Paper ID: {paper_id} | Chunk: {self.paper_counters[paper_id]}"
-                        }
-                        # Format the conversation using phi-4's chat template
-                        formatted_content = self.format_phi_chat([metadata] + conversation)
-                        # Tokenize with the model's chat template
-                        inputs = self.tokenizer(
-                            formatted_content,
-                            add_special_tokens=True,
-                            truncation=True,
-                            max_length=model_config.get("max_seq_length", 2048),
-                            return_tensors=None,  # Return list instead of tensors
-                        )
-                        input_ids = inputs["input_ids"]
-                        attention_mask = inputs["attention_mask"]
-                        if len(input_ids) > 0:
-                            # For causal language modeling, labels are the same as inputs
-                            labels = input_ids.copy()
-                            batch["input_ids"].append(input_ids)
-                            batch["attention_mask"].append(attention_mask)
-                            batch["labels"].append(labels)
-                            self.stats["processed"] += 1
-                            self.stats["total_tokens"] += len(input_ids)
-                            # Debug logging for first few examples
-                            if self.stats["processed"] <= 3:
-                                logger.info(f"Example {self.stats['processed']} format:")
-                                logger.info(f"Paper ID: {paper_id} | Chunk: {self.paper_counters[paper_id]}")
-                                logger.info(f"Token count: {len(input_ids)}")
-                                logger.info(f"Content preview:\n{formatted_content[:500]}...")
-                        else:
-                            self.stats["skipped"] += 1
-                    except Exception as e:
-                        logger.warning(f"Error processing example: {str(e)[:100]}...")
-                        self.stats["skipped"] += 1
-                        continue
-                # Handle empty batches
-                if not batch["input_ids"]:
-                    logger.warning("Empty batch, returning dummy tensors")
-                    return {
-                        "input_ids": torch.zeros((1, 1), dtype=torch.long),
-                        "attention_mask": torch.zeros((1, 1), dtype=torch.long),
-                        "labels": torch.zeros((1, 1), dtype=torch.long)
-                    }
-                # Pad the batch
-                max_length = max(len(ids) for ids in batch["input_ids"])
-                for i in range(len(batch["input_ids"])):
-                    padding_length = max_length - len(batch["input_ids"][i])
-                    if padding_length > 0:
-                        batch["input_ids"][i].extend([self.pad_token_id] * padding_length)
-                        batch["attention_mask"][i].extend([0] * padding_length)
-                        batch["labels"][i].extend([-100] * padding_length)  # Don't compute loss on padding
-                # Convert to tensors
-                batch = {k: torch.tensor(v) for k, v in batch.items()}
-                # Log stats periodically
-                if self.stats["processed"] % 100 == 0 and self.stats["processed"] > 0:
-                    logger.info(f"Data collator stats: processed={self.stats['processed']}, "
-                               f"skipped={self.stats['skipped']}, "
-                               f"avg_tokens={self.stats['total_tokens']/self.stats['processed']:.1f}, "
-                               f"unique_papers={len(self.paper_counters)}")
-                return batch
         # Create data collator
-        data_collator = SimpleDataCollator(tokenizer)
         # Simple logging callback
         class LoggingCallback(TrainerCallback):

 def load_model_and_tokenizer(config):
     """Load model and tokenizer with proper error handling and optimizations."""
     try:
+        if unsloth_available:
+            logger.info("Using Unsloth optimizations with pre-quantized model")
             model, tokenizer = FastLanguageModel.from_pretrained(
                 model_name=config.get("model_name"),
                 max_seq_length=config.get("max_seq_length", 2048),
                 dtype=None,  # Let Unsloth choose optimal dtype
                 device_map="auto",
             )
             )
             logger.info("Unsloth optimizations applied successfully")
         else:
+            logger.error("Unsloth is required for training with pre-quantized model")
+            raise ImportError("Unsloth is required for this training setup")
         # Set up tokenizer settings
         if config.get("chat_template"):
+            chat_template = get_chat_template("phi")
+            tokenizer.chat_template = chat_template
+            logger.info("Set phi chat template")
         # Ensure proper token settings
         if tokenizer.pad_token_id is None:
     """Load and prepare dataset with proper column mapping."""
     try:
         # Load dataset
+        dataset_name = dataset_config.get("dataset", {}).get("name", "")
+        dataset_split = dataset_config.get("dataset", {}).get("split", "train")
+        if not dataset_name:
+            raise ValueError("Dataset name not provided in configuration")
+        logger.info(f"Loading dataset {dataset_name}, split {dataset_split}")
+        dataset = load_dataset(dataset_name, split=dataset_split)
+        # Map columns if specified
+        column_mapping = dataset_config.get("dataset", {}).get("column_mapping", {})
+        if column_mapping:
+            logger.info(f"Applying column mapping: {column_mapping}")
+            # Rename columns according to mapping
+            for target, source in column_mapping.items():
+                if source in dataset.column_names:
+                    dataset = dataset.rename_column(source, target)
         # Sort dataset if required
+        sort_by_id = dataset_config.get("dataset", {}).get("processing", {}).get("sort_by_id", False)
+        if sort_by_id and "id" in dataset.column_names:
+            logger.info("Sorting dataset by ID")
             dataset = dataset.sort("id")
+            # Log the first few IDs to verify sorting
+            sample_ids = [example['id'] for example in dataset.select(range(min(5, len(dataset))))]
             logger.info(f"First few IDs after sorting: {sample_ids}")
+        logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
         return dataset
     except Exception as e:
         logger.error(f"Error loading dataset: {str(e)}")
         raise
+def format_phi_chat(messages, dataset_config):
+    """Format messages according to phi-4's chat template and dataset config."""
+    formatted_chat = ""
+    # Get role templates from config
+    roles = dataset_config.get("data_formatting", {}).get("roles", {
+        "system": "System: {content}\n\n",
+        "human": "Human: {content}\n\n",
+        "assistant": "Assistant: {content}\n\n"
+    })
+    # Handle research introduction metadata first
+    metadata = next((msg for msg in messages if "[RESEARCH INTRODUCTION]" in msg.get("content", "")), None)
+    if metadata:
+        system_template = roles.get("system", "System: {content}\n\n")
+        formatted_chat = system_template.format(content=metadata['content'])
+        messages = [msg for msg in messages if msg != metadata]
+    # Process remaining messages
+    for message in messages:
+        role = message.get("role", "").lower()
+        content = message.get("content", "")
+        # Format based on role
+        if role == "human" or role == "user":
+            template = roles.get("human", "Human: {content}\n\n")
+            formatted_chat += template.format(content=content)
+        elif role == "assistant":
+            template = roles.get("assistant", "Assistant: {content}\n\n")
+            formatted_chat += template.format(content=content)
+        elif role == "system":
+            # For system messages, prepend them
+            template = roles.get("system", "System: {content}\n\n")
+            formatted_chat = template.format(content=content) + formatted_chat
+    return formatted_chat.strip()
+class SimpleDataCollator:
+    def __init__(self, tokenizer, dataset_config):
+        self.tokenizer = tokenizer
+        self.dataset_config = dataset_config
+        self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
+        self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
+        self.prompt_counter = 0
+        self.paper_counters = {}
+        self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
+        self.include_metadata = dataset_config.get("data_formatting", {}).get("metadata_handling", {}).get("include_paper_id", True)
+        self.include_chunk = dataset_config.get("data_formatting", {}).get("metadata_handling", {}).get("include_chunk_number", True)
+        self.metadata_format = dataset_config.get("data_formatting", {}).get("metadata_handling", {}).get("metadata_format", "Paper ID: {paper_id} | Chunk: {chunk_number}")
+        logger.info(f"SimpleDataCollator initialized - using phi-4 chat format with max_seq_length={self.max_seq_length}")
+    def __call__(self, features):
+        batch = {"input_ids": [], "attention_mask": [], "labels": []}
+        for example in features:
+            try:
+                # Get ID and conversation fields
+                paper_id = example.get("id", "")
+                conversation = example.get("conversations", [])
+                if not conversation:
+                    self.stats["skipped"] += 1
+                    continue
+                # Track paper chunks
+                if paper_id not in self.paper_counters:
+                    self.paper_counters[paper_id] = 0
+                self.paper_counters[paper_id] += 1
+                # Add metadata if configured
+                if self.include_metadata:
+                    # Format metadata according to configured format
+                    metadata_content = self.metadata_format.format(
+                        paper_id=paper_id,
+                        chunk_number=self.paper_counters[paper_id]
+                    )
+                    # Add as system message if not already in conversation
+                    if not any(msg.get("role") == "system" for msg in conversation):
+                        conversation = [{"role": "system", "content": metadata_content}] + conversation
+                # Format conversation with research introduction and chunk info
+                formatted_content = format_phi_chat(conversation, self.dataset_config)
+                # Tokenize with the model's chat template
+                inputs = self.tokenizer(
+                    formatted_content,
+                    add_special_tokens=True,
+                    truncation=True,
+                    max_length=self.max_seq_length,
+                    return_tensors=None,
+                )
+                if len(inputs["input_ids"]) > 0:
+                    # For causal language modeling, labels are the same as inputs
+                    labels = inputs["input_ids"].copy()
+                    batch["input_ids"].append(inputs["input_ids"])
+                    batch["attention_mask"].append(inputs["attention_mask"])
+                    batch["labels"].append(labels)
+                    self.stats["processed"] += 1
+                    self.stats["total_tokens"] += len(inputs["input_ids"])
+                    # Debug logging for first few examples
+                    log_samples = self.dataset_config.get("validation", {}).get("log_samples", 3)
+                    if self.stats["processed"] <= log_samples:
+                        logger.info(f"Example {self.stats['processed']} format:")
+                        logger.info(f"Paper ID: {paper_id} | Chunk: {self.paper_counters[paper_id]}")
+                        logger.info(f"Token count: {len(inputs['input_ids'])}")
+                        logger.info(f"Content preview:\n{formatted_content[:500]}...")
+                else:
+                    self.stats["skipped"] += 1
+            except Exception as e:
+                logger.warning(f"Error processing example: {str(e)[:100]}...")
+                self.stats["skipped"] += 1
+                continue
+        if not batch["input_ids"]:
+            logger.warning("Empty batch, returning dummy tensors")
+            return {
+                "input_ids": torch.zeros((1, 1), dtype=torch.long),
+                "attention_mask": torch.zeros((1, 1), dtype=torch.long),
+                "labels": torch.zeros((1, 1), dtype=torch.long)
+            }
+        # Pad the batch
+        max_length = max(len(ids) for ids in batch["input_ids"])
+        for i in range(len(batch["input_ids"])):
+            padding_length = max_length - len(batch["input_ids"][i])
+            if padding_length > 0:
+                batch["input_ids"][i].extend([self.pad_token_id] * padding_length)
+                batch["attention_mask"][i].extend([0] * padding_length)
+                batch["labels"][i].extend([-100] * padding_length)
+        # Convert to tensors
+        batch = {k: torch.tensor(v) for k, v in batch.items()}
+        # Log stats periodically
+        log_interval = self.dataset_config.get("validation", {}).get("log_interval", 100)
+        if self.stats["processed"] % log_interval == 0 and self.stats["processed"] > 0:
+            logger.info(f"Data collator stats: processed={self.stats['processed']}, "
+                       f"skipped={self.stats['skipped']}, "
+                       f"avg_tokens={self.stats['total_tokens']/self.stats['processed']:.1f}, "
+                       f"unique_papers={len(self.paper_counters)}")
+        return batch
 def main():
     # Set up logging
     logger.info("Starting training process")
                 logger.error(f"Error setting up PEFT: {e}")
                 return 1
+        # Load dataset
+        logger.info(f"Loading dataset: {dataset_config.get('dataset_name')}")
         try:
+            dataset = load_dataset(dataset_config.get("dataset_name"))
+            logger.info(f"Dataset loaded successfully with {len(dataset['train'])} training examples")
+            # Sort dataset by ID to ensure chunks from the same paper are processed together
+            logger.info("Sorting dataset by ID to maintain paper chunk order")
+            def sort_by_id(example):
+                # Extract ID as integer if possible, otherwise keep as string
+                try:
+                    return int(example['id'])
+                except (ValueError, TypeError):
+                    return example['id']
+            # Apply sorting to the dataset
+            dataset['train'] = dataset['train'].sort('id')
+            logger.info("Dataset sorted by ID")
+            # Log the first few IDs to verify sorting
+            sample_ids = [example['id'] for example in dataset['train'].select(range(min(5, len(dataset['train']))))]
+            logger.info(f"First few IDs after sorting: {sample_ids}")
         except Exception as e:
+            logger.error(f"Error loading or sorting dataset: {e}")
             return 1
         # Create data collator
+        data_collator = SimpleDataCollator(tokenizer, dataset_config)
         # Simple logging callback
         class LoggingCallback(TrainerCallback):

transformers_config.json CHANGED Viewed

@@ -15,7 +15,7 @@
   "training": {
     "per_device_train_batch_size": 24,
     "gradient_accumulation_steps": 2,
-    "learning_rate": 3e-5,
     "num_train_epochs": 3,
     "max_steps": -1,
     "logging_steps": 10,
@@ -65,7 +65,7 @@
       "offload_params": false
     },
     "ddp_find_unused_parameters": false,
-    "dataloader_num_workers": 8
   },
   "logging": {

   "training": {
     "per_device_train_batch_size": 24,
     "gradient_accumulation_steps": 2,
+    "learning_rate": 2e-5,
     "num_train_epochs": 3,
     "max_steps": -1,
     "logging_steps": 10,
       "offload_params": false
     },
     "ddp_find_unused_parameters": false,
+    "dataloader_num_workers": 4
   },
   "logging": {