#!/usr/bin/env python # coding=utf-8 import unsloth import os import sys import json import logging import subprocess import time import traceback from datetime import datetime from pathlib import Path # Configure logging to match HF Space logs logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler(sys.stdout)] ) logger = logging.getLogger(__name__) # Set other loggers to WARNING to reduce noise and ensure our logs are visible logging.getLogger("transformers").setLevel(logging.WARNING) logging.getLogger("datasets").setLevel(logging.WARNING) logging.getLogger("accelerate").setLevel(logging.WARNING) logging.getLogger("torch").setLevel(logging.WARNING) logging.getLogger("bitsandbytes").setLevel(logging.WARNING) # Define a clean logging function for HF Space compatibility def log_info(message): """Log information in a format compatible with Hugging Face Spaces""" logger.info(message) # Ensure output is flushed immediately for streaming sys.stdout.flush() # Configuration paths CONFIG_DIR = "." TRANSFORMERS_CONFIG = os.path.join(CONFIG_DIR, "transformers_config.json") def load_config(config_path): """Load configuration from a JSON file.""" try: with open(config_path, 'r') as f: return json.load(f) except Exception as e: log_info(f"Error loading config: {str(e)}") return {} def display_config(): """Display current training configuration.""" config = load_config(TRANSFORMERS_CONFIG) if not config: return "Error loading configuration file." # Extract sub-configurations transformers_config = config hardware_config = config.get("hardware", {}) dataset_config = config.get("dataset", {}) model_name = transformers_config.get("model", {}).get("name") or transformers_config.get("model_name_or_path", "") # Training parameters training_config = transformers_config.get("training", {}) batch_size = training_config.get("per_device_train_batch_size", 16) grad_accum = training_config.get("gradient_accumulation_steps", 3) epochs = training_config.get("num_train_epochs", 3) learning_rate = training_config.get("learning_rate", 2e-5) # Hardware settings gpu_count = hardware_config.get("specs", {}).get("gpu_count", 4) gpu_type = hardware_config.get("specs", {}).get("gpu_type", "L4") vram = hardware_config.get("specs", {}).get("vram_per_gpu", 24) # Dataset info dataset_name = dataset_config.get("dataset", {}).get("name", "") # Format response as HTML for better display html = f"""

Training Configuration

Model

Hardware

Dataset

""" return html def start_training(): """Start the training process.""" try: # Log configuration check log_info("Preparing to start training process...") log_info("Using consolidated configuration from transformers_config.json") # Start training log_info("Starting training process...") # Run in a background process for HF Space cmd = "python run_transformers_training.py" # In HF Spaces, we don't need to handle process management ourselves subprocess.Popen(cmd, shell=True, stdout=sys.stdout, stderr=sys.stderr) log_info("Training process has been started. You can monitor progress in the logs.") return "Training started successfully. Monitor progress in the Hugging Face Space logs." except Exception as e: error_msg = f"Error starting training: {str(e)}" log_info(error_msg) return error_msg # Interface setup for gradio def create_interface(): import gradio as gr with gr.Blocks(title="Phi-4 Training Center") as demo: gr.Markdown("# Phi-4 Research Assistant Training") with gr.Row(): with gr.Column(): gr.Markdown("## Control Panel") # Display current config config_html = gr.HTML(display_config()) refresh_btn = gr.Button("Refresh Configuration") # Training controls train_btn = gr.Button("Start Training", variant="primary") train_output = gr.Textbox(label="Status", interactive=False) with gr.Column(): gr.Markdown("## Training Information") gr.Markdown(""" ### Hardware: - 4× NVIDIA L4 GPUs (24GB VRAM per GPU, 96GB total) - Training with BF16 precision - Using Data Parallel for multi-GPU - Effective batch size: 16 (per device) × 4 (GPUs) × 3 (gradient accumulation) = 192 ### Notes: - Training may take several hours depending on dataset size - Check the Space logs for real-time progress - Model checkpoints will be saved to ./results directory """) # Connect buttons to functions refresh_btn.click(lambda: gr.update(value=display_config()), outputs=config_html) train_btn.click(start_training, outputs=train_output) return demo if __name__ == "__main__": # Print basic system information to help with debugging try: import torch logger.info(f"Python: {sys.version.split()[0]}") logger.info(f"PyTorch: {torch.__version__}") logger.info(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}") from transformers import __version__ as tf_version logger.info(f"Transformers: {tf_version}") from unsloth import __version__ as un_version logger.info(f"Unsloth: {un_version}") except Exception as e: logger.warning(f"Error printing system info: {e}") # Create and launch the Gradio interface demo = create_interface() demo.queue() demo.launch(share=True)