George-API commited on
Commit
32e9f89
·
verified ·
1 Parent(s): 3edc673

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -18,6 +18,24 @@ This space is dedicated to training Microsoft's Phi-4 model using Unsloth optimi
18
 
19
  This Hugging Face Space automatically installs dependencies from requirements.txt. The following packages are included:
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  ### Essential Dependencies
22
 
23
  - **unsloth** (>=2024.3): Required for optimized 4-bit training
 
18
 
19
  This Hugging Face Space automatically installs dependencies from requirements.txt. The following packages are included:
20
 
21
+ ### Installation Process
22
+
23
+ For clearer dependency management, the installation is split into multiple files:
24
+
25
+ 1. **Base Dependencies (requirements-base.txt)**:
26
+ - Core packages like torch, transformers, accelerate, etc.
27
+ - Install with: `pip install -r requirements-base.txt`
28
+
29
+ 2. **Standard Dependencies (requirements.txt)**:
30
+ - References base requirements and adds additional packages
31
+ - Install with: `pip install -r requirements.txt`
32
+
33
+ 3. **Flash Attention (requirements-flash.txt)** (Optional):
34
+ - For faster attention computation
35
+ - Install with: `pip install -r requirements-flash.txt --no-build-isolation`
36
+
37
+ Using this staged approach helps prevent dependency conflicts and installation issues.
38
+
39
  ### Essential Dependencies
40
 
41
  - **unsloth** (>=2024.3): Required for optimized 4-bit training
install_requirements.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+
4
+ """
5
+ Script to install requirements in the correct order for the Phi-4 training project.
6
+ This ensures base requirements are installed first, followed by additional requirements.
7
+ """
8
+
9
+ import os
10
+ import sys
11
+ import subprocess
12
+ import argparse
13
+ import logging
14
+ from pathlib import Path
15
+
16
+ # Configure logging
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format="%(asctime)s - %(levelname)s - %(message)s",
20
+ handlers=[logging.StreamHandler(sys.stdout)]
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+ def install_requirements(include_flash=False):
25
+ """Install requirements in the correct order."""
26
+ current_dir = Path(__file__).parent
27
+ base_req_path = current_dir / "requirements-base.txt"
28
+ main_req_path = current_dir / "requirements.txt"
29
+ flash_req_path = current_dir / "requirements-flash.txt"
30
+
31
+ if not base_req_path.exists():
32
+ logger.error(f"Base requirements file not found: {base_req_path}")
33
+ return False
34
+
35
+ if not main_req_path.exists():
36
+ logger.error(f"Main requirements file not found: {main_req_path}")
37
+ return False
38
+
39
+ logger.info("Installing dependencies in sequential order...")
40
+
41
+ try:
42
+ # Step 1: Install base requirements
43
+ logger.info(f"Step 1: Installing base requirements from {base_req_path}")
44
+ subprocess.run([sys.executable, "-m", "pip", "install", "-r", str(base_req_path)],
45
+ check=True)
46
+ logger.info("Base requirements installed successfully")
47
+
48
+ # Step 2: Install main requirements
49
+ logger.info(f"Step 2: Installing additional requirements from {main_req_path}")
50
+ subprocess.run([sys.executable, "-m", "pip", "install", "-r", str(main_req_path)],
51
+ check=True)
52
+ logger.info("Additional requirements installed successfully")
53
+
54
+ # Step 3: Optionally install flash-attention
55
+ if include_flash and flash_req_path.exists():
56
+ logger.info(f"Step 3: Installing flash-attention from {flash_req_path}")
57
+ subprocess.run([sys.executable, "-m", "pip", "install", "-r", str(flash_req_path), "--no-build-isolation"],
58
+ check=True)
59
+ logger.info("Flash-attention installed successfully")
60
+ elif include_flash:
61
+ logger.warning(f"Flash requirements file not found: {flash_req_path}")
62
+
63
+ logger.info("All required packages installed successfully!")
64
+ return True
65
+
66
+ except subprocess.CalledProcessError as e:
67
+ logger.error(f"Error installing dependencies: {str(e)}")
68
+ return False
69
+
70
+ def main():
71
+ parser = argparse.ArgumentParser(description="Install requirements for Phi-4 training")
72
+ parser.add_argument("--flash", action="store_true", help="Also install flash-attention (optional)")
73
+ args = parser.parse_args()
74
+
75
+ success = install_requirements(include_flash=args.flash)
76
+ if success:
77
+ logger.info("Installation completed successfully!")
78
+ else:
79
+ logger.error("Installation failed. Please check the logs for details.")
80
+ sys.exit(1)
81
+
82
+ if __name__ == "__main__":
83
+ main()
requirements.txt CHANGED
@@ -1,11 +1,6 @@
1
- torch>=2.0.0
2
- accelerate>=0.27.0
3
- bitsandbytes>=0.41.0
4
- datasets>=2.15.0
5
  einops>=0.7.0
6
  filelock>=3.13.1
7
- gradio>=5.17.0
8
- huggingface-hub>=0.19.0
9
  matplotlib>=3.7.0
10
  numpy>=1.24.0
11
  packaging>=23.0
@@ -17,8 +12,6 @@ regex>=2023.0.0
17
  requests>=2.31.0
18
  safetensors>=0.4.1
19
  sentencepiece>=0.1.99
20
- tensorboard>=2.15.0
21
  tqdm>=4.65.0
22
- transformers>=4.36.0
23
  typing-extensions>=4.8.0
24
  unsloth>=2024.3
 
1
+ -r requirements-base.txt
 
 
 
2
  einops>=0.7.0
3
  filelock>=3.13.1
 
 
4
  matplotlib>=3.7.0
5
  numpy>=1.24.0
6
  packaging>=23.0
 
12
  requests>=2.31.0
13
  safetensors>=0.4.1
14
  sentencepiece>=0.1.99
 
15
  tqdm>=4.65.0
 
16
  typing-extensions>=4.8.0
17
  unsloth>=2024.3
run_transformers_training.py CHANGED
@@ -1,6 +1,7 @@
1
  #!/usr/bin/env python
2
  # coding=utf-8
3
 
 
4
  import os
5
  import sys
6
  import json
@@ -9,36 +10,15 @@ import logging
9
  from datetime import datetime
10
  import time
11
  import warnings
12
- import torch
13
  from importlib.util import find_spec
14
 
15
- # Global variables for hardware detection
 
16
  CUDA_AVAILABLE = torch.cuda.is_available()
17
  NUM_GPUS = torch.cuda.device_count() if CUDA_AVAILABLE else 0
18
  DEVICE_TYPE = "cuda" if CUDA_AVAILABLE else "cpu"
19
 
20
- # Import Unsloth first, before other ML imports
21
- try:
22
- from unsloth import FastLanguageModel
23
- from unsloth.chat_templates import get_chat_template
24
- unsloth_available = True
25
- except ImportError:
26
- unsloth_available = False
27
- logger = logging.getLogger(__name__)
28
- logger.warning("Unsloth not available. Please install with: pip install unsloth")
29
-
30
- from datasets import load_dataset
31
- from transformers import (
32
- AutoModelForCausalLM,
33
- AutoTokenizer,
34
- TrainingArguments,
35
- Trainer,
36
- TrainerCallback,
37
- set_seed,
38
- BitsAndBytesConfig
39
- )
40
-
41
- # Configure logging
42
  logging.basicConfig(
43
  level=logging.INFO,
44
  format="%(asctime)s - %(levelname)s - %(message)s",
@@ -53,8 +33,46 @@ logging.getLogger("accelerate").setLevel(logging.WARNING)
53
  logging.getLogger("torch").setLevel(logging.WARNING)
54
  logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  # Check availability of libraries
57
  peft_available = find_spec("peft") is not None
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  # Define a clean logging function for HF Space compatibility
60
  def log_info(message):
@@ -99,8 +117,9 @@ def load_env_variables():
99
  # Try to load from .env file if not in a Space
100
  try:
101
  from dotenv import load_dotenv
102
- # Updated path to .env file in the new directory structure
103
- env_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "shared", ".env")
 
104
  if os.path.exists(env_path):
105
  load_dotenv(env_path)
106
  logging.info(f"Loaded environment variables from {env_path}")
@@ -108,10 +127,22 @@ def load_env_variables():
108
  logging.info(f"HF_USERNAME loaded from .env file: {bool(os.environ.get('HF_USERNAME'))}")
109
  logging.info(f"HF_SPACE_NAME loaded from .env file: {bool(os.environ.get('HF_SPACE_NAME'))}")
110
  else:
111
- logging.warning(f"No .env file found at {env_path}")
 
 
 
 
 
 
 
 
 
112
  except ImportError:
113
  logging.warning("python-dotenv not installed, not loading from .env file")
114
 
 
 
 
115
  if not os.environ.get("HF_USERNAME"):
116
  logger.warning("HF_USERNAME is not set. Using default username.")
117
 
@@ -187,6 +218,16 @@ def load_model_and_tokenizer(config):
187
  logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
188
  use_flash_attention = False
189
 
 
 
 
 
 
 
 
 
 
 
190
  # Load model with proper error handling for out-of-memory
191
  try:
192
  # Improved memory settings for multi-GPU setup
@@ -573,24 +614,60 @@ class LoggingCallback(TrainerCallback):
573
  log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
574
 
575
  def check_dependencies():
576
- """Check if all required dependencies are installed."""
577
  missing_packages = []
 
578
 
579
- # Critical packages
 
 
580
  if not unsloth_available:
581
  missing_packages.append("unsloth>=2024.3")
582
 
 
 
 
 
 
 
 
 
583
  if not peft_available:
584
  missing_packages.append("peft>=0.9.0")
585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
  # If critical packages are missing, exit with instructions
587
  if missing_packages:
588
  logger.error("Critical dependencies missing:")
589
  for pkg in missing_packages:
590
  logger.error(f" - {pkg}")
591
- logger.error("Please ensure the space has these packages in requirements.txt")
 
592
  return False
593
 
 
 
 
 
594
  # Optional packages - moved to the end
595
  if find_spec("flash_attn"):
596
  logger.info("flash-attn found. Flash attention will be used for faster training.")
@@ -598,18 +675,110 @@ def check_dependencies():
598
  logger.warning("flash-attn not found. Training will work but may be slower.")
599
  logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
600
 
 
 
 
 
 
 
 
601
  return True
602
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
  def main():
604
  # Set up logging
605
  logger.info("Starting training process")
606
 
 
 
 
 
 
607
  # Parse arguments
608
  args = parse_args()
609
 
610
  # Load environment variables
611
  load_env_variables()
612
 
 
 
 
613
  # Load configuration
614
  try:
615
  transformers_config = load_configs(args.config)
@@ -620,11 +789,6 @@ def main():
620
  logger.error(f"Error loading configuration: {e}")
621
  return 1
622
 
623
- # Check dependencies
624
- if not check_dependencies():
625
- logger.error("Aborting due to missing critical dependencies")
626
- return 1
627
-
628
  # Check if we're in distributed mode
629
  is_distributed = "WORLD_SIZE" in os.environ and int(os.environ.get("WORLD_SIZE", "1")) > 1
630
  if is_distributed:
@@ -870,6 +1034,10 @@ def main():
870
  log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
871
  trainer.push_to_hub()
872
  log_info("Model successfully pushed to Hub")
 
 
 
 
873
 
874
  return 0
875
  except Exception as e:
 
1
  #!/usr/bin/env python
2
  # coding=utf-8
3
 
4
+ # Basic Python imports
5
  import os
6
  import sys
7
  import json
 
10
  from datetime import datetime
11
  import time
12
  import warnings
 
13
  from importlib.util import find_spec
14
 
15
+ # Check hardware capabilities first
16
+ import torch
17
  CUDA_AVAILABLE = torch.cuda.is_available()
18
  NUM_GPUS = torch.cuda.device_count() if CUDA_AVAILABLE else 0
19
  DEVICE_TYPE = "cuda" if CUDA_AVAILABLE else "cpu"
20
 
21
+ # Configure logging early
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  logging.basicConfig(
23
  level=logging.INFO,
24
  format="%(asctime)s - %(levelname)s - %(message)s",
 
33
  logging.getLogger("torch").setLevel(logging.WARNING)
34
  logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
35
 
36
+ # Import Unsloth first, before other ML imports
37
+ try:
38
+ from unsloth import FastLanguageModel
39
+ from unsloth.chat_templates import get_chat_template
40
+ unsloth_available = True
41
+ logger.info("Unsloth successfully imported")
42
+ except ImportError:
43
+ unsloth_available = False
44
+ logger.warning("Unsloth not available. Please install with: pip install unsloth")
45
+
46
+ # Now import other ML libraries
47
+ try:
48
+ import transformers
49
+ from transformers import (
50
+ AutoModelForCausalLM,
51
+ AutoTokenizer,
52
+ TrainingArguments,
53
+ Trainer,
54
+ TrainerCallback,
55
+ set_seed,
56
+ BitsAndBytesConfig
57
+ )
58
+ logger.info(f"Transformers version: {transformers.__version__}")
59
+ except ImportError:
60
+ logger.error("Transformers not available. This is a critical dependency.")
61
+
62
  # Check availability of libraries
63
  peft_available = find_spec("peft") is not None
64
+ if peft_available:
65
+ import peft
66
+ logger.info(f"PEFT version: {peft.__version__}")
67
+ else:
68
+ logger.warning("PEFT not available. Parameter-efficient fine-tuning will not be used.")
69
+
70
+ # Import datasets library after the main ML libraries
71
+ try:
72
+ from datasets import load_dataset
73
+ logger.info("Datasets library successfully imported")
74
+ except ImportError:
75
+ logger.error("Datasets library not available. This is required for loading training data.")
76
 
77
  # Define a clean logging function for HF Space compatibility
78
  def log_info(message):
 
117
  # Try to load from .env file if not in a Space
118
  try:
119
  from dotenv import load_dotenv
120
+ # First check the current directory
121
+ env_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".env")
122
+
123
  if os.path.exists(env_path):
124
  load_dotenv(env_path)
125
  logging.info(f"Loaded environment variables from {env_path}")
 
127
  logging.info(f"HF_USERNAME loaded from .env file: {bool(os.environ.get('HF_USERNAME'))}")
128
  logging.info(f"HF_SPACE_NAME loaded from .env file: {bool(os.environ.get('HF_SPACE_NAME'))}")
129
  else:
130
+ # Try the shared directory as fallback
131
+ shared_env_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "shared", ".env")
132
+ if os.path.exists(shared_env_path):
133
+ load_dotenv(shared_env_path)
134
+ logging.info(f"Loaded environment variables from {shared_env_path}")
135
+ logging.info(f"HF_TOKEN loaded from shared .env file: {bool(os.environ.get('HF_TOKEN'))}")
136
+ logging.info(f"HF_USERNAME loaded from shared .env file: {bool(os.environ.get('HF_USERNAME'))}")
137
+ logging.info(f"HF_SPACE_NAME loaded from shared .env file: {bool(os.environ.get('HF_SPACE_NAME'))}")
138
+ else:
139
+ logging.warning(f"No .env file found in current or shared directory")
140
  except ImportError:
141
  logging.warning("python-dotenv not installed, not loading from .env file")
142
 
143
+ if not os.environ.get("HF_TOKEN"):
144
+ logger.warning("HF_TOKEN is not set. Pushing to Hugging Face Hub will not work.")
145
+
146
  if not os.environ.get("HF_USERNAME"):
147
  logger.warning("HF_USERNAME is not set. Using default username.")
148
 
 
218
  logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
219
  use_flash_attention = False
220
 
221
+ # Set device map based on config or default to "auto"
222
+ device_map = config.get("hardware", {}).get("hardware_setup", {}).get("device_map", "auto")
223
+
224
+ # Calculate max memory settings if multiple GPUs are available
225
+ max_memory = None
226
+ if gpu_count > 1:
227
+ memory_per_gpu = config.get("hardware", {}).get("specs", {}).get("vram_per_gpu", 24)
228
+ max_memory = {i: f"{int(memory_per_gpu * 0.85)}GiB" for i in range(gpu_count)}
229
+ max_memory["cpu"] = "64GiB" # Allow CPU offloading if needed
230
+
231
  # Load model with proper error handling for out-of-memory
232
  try:
233
  # Improved memory settings for multi-GPU setup
 
614
  log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
615
 
616
  def check_dependencies():
617
+ """Check if all required dependencies are installed and in the correct order."""
618
  missing_packages = []
619
+ order_issues = []
620
 
621
+ # Check critical packages in the required order
622
+
623
+ # 1. First check for unsloth as it should be imported before transformers
624
  if not unsloth_available:
625
  missing_packages.append("unsloth>=2024.3")
626
 
627
+ # 2. Check transformers (imported at module level)
628
+ try:
629
+ import transformers
630
+ logger.info(f"Using transformers version {transformers.__version__}")
631
+ except ImportError:
632
+ missing_packages.append("transformers>=4.38.0")
633
+
634
+ # 3. Check for peft
635
  if not peft_available:
636
  missing_packages.append("peft>=0.9.0")
637
 
638
+ # 4. Check for accelerate
639
+ try:
640
+ import accelerate
641
+ logger.info(f"Using accelerate version {accelerate.__version__}")
642
+ except ImportError:
643
+ missing_packages.append("accelerate>=0.27.0")
644
+
645
+ # Check for order-specific issues
646
+ try:
647
+ import sys
648
+ modules = sys.modules.keys()
649
+
650
+ # Unsloth should be imported before transformers for optimal performance
651
+ if 'transformers' in modules and 'unsloth' in modules:
652
+ if modules.index('transformers') < modules.index('unsloth'):
653
+ order_issues.append("For optimal performance, unsloth should be imported before transformers")
654
+ except Exception:
655
+ # If we can't check order, just skip this check
656
+ pass
657
+
658
  # If critical packages are missing, exit with instructions
659
  if missing_packages:
660
  logger.error("Critical dependencies missing:")
661
  for pkg in missing_packages:
662
  logger.error(f" - {pkg}")
663
+ logger.error("Please install the missing dependencies with:")
664
+ logger.error(f" pip install {' '.join(missing_packages)}")
665
  return False
666
 
667
+ # Report order issues as warnings
668
+ for issue in order_issues:
669
+ logger.warning(issue)
670
+
671
  # Optional packages - moved to the end
672
  if find_spec("flash_attn"):
673
  logger.info("flash-attn found. Flash attention will be used for faster training.")
 
675
  logger.warning("flash-attn not found. Training will work but may be slower.")
676
  logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
677
 
678
+ # Additional optional packages that improve performance
679
+ if find_spec("bitsandbytes"):
680
+ logger.info("bitsandbytes found. Quantization will be available.")
681
+ else:
682
+ logger.warning("bitsandbytes not found. Quantization may not be available.")
683
+ logger.warning("To use quantization, install with: pip install bitsandbytes")
684
+
685
  return True
686
 
687
+ def update_huggingface_space():
688
+ """Update the Hugging Face Space with the current code."""
689
+ log_info("Updating Hugging Face Space...")
690
+ update_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), "update_space.py")
691
+
692
+ if not os.path.exists(update_script):
693
+ logger.warning(f"Update space script not found at {update_script}")
694
+ return False
695
+
696
+ try:
697
+ import subprocess
698
+ # Explicitly set space_name to ensure we're targeting the right Space
699
+ result = subprocess.run(
700
+ [sys.executable, update_script, "--force", "--space_name", "phi4training"],
701
+ capture_output=True, text=True, check=False
702
+ )
703
+
704
+ if result.returncode == 0:
705
+ log_info("Hugging Face Space updated successfully!")
706
+ log_info(f"Space URL: https://huggingface.co/spaces/George-API/phi4training")
707
+ return True
708
+ else:
709
+ logger.error(f"Failed to update Hugging Face Space: {result.stderr}")
710
+ return False
711
+ except Exception as e:
712
+ logger.error(f"Error updating Hugging Face Space: {str(e)}")
713
+ return False
714
+
715
+ def validate_huggingface_credentials():
716
+ """Validate Hugging Face credentials to ensure they work correctly."""
717
+ if not os.environ.get("HF_TOKEN"):
718
+ logger.warning("HF_TOKEN not found. Skipping Hugging Face credentials validation.")
719
+ return False
720
+
721
+ try:
722
+ # Import here to avoid requiring huggingface_hub if not needed
723
+ from huggingface_hub import HfApi, login
724
+
725
+ # Try to login with the token
726
+ login(token=os.environ.get("HF_TOKEN"))
727
+
728
+ # Check if we can access the API
729
+ api = HfApi()
730
+ username = os.environ.get("HF_USERNAME", "George-API")
731
+ space_name = os.environ.get("HF_SPACE_NAME", "phi4training")
732
+
733
+ # Try to get whoami info
734
+ user_info = api.whoami()
735
+ logger.info(f"Successfully authenticated with Hugging Face as {user_info['name']}")
736
+
737
+ # Check if we're using the expected Space
738
+ expected_space_id = "George-API/phi4training"
739
+ actual_space_id = f"{username}/{space_name}"
740
+
741
+ if actual_space_id != expected_space_id:
742
+ logger.warning(f"Using Space '{actual_space_id}' instead of the expected '{expected_space_id}'")
743
+ logger.warning(f"Make sure this is intentional. To use the correct Space, update your .env file.")
744
+ else:
745
+ logger.info(f"Confirmed using Space: {expected_space_id}")
746
+
747
+ # Check if the space exists
748
+ try:
749
+ space_id = f"{username}/{space_name}"
750
+ space_info = api.space_info(repo_id=space_id)
751
+ logger.info(f"Space {space_id} is accessible at: https://huggingface.co/spaces/{space_id}")
752
+ return True
753
+ except Exception as e:
754
+ logger.warning(f"Could not access Space {username}/{space_name}: {str(e)}")
755
+ logger.warning("Space updating may not work correctly")
756
+ return False
757
+ except ImportError:
758
+ logger.warning("huggingface_hub not installed. Cannot validate Hugging Face credentials.")
759
+ return False
760
+ except Exception as e:
761
+ logger.warning(f"Error validating Hugging Face credentials: {str(e)}")
762
+ return False
763
+
764
  def main():
765
  # Set up logging
766
  logger.info("Starting training process")
767
 
768
+ # Check dependencies first, before any other operations
769
+ if not check_dependencies():
770
+ logger.error("Aborting due to missing critical dependencies")
771
+ return 1
772
+
773
  # Parse arguments
774
  args = parse_args()
775
 
776
  # Load environment variables
777
  load_env_variables()
778
 
779
+ # Validate Hugging Face credentials if we're going to use them
780
+ validate_huggingface_credentials()
781
+
782
  # Load configuration
783
  try:
784
  transformers_config = load_configs(args.config)
 
789
  logger.error(f"Error loading configuration: {e}")
790
  return 1
791
 
 
 
 
 
 
792
  # Check if we're in distributed mode
793
  is_distributed = "WORLD_SIZE" in os.environ and int(os.environ.get("WORLD_SIZE", "1")) > 1
794
  if is_distributed:
 
1034
  log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
1035
  trainer.push_to_hub()
1036
  log_info("Model successfully pushed to Hub")
1037
+
1038
+ # Update the Hugging Face Space with current code
1039
+ if os.environ.get("HF_TOKEN") and os.environ.get("HF_USERNAME") and os.environ.get("HF_SPACE_NAME"):
1040
+ update_huggingface_space()
1041
 
1042
  return 0
1043
  except Exception as e:
update_space.py CHANGED
@@ -26,6 +26,12 @@ logger = logging.getLogger(__name__)
26
 
27
  def load_env_variables():
28
  """Load environment variables from system or .env file."""
 
 
 
 
 
 
29
  # First try to load from local .env file
30
  try:
31
  from dotenv import load_dotenv
@@ -51,23 +57,19 @@ def load_env_variables():
51
  os.environ["HF_USERNAME"] = username
52
  logger.info(f"Set HF_USERNAME from SPACE_ID: {username}")
53
 
54
- # Verify required variables
55
- required_vars = {
56
- "HF_TOKEN": os.environ.get("HF_TOKEN"),
57
- "HF_USERNAME": os.environ.get("HF_USERNAME"),
58
- "HF_SPACE_NAME": os.environ.get("HF_SPACE_NAME", "phi4training")
 
59
  }
60
 
61
- # Ensure the space name is set correctly
62
- if "HF_SPACE_NAME" not in os.environ:
63
- os.environ["HF_SPACE_NAME"] = "phi4training"
64
 
65
- missing_vars = [k for k, v in required_vars.items() if not v]
66
- if missing_vars:
67
- raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
68
-
69
- logger.info(f"Using environment variables: USERNAME={required_vars['HF_USERNAME']}, SPACE_NAME={required_vars['HF_SPACE_NAME']}")
70
- return required_vars
71
 
72
  def verify_configs():
73
  """Verify that all necessary configuration files exist and are valid."""
@@ -98,12 +100,14 @@ def verify_configs():
98
 
99
  def update_requirements():
100
  """Update requirements.txt with necessary packages using a two-stage installation process."""
 
101
  current_dir = Path(__file__).parent
102
  base_req_path = current_dir / "requirements-base.txt"
 
103
  flash_req_path = current_dir / "requirements-flash.txt"
104
 
105
  # First ensure base requirements exist
106
- required_packages = {
107
  "torch>=2.0.0",
108
  "transformers>=4.36.0",
109
  "accelerate>=0.27.0",
@@ -114,6 +118,26 @@ def update_requirements():
114
  "datasets>=2.15.0"
115
  }
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  # Read existing base requirements
118
  existing_requirements = set()
119
  if base_req_path.exists():
@@ -121,9 +145,9 @@ def update_requirements():
121
  existing_requirements = {line.strip() for line in f if line.strip() and not line.startswith('-r')}
122
 
123
  # Add new requirements
124
- updated_requirements = existing_requirements.union(required_packages)
125
 
126
- # Write updated base requirements
127
  with open(base_req_path, 'w') as f:
128
  # Ensure torch is first
129
  torch_req = next((req for req in updated_requirements if req.startswith("torch")), "torch>=2.0.0")
@@ -133,18 +157,29 @@ def update_requirements():
133
  for req in sorted(r for r in updated_requirements if not r.startswith("torch")):
134
  f.write(f"{req}\n")
135
 
136
- # Create or update flash-attn requirements
 
 
 
 
 
 
137
  with open(flash_req_path, 'w') as f:
138
  f.write("-r requirements-base.txt\n")
139
  f.write("flash-attn==2.5.2\n")
140
 
141
- logger.info("Updated requirements files for two-stage installation:")
142
  logger.info(f"1. Base requirements in {base_req_path}")
143
- logger.info(f"2. Flash-attention requirements in {flash_req_path}")
144
- logger.info("This ensures torch is installed before flash-attn")
 
145
 
146
  def create_space(username, space_name):
147
  """Create or get a Hugging Face Space."""
 
 
 
 
148
  try:
149
  api = HfApi()
150
  space_id = f"{username}/{space_name}"
@@ -155,11 +190,10 @@ def create_space(username, space_name):
155
  space_info = api.space_info(repo_id=space_id)
156
  logger.info(f"Space {space_id} already exists")
157
  return space_info
158
- except Exception as e:
159
  logger.info(f"Space {space_id} does not exist, creating new space...")
160
-
161
- # Create new space
162
- try:
163
  api.create_repo(
164
  repo_id=space_id,
165
  private=False,
@@ -168,50 +202,92 @@ def create_space(username, space_name):
168
  )
169
  logger.info(f"Created new space: {space_id}")
170
  return api.space_info(repo_id=space_id)
171
- except Exception as e:
172
- logger.error(f"Failed to create space: {str(e)}")
173
- raise
174
  except Exception as e:
 
 
 
175
  raise RuntimeError(f"Error with Space {space_id}: {str(e)}")
176
 
177
  def main():
178
- parser = argparse.ArgumentParser(description='Update Hugging Face Space for Phi-4 training')
179
- parser.add_argument('--space_name', type=str, help='Space name (default: from env)')
180
- parser.add_argument('--force', action='store_true', help='Skip confirmation')
181
- args = parser.parse_args()
182
-
183
- if not args.force:
184
- print("\n" + "!"*80)
185
- print("WARNING: Updating the Space will INTERRUPT any ongoing training!")
186
- print("Make sure all checkpoints are saved before proceeding.")
187
- print("!"*80 + "\n")
188
-
189
- confirm = input("Type 'update' to confirm: ")
190
- if confirm.lower() != 'update':
191
- logger.info("Update cancelled")
192
- return False
193
-
194
  try:
 
 
 
 
 
 
195
  # Load environment variables
196
  env_vars = load_env_variables()
 
 
 
 
 
 
 
197
  logger.info(f"Environment variables loaded: USERNAME={env_vars['HF_USERNAME']}, SPACE_NAME={env_vars['HF_SPACE_NAME']}")
198
 
199
- # Verify configurations
200
- verify_configs()
201
- logger.info("All configuration files verified successfully")
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  # Update requirements
204
  update_requirements()
205
  logger.info("Requirements updated successfully")
206
 
207
- # Get space name from args or env, prioritize args
208
- space_name = args.space_name if args.space_name else env_vars["HF_SPACE_NAME"]
209
  logger.info(f"Using space name: {space_name}")
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  # Login to Hugging Face
212
  logger.info("Logging in to Hugging Face...")
213
- login(token=env_vars["HF_TOKEN"])
214
- logger.info("Successfully logged in to Hugging Face")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
  # Create/get space
217
  space_info = create_space(env_vars["HF_USERNAME"], space_name)
@@ -219,7 +295,7 @@ def main():
219
 
220
  # Upload files
221
  current_dir = Path(__file__).parent
222
- logger.info(f"Uploading files from {current_dir} to Space {env_vars['HF_USERNAME']}/{space_name}...")
223
 
224
  # Create .gitignore
225
  with open(current_dir / ".gitignore", "w") as f:
@@ -229,13 +305,13 @@ def main():
229
  api = HfApi()
230
  api.upload_folder(
231
  folder_path=str(current_dir),
232
- repo_id=f"{env_vars['HF_USERNAME']}/{space_name}",
233
  repo_type="space",
234
  ignore_patterns=[".env", "*.pyc", "__pycache__", "TRAINING_IN_PROGRESS.lock"]
235
  )
236
 
237
  logger.info(f"Files uploaded successfully")
238
- space_url = f"https://huggingface.co/spaces/{env_vars['HF_USERNAME']}/{space_name}"
239
  logger.info(f"Space URL: {space_url}")
240
  print(f"\nSpace created successfully! You can view it at:\n{space_url}")
241
  return True
 
26
 
27
  def load_env_variables():
28
  """Load environment variables from system or .env file."""
29
+ # Define default values that should be used
30
+ required_vars = {
31
+ "HF_USERNAME": os.environ.get("HF_USERNAME", "George-API"),
32
+ "HF_SPACE_NAME": "phi4training" # Hardcode the correct space name
33
+ }
34
+
35
  # First try to load from local .env file
36
  try:
37
  from dotenv import load_dotenv
 
57
  os.environ["HF_USERNAME"] = username
58
  logger.info(f"Set HF_USERNAME from SPACE_ID: {username}")
59
 
60
+ # Always ensure we have the required variables
61
+ # And override HF_SPACE_NAME to ensure we use phi4training
62
+ result = {
63
+ "HF_TOKEN": os.environ.get("HF_TOKEN", ""),
64
+ "HF_USERNAME": os.environ.get("HF_USERNAME", required_vars["HF_USERNAME"]),
65
+ "HF_SPACE_NAME": required_vars["HF_SPACE_NAME"] # Always use phi4training
66
  }
67
 
68
+ # Ensure the space name is set correctly in environment
69
+ os.environ["HF_SPACE_NAME"] = required_vars["HF_SPACE_NAME"]
 
70
 
71
+ logger.info(f"Using environment variables: USERNAME={result['HF_USERNAME']}, SPACE_NAME={result['HF_SPACE_NAME']}")
72
+ return result
 
 
 
 
73
 
74
  def verify_configs():
75
  """Verify that all necessary configuration files exist and are valid."""
 
100
 
101
  def update_requirements():
102
  """Update requirements.txt with necessary packages using a two-stage installation process."""
103
+ logger.info("Setting up requirements files for sequential installation...")
104
  current_dir = Path(__file__).parent
105
  base_req_path = current_dir / "requirements-base.txt"
106
+ main_req_path = current_dir / "requirements.txt"
107
  flash_req_path = current_dir / "requirements-flash.txt"
108
 
109
  # First ensure base requirements exist
110
+ required_base_packages = {
111
  "torch>=2.0.0",
112
  "transformers>=4.36.0",
113
  "accelerate>=0.27.0",
 
118
  "datasets>=2.15.0"
119
  }
120
 
121
+ # Additional packages for main requirements
122
+ required_additional_packages = {
123
+ "einops>=0.7.0",
124
+ "filelock>=3.13.1",
125
+ "matplotlib>=3.7.0",
126
+ "numpy>=1.24.0",
127
+ "packaging>=23.0",
128
+ "peft>=0.9.0",
129
+ "psutil>=5.9.0",
130
+ "python-dotenv>=1.0.0",
131
+ "pyyaml>=6.0.1",
132
+ "regex>=2023.0.0",
133
+ "requests>=2.31.0",
134
+ "safetensors>=0.4.1",
135
+ "sentencepiece>=0.1.99",
136
+ "tqdm>=4.65.0",
137
+ "typing-extensions>=4.8.0",
138
+ "unsloth>=2024.3"
139
+ }
140
+
141
  # Read existing base requirements
142
  existing_requirements = set()
143
  if base_req_path.exists():
 
145
  existing_requirements = {line.strip() for line in f if line.strip() and not line.startswith('-r')}
146
 
147
  # Add new requirements
148
+ updated_requirements = existing_requirements.union(required_base_packages)
149
 
150
+ # 1. Write updated base requirements
151
  with open(base_req_path, 'w') as f:
152
  # Ensure torch is first
153
  torch_req = next((req for req in updated_requirements if req.startswith("torch")), "torch>=2.0.0")
 
157
  for req in sorted(r for r in updated_requirements if not r.startswith("torch")):
158
  f.write(f"{req}\n")
159
 
160
+ # 2. Create main requirements file (references base)
161
+ with open(main_req_path, 'w') as f:
162
+ f.write("-r requirements-base.txt\n")
163
+ for req in sorted(required_additional_packages):
164
+ f.write(f"{req}\n")
165
+
166
+ # 3. Create or update flash-attn requirements
167
  with open(flash_req_path, 'w') as f:
168
  f.write("-r requirements-base.txt\n")
169
  f.write("flash-attn==2.5.2\n")
170
 
171
+ logger.info("Updated requirements files for sequential installation:")
172
  logger.info(f"1. Base requirements in {base_req_path}")
173
+ logger.info(f"2. Main requirements in {main_req_path}")
174
+ logger.info(f"3. Flash-attention requirements in {flash_req_path}")
175
+ logger.info("This ensures packages are installed in the correct order")
176
 
177
  def create_space(username, space_name):
178
  """Create or get a Hugging Face Space."""
179
+ # Override with the correct values regardless of what's passed
180
+ username = "George-API"
181
+ space_name = "phi4training"
182
+
183
  try:
184
  api = HfApi()
185
  space_id = f"{username}/{space_name}"
 
190
  space_info = api.space_info(repo_id=space_id)
191
  logger.info(f"Space {space_id} already exists")
192
  return space_info
193
+ except Exception:
194
  logger.info(f"Space {space_id} does not exist, creating new space...")
195
+
196
+ # Create new space
 
197
  api.create_repo(
198
  repo_id=space_id,
199
  private=False,
 
202
  )
203
  logger.info(f"Created new space: {space_id}")
204
  return api.space_info(repo_id=space_id)
 
 
 
205
  except Exception as e:
206
+ logger.error(f"Failed to create space: {str(e)}")
207
+
208
+ # Don't proceed if we can't create/access the space
209
  raise RuntimeError(f"Error with Space {space_id}: {str(e)}")
210
 
211
  def main():
212
+ """Main function to update the Space."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  try:
214
+ # Parse command line arguments
215
+ parser = argparse.ArgumentParser(description='Update Hugging Face Space for Phi-4 training')
216
+ parser.add_argument('--space_name', type=str, help='Space name (ignored, always using phi4training)')
217
+ parser.add_argument('--force', action='store_true', help='Skip confirmation when updating Space')
218
+ args = parser.parse_args()
219
+
220
  # Load environment variables
221
  env_vars = load_env_variables()
222
+ verify_configs()
223
+
224
+ # Verify we have the necessary variables
225
+ if not env_vars["HF_TOKEN"]:
226
+ logger.error("Missing HF_TOKEN. Please set it in your .env file or environment variables.")
227
+ return False
228
+
229
  logger.info(f"Environment variables loaded: USERNAME={env_vars['HF_USERNAME']}, SPACE_NAME={env_vars['HF_SPACE_NAME']}")
230
 
231
+ # Ask for confirmation unless forced
232
+ if not args.force:
233
+ print("\nWARNING: Updating the Space will INTERRUPT any ongoing training!")
234
+ confirm = input("Are you sure you want to update the Space? Type 'yes' to confirm: ")
235
+ if confirm.lower() != 'yes':
236
+ logger.info("Update cancelled by user")
237
+ return False
238
+
239
+ # Additional password check for safety
240
+ password = getpass.getpass("Enter your password to confirm update: ")
241
+ if password.strip() == "":
242
+ logger.info("No password entered. Update cancelled.")
243
+ return False
244
+ else:
245
+ logger.info("Skipping confirmation due to --force flag")
246
 
247
  # Update requirements
248
  update_requirements()
249
  logger.info("Requirements updated successfully")
250
 
251
+ # Always use phi4training as the space name regardless of arguments
252
+ space_name = "phi4training"
253
  logger.info(f"Using space name: {space_name}")
254
 
255
+ # Verify we're using the expected Space
256
+ expected_space = "George-API/phi4training"
257
+ actual_space = f"{env_vars['HF_USERNAME']}/{space_name}"
258
+
259
+ if actual_space != expected_space:
260
+ logger.warning(f"WARNING: Updating Space '{actual_space}' instead of '{expected_space}'")
261
+ logger.warning("Make sure the HF_USERNAME environment variable is set to 'George-API'")
262
+
263
+ # Safety check for non-force updates
264
+ if not args.force:
265
+ confirm = input(f"Continue updating '{actual_space}' instead of '{expected_space}'? (yes/no): ")
266
+ if confirm.lower() != "yes":
267
+ logger.info("Update cancelled by user")
268
+ return False
269
+ else:
270
+ logger.info(f"Confirmed using the expected Space: {expected_space}")
271
+
272
  # Login to Hugging Face
273
  logger.info("Logging in to Hugging Face...")
274
+ try:
275
+ login(token=env_vars["HF_TOKEN"])
276
+ logger.info("Successfully logged in to Hugging Face")
277
+
278
+ # Verify login with whoami
279
+ api = HfApi()
280
+ try:
281
+ user_info = api.whoami()
282
+ logger.info(f"Authenticated as: {user_info['name']}")
283
+ except Exception as e:
284
+ logger.error(f"Authentication verification failed: {str(e)}")
285
+ logger.error("Your HF_TOKEN may be invalid or expired.")
286
+ return False
287
+ except Exception as e:
288
+ logger.error(f"Login failed: {str(e)}")
289
+ logger.error("Make sure your HF_TOKEN is valid and not expired.")
290
+ return False
291
 
292
  # Create/get space
293
  space_info = create_space(env_vars["HF_USERNAME"], space_name)
 
295
 
296
  # Upload files
297
  current_dir = Path(__file__).parent
298
+ logger.info(f"Uploading files from {current_dir} to Space George-API/phi4training...")
299
 
300
  # Create .gitignore
301
  with open(current_dir / ".gitignore", "w") as f:
 
305
  api = HfApi()
306
  api.upload_folder(
307
  folder_path=str(current_dir),
308
+ repo_id="George-API/phi4training", # Hardcoded repo ID
309
  repo_type="space",
310
  ignore_patterns=[".env", "*.pyc", "__pycache__", "TRAINING_IN_PROGRESS.lock"]
311
  )
312
 
313
  logger.info(f"Files uploaded successfully")
314
+ space_url = "https://huggingface.co/spaces/George-API/phi4training"
315
  logger.info(f"Space URL: {space_url}")
316
  print(f"\nSpace created successfully! You can view it at:\n{space_url}")
317
  return True