George-API commited on
Commit
93b2fec
·
verified ·
1 Parent(s): decfb95

Upload folder using huggingface_hub

Browse files
run_transformers_training.py CHANGED
@@ -262,13 +262,22 @@ def load_model_and_tokenizer(config):
262
 
263
  # Apply Unsloth's training optimizations with config parameters
264
  unsloth_config = config.get("unsloth", {})
 
 
 
 
 
 
 
 
 
265
  model = FastLanguageModel.get_peft_model(
266
  model,
267
  r=unsloth_config.get("r", 32),
268
  target_modules=unsloth_config.get("target_modules",
269
  ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]),
270
  lora_alpha=unsloth_config.get("alpha", 16),
271
- lora_dropout=unsloth_config.get("dropout", 0.05),
272
  bias="none",
273
  use_gradient_checkpointing=config.get("gradient_checkpointing", True) or config.get("training", {}).get("gradient_checkpointing", True),
274
  random_state=config.get("seed", 42),
@@ -279,11 +288,19 @@ def load_model_and_tokenizer(config):
279
  chat_template = config.get("chat_template") or config.get("tokenizer", {}).get("chat_template")
280
  if chat_template:
281
  try:
 
282
  template = get_chat_template("phi")
283
- tokenizer.chat_template = template
284
- logger.info("Set phi chat template")
 
 
 
 
 
 
285
  except Exception as e:
286
  logger.warning(f"Failed to set chat template: {str(e)}")
 
287
 
288
  # Ensure proper token settings
289
  if tokenizer.pad_token_id is None:
@@ -308,7 +325,23 @@ def load_dataset_with_mapping(dataset_config):
308
  raise ValueError("Dataset name not provided in configuration")
309
 
310
  logger.info(f"Loading pre-processed dataset {dataset_name}, split {dataset_split}")
311
- dataset = load_dataset(dataset_name, split=dataset_split)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
 
313
  # Apply minimal processing since the dataset has already been properly structured
314
  # Just perform validation to ensure required fields exist
@@ -765,296 +798,311 @@ def main():
765
  # Set up logging
766
  logger.info("Starting training process")
767
 
768
- # Check dependencies first, before any other operations
769
- if not check_dependencies():
770
- logger.error("Aborting due to missing critical dependencies")
771
- return 1
772
-
773
- # Parse arguments
774
- args = parse_args()
775
-
776
- # Load environment variables
777
- load_env_variables()
778
-
779
- # Validate Hugging Face credentials if we're going to use them
780
- validate_huggingface_credentials()
781
-
782
- # Load configuration
783
  try:
784
- transformers_config = load_configs(args.config)
785
- hardware_config = transformers_config.get("hardware", {})
786
- dataset_config = transformers_config.get("dataset", {})
787
- logger.info("Configuration loaded successfully")
788
- except Exception as e:
789
- logger.error(f"Error loading configuration: {e}")
790
- return 1
791
-
792
- # Check if we're in distributed mode
793
- is_distributed = "WORLD_SIZE" in os.environ and int(os.environ.get("WORLD_SIZE", "1")) > 1
794
- if is_distributed:
795
- local_rank = int(os.environ.get("LOCAL_RANK", "0"))
796
- log_info(f"Running in distributed mode with {os.environ.get('WORLD_SIZE')} processes, local_rank: {local_rank}")
797
- else:
798
- log_info("Running in non-distributed mode (single process)")
799
-
800
- # Set random seed for reproducibility
801
- seed = transformers_config.get("seed", 42)
802
- set_seed(seed)
803
- logger.info(f"Set random seed to {seed}")
804
-
805
- # Load model and tokenizer using the consolidated config
806
- model, tokenizer = load_model_and_tokenizer(transformers_config)
807
-
808
- # Empty CUDA cache to ensure clean state
809
- if CUDA_AVAILABLE:
810
- torch.cuda.empty_cache()
811
- log_info("Cleared CUDA cache")
812
-
813
- # Setup environment variable for CUDA memory allocation
814
- if CUDA_AVAILABLE:
815
- system_settings = hardware_config.get("system_settings", {})
816
- cuda_memory_fraction = system_settings.get("cuda_memory_fraction", 0.85)
817
-
818
- if cuda_memory_fraction < 1.0:
819
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:128,expandable_segments:True"
820
- log_info(f"Set CUDA memory allocation limit to expandable with max_split_size_mb:128")
821
-
822
- try:
823
- log_info("Loading dataset...")
824
- dataset = load_dataset_with_mapping(dataset_config)
825
- log_info(f"Dataset loaded with {len(dataset)} examples")
826
 
827
- # Minimal validation before proceeding
828
- if dataset is None or len(dataset) == 0:
829
- logger.error("Dataset is empty or None! Cannot proceed with training.")
 
 
 
 
 
 
 
 
 
 
 
830
  return 1
831
 
832
- # Create data collator
833
- data_collator = SimpleDataCollator(tokenizer, dataset_config)
834
-
835
- # Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
836
- # First check hardware config, then transformers config
837
- use_bf16 = False
838
- use_fp16 = False
839
-
840
- # Check hardware config first
841
- hardware_precision = hardware_config.get("training_optimizations", {}).get("mixed_precision", "")
842
- if hardware_precision.lower() == "bf16":
843
- use_bf16 = True
844
- log_info("Using BF16 precision from hardware config")
845
- elif hardware_precision.lower() == "fp16":
846
- use_fp16 = True
847
- log_info("Using FP16 precision from hardware config")
848
- else:
849
- # Fall back to transformers config
850
- use_bf16 = transformers_config.get("bf16", False) or transformers_config.get("torch_dtype", "") == "bfloat16"
851
- use_fp16 = transformers_config.get("fp16", False) and not use_bf16 # Only use fp16 if bf16 is not set
852
- log_info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
853
-
854
- # Get per device batch size - from transformers config, but possibly overridden by hardware config
855
- per_device_batch_size = transformers_config.get("training", {}).get("per_device_train_batch_size", 16)
856
- gradient_accumulation_steps = transformers_config.get("training", {}).get("gradient_accumulation_steps", 3)
857
-
858
- # Get multi-GPU strategy from hardware config (default to data_parallel)
859
- multi_gpu_strategy = hardware_config.get("training_optimizations", {}).get("multi_gpu_strategy", "data_parallel")
860
- logger.info(f"Multi-GPU strategy: {multi_gpu_strategy}")
861
-
862
- # For multi-GPU setup, adjust for better balance
863
- if CUDA_AVAILABLE and NUM_GPUS > 1:
864
- log_info(f"Multi-GPU setup: Adjusting for {NUM_GPUS} GPUs")
865
-
866
- # Set up FSDP for multi-GPU training if specified and in distributed mode
867
- fsdp_config = None
868
- if multi_gpu_strategy == "fsdp" and is_distributed and NUM_GPUS > 1:
869
- try:
870
- from torch.distributed.fsdp import (
871
- FullyShardedDataParallel as FSDP,
872
- MixedPrecision,
873
- BackwardPrefetch,
874
- ShardingStrategy,
875
- CPUOffload,
876
- )
877
- from torch.distributed.fsdp.wrap import (
878
- transformer_auto_wrap_policy,
879
- enable_wrap,
880
- wrap,
881
- )
882
-
883
- log_info("Using FSDP for distributed training")
884
-
885
- # Configure FSDP
886
- fsdp_config = {
887
- "fsdp_transformer_layer_cls_to_wrap": ["LlamaDecoderLayer"],
888
- "fsdp_offload_params": False,
889
- "fsdp_backward_prefetch": "BACKWARD_PRE",
890
- "fsdp_min_num_params": 1e6,
891
- "fsdp_sharding_strategy": 1, # FULL_SHARD
892
- }
893
-
894
- if use_bf16 or use_fp16:
895
- precision_type = "bf16" if use_bf16 else "fp16"
896
- fsdp_config["fsdp_state_dict_type"] = "FULL_STATE_DICT"
897
- log_info(f"FSDP using mixed precision: {precision_type}")
898
- except ImportError:
899
- log_info("FSDP imports failed, falling back to standard DDP")
900
- fsdp_config = None
901
- elif multi_gpu_strategy == "fsdp" and not is_distributed:
902
- log_info("FSDP disabled: requires distributed environment (use torchrun or accelerate)")
903
- log_info("Using DataParallel for multi-GPU training instead")
904
  else:
905
- log_info(f"Using {multi_gpu_strategy} for multi-GPU training")
906
-
907
- # Get system settings from hardware config
908
- dataloader_workers = hardware_config.get("system_settings", {}).get("dataloader_num_workers", 2)
909
- pin_memory = hardware_config.get("system_settings", {}).get("dataloader_pin_memory", True)
910
-
911
- # Set up training arguments
912
- log_info("Setting up training arguments")
913
- training_args = TrainingArguments(
914
- output_dir=transformers_config.get("output_dir", "./results") or transformers_config.get("checkpointing", {}).get("output_dir", "./results"),
915
- num_train_epochs=transformers_config.get("training", {}).get("num_train_epochs", 3),
916
- per_device_train_batch_size=per_device_batch_size,
917
- gradient_accumulation_steps=gradient_accumulation_steps,
918
- learning_rate=transformers_config.get("training", {}).get("learning_rate", 2e-5),
919
- weight_decay=transformers_config.get("training", {}).get("weight_decay", 0.01),
920
- warmup_ratio=transformers_config.get("training", {}).get("warmup_ratio", 0.05),
921
- lr_scheduler_type=transformers_config.get("training", {}).get("lr_scheduler_type", "cosine"),
922
- logging_steps=transformers_config.get("training", {}).get("logging_steps", 10),
923
- save_strategy=transformers_config.get("checkpointing", {}).get("save_strategy", "steps"),
924
- save_steps=transformers_config.get("checkpointing", {}).get("save_steps", 100),
925
- save_total_limit=transformers_config.get("checkpointing", {}).get("save_total_limit", 3),
926
- fp16=use_fp16,
927
- bf16=use_bf16,
928
- max_grad_norm=transformers_config.get("training", {}).get("max_grad_norm", 1.0),
929
- push_to_hub=transformers_config.get("huggingface_hub", {}).get("push_to_hub", False),
930
- hub_model_id=transformers_config.get("huggingface_hub", {}).get("hub_model_id", None),
931
- hub_token=os.environ.get("HF_TOKEN", None),
932
- report_to="tensorboard",
933
- remove_unused_columns=False, # Keep all columns
934
- gradient_checkpointing=transformers_config.get("training", {}).get("gradient_checkpointing", True),
935
- dataloader_pin_memory=pin_memory,
936
- optim=transformers_config.get("training", {}).get("optim", "adamw_torch"),
937
- ddp_find_unused_parameters=False, # Improve distributed training efficiency
938
- dataloader_drop_last=False, # Process all examples
939
- dataloader_num_workers=dataloader_workers,
940
- no_cuda=False if CUDA_AVAILABLE else True, # Use CUDA if available
941
- # Only add FSDP if we're in distributed mode with FSDP strategy
942
- fsdp=fsdp_config if is_distributed and multi_gpu_strategy == "fsdp" else None,
943
- )
944
 
945
- # Create sequential sampler to maintain original dataset order
946
- sequential_sampler = torch.utils.data.SequentialSampler(dataset)
947
-
948
- # Initialize trainer first
949
- log_info("Initializing Trainer")
950
- trainer = Trainer(
951
- model=model,
952
- args=training_args,
953
- train_dataset=dataset, # We'll override this with our custom dataloader
954
- data_collator=data_collator,
955
- callbacks=[LoggingCallback()],
956
- )
957
 
958
- # Then override the get_train_dataloader method
959
- def custom_get_train_dataloader():
960
- """Custom dataloader that preserves original dataset order"""
961
- log_info("Creating sequential dataloader to maintain original dataset order")
 
 
 
 
 
962
 
963
- # Create a simple sequential sampler
964
- sequential_sampler = torch.utils.data.SequentialSampler(dataset)
 
 
 
 
 
965
 
966
- # Verification of sequence preservation flags - simplified
967
- data_loading_config = dataset_config.get("data_loading", {})
968
- shuffle_enabled = data_loading_config.get("shuffle", False)
 
 
 
 
 
 
 
969
 
970
- if shuffle_enabled:
971
- log_info("WARNING: Shuffle is enabled in configuration! This will be overridden to preserve order.")
972
- # We enforce sequential processing regardless of config
 
973
 
974
- # Log our approach clearly
975
- log_info("Using SequentialSampler to guarantee dataset order is preserved based on prompt_number")
976
 
977
- # Verify column order
978
- expected_order = ["prompt_number", "article_id", "conversations"]
979
- if hasattr(dataset, 'column_names'):
980
- actual_order = dataset.column_names
981
- if actual_order == expected_order:
982
- log_info(f"Confirmed dataset columns are in expected order: {', '.join(expected_order)}")
983
- else:
984
- log_info(f"Note: Dataset columns ({', '.join(actual_order)}) are not in expected order ({', '.join(expected_order)})")
985
- log_info("This is handled correctly by field-based access, but noting for clarity")
986
 
987
- log_info("Dataset is pre-processed with prompt_number field indicating the correct sequence")
 
 
 
 
 
 
 
 
 
 
 
 
988
 
989
- # Calculate batch size based on device availability
990
- if getattr(training_args, "no_cuda", False):
991
- batch_size = training_args.per_device_train_batch_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
992
  else:
993
- batch_size = max(training_args.per_device_train_batch_size * max(1, NUM_GPUS), 1)
994
-
995
- log_info(f"Using sequential sampler with batch size {batch_size}")
 
 
996
 
997
- # Return DataLoader with sequential sampler
998
- return torch.utils.data.DataLoader(
999
- dataset,
1000
- batch_size=batch_size,
1001
- sampler=sequential_sampler, # Always use sequential sampler
1002
- collate_fn=data_collator,
1003
- drop_last=training_args.dataloader_drop_last,
1004
- num_workers=training_args.dataloader_num_workers,
1005
- pin_memory=training_args.dataloader_pin_memory,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1006
  )
1007
-
1008
- # Override the get_train_dataloader method
1009
- trainer.get_train_dataloader = custom_get_train_dataloader
1010
-
1011
- # Start training
1012
- log_info("=== Starting Training ===")
1013
- try:
1014
- # Empty cache again right before training
1015
- if CUDA_AVAILABLE:
1016
- torch.cuda.empty_cache()
1017
- log_info("Cleared CUDA cache before training")
1018
 
1019
- # Display compact training info
1020
- total_steps = int(len(dataset) / (per_device_batch_size * NUM_GPUS * gradient_accumulation_steps) * training_args.num_train_epochs)
1021
- log_info(f"Training plan: {len(dataset)} examples over {training_args.num_train_epochs} epochs ≈ {total_steps} steps")
1022
 
1023
- trainer.train()
1024
- log_info("Training completed successfully!")
 
 
 
 
 
 
 
1025
 
1026
- # Save the final model
1027
- log_info("Saving final model...")
1028
- trainer.save_model()
1029
- log_info(f"Model saved to {training_args.output_dir}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1030
 
1031
- # Push to hub if enabled
1032
- if transformers_config.get("huggingface_hub", {}).get("push_to_hub", False):
1033
- hub_id = transformers_config.get("huggingface_hub", {}).get("hub_model_id", "model")
1034
- log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
1035
- trainer.push_to_hub()
1036
- log_info("Model successfully pushed to Hub")
1037
 
1038
- # Update the Hugging Face Space with current code
1039
- if os.environ.get("HF_TOKEN") and os.environ.get("HF_USERNAME") and os.environ.get("HF_SPACE_NAME"):
1040
- update_huggingface_space()
 
 
 
 
 
 
 
 
 
 
 
1041
 
1042
- return 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1043
  except Exception as e:
1044
- logger.error(f"Training failed with error: {str(e)}")
1045
- # Log CUDA memory info if available in compact format
1046
- if CUDA_AVAILABLE:
1047
- memory_info = []
1048
- for i in range(NUM_GPUS):
1049
- allocated = torch.cuda.memory_allocated(i) / 1024**2
1050
- reserved = torch.cuda.memory_reserved(i) / 1024**2
1051
- max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
1052
- memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB (max: {max_mem:.1f}MB)")
1053
- logger.error(f"GPU memory at failure: {', '.join(memory_info)}")
1054
- raise
1055
 
1056
  except Exception as e:
1057
- logger.error(f"Error in main training loop: {str(e)}")
1058
  return 1
1059
 
1060
  if __name__ == "__main__":
 
262
 
263
  # Apply Unsloth's training optimizations with config parameters
264
  unsloth_config = config.get("unsloth", {})
265
+
266
+ # Get dropout value; if not explicitly zero, warn about performance implications
267
+ lora_dropout = unsloth_config.get("dropout", 0.05)
268
+ if lora_dropout > 0:
269
+ logger.warning(f"Unsloth works best with dropout=0, but config has dropout={lora_dropout}")
270
+ logger.warning("This will impact performance but training will still work")
271
+ logger.warning("Consider setting dropout=0 in your config for better performance")
272
+
273
+ # Apply optimizations
274
  model = FastLanguageModel.get_peft_model(
275
  model,
276
  r=unsloth_config.get("r", 32),
277
  target_modules=unsloth_config.get("target_modules",
278
  ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]),
279
  lora_alpha=unsloth_config.get("alpha", 16),
280
+ lora_dropout=lora_dropout, # Using the value from config or default
281
  bias="none",
282
  use_gradient_checkpointing=config.get("gradient_checkpointing", True) or config.get("training", {}).get("gradient_checkpointing", True),
283
  random_state=config.get("seed", 42),
 
288
  chat_template = config.get("chat_template") or config.get("tokenizer", {}).get("chat_template")
289
  if chat_template:
290
  try:
291
+ # Get the correct chat template for phi models
292
  template = get_chat_template("phi")
293
+ # Correctly apply the template to the tokenizer (it's a string)
294
+ if isinstance(template, str):
295
+ tokenizer.chat_template = template
296
+ logger.info("Set phi chat template (string)")
297
+ else:
298
+ # If it's not a string, it's likely already a template object
299
+ tokenizer.chat_template = template
300
+ logger.info("Set phi chat template (object)")
301
  except Exception as e:
302
  logger.warning(f"Failed to set chat template: {str(e)}")
303
+ logger.warning("Chat formatting may not work correctly, but training can continue")
304
 
305
  # Ensure proper token settings
306
  if tokenizer.pad_token_id is None:
 
325
  raise ValueError("Dataset name not provided in configuration")
326
 
327
  logger.info(f"Loading pre-processed dataset {dataset_name}, split {dataset_split}")
328
+
329
+ try:
330
+ dataset = load_dataset(dataset_name, split=dataset_split)
331
+
332
+ # Verify the dataset was actually loaded and is not None
333
+ if dataset is None:
334
+ raise ValueError(f"Dataset {dataset_name} (split {dataset_split}) loaded as None - check dataset exists and is accessible")
335
+
336
+ # Check if the dataset is empty
337
+ if len(dataset) == 0:
338
+ raise ValueError(f"Dataset {dataset_name} (split {dataset_split}) is empty (contains 0 examples)")
339
+
340
+ except Exception as dataset_error:
341
+ logger.error(f"Failed to load dataset {dataset_name}: {str(dataset_error)}")
342
+ logger.error("Make sure the dataset exists and you have proper access permissions")
343
+ logger.error("This could be due to authentication issues with your HF_TOKEN")
344
+ raise
345
 
346
  # Apply minimal processing since the dataset has already been properly structured
347
  # Just perform validation to ensure required fields exist
 
798
  # Set up logging
799
  logger.info("Starting training process")
800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
801
  try:
802
+ # Check dependencies first, before any other operations
803
+ if not check_dependencies():
804
+ logger.error("Aborting due to missing critical dependencies")
805
+ return 1
806
+
807
+ # Parse arguments
808
+ args = parse_args()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
 
810
+ # Load environment variables
811
+ load_env_variables()
812
+
813
+ # Validate Hugging Face credentials if we're going to use them
814
+ validate_huggingface_credentials()
815
+
816
+ # Load configuration
817
+ try:
818
+ transformers_config = load_configs(args.config)
819
+ hardware_config = transformers_config.get("hardware", {})
820
+ dataset_config = transformers_config.get("dataset", {})
821
+ logger.info("Configuration loaded successfully")
822
+ except Exception as e:
823
+ logger.error(f"Error loading configuration: {e}")
824
  return 1
825
 
826
+ # Check if we're in distributed mode
827
+ is_distributed = "WORLD_SIZE" in os.environ and int(os.environ.get("WORLD_SIZE", "1")) > 1
828
+ if is_distributed:
829
+ local_rank = int(os.environ.get("LOCAL_RANK", "0"))
830
+ log_info(f"Running in distributed mode with {os.environ.get('WORLD_SIZE')} processes, local_rank: {local_rank}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
831
  else:
832
+ log_info("Running in non-distributed mode (single process)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
833
 
834
+ # Set random seed for reproducibility
835
+ seed = transformers_config.get("seed", 42)
836
+ set_seed(seed)
837
+ logger.info(f"Set random seed to {seed}")
838
+
839
+ # Load model and tokenizer using the consolidated config
840
+ model, tokenizer = load_model_and_tokenizer(transformers_config)
 
 
 
 
 
841
 
842
+ # Empty CUDA cache to ensure clean state
843
+ if CUDA_AVAILABLE:
844
+ torch.cuda.empty_cache()
845
+ log_info("Cleared CUDA cache")
846
+
847
+ # Setup environment variable for CUDA memory allocation
848
+ if CUDA_AVAILABLE:
849
+ system_settings = hardware_config.get("system_settings", {})
850
+ cuda_memory_fraction = system_settings.get("cuda_memory_fraction", 0.85)
851
 
852
+ if cuda_memory_fraction < 1.0:
853
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:128,expandable_segments:True"
854
+ log_info(f"Set CUDA memory allocation limit to expandable with max_split_size_mb:128")
855
+
856
+ try:
857
+ log_info("Loading dataset...")
858
+ dataset = load_dataset_with_mapping(dataset_config)
859
 
860
+ # Extra validation to catch None/empty dataset issues
861
+ if dataset is None:
862
+ logger.error("Dataset is None! Cannot proceed with training.")
863
+ return 1
864
+
865
+ if not hasattr(dataset, '__len__') or len(dataset) == 0:
866
+ logger.error("Dataset is empty! Cannot proceed with training.")
867
+ return 1
868
+
869
+ log_info(f"Dataset loaded with {len(dataset)} examples")
870
 
871
+ # Minimal validation before proceeding
872
+ if dataset is None or len(dataset) == 0:
873
+ logger.error("Dataset is empty or None! Cannot proceed with training.")
874
+ return 1
875
 
876
+ # Create data collator
877
+ data_collator = SimpleDataCollator(tokenizer, dataset_config)
878
 
879
+ # Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
880
+ # First check hardware config, then transformers config
881
+ use_bf16 = False
882
+ use_fp16 = False
 
 
 
 
 
883
 
884
+ # Check hardware config first
885
+ hardware_precision = hardware_config.get("training_optimizations", {}).get("mixed_precision", "")
886
+ if hardware_precision.lower() == "bf16":
887
+ use_bf16 = True
888
+ log_info("Using BF16 precision from hardware config")
889
+ elif hardware_precision.lower() == "fp16":
890
+ use_fp16 = True
891
+ log_info("Using FP16 precision from hardware config")
892
+ else:
893
+ # Fall back to transformers config
894
+ use_bf16 = transformers_config.get("bf16", False) or transformers_config.get("torch_dtype", "") == "bfloat16"
895
+ use_fp16 = transformers_config.get("fp16", False) and not use_bf16 # Only use fp16 if bf16 is not set
896
+ log_info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
897
 
898
+ # Get per device batch size - from transformers config, but possibly overridden by hardware config
899
+ per_device_batch_size = transformers_config.get("training", {}).get("per_device_train_batch_size", 16)
900
+ gradient_accumulation_steps = transformers_config.get("training", {}).get("gradient_accumulation_steps", 3)
901
+
902
+ # Get multi-GPU strategy from hardware config (default to data_parallel)
903
+ multi_gpu_strategy = hardware_config.get("training_optimizations", {}).get("multi_gpu_strategy", "data_parallel")
904
+ logger.info(f"Multi-GPU strategy: {multi_gpu_strategy}")
905
+
906
+ # For multi-GPU setup, adjust for better balance
907
+ if CUDA_AVAILABLE and NUM_GPUS > 1:
908
+ log_info(f"Multi-GPU setup: Adjusting for {NUM_GPUS} GPUs")
909
+
910
+ # Set up FSDP for multi-GPU training if specified and in distributed mode
911
+ fsdp_config = None
912
+ if multi_gpu_strategy == "fsdp" and is_distributed and NUM_GPUS > 1:
913
+ try:
914
+ from torch.distributed.fsdp import (
915
+ FullyShardedDataParallel as FSDP,
916
+ MixedPrecision,
917
+ BackwardPrefetch,
918
+ ShardingStrategy,
919
+ CPUOffload,
920
+ )
921
+ from torch.distributed.fsdp.wrap import (
922
+ transformer_auto_wrap_policy,
923
+ enable_wrap,
924
+ wrap,
925
+ )
926
+
927
+ log_info("Using FSDP for distributed training")
928
+
929
+ # Configure FSDP
930
+ fsdp_config = {
931
+ "fsdp_transformer_layer_cls_to_wrap": ["LlamaDecoderLayer"],
932
+ "fsdp_offload_params": False,
933
+ "fsdp_backward_prefetch": "BACKWARD_PRE",
934
+ "fsdp_min_num_params": 1e6,
935
+ "fsdp_sharding_strategy": 1, # FULL_SHARD
936
+ }
937
+
938
+ if use_bf16 or use_fp16:
939
+ precision_type = "bf16" if use_bf16 else "fp16"
940
+ fsdp_config["fsdp_state_dict_type"] = "FULL_STATE_DICT"
941
+ log_info(f"FSDP using mixed precision: {precision_type}")
942
+ except ImportError:
943
+ log_info("FSDP imports failed, falling back to standard DDP")
944
+ fsdp_config = None
945
+ elif multi_gpu_strategy == "fsdp" and not is_distributed:
946
+ log_info("FSDP disabled: requires distributed environment (use torchrun or accelerate)")
947
+ log_info("Using DataParallel for multi-GPU training instead")
948
  else:
949
+ log_info(f"Using {multi_gpu_strategy} for multi-GPU training")
950
+
951
+ # Get system settings from hardware config
952
+ dataloader_workers = hardware_config.get("system_settings", {}).get("dataloader_num_workers", 2)
953
+ pin_memory = hardware_config.get("system_settings", {}).get("dataloader_pin_memory", True)
954
 
955
+ # Set up training arguments
956
+ log_info("Setting up training arguments")
957
+ training_args = TrainingArguments(
958
+ output_dir=transformers_config.get("output_dir", "./results") or transformers_config.get("checkpointing", {}).get("output_dir", "./results"),
959
+ num_train_epochs=transformers_config.get("training", {}).get("num_train_epochs", 3),
960
+ per_device_train_batch_size=per_device_batch_size,
961
+ gradient_accumulation_steps=gradient_accumulation_steps,
962
+ learning_rate=transformers_config.get("training", {}).get("learning_rate", 2e-5),
963
+ weight_decay=transformers_config.get("training", {}).get("weight_decay", 0.01),
964
+ warmup_ratio=transformers_config.get("training", {}).get("warmup_ratio", 0.05),
965
+ lr_scheduler_type=transformers_config.get("training", {}).get("lr_scheduler_type", "cosine"),
966
+ logging_steps=transformers_config.get("training", {}).get("logging_steps", 10),
967
+ save_strategy=transformers_config.get("checkpointing", {}).get("save_strategy", "steps"),
968
+ save_steps=transformers_config.get("checkpointing", {}).get("save_steps", 100),
969
+ save_total_limit=transformers_config.get("checkpointing", {}).get("save_total_limit", 3),
970
+ fp16=use_fp16,
971
+ bf16=use_bf16,
972
+ max_grad_norm=transformers_config.get("training", {}).get("max_grad_norm", 1.0),
973
+ push_to_hub=transformers_config.get("huggingface_hub", {}).get("push_to_hub", False),
974
+ hub_model_id=transformers_config.get("huggingface_hub", {}).get("hub_model_id", None),
975
+ hub_token=os.environ.get("HF_TOKEN", None),
976
+ report_to="tensorboard",
977
+ remove_unused_columns=False, # Keep all columns
978
+ gradient_checkpointing=transformers_config.get("training", {}).get("gradient_checkpointing", True),
979
+ dataloader_pin_memory=pin_memory,
980
+ optim=transformers_config.get("training", {}).get("optim", "adamw_torch"),
981
+ ddp_find_unused_parameters=False, # Improve distributed training efficiency
982
+ dataloader_drop_last=False, # Process all examples
983
+ dataloader_num_workers=dataloader_workers,
984
+ no_cuda=False if CUDA_AVAILABLE else True, # Use CUDA if available
985
+ # Only add FSDP if we're in distributed mode with FSDP strategy
986
+ fsdp=fsdp_config if is_distributed and multi_gpu_strategy == "fsdp" else None,
987
  )
 
 
 
 
 
 
 
 
 
 
 
988
 
989
+ # Create sequential sampler to maintain original dataset order
990
+ sequential_sampler = torch.utils.data.SequentialSampler(dataset)
 
991
 
992
+ # Initialize trainer first
993
+ log_info("Initializing Trainer")
994
+ trainer = Trainer(
995
+ model=model,
996
+ args=training_args,
997
+ train_dataset=dataset, # We'll override this with our custom dataloader
998
+ data_collator=data_collator,
999
+ callbacks=[LoggingCallback()],
1000
+ )
1001
 
1002
+ # Then override the get_train_dataloader method
1003
+ def custom_get_train_dataloader():
1004
+ """Custom dataloader that preserves original dataset order"""
1005
+ log_info("Creating sequential dataloader to maintain original dataset order")
1006
+
1007
+ # Create a simple sequential sampler
1008
+ sequential_sampler = torch.utils.data.SequentialSampler(dataset)
1009
+
1010
+ # Verification of sequence preservation flags - simplified
1011
+ data_loading_config = dataset_config.get("data_loading", {})
1012
+ shuffle_enabled = data_loading_config.get("shuffle", False)
1013
+
1014
+ if shuffle_enabled:
1015
+ log_info("WARNING: Shuffle is enabled in configuration! This will be overridden to preserve order.")
1016
+ # We enforce sequential processing regardless of config
1017
+
1018
+ # Log our approach clearly
1019
+ log_info("Using SequentialSampler to guarantee dataset order is preserved based on prompt_number")
1020
+
1021
+ # Verify column order
1022
+ expected_order = ["prompt_number", "article_id", "conversations"]
1023
+ if hasattr(dataset, 'column_names'):
1024
+ actual_order = dataset.column_names
1025
+ if actual_order == expected_order:
1026
+ log_info(f"Confirmed dataset columns are in expected order: {', '.join(expected_order)}")
1027
+ else:
1028
+ log_info(f"Note: Dataset columns ({', '.join(actual_order)}) are not in expected order ({', '.join(expected_order)})")
1029
+ log_info("This is handled correctly by field-based access, but noting for clarity")
1030
+
1031
+ log_info("Dataset is pre-processed with prompt_number field indicating the correct sequence")
1032
+
1033
+ # Calculate batch size based on device availability
1034
+ if getattr(training_args, "no_cuda", False):
1035
+ batch_size = training_args.per_device_train_batch_size
1036
+ else:
1037
+ batch_size = max(training_args.per_device_train_batch_size * max(1, NUM_GPUS), 1)
1038
+
1039
+ log_info(f"Using sequential sampler with batch size {batch_size}")
1040
+
1041
+ # Return DataLoader with sequential sampler
1042
+ return torch.utils.data.DataLoader(
1043
+ dataset,
1044
+ batch_size=batch_size,
1045
+ sampler=sequential_sampler, # Always use sequential sampler
1046
+ collate_fn=data_collator,
1047
+ drop_last=training_args.dataloader_drop_last,
1048
+ num_workers=training_args.dataloader_num_workers,
1049
+ pin_memory=training_args.dataloader_pin_memory,
1050
+ )
1051
 
1052
+ # Override the get_train_dataloader method
1053
+ trainer.get_train_dataloader = custom_get_train_dataloader
 
 
 
 
1054
 
1055
+ # Start training
1056
+ log_info("=== Starting Training ===")
1057
+ try:
1058
+ # Empty cache again right before training
1059
+ if CUDA_AVAILABLE:
1060
+ torch.cuda.empty_cache()
1061
+ log_info("Cleared CUDA cache before training")
1062
+
1063
+ # Display compact training info
1064
+ total_steps = int(len(dataset) / (per_device_batch_size * NUM_GPUS * gradient_accumulation_steps) * training_args.num_train_epochs)
1065
+ log_info(f"Training plan: {len(dataset)} examples over {training_args.num_train_epochs} epochs ≈ {total_steps} steps")
1066
+
1067
+ trainer.train()
1068
+ log_info("Training completed successfully!")
1069
 
1070
+ # Save the final model
1071
+ log_info("Saving final model...")
1072
+ trainer.save_model()
1073
+ log_info(f"Model saved to {training_args.output_dir}")
1074
+
1075
+ # Push to hub if enabled
1076
+ if transformers_config.get("huggingface_hub", {}).get("push_to_hub", False):
1077
+ hub_id = transformers_config.get("huggingface_hub", {}).get("hub_model_id", "model")
1078
+ log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
1079
+ trainer.push_to_hub()
1080
+ log_info("Model successfully pushed to Hub")
1081
+
1082
+ # Update the Hugging Face Space with current code
1083
+ if os.environ.get("HF_TOKEN") and os.environ.get("HF_USERNAME") and os.environ.get("HF_SPACE_NAME"):
1084
+ update_huggingface_space()
1085
+
1086
+ return 0
1087
+ except Exception as e:
1088
+ logger.error(f"Training failed with error: {str(e)}")
1089
+ # Log CUDA memory info if available in compact format
1090
+ if CUDA_AVAILABLE:
1091
+ memory_info = []
1092
+ for i in range(NUM_GPUS):
1093
+ allocated = torch.cuda.memory_allocated(i) / 1024**2
1094
+ reserved = torch.cuda.memory_reserved(i) / 1024**2
1095
+ max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
1096
+ memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB (max: {max_mem:.1f}MB)")
1097
+ logger.error(f"GPU memory at failure: {', '.join(memory_info)}")
1098
+ raise
1099
+
1100
  except Exception as e:
1101
+ logger.error(f"Error in main training loop: {str(e)}")
1102
+ return 1
 
 
 
 
 
 
 
 
 
1103
 
1104
  except Exception as e:
1105
+ logger.error(f"Error in main function: {str(e)}")
1106
  return 1
1107
 
1108
  if __name__ == "__main__":
transformers_config.json CHANGED
@@ -46,7 +46,7 @@
46
  "enabled": true,
47
  "r": 32,
48
  "alpha": 16,
49
- "dropout": 0.05,
50
  "target_modules": [
51
  "q_proj",
52
  "k_proj",
 
46
  "enabled": true,
47
  "r": 32,
48
  "alpha": 16,
49
+ "dropout": 0,
50
  "target_modules": [
51
  "q_proj",
52
  "k_proj",