George-API commited on
Commit
5f730a4
·
verified ·
1 Parent(s): ae4e1de

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. run_transformers_training.py +32 -3
run_transformers_training.py CHANGED
@@ -980,6 +980,28 @@ def main():
980
 
981
  # Set up training arguments
982
  log_info("Setting up training arguments")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
983
  training_args = TrainingArguments(
984
  output_dir=transformers_config.get("output_dir", "./results") or transformers_config.get("checkpointing", {}).get("output_dir", "./results"),
985
  num_train_epochs=transformers_config.get("training", {}).get("num_train_epochs", 3),
@@ -998,7 +1020,8 @@ def main():
998
  max_grad_norm=transformers_config.get("training", {}).get("max_grad_norm", 1.0),
999
  push_to_hub=transformers_config.get("huggingface_hub", {}).get("push_to_hub", False),
1000
  hub_model_id=transformers_config.get("huggingface_hub", {}).get("hub_model_id", None),
1001
- hub_token=os.environ.get("HF_TOKEN", None),
 
1002
  report_to="tensorboard",
1003
  remove_unused_columns=False, # Keep all columns
1004
  gradient_checkpointing=transformers_config.get("training", {}).get("gradient_checkpointing", True),
@@ -1008,12 +1031,18 @@ def main():
1008
  dataloader_drop_last=False, # Process all examples
1009
  dataloader_num_workers=dataloader_workers,
1010
  no_cuda=False if CUDA_AVAILABLE else True, # Use CUDA if available
1011
- # Only add FSDP if we're in distributed mode with FSDP strategy
1012
- fsdp=fsdp_config if is_distributed and multi_gpu_strategy == "fsdp" else None,
1013
  )
 
 
 
 
 
 
1014
 
1015
  # Create sequential sampler to maintain original dataset order
1016
  sequential_sampler = torch.utils.data.SequentialSampler(dataset)
 
1017
 
1018
  # Initialize trainer first
1019
  log_info("Initializing Trainer")
 
980
 
981
  # Set up training arguments
982
  log_info("Setting up training arguments")
983
+
984
+ # Validate FSDP config before using it
985
+ fsdp_args = None
986
+ if fsdp_config is not None and is_distributed and multi_gpu_strategy == "fsdp":
987
+ try:
988
+ # Convert FSDP config to proper format expected by TrainingArguments
989
+ fsdp_args = {
990
+ "fsdp_transformer_layer_cls_to_wrap": fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", []),
991
+ "fsdp_offload_params": fsdp_config.get("fsdp_offload_params", False),
992
+ "fsdp_backward_prefetch": fsdp_config.get("fsdp_backward_prefetch", "BACKWARD_PRE"),
993
+ "fsdp_min_num_params": fsdp_config.get("fsdp_min_num_params", 1e6),
994
+ "fsdp_sharding_strategy": fsdp_config.get("fsdp_sharding_strategy", 1),
995
+ }
996
+ log_info("FSDP config validated and prepared")
997
+ except Exception as e:
998
+ log_info(f"Error preparing FSDP config: {str(e)}, disabling FSDP")
999
+ fsdp_args = None
1000
+
1001
+ # Check if we're running in a Space
1002
+ is_space = bool(os.environ.get("SPACE_ID"))
1003
+
1004
+ # Create training arguments with validated FSDP config
1005
  training_args = TrainingArguments(
1006
  output_dir=transformers_config.get("output_dir", "./results") or transformers_config.get("checkpointing", {}).get("output_dir", "./results"),
1007
  num_train_epochs=transformers_config.get("training", {}).get("num_train_epochs", 3),
 
1020
  max_grad_norm=transformers_config.get("training", {}).get("max_grad_norm", 1.0),
1021
  push_to_hub=transformers_config.get("huggingface_hub", {}).get("push_to_hub", False),
1022
  hub_model_id=transformers_config.get("huggingface_hub", {}).get("hub_model_id", None),
1023
+ # Don't set hub_token when running in a Space - it will use Space secrets automatically
1024
+ hub_token=None if is_space else os.environ.get("HF_TOKEN", None),
1025
  report_to="tensorboard",
1026
  remove_unused_columns=False, # Keep all columns
1027
  gradient_checkpointing=transformers_config.get("training", {}).get("gradient_checkpointing", True),
 
1031
  dataloader_drop_last=False, # Process all examples
1032
  dataloader_num_workers=dataloader_workers,
1033
  no_cuda=False if CUDA_AVAILABLE else True, # Use CUDA if available
1034
+ fsdp=fsdp_args, # Use validated FSDP config
 
1035
  )
1036
+
1037
+ log_info("Training arguments created successfully")
1038
+
1039
+ # Validate dataset before creating sampler
1040
+ if dataset is None:
1041
+ raise ValueError("Dataset is None - cannot create sampler")
1042
 
1043
  # Create sequential sampler to maintain original dataset order
1044
  sequential_sampler = torch.utils.data.SequentialSampler(dataset)
1045
+ log_info("Sequential sampler created")
1046
 
1047
  # Initialize trainer first
1048
  log_info("Initializing Trainer")