Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- run_transformers_training.py +32 -3
run_transformers_training.py
CHANGED
@@ -980,6 +980,28 @@ def main():
|
|
980 |
|
981 |
# Set up training arguments
|
982 |
log_info("Setting up training arguments")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
983 |
training_args = TrainingArguments(
|
984 |
output_dir=transformers_config.get("output_dir", "./results") or transformers_config.get("checkpointing", {}).get("output_dir", "./results"),
|
985 |
num_train_epochs=transformers_config.get("training", {}).get("num_train_epochs", 3),
|
@@ -998,7 +1020,8 @@ def main():
|
|
998 |
max_grad_norm=transformers_config.get("training", {}).get("max_grad_norm", 1.0),
|
999 |
push_to_hub=transformers_config.get("huggingface_hub", {}).get("push_to_hub", False),
|
1000 |
hub_model_id=transformers_config.get("huggingface_hub", {}).get("hub_model_id", None),
|
1001 |
-
hub_token
|
|
|
1002 |
report_to="tensorboard",
|
1003 |
remove_unused_columns=False, # Keep all columns
|
1004 |
gradient_checkpointing=transformers_config.get("training", {}).get("gradient_checkpointing", True),
|
@@ -1008,12 +1031,18 @@ def main():
|
|
1008 |
dataloader_drop_last=False, # Process all examples
|
1009 |
dataloader_num_workers=dataloader_workers,
|
1010 |
no_cuda=False if CUDA_AVAILABLE else True, # Use CUDA if available
|
1011 |
-
#
|
1012 |
-
fsdp=fsdp_config if is_distributed and multi_gpu_strategy == "fsdp" else None,
|
1013 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
1014 |
|
1015 |
# Create sequential sampler to maintain original dataset order
|
1016 |
sequential_sampler = torch.utils.data.SequentialSampler(dataset)
|
|
|
1017 |
|
1018 |
# Initialize trainer first
|
1019 |
log_info("Initializing Trainer")
|
|
|
980 |
|
981 |
# Set up training arguments
|
982 |
log_info("Setting up training arguments")
|
983 |
+
|
984 |
+
# Validate FSDP config before using it
|
985 |
+
fsdp_args = None
|
986 |
+
if fsdp_config is not None and is_distributed and multi_gpu_strategy == "fsdp":
|
987 |
+
try:
|
988 |
+
# Convert FSDP config to proper format expected by TrainingArguments
|
989 |
+
fsdp_args = {
|
990 |
+
"fsdp_transformer_layer_cls_to_wrap": fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", []),
|
991 |
+
"fsdp_offload_params": fsdp_config.get("fsdp_offload_params", False),
|
992 |
+
"fsdp_backward_prefetch": fsdp_config.get("fsdp_backward_prefetch", "BACKWARD_PRE"),
|
993 |
+
"fsdp_min_num_params": fsdp_config.get("fsdp_min_num_params", 1e6),
|
994 |
+
"fsdp_sharding_strategy": fsdp_config.get("fsdp_sharding_strategy", 1),
|
995 |
+
}
|
996 |
+
log_info("FSDP config validated and prepared")
|
997 |
+
except Exception as e:
|
998 |
+
log_info(f"Error preparing FSDP config: {str(e)}, disabling FSDP")
|
999 |
+
fsdp_args = None
|
1000 |
+
|
1001 |
+
# Check if we're running in a Space
|
1002 |
+
is_space = bool(os.environ.get("SPACE_ID"))
|
1003 |
+
|
1004 |
+
# Create training arguments with validated FSDP config
|
1005 |
training_args = TrainingArguments(
|
1006 |
output_dir=transformers_config.get("output_dir", "./results") or transformers_config.get("checkpointing", {}).get("output_dir", "./results"),
|
1007 |
num_train_epochs=transformers_config.get("training", {}).get("num_train_epochs", 3),
|
|
|
1020 |
max_grad_norm=transformers_config.get("training", {}).get("max_grad_norm", 1.0),
|
1021 |
push_to_hub=transformers_config.get("huggingface_hub", {}).get("push_to_hub", False),
|
1022 |
hub_model_id=transformers_config.get("huggingface_hub", {}).get("hub_model_id", None),
|
1023 |
+
# Don't set hub_token when running in a Space - it will use Space secrets automatically
|
1024 |
+
hub_token=None if is_space else os.environ.get("HF_TOKEN", None),
|
1025 |
report_to="tensorboard",
|
1026 |
remove_unused_columns=False, # Keep all columns
|
1027 |
gradient_checkpointing=transformers_config.get("training", {}).get("gradient_checkpointing", True),
|
|
|
1031 |
dataloader_drop_last=False, # Process all examples
|
1032 |
dataloader_num_workers=dataloader_workers,
|
1033 |
no_cuda=False if CUDA_AVAILABLE else True, # Use CUDA if available
|
1034 |
+
fsdp=fsdp_args, # Use validated FSDP config
|
|
|
1035 |
)
|
1036 |
+
|
1037 |
+
log_info("Training arguments created successfully")
|
1038 |
+
|
1039 |
+
# Validate dataset before creating sampler
|
1040 |
+
if dataset is None:
|
1041 |
+
raise ValueError("Dataset is None - cannot create sampler")
|
1042 |
|
1043 |
# Create sequential sampler to maintain original dataset order
|
1044 |
sequential_sampler = torch.utils.data.SequentialSampler(dataset)
|
1045 |
+
log_info("Sequential sampler created")
|
1046 |
|
1047 |
# Initialize trainer first
|
1048 |
log_info("Initializing Trainer")
|