Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- run_transformers_training.py +83 -95
run_transformers_training.py
CHANGED
@@ -8,6 +8,16 @@ import argparse
|
|
8 |
import logging
|
9 |
from datetime import datetime
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
import torch
|
12 |
from datasets import load_dataset
|
13 |
from transformers import (
|
@@ -44,15 +54,6 @@ except ImportError:
|
|
44 |
peft_available = False
|
45 |
logger.warning("PEFT not available. Parameter-efficient fine-tuning will not be used.")
|
46 |
|
47 |
-
# Import Unsloth
|
48 |
-
try:
|
49 |
-
from unsloth import FastLanguageModel
|
50 |
-
from unsloth.chat_templates import get_chat_template
|
51 |
-
unsloth_available = True
|
52 |
-
except ImportError:
|
53 |
-
unsloth_available = False
|
54 |
-
logger.warning("Unsloth not available. Please install with: pip install unsloth")
|
55 |
-
|
56 |
def load_env_variables():
|
57 |
"""Load environment variables from system, .env file, or Hugging Face Space variables."""
|
58 |
# Check if we're running in a Hugging Face Space
|
@@ -131,6 +132,13 @@ def load_model_and_tokenizer(config):
|
|
131 |
logger.error("Unsloth is required for training with pre-quantized model")
|
132 |
logger.error("Please ensure unsloth is in requirements.txt")
|
133 |
raise ImportError("Unsloth is required for this training setup")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
logger.info("Using Unsloth optimizations with pre-quantized model")
|
136 |
# Check for flash attention without importing it directly
|
@@ -143,32 +151,37 @@ def load_model_and_tokenizer(config):
|
|
143 |
logger.warning("Flash attention not available, falling back to standard attention")
|
144 |
|
145 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
146 |
-
model_name=
|
147 |
-
max_seq_length=config.get("max_seq_length", 2048),
|
148 |
dtype=None, # Let Unsloth choose optimal dtype
|
149 |
device_map="auto",
|
150 |
# Don't explicitly use flash attention config here, let Unsloth handle it
|
151 |
)
|
152 |
|
153 |
# Apply Unsloth's training optimizations with config parameters
|
|
|
154 |
model = FastLanguageModel.get_peft_model(
|
155 |
model,
|
156 |
-
r=
|
157 |
-
target_modules=
|
158 |
["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]),
|
159 |
-
lora_alpha=
|
160 |
-
lora_dropout=
|
161 |
bias="none",
|
162 |
-
use_gradient_checkpointing=config.get("gradient_checkpointing", True),
|
163 |
random_state=config.get("seed", 42),
|
164 |
)
|
165 |
logger.info("Unsloth optimizations applied successfully")
|
166 |
|
167 |
# Set up tokenizer settings
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
172 |
|
173 |
# Ensure proper token settings
|
174 |
if tokenizer.pad_token_id is None:
|
@@ -418,24 +431,58 @@ def main():
|
|
418 |
# Load all configurations
|
419 |
try:
|
420 |
configs = load_configs(args.config_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
logger.info("All configurations loaded successfully")
|
422 |
|
423 |
# Extract specific configs
|
424 |
model_config = configs["transformers"]
|
425 |
-
hardware_config = configs
|
426 |
dataset_config = configs["dataset"]
|
427 |
|
428 |
-
# Apply hardware-specific settings
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
"
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
except Exception as e:
|
440 |
logger.error(f"Error loading configurations: {e}")
|
441 |
return 1
|
@@ -445,75 +492,16 @@ def main():
|
|
445 |
set_seed(seed)
|
446 |
logger.info(f"Set random seed to {seed}")
|
447 |
|
448 |
-
# Check if we're running in a Hugging Face Space
|
449 |
-
if os.environ.get("SPACE_ID") and not os.environ.get("HF_USERNAME"):
|
450 |
-
# Extract username from SPACE_ID
|
451 |
-
username = os.environ.get("SPACE_ID").split("/")[0]
|
452 |
-
logger.info(f"Extracted username from SPACE_ID: {username}")
|
453 |
-
|
454 |
-
# Set hub_model_id if not already set and push_to_hub is enabled
|
455 |
-
if model_config.get("push_to_hub", False) and not model_config.get("hub_model_id"):
|
456 |
-
model_name = model_config.get("model_name", "").split("/")[-1]
|
457 |
-
model_config["hub_model_id"] = f"{username}/finetuned-{model_name}"
|
458 |
-
logger.info(f"Set hub_model_id to {model_config['hub_model_id']}")
|
459 |
-
|
460 |
-
# Load model and tokenizer
|
461 |
-
logger.info(f"Loading model: {model_config.get('model_name')}")
|
462 |
-
|
463 |
try:
|
464 |
model, tokenizer = load_model_and_tokenizer(model_config)
|
465 |
logger.info("Model and tokenizer loaded successfully")
|
466 |
|
467 |
-
#
|
468 |
-
if model_config.get("use_peft", False) and peft_available:
|
469 |
-
logger.info("Preparing model for parameter-efficient fine-tuning")
|
470 |
-
try:
|
471 |
-
model = prepare_model_for_kbit_training(model)
|
472 |
-
|
473 |
-
# Get target modules
|
474 |
-
target_modules = model_config.get("target_modules", ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"])
|
475 |
-
|
476 |
-
# Create LoRA config
|
477 |
-
lora_config = LoraConfig(
|
478 |
-
r=model_config.get("lora_r", 16),
|
479 |
-
lora_alpha=model_config.get("lora_alpha", 32),
|
480 |
-
lora_dropout=model_config.get("lora_dropout", 0.05),
|
481 |
-
bias="none",
|
482 |
-
task_type="CAUSAL_LM",
|
483 |
-
target_modules=target_modules
|
484 |
-
)
|
485 |
-
|
486 |
-
# Apply LoRA to model
|
487 |
-
model = get_peft_model(model, lora_config)
|
488 |
-
logger.info(f"Applied LoRA with r={model_config.get('lora_r', 16)}, alpha={model_config.get('lora_alpha', 32)}")
|
489 |
-
except Exception as e:
|
490 |
-
logger.error(f"Error setting up PEFT: {e}")
|
491 |
-
return 1
|
492 |
-
|
493 |
-
# Load dataset
|
494 |
-
logger.info(f"Loading dataset: {dataset_config.get('dataset_name')}")
|
495 |
try:
|
496 |
-
dataset =
|
497 |
-
logger.info(
|
498 |
-
|
499 |
-
# Sort dataset by ID to ensure chunks from the same paper are processed together
|
500 |
-
logger.info("Sorting dataset by ID to maintain paper chunk order")
|
501 |
-
def sort_by_id(example):
|
502 |
-
# Extract ID as integer if possible, otherwise keep as string
|
503 |
-
try:
|
504 |
-
return int(example['id'])
|
505 |
-
except (ValueError, TypeError):
|
506 |
-
return example['id']
|
507 |
-
|
508 |
-
# Apply sorting to the dataset
|
509 |
-
dataset['train'] = dataset['train'].sort('id')
|
510 |
-
logger.info("Dataset sorted by ID")
|
511 |
-
|
512 |
-
# Log the first few IDs to verify sorting
|
513 |
-
sample_ids = [example['id'] for example in dataset['train'].select(range(min(5, len(dataset['train']))))]
|
514 |
-
logger.info(f"First few IDs after sorting: {sample_ids}")
|
515 |
except Exception as e:
|
516 |
-
logger.error(f"Error loading
|
517 |
return 1
|
518 |
|
519 |
# Create data collator
|
|
|
8 |
import logging
|
9 |
from datetime import datetime
|
10 |
|
11 |
+
# Import Unsloth first, before other ML imports
|
12 |
+
try:
|
13 |
+
from unsloth import FastLanguageModel
|
14 |
+
from unsloth.chat_templates import get_chat_template
|
15 |
+
unsloth_available = True
|
16 |
+
except ImportError:
|
17 |
+
unsloth_available = False
|
18 |
+
logger = logging.getLogger(__name__)
|
19 |
+
logger.warning("Unsloth not available. Please install with: pip install unsloth")
|
20 |
+
|
21 |
import torch
|
22 |
from datasets import load_dataset
|
23 |
from transformers import (
|
|
|
54 |
peft_available = False
|
55 |
logger.warning("PEFT not available. Parameter-efficient fine-tuning will not be used.")
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
def load_env_variables():
|
58 |
"""Load environment variables from system, .env file, or Hugging Face Space variables."""
|
59 |
# Check if we're running in a Hugging Face Space
|
|
|
132 |
logger.error("Unsloth is required for training with pre-quantized model")
|
133 |
logger.error("Please ensure unsloth is in requirements.txt")
|
134 |
raise ImportError("Unsloth is required for this training setup")
|
135 |
+
|
136 |
+
# Get model name correctly from nested config structure
|
137 |
+
model_name = config.get("model", {}).get("name") or config.get("model_name_or_path") or config.get("model_name")
|
138 |
+
logger.info(f"Loading model: {model_name}")
|
139 |
+
|
140 |
+
if not model_name:
|
141 |
+
raise ValueError("Model name not found in configuration. Please check your transformers_config.json file.")
|
142 |
|
143 |
logger.info("Using Unsloth optimizations with pre-quantized model")
|
144 |
# Check for flash attention without importing it directly
|
|
|
151 |
logger.warning("Flash attention not available, falling back to standard attention")
|
152 |
|
153 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
154 |
+
model_name=model_name,
|
155 |
+
max_seq_length=config.get("max_seq_length", 2048) or config.get("tokenizer", {}).get("max_seq_length", 2048),
|
156 |
dtype=None, # Let Unsloth choose optimal dtype
|
157 |
device_map="auto",
|
158 |
# Don't explicitly use flash attention config here, let Unsloth handle it
|
159 |
)
|
160 |
|
161 |
# Apply Unsloth's training optimizations with config parameters
|
162 |
+
unsloth_config = config.get("unsloth", {})
|
163 |
model = FastLanguageModel.get_peft_model(
|
164 |
model,
|
165 |
+
r=unsloth_config.get("r", 32),
|
166 |
+
target_modules=unsloth_config.get("target_modules",
|
167 |
["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]),
|
168 |
+
lora_alpha=unsloth_config.get("alpha", 16),
|
169 |
+
lora_dropout=unsloth_config.get("dropout", 0.05),
|
170 |
bias="none",
|
171 |
+
use_gradient_checkpointing=config.get("gradient_checkpointing", True) or config.get("training", {}).get("gradient_checkpointing", True),
|
172 |
random_state=config.get("seed", 42),
|
173 |
)
|
174 |
logger.info("Unsloth optimizations applied successfully")
|
175 |
|
176 |
# Set up tokenizer settings
|
177 |
+
chat_template = config.get("chat_template") or config.get("tokenizer", {}).get("chat_template")
|
178 |
+
if chat_template:
|
179 |
+
try:
|
180 |
+
template = get_chat_template("phi")
|
181 |
+
tokenizer.chat_template = template
|
182 |
+
logger.info("Set phi chat template")
|
183 |
+
except Exception as e:
|
184 |
+
logger.warning(f"Failed to set chat template: {str(e)}")
|
185 |
|
186 |
# Ensure proper token settings
|
187 |
if tokenizer.pad_token_id is None:
|
|
|
431 |
# Load all configurations
|
432 |
try:
|
433 |
configs = load_configs(args.config_dir)
|
434 |
+
|
435 |
+
# Extract specific configs
|
436 |
+
if not configs:
|
437 |
+
logger.error("Failed to load configurations")
|
438 |
+
return 1
|
439 |
+
|
440 |
+
# Verify configurations exist
|
441 |
+
if "transformers" not in configs:
|
442 |
+
logger.error("transformers_config.json not found or invalid")
|
443 |
+
return 1
|
444 |
+
|
445 |
+
if "hardware" not in configs:
|
446 |
+
logger.warning("hardware_config.json not found. Using default hardware configuration.")
|
447 |
+
|
448 |
+
if "dataset" not in configs:
|
449 |
+
logger.error("dataset_config.json not found or invalid")
|
450 |
+
return 1
|
451 |
+
|
452 |
+
# Validate model configuration
|
453 |
+
model_config = configs["transformers"]
|
454 |
+
if not model_config.get("model", {}).get("name") and not model_config.get("model_name_or_path") and not model_config.get("model_name"):
|
455 |
+
logger.error("Model name not specified in configuration")
|
456 |
+
logger.error("Please ensure 'name' is specified under 'model' in transformers_config.json")
|
457 |
+
return 1
|
458 |
+
|
459 |
+
logger.info(f"Model name: {model_config.get('model', {}).get('name') or model_config.get('model_name_or_path') or model_config.get('model_name')}")
|
460 |
logger.info("All configurations loaded successfully")
|
461 |
|
462 |
# Extract specific configs
|
463 |
model_config = configs["transformers"]
|
464 |
+
hardware_config = configs.get("hardware", {})
|
465 |
dataset_config = configs["dataset"]
|
466 |
|
467 |
+
# Apply hardware-specific settings if available
|
468 |
+
if hardware_config:
|
469 |
+
training_opts = hardware_config.get("training_optimizations", {})
|
470 |
+
per_device_batch_size = training_opts.get("per_device_batch_size")
|
471 |
+
gradient_accumulation = training_opts.get("gradient_accumulation_steps")
|
472 |
+
|
473 |
+
if per_device_batch_size and model_config.get("training"):
|
474 |
+
model_config["training"]["per_device_train_batch_size"] = per_device_batch_size
|
475 |
+
logger.info(f"Applied hardware-specific batch size: {per_device_batch_size}")
|
476 |
+
|
477 |
+
if gradient_accumulation and model_config.get("training"):
|
478 |
+
model_config["training"]["gradient_accumulation_steps"] = gradient_accumulation
|
479 |
+
logger.info(f"Applied hardware-specific gradient accumulation: {gradient_accumulation}")
|
480 |
+
|
481 |
+
# Apply memory optimizations
|
482 |
+
memory_opts = training_opts.get("memory_optimizations", {})
|
483 |
+
if memory_opts.get("use_gradient_checkpointing") is not None and model_config.get("training"):
|
484 |
+
model_config["training"]["gradient_checkpointing"] = memory_opts["use_gradient_checkpointing"]
|
485 |
+
|
486 |
except Exception as e:
|
487 |
logger.error(f"Error loading configurations: {e}")
|
488 |
return 1
|
|
|
492 |
set_seed(seed)
|
493 |
logger.info(f"Set random seed to {seed}")
|
494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
try:
|
496 |
model, tokenizer = load_model_and_tokenizer(model_config)
|
497 |
logger.info("Model and tokenizer loaded successfully")
|
498 |
|
499 |
+
# Load dataset with proper mapping
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
500 |
try:
|
501 |
+
dataset = load_dataset_with_mapping(dataset_config)
|
502 |
+
logger.info("Dataset loaded and prepared successfully")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
503 |
except Exception as e:
|
504 |
+
logger.error(f"Error loading dataset: {e}")
|
505 |
return 1
|
506 |
|
507 |
# Create data collator
|