George-API commited on
Commit
b3a8a7a
·
verified ·
1 Parent(s): 578eea8

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. run_transformers_training.py +83 -95
run_transformers_training.py CHANGED
@@ -8,6 +8,16 @@ import argparse
8
  import logging
9
  from datetime import datetime
10
 
 
 
 
 
 
 
 
 
 
 
11
  import torch
12
  from datasets import load_dataset
13
  from transformers import (
@@ -44,15 +54,6 @@ except ImportError:
44
  peft_available = False
45
  logger.warning("PEFT not available. Parameter-efficient fine-tuning will not be used.")
46
 
47
- # Import Unsloth
48
- try:
49
- from unsloth import FastLanguageModel
50
- from unsloth.chat_templates import get_chat_template
51
- unsloth_available = True
52
- except ImportError:
53
- unsloth_available = False
54
- logger.warning("Unsloth not available. Please install with: pip install unsloth")
55
-
56
  def load_env_variables():
57
  """Load environment variables from system, .env file, or Hugging Face Space variables."""
58
  # Check if we're running in a Hugging Face Space
@@ -131,6 +132,13 @@ def load_model_and_tokenizer(config):
131
  logger.error("Unsloth is required for training with pre-quantized model")
132
  logger.error("Please ensure unsloth is in requirements.txt")
133
  raise ImportError("Unsloth is required for this training setup")
 
 
 
 
 
 
 
134
 
135
  logger.info("Using Unsloth optimizations with pre-quantized model")
136
  # Check for flash attention without importing it directly
@@ -143,32 +151,37 @@ def load_model_and_tokenizer(config):
143
  logger.warning("Flash attention not available, falling back to standard attention")
144
 
145
  model, tokenizer = FastLanguageModel.from_pretrained(
146
- model_name=config.get("model_name"),
147
- max_seq_length=config.get("max_seq_length", 2048),
148
  dtype=None, # Let Unsloth choose optimal dtype
149
  device_map="auto",
150
  # Don't explicitly use flash attention config here, let Unsloth handle it
151
  )
152
 
153
  # Apply Unsloth's training optimizations with config parameters
 
154
  model = FastLanguageModel.get_peft_model(
155
  model,
156
- r=config.get("unsloth_r", 32),
157
- target_modules=config.get("unsloth_target_modules",
158
  ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]),
159
- lora_alpha=config.get("unsloth_alpha", 16),
160
- lora_dropout=config.get("unsloth_dropout", 0.05),
161
  bias="none",
162
- use_gradient_checkpointing=config.get("gradient_checkpointing", True),
163
  random_state=config.get("seed", 42),
164
  )
165
  logger.info("Unsloth optimizations applied successfully")
166
 
167
  # Set up tokenizer settings
168
- if config.get("chat_template"):
169
- chat_template = get_chat_template("phi")
170
- tokenizer.chat_template = chat_template
171
- logger.info("Set phi chat template")
 
 
 
 
172
 
173
  # Ensure proper token settings
174
  if tokenizer.pad_token_id is None:
@@ -418,24 +431,58 @@ def main():
418
  # Load all configurations
419
  try:
420
  configs = load_configs(args.config_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  logger.info("All configurations loaded successfully")
422
 
423
  # Extract specific configs
424
  model_config = configs["transformers"]
425
- hardware_config = configs["hardware"]
426
  dataset_config = configs["dataset"]
427
 
428
- # Apply hardware-specific settings
429
- per_device_batch_size = hardware_config["training_optimizations"]["per_device_batch_size"]
430
- gradient_accumulation = hardware_config["training_optimizations"]["gradient_accumulation_steps"]
431
-
432
- # Update model config with hardware settings
433
- model_config["training"].update({
434
- "per_device_train_batch_size": per_device_batch_size,
435
- "gradient_accumulation_steps": gradient_accumulation,
436
- "gradient_checkpointing": hardware_config["training_optimizations"]["memory_optimizations"]["use_gradient_checkpointing"]
437
- })
438
-
 
 
 
 
 
 
 
 
439
  except Exception as e:
440
  logger.error(f"Error loading configurations: {e}")
441
  return 1
@@ -445,75 +492,16 @@ def main():
445
  set_seed(seed)
446
  logger.info(f"Set random seed to {seed}")
447
 
448
- # Check if we're running in a Hugging Face Space
449
- if os.environ.get("SPACE_ID") and not os.environ.get("HF_USERNAME"):
450
- # Extract username from SPACE_ID
451
- username = os.environ.get("SPACE_ID").split("/")[0]
452
- logger.info(f"Extracted username from SPACE_ID: {username}")
453
-
454
- # Set hub_model_id if not already set and push_to_hub is enabled
455
- if model_config.get("push_to_hub", False) and not model_config.get("hub_model_id"):
456
- model_name = model_config.get("model_name", "").split("/")[-1]
457
- model_config["hub_model_id"] = f"{username}/finetuned-{model_name}"
458
- logger.info(f"Set hub_model_id to {model_config['hub_model_id']}")
459
-
460
- # Load model and tokenizer
461
- logger.info(f"Loading model: {model_config.get('model_name')}")
462
-
463
  try:
464
  model, tokenizer = load_model_and_tokenizer(model_config)
465
  logger.info("Model and tokenizer loaded successfully")
466
 
467
- # Prepare model for k-bit training if using PEFT
468
- if model_config.get("use_peft", False) and peft_available:
469
- logger.info("Preparing model for parameter-efficient fine-tuning")
470
- try:
471
- model = prepare_model_for_kbit_training(model)
472
-
473
- # Get target modules
474
- target_modules = model_config.get("target_modules", ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"])
475
-
476
- # Create LoRA config
477
- lora_config = LoraConfig(
478
- r=model_config.get("lora_r", 16),
479
- lora_alpha=model_config.get("lora_alpha", 32),
480
- lora_dropout=model_config.get("lora_dropout", 0.05),
481
- bias="none",
482
- task_type="CAUSAL_LM",
483
- target_modules=target_modules
484
- )
485
-
486
- # Apply LoRA to model
487
- model = get_peft_model(model, lora_config)
488
- logger.info(f"Applied LoRA with r={model_config.get('lora_r', 16)}, alpha={model_config.get('lora_alpha', 32)}")
489
- except Exception as e:
490
- logger.error(f"Error setting up PEFT: {e}")
491
- return 1
492
-
493
- # Load dataset
494
- logger.info(f"Loading dataset: {dataset_config.get('dataset_name')}")
495
  try:
496
- dataset = load_dataset(dataset_config.get("dataset_name"))
497
- logger.info(f"Dataset loaded successfully with {len(dataset['train'])} training examples")
498
-
499
- # Sort dataset by ID to ensure chunks from the same paper are processed together
500
- logger.info("Sorting dataset by ID to maintain paper chunk order")
501
- def sort_by_id(example):
502
- # Extract ID as integer if possible, otherwise keep as string
503
- try:
504
- return int(example['id'])
505
- except (ValueError, TypeError):
506
- return example['id']
507
-
508
- # Apply sorting to the dataset
509
- dataset['train'] = dataset['train'].sort('id')
510
- logger.info("Dataset sorted by ID")
511
-
512
- # Log the first few IDs to verify sorting
513
- sample_ids = [example['id'] for example in dataset['train'].select(range(min(5, len(dataset['train']))))]
514
- logger.info(f"First few IDs after sorting: {sample_ids}")
515
  except Exception as e:
516
- logger.error(f"Error loading or sorting dataset: {e}")
517
  return 1
518
 
519
  # Create data collator
 
8
  import logging
9
  from datetime import datetime
10
 
11
+ # Import Unsloth first, before other ML imports
12
+ try:
13
+ from unsloth import FastLanguageModel
14
+ from unsloth.chat_templates import get_chat_template
15
+ unsloth_available = True
16
+ except ImportError:
17
+ unsloth_available = False
18
+ logger = logging.getLogger(__name__)
19
+ logger.warning("Unsloth not available. Please install with: pip install unsloth")
20
+
21
  import torch
22
  from datasets import load_dataset
23
  from transformers import (
 
54
  peft_available = False
55
  logger.warning("PEFT not available. Parameter-efficient fine-tuning will not be used.")
56
 
 
 
 
 
 
 
 
 
 
57
  def load_env_variables():
58
  """Load environment variables from system, .env file, or Hugging Face Space variables."""
59
  # Check if we're running in a Hugging Face Space
 
132
  logger.error("Unsloth is required for training with pre-quantized model")
133
  logger.error("Please ensure unsloth is in requirements.txt")
134
  raise ImportError("Unsloth is required for this training setup")
135
+
136
+ # Get model name correctly from nested config structure
137
+ model_name = config.get("model", {}).get("name") or config.get("model_name_or_path") or config.get("model_name")
138
+ logger.info(f"Loading model: {model_name}")
139
+
140
+ if not model_name:
141
+ raise ValueError("Model name not found in configuration. Please check your transformers_config.json file.")
142
 
143
  logger.info("Using Unsloth optimizations with pre-quantized model")
144
  # Check for flash attention without importing it directly
 
151
  logger.warning("Flash attention not available, falling back to standard attention")
152
 
153
  model, tokenizer = FastLanguageModel.from_pretrained(
154
+ model_name=model_name,
155
+ max_seq_length=config.get("max_seq_length", 2048) or config.get("tokenizer", {}).get("max_seq_length", 2048),
156
  dtype=None, # Let Unsloth choose optimal dtype
157
  device_map="auto",
158
  # Don't explicitly use flash attention config here, let Unsloth handle it
159
  )
160
 
161
  # Apply Unsloth's training optimizations with config parameters
162
+ unsloth_config = config.get("unsloth", {})
163
  model = FastLanguageModel.get_peft_model(
164
  model,
165
+ r=unsloth_config.get("r", 32),
166
+ target_modules=unsloth_config.get("target_modules",
167
  ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]),
168
+ lora_alpha=unsloth_config.get("alpha", 16),
169
+ lora_dropout=unsloth_config.get("dropout", 0.05),
170
  bias="none",
171
+ use_gradient_checkpointing=config.get("gradient_checkpointing", True) or config.get("training", {}).get("gradient_checkpointing", True),
172
  random_state=config.get("seed", 42),
173
  )
174
  logger.info("Unsloth optimizations applied successfully")
175
 
176
  # Set up tokenizer settings
177
+ chat_template = config.get("chat_template") or config.get("tokenizer", {}).get("chat_template")
178
+ if chat_template:
179
+ try:
180
+ template = get_chat_template("phi")
181
+ tokenizer.chat_template = template
182
+ logger.info("Set phi chat template")
183
+ except Exception as e:
184
+ logger.warning(f"Failed to set chat template: {str(e)}")
185
 
186
  # Ensure proper token settings
187
  if tokenizer.pad_token_id is None:
 
431
  # Load all configurations
432
  try:
433
  configs = load_configs(args.config_dir)
434
+
435
+ # Extract specific configs
436
+ if not configs:
437
+ logger.error("Failed to load configurations")
438
+ return 1
439
+
440
+ # Verify configurations exist
441
+ if "transformers" not in configs:
442
+ logger.error("transformers_config.json not found or invalid")
443
+ return 1
444
+
445
+ if "hardware" not in configs:
446
+ logger.warning("hardware_config.json not found. Using default hardware configuration.")
447
+
448
+ if "dataset" not in configs:
449
+ logger.error("dataset_config.json not found or invalid")
450
+ return 1
451
+
452
+ # Validate model configuration
453
+ model_config = configs["transformers"]
454
+ if not model_config.get("model", {}).get("name") and not model_config.get("model_name_or_path") and not model_config.get("model_name"):
455
+ logger.error("Model name not specified in configuration")
456
+ logger.error("Please ensure 'name' is specified under 'model' in transformers_config.json")
457
+ return 1
458
+
459
+ logger.info(f"Model name: {model_config.get('model', {}).get('name') or model_config.get('model_name_or_path') or model_config.get('model_name')}")
460
  logger.info("All configurations loaded successfully")
461
 
462
  # Extract specific configs
463
  model_config = configs["transformers"]
464
+ hardware_config = configs.get("hardware", {})
465
  dataset_config = configs["dataset"]
466
 
467
+ # Apply hardware-specific settings if available
468
+ if hardware_config:
469
+ training_opts = hardware_config.get("training_optimizations", {})
470
+ per_device_batch_size = training_opts.get("per_device_batch_size")
471
+ gradient_accumulation = training_opts.get("gradient_accumulation_steps")
472
+
473
+ if per_device_batch_size and model_config.get("training"):
474
+ model_config["training"]["per_device_train_batch_size"] = per_device_batch_size
475
+ logger.info(f"Applied hardware-specific batch size: {per_device_batch_size}")
476
+
477
+ if gradient_accumulation and model_config.get("training"):
478
+ model_config["training"]["gradient_accumulation_steps"] = gradient_accumulation
479
+ logger.info(f"Applied hardware-specific gradient accumulation: {gradient_accumulation}")
480
+
481
+ # Apply memory optimizations
482
+ memory_opts = training_opts.get("memory_optimizations", {})
483
+ if memory_opts.get("use_gradient_checkpointing") is not None and model_config.get("training"):
484
+ model_config["training"]["gradient_checkpointing"] = memory_opts["use_gradient_checkpointing"]
485
+
486
  except Exception as e:
487
  logger.error(f"Error loading configurations: {e}")
488
  return 1
 
492
  set_seed(seed)
493
  logger.info(f"Set random seed to {seed}")
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  try:
496
  model, tokenizer = load_model_and_tokenizer(model_config)
497
  logger.info("Model and tokenizer loaded successfully")
498
 
499
+ # Load dataset with proper mapping
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  try:
501
+ dataset = load_dataset_with_mapping(dataset_config)
502
+ logger.info("Dataset loaded and prepared successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
  except Exception as e:
504
+ logger.error(f"Error loading dataset: {e}")
505
  return 1
506
 
507
  # Create data collator