Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- run_transformers_training.py +94 -133
run_transformers_training.py
CHANGED
@@ -297,20 +297,27 @@ def load_dataset_with_mapping(dataset_config):
|
|
297 |
else:
|
298 |
logger.warning(f"Expected column '{col}' not found in dataset")
|
299 |
|
300 |
-
#
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
# Log the first few IDs to verify sorting
|
307 |
sample_ids = [example['id'] for example in dataset.select(range(min(5, len(dataset))))]
|
308 |
-
logger.info(f"First few IDs
|
309 |
|
310 |
-
# Log
|
311 |
-
if
|
312 |
-
|
313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
|
315 |
logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
|
316 |
logger.info(f"Dataset columns: {dataset.column_names}")
|
@@ -374,142 +381,91 @@ class SimpleDataCollator:
|
|
374 |
self.dataset_config = dataset_config
|
375 |
self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
|
376 |
self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
|
377 |
-
self.paper_counters = {}
|
378 |
self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
|
379 |
-
|
380 |
-
|
381 |
-
logger.info(f"SimpleDataCollator initialized - using phi-4 chat format with max_seq_length={self.max_seq_length}")
|
382 |
-
logger.info("Metadata handling disabled - using metadata from content field")
|
383 |
|
384 |
# Check if we're on GPU
|
385 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
386 |
logger.info(f"SimpleDataCollator using device: {self.device}")
|
387 |
|
388 |
-
def normalize_conversation(self, conversation):
|
389 |
-
"""Normalize conversation format to ensure consistent structure."""
|
390 |
-
normalized = []
|
391 |
-
|
392 |
-
# Handle non-list or empty inputs
|
393 |
-
if not isinstance(conversation, list):
|
394 |
-
logger.warning(f"Conversation is not a list: {type(conversation)}")
|
395 |
-
if hasattr(conversation, 'items'): # It's a dict-like object
|
396 |
-
conversation = [conversation]
|
397 |
-
else:
|
398 |
-
return []
|
399 |
-
|
400 |
-
# Get introductory message if present (should be first and without chunk number)
|
401 |
-
intro_msg = None
|
402 |
-
for i, turn in enumerate(conversation):
|
403 |
-
if isinstance(turn, dict) and turn.get('content') and "[RESEARCH INTRODUCTION]" in turn.get('content', ''):
|
404 |
-
intro_msg = turn
|
405 |
-
break
|
406 |
-
|
407 |
-
# Process introduction message first if found
|
408 |
-
if intro_msg:
|
409 |
-
normalized.append({
|
410 |
-
"role": "system",
|
411 |
-
"content": intro_msg.get('content', '')
|
412 |
-
})
|
413 |
-
# Remove intro from further processing
|
414 |
-
conversation = [t for t in conversation if t != intro_msg]
|
415 |
-
|
416 |
-
# Process remaining messages
|
417 |
-
for turn in conversation:
|
418 |
-
# Skip empty or None entries
|
419 |
-
if not turn:
|
420 |
-
continue
|
421 |
-
|
422 |
-
# Handle string entries (convert to user message)
|
423 |
-
if isinstance(turn, str):
|
424 |
-
normalized.append({"role": "user", "content": turn})
|
425 |
-
continue
|
426 |
-
|
427 |
-
# Handle dict-like entries
|
428 |
-
if not isinstance(turn, dict) and hasattr(turn, 'get'):
|
429 |
-
# Convert to dict
|
430 |
-
turn = {k: turn.get(k) for k in ['role', 'content'] if hasattr(turn, 'get') and turn.get(k) is not None}
|
431 |
-
|
432 |
-
# Ensure both role and content exist
|
433 |
-
if not isinstance(turn, dict) or 'role' not in turn or 'content' not in turn:
|
434 |
-
logger.warning(f"Skipping malformatted conversation turn: {turn}")
|
435 |
-
continue
|
436 |
-
|
437 |
-
# Normalize role field
|
438 |
-
role = turn.get('role', '').lower()
|
439 |
-
if role == 'user' or role == 'human':
|
440 |
-
role = 'user'
|
441 |
-
elif role == 'assistant' or role == 'bot':
|
442 |
-
role = 'assistant'
|
443 |
-
|
444 |
-
# Add normalized turn
|
445 |
-
normalized.append({
|
446 |
-
"role": role,
|
447 |
-
"content": str(turn.get('content', ''))
|
448 |
-
})
|
449 |
-
|
450 |
-
return normalized
|
451 |
-
|
452 |
def __call__(self, features):
|
|
|
453 |
batch = {"input_ids": [], "attention_mask": [], "labels": []}
|
454 |
|
455 |
for example in features:
|
456 |
try:
|
457 |
-
# Get ID
|
458 |
paper_id = example.get("id", "")
|
459 |
|
460 |
-
#
|
461 |
-
|
462 |
-
|
463 |
-
# Normalize conversation format
|
464 |
-
conversation = self.normalize_conversation(conversation)
|
465 |
-
|
466 |
-
if not conversation:
|
467 |
self.stats["skipped"] += 1
|
468 |
continue
|
469 |
|
470 |
-
#
|
471 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
472 |
|
473 |
-
#
|
474 |
-
inputs
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
padding=False, # Don't pad here, we'll pad the batch later
|
481 |
-
)
|
482 |
|
483 |
-
if len(inputs
|
484 |
# For causal language modeling, labels are the same as inputs
|
485 |
-
labels = inputs
|
486 |
|
487 |
-
batch["input_ids"].append(inputs
|
488 |
-
batch["attention_mask"].append(
|
489 |
batch["labels"].append(labels)
|
490 |
|
491 |
self.stats["processed"] += 1
|
492 |
-
self.stats["total_tokens"] += len(inputs
|
493 |
|
494 |
# Debug logging for first few examples
|
495 |
log_samples = self.dataset_config.get("validation", {}).get("log_samples", 3)
|
496 |
if self.stats["processed"] <= log_samples:
|
497 |
-
logger.info(f"Example {self.stats['processed']}
|
498 |
logger.info(f"Paper ID: {paper_id}")
|
499 |
-
logger.info(f"Token count: {len(inputs
|
500 |
-
logger.info(f"
|
501 |
-
logger.info(f"Conversation structure: {conversation[:2]}...")
|
502 |
else:
|
503 |
self.stats["skipped"] += 1
|
504 |
except Exception as e:
|
505 |
logger.warning(f"Error processing example: {str(e)[:100]}...")
|
506 |
-
logger.warning(f"Problematic example: {
|
507 |
self.stats["skipped"] += 1
|
508 |
continue
|
509 |
|
510 |
if not batch["input_ids"]:
|
511 |
logger.warning("Empty batch, returning dummy tensors")
|
512 |
-
# Return tensors on the right device
|
513 |
return {
|
514 |
"input_ids": torch.zeros((1, 1), dtype=torch.long),
|
515 |
"attention_mask": torch.zeros((1, 1), dtype=torch.long),
|
@@ -526,7 +482,7 @@ class SimpleDataCollator:
|
|
526 |
batch["attention_mask"][i].extend([0] * padding_length)
|
527 |
batch["labels"][i].extend([-100] * padding_length)
|
528 |
|
529 |
-
# Convert to tensors
|
530 |
batch = {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
|
531 |
|
532 |
# Log stats periodically
|
@@ -534,8 +490,7 @@ class SimpleDataCollator:
|
|
534 |
if self.stats["processed"] % log_interval == 0 and self.stats["processed"] > 0:
|
535 |
logger.info(f"Data collator stats: processed={self.stats['processed']}, "
|
536 |
f"skipped={self.stats['skipped']}, "
|
537 |
-
f"avg_tokens={self.stats['total_tokens']/self.stats['processed']:.1f}
|
538 |
-
f"unique_papers={len(self.paper_counters)}")
|
539 |
|
540 |
return batch
|
541 |
|
@@ -731,21 +686,35 @@ def main():
|
|
731 |
no_cuda=False if torch.cuda.is_available() else True, # Use CUDA if available
|
732 |
)
|
733 |
|
734 |
-
#
|
735 |
-
|
736 |
-
|
737 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
738 |
if getattr(training_args, "no_cuda", False):
|
739 |
batch_size = training_args.per_device_train_batch_size
|
740 |
else:
|
741 |
-
batch_size = max(training_args.per_device_train_batch_size * torch.cuda.device_count(), 1)
|
742 |
|
743 |
-
|
744 |
-
sequential_sampler = torch.utils.data.SequentialSampler(dataset["train"])
|
745 |
-
logger.info(f"Using sequential sampler for batch size {batch_size}")
|
746 |
|
|
|
747 |
return torch.utils.data.DataLoader(
|
748 |
-
dataset
|
749 |
batch_size=batch_size,
|
750 |
sampler=sequential_sampler,
|
751 |
collate_fn=data_collator,
|
@@ -754,16 +723,8 @@ def main():
|
|
754 |
pin_memory=training_args.dataloader_pin_memory,
|
755 |
)
|
756 |
|
757 |
-
#
|
758 |
-
|
759 |
-
trainer = Trainer(
|
760 |
-
model=model,
|
761 |
-
args=training_args,
|
762 |
-
get_train_dataloader=get_train_dataloader_no_shuffle,
|
763 |
-
tokenizer=tokenizer,
|
764 |
-
data_collator=data_collator,
|
765 |
-
callbacks=[LoggingCallback()]
|
766 |
-
)
|
767 |
|
768 |
# Start training
|
769 |
logger.info("Starting training process")
|
|
|
297 |
else:
|
298 |
logger.warning(f"Expected column '{col}' not found in dataset")
|
299 |
|
300 |
+
# Note: Explicitly NOT sorting the dataset to preserve original order
|
301 |
+
logger.info("Preserving original dataset order (no sorting)")
|
302 |
+
|
303 |
+
# Log examples without printing full content
|
304 |
+
if "conversations" in dataset.column_names:
|
|
|
|
|
305 |
sample_ids = [example['id'] for example in dataset.select(range(min(5, len(dataset))))]
|
306 |
+
logger.info(f"First few IDs: {sample_ids}")
|
307 |
|
308 |
+
# Log conversation structure without full content
|
309 |
+
if len(dataset) > 0:
|
310 |
+
sample_conv_structure = []
|
311 |
+
for msg in dataset["conversations"][0]:
|
312 |
+
if isinstance(msg, dict):
|
313 |
+
content = msg.get('content', '')
|
314 |
+
preview = content[:50] + "..." if len(content) > 50 else content
|
315 |
+
sample_conv_structure.append({
|
316 |
+
"role": msg.get('role', ''),
|
317 |
+
"content_length": len(content),
|
318 |
+
"preview": preview
|
319 |
+
})
|
320 |
+
logger.info(f"Conversation structure: {sample_conv_structure}")
|
321 |
|
322 |
logger.info(f"Dataset loaded successfully with {len(dataset)} examples")
|
323 |
logger.info(f"Dataset columns: {dataset.column_names}")
|
|
|
381 |
self.dataset_config = dataset_config
|
382 |
self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
|
383 |
self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
|
|
|
384 |
self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
|
385 |
+
logger.info(f"SimpleDataCollator initialized - using pre-audited dataset with max_seq_length={self.max_seq_length}")
|
386 |
+
logger.info("Using exact dataset structure without reformatting")
|
|
|
|
|
387 |
|
388 |
# Check if we're on GPU
|
389 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
390 |
logger.info(f"SimpleDataCollator using device: {self.device}")
|
391 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
def __call__(self, features):
|
393 |
+
"""Process examples preserving exact JSONL structure"""
|
394 |
batch = {"input_ids": [], "attention_mask": [], "labels": []}
|
395 |
|
396 |
for example in features:
|
397 |
try:
|
398 |
+
# Get ID
|
399 |
paper_id = example.get("id", "")
|
400 |
|
401 |
+
# Get conversations - these should already contain role and content
|
402 |
+
conversations = example.get("conversations", [])
|
403 |
+
if not conversations:
|
|
|
|
|
|
|
|
|
404 |
self.stats["skipped"] += 1
|
405 |
continue
|
406 |
|
407 |
+
# Directly use the conversations array as input to the model's chat template
|
408 |
+
# This preserves the exact structure with roles and content as they are
|
409 |
+
try:
|
410 |
+
# Let tokenizer handle the content with the model's chat template
|
411 |
+
inputs = self.tokenizer.apply_chat_template(
|
412 |
+
conversations,
|
413 |
+
return_tensors=None,
|
414 |
+
add_generation_prompt=False
|
415 |
+
)
|
416 |
+
except:
|
417 |
+
# Fallback if apply_chat_template fails
|
418 |
+
logger.warning(f"Chat template application failed for example {paper_id}, using basic tokenization")
|
419 |
+
|
420 |
+
# Create a basic representation of the conversation
|
421 |
+
conversation_text = ""
|
422 |
+
for msg in conversations:
|
423 |
+
if isinstance(msg, dict) and 'content' in msg:
|
424 |
+
conversation_text += msg.get('content', '') + "\n\n"
|
425 |
+
|
426 |
+
# Basic tokenization
|
427 |
+
inputs = self.tokenizer(
|
428 |
+
conversation_text,
|
429 |
+
add_special_tokens=True,
|
430 |
+
return_tensors=None
|
431 |
+
)
|
432 |
|
433 |
+
# Apply length cap if needed (shouldn't be necessary for pre-audited data)
|
434 |
+
if self.max_seq_length > 0 and len(inputs) > self.max_seq_length:
|
435 |
+
logger.warning(f"Example {paper_id} exceeds max_seq_length ({len(inputs)} > {self.max_seq_length})")
|
436 |
+
inputs = inputs[:self.max_seq_length]
|
437 |
+
|
438 |
+
# Create attention mask (1 for all tokens)
|
439 |
+
attention_mask = [1] * len(inputs)
|
|
|
|
|
440 |
|
441 |
+
if len(inputs) > 0:
|
442 |
# For causal language modeling, labels are the same as inputs
|
443 |
+
labels = inputs.copy()
|
444 |
|
445 |
+
batch["input_ids"].append(inputs)
|
446 |
+
batch["attention_mask"].append(attention_mask)
|
447 |
batch["labels"].append(labels)
|
448 |
|
449 |
self.stats["processed"] += 1
|
450 |
+
self.stats["total_tokens"] += len(inputs)
|
451 |
|
452 |
# Debug logging for first few examples
|
453 |
log_samples = self.dataset_config.get("validation", {}).get("log_samples", 3)
|
454 |
if self.stats["processed"] <= log_samples:
|
455 |
+
logger.info(f"Example {self.stats['processed']}:")
|
456 |
logger.info(f"Paper ID: {paper_id}")
|
457 |
+
logger.info(f"Token count: {len(inputs)}")
|
458 |
+
logger.info(f"Conversation entries: {len(conversations)}")
|
|
|
459 |
else:
|
460 |
self.stats["skipped"] += 1
|
461 |
except Exception as e:
|
462 |
logger.warning(f"Error processing example: {str(e)[:100]}...")
|
463 |
+
logger.warning(f"Problematic example ID: {example.get('id', 'unknown')}")
|
464 |
self.stats["skipped"] += 1
|
465 |
continue
|
466 |
|
467 |
if not batch["input_ids"]:
|
468 |
logger.warning("Empty batch, returning dummy tensors")
|
|
|
469 |
return {
|
470 |
"input_ids": torch.zeros((1, 1), dtype=torch.long),
|
471 |
"attention_mask": torch.zeros((1, 1), dtype=torch.long),
|
|
|
482 |
batch["attention_mask"][i].extend([0] * padding_length)
|
483 |
batch["labels"][i].extend([-100] * padding_length)
|
484 |
|
485 |
+
# Convert to tensors
|
486 |
batch = {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
|
487 |
|
488 |
# Log stats periodically
|
|
|
490 |
if self.stats["processed"] % log_interval == 0 and self.stats["processed"] > 0:
|
491 |
logger.info(f"Data collator stats: processed={self.stats['processed']}, "
|
492 |
f"skipped={self.stats['skipped']}, "
|
493 |
+
f"avg_tokens={self.stats['total_tokens']/self.stats['processed']:.1f}")
|
|
|
494 |
|
495 |
return batch
|
496 |
|
|
|
686 |
no_cuda=False if torch.cuda.is_available() else True, # Use CUDA if available
|
687 |
)
|
688 |
|
689 |
+
# Create sequential sampler to maintain original dataset order
|
690 |
+
sequential_sampler = torch.utils.data.SequentialSampler(dataset)
|
691 |
+
|
692 |
+
# Initialize trainer first
|
693 |
+
logger.info("Initializing Trainer")
|
694 |
+
trainer = Trainer(
|
695 |
+
model=model,
|
696 |
+
args=training_args,
|
697 |
+
train_dataset=dataset, # We'll override this with our custom dataloader
|
698 |
+
data_collator=data_collator,
|
699 |
+
callbacks=[LoggingCallback()],
|
700 |
+
)
|
701 |
+
|
702 |
+
# Then override the get_train_dataloader method
|
703 |
+
def custom_get_train_dataloader():
|
704 |
+
"""Custom dataloader that preserves original dataset order"""
|
705 |
+
logger.info("Creating sequential dataloader to maintain original dataset order")
|
706 |
+
|
707 |
+
# Calculate batch size based on device availability
|
708 |
if getattr(training_args, "no_cuda", False):
|
709 |
batch_size = training_args.per_device_train_batch_size
|
710 |
else:
|
711 |
+
batch_size = max(training_args.per_device_train_batch_size * max(1, torch.cuda.device_count()), 1)
|
712 |
|
713 |
+
logger.info(f"Using sequential sampler with batch size {batch_size}")
|
|
|
|
|
714 |
|
715 |
+
# Return DataLoader with sequential sampler
|
716 |
return torch.utils.data.DataLoader(
|
717 |
+
dataset,
|
718 |
batch_size=batch_size,
|
719 |
sampler=sequential_sampler,
|
720 |
collate_fn=data_collator,
|
|
|
723 |
pin_memory=training_args.dataloader_pin_memory,
|
724 |
)
|
725 |
|
726 |
+
# Override the get_train_dataloader method
|
727 |
+
trainer.get_train_dataloader = custom_get_train_dataloader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
728 |
|
729 |
# Start training
|
730 |
logger.info("Starting training process")
|