hf-train-frontend / transformers_config.json
George-API's picture
Upload folder using huggingface_hub
bf7bd7e verified
raw
history blame
4.03 kB
{
"model": {
"name": "unsloth/phi-4-unsloth-bnb-4bit",
"trust_remote_code": true,
"use_fast_tokenizer": true
},
"tokenizer": {
"chat_template": "phi",
"max_seq_length": 2048,
"padding_side": "right",
"add_eos_token": true
},
"training": {
"per_device_train_batch_size": 16,
"gradient_accumulation_steps": 3,
"learning_rate": 2e-5,
"num_train_epochs": 3,
"max_steps": -1,
"logging_steps": 10,
"save_steps": 200,
"save_total_limit": 5,
"push_to_hub": true,
"hub_strategy": "every_save",
"gradient_checkpointing": true,
"optim": "adamw_torch",
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.05,
"weight_decay": 0.01,
"max_grad_norm": 1.0,
"neftune_noise_alpha": 5,
"fp16": false,
"bf16": true
},
"checkpointing": {
"output_dir": "./results",
"save_strategy": "steps",
"save_steps": 100,
"save_total_limit": 3,
"hub_strategy": "every_save"
},
"unsloth": {
"enabled": true,
"r": 32,
"alpha": 16,
"dropout": 0,
"target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj"
]
},
"distributed_training": {
"fsdp_config": {
"enabled": false,
"sharding_strategy": "FULL_SHARD",
"mixed_precision": "BF16",
"activation_checkpointing": true,
"offload_params": false
},
"ddp_find_unused_parameters": false,
"dataloader_num_workers": 2,
"ddp_config": {
"enabled": true,
"backend": "nccl",
"find_unused_parameters": false,
"broadcast_buffers": false,
"gradient_as_bucket_view": true
}
},
"logging": {
"logging_steps": 50,
"log_level": "info"
},
"huggingface_hub": {
"push_to_hub": true,
"hub_model_id": "phi-4-cognitive-assistant",
"hub_private_repo": true
},
"model_name_or_path": "unsloth/phi-4-unsloth-bnb-4bit",
"model_revision": "main",
"use_flash_attention": true,
"torch_dtype": "bfloat16",
"bf16": true,
"fp16": false,
"hardware": {
"hardware_name": "4xL4",
"specs": {
"gpu_count": 4,
"gpu_type": "L4",
"vram_per_gpu": 24,
"total_vram": 96,
"vcpu_count": 48,
"ram": 186
},
"hardware_setup": {
"use_cpu": false,
"num_gpus": 4,
"device_map": "auto"
},
"training_optimizations": {
"per_device_batch_size": 16,
"gradient_accumulation_steps": 3,
"mixed_precision": "bf16",
"torch_compile": false,
"memory_optimizations": {
"use_gradient_checkpointing": true,
"use_flash_attention": true
},
"multi_gpu_strategy": "ddp"
},
"system_settings": {
"cuda_memory_fraction": 0.75,
"dataloader_num_workers": 4,
"dataloader_pin_memory": true
},
"memory_breakdown": {
"model_size": "~3.5GB (pre-quantized 4-bit)",
"optimizer_states": "~1GB",
"batch_memory_per_gpu": "~3GB",
"peak_memory_estimate": "~18GB",
"safe_headroom": "~6GB"
},
"compute_environment": "L4_CLOUD"
},
"dataset": {
"dataset": {
"name": "George-API/phi4-cognitive-dataset",
"split": "train"
},
"data_formatting": {
"chat_template": "phi",
"conversation_structure": {
"system_identifier": "[RESEARCH INTRODUCTION]",
"turn_order": ["human", "assistant"]
},
"roles": {
"system": "System: {content}\n\n",
"human": "Human: {content}\n\n",
"assistant": "Assistant: {content}\n\n"
}
},
"data_loading": {
"batch_size": 24,
"shuffle": false,
"sequential_processing": true,
"drop_last": false,
"num_workers": 4,
"pin_memory": true,
"prefetch_factor": 4
},
"validation": {
"log_samples": 3,
"log_interval": 50,
"verify_sequence_integrity": true,
"metrics": ["processed", "skipped", "avg_tokens", "unique_articles"]
}
}
}