|
--- |
|
license: mit |
|
datasets: |
|
- Vi-VLM/Vista |
|
language: |
|
- vi |
|
--- |
|
|
|
LLaVA-Qwen1.5-1.8b model trained with LoRA, on a subset of Vista Vi LLaVA Complex Reasoning. |
|
Loss: ~1.5 |
|
|
|
Training script |
|
```bash |
|
deepspeed moellava/train/train_mem.py \ |
|
--lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 0.00000125 \ |
|
--lora_path /kaggle/temp/lora-llavaqwen \ |
|
--deepspeed ./scripts/zero3.json \ |
|
--model_name_or_path Qwen/Qwen1.5-1.8B \ |
|
--version qwen \ |
|
--data_path /kaggle/temp/vi_llava_train.json \ |
|
--image_folder /kaggle/input/coco-2017-dataset/coco2017/train2017 \ |
|
--image_tower google/siglip-base-patch16-256-multilingual \ |
|
--image_projector_type mlp2x_gelu \ |
|
--pretrain_mm_mlp_adapter /kaggle/temp/pt-llavaqwen1.5-1.8b/mm_projector.bin \ |
|
--mm_vision_select_layer -2 \ |
|
--mm_use_im_start_end False \ |
|
--mm_use_im_patch_token False \ |
|
--image_aspect_ratio pad \ |
|
--group_by_modality_length True \ |
|
--fp16 True \ |
|
--output_dir ./checkpoints/ft-lora-llavaqwen1.5-1.8b-complex_reasoning \ |
|
--num_train_epochs 1 \ |
|
--per_device_train_batch_size 2 \ |
|
--per_device_eval_batch_size 4 \ |
|
--gradient_accumulation_steps 8 \ |
|
--evaluation_strategy "no" \ |
|
--save_strategy "steps" \ |
|
--save_steps 100 \ |
|
--save_total_limit 1 \ |
|
--learning_rate 1e-5 \ |
|
--weight_decay 0. \ |
|
--warmup_ratio 0 \ |
|
--lr_scheduler_type "cosine" \ |
|
--logging_steps 5 \ |
|
--tf32 False \ |
|
--model_max_length 1024 \ |
|
--gradient_checkpointing True \ |
|
--dataloader_num_workers 4 \ |
|
--lazy_preprocess True \ |
|
--report_to wandb \ |
|
--run_name ft-llava-qwen1.5-1.8b-lora-vista_reasoning-cont \ |
|
--push_to_hub True |
|
``` |
|
|
|
Python code to merge LoRA |
|
```python |
|
from typing import Optional, List |
|
class ModelArguments: |
|
model_name_or_path: Optional[str] = "facebook/opt-125m" |
|
version: Optional[str] = "v0" |
|
freeze_backbone: bool = False |
|
tune_mm_mlp_adapter: bool = False |
|
mm_vision_select_layer: Optional[int] = -1 # default to the last layer |
|
pretrain_mm_mlp_adapter: Optional[str] = None |
|
mm_use_im_start_end: bool = False |
|
mm_use_im_patch_token: bool = True |
|
mm_vision_select_feature: Optional[str] = "patch" |
|
# =================================================================== |
|
image_tower: Optional[str] = 'google/siglip-base-patch16-256-multilingual' |
|
video_tower: Optional[str] = None |
|
image_projector_type: Optional[str] = 'linear' |
|
video_projector_type: Optional[str] = 'linear' |
|
video_global_proj: bool = False |
|
video_temproal_proj: bool = False |
|
video_spatial_proj: bool = False |
|
# =================================================================== |
|
|
|
# ============================================================= |
|
only_lora_ffn: bool = True |
|
moe_enable: bool = False |
|
train_modules: Optional[List[str]] = None |
|
moe_mode: str = "sparse" |
|
moe_layers_idx: Optional[List[int]] = None |
|
ep_size: int = 1 |
|
num_experts: Optional[List[int]] = 4 |
|
top_k_experts: int = 2 |
|
capacity_factor: float = 1. |
|
eval_capacity_factor: float = 2. |
|
min_capacity: int = 0 |
|
use_residual: bool = False |
|
router_aux_loss_coef: float = 0.01 |
|
|
|
class DataArguments: |
|
lazy_preprocess: bool = False |
|
is_multimodal: bool = False |
|
image_aspect_ratio: str = 'pad' |
|
# =================================================================== |
|
data_path: Optional[List[str]] = None |
|
image_folder: Optional[str] = None |
|
video_folder: Optional[str] = None |
|
num_frames: int = 8 |
|
|
|
model_args = ModelArguments() |
|
data_args = DataArguments() |
|
|
|
import torch |
|
from peft import PeftModel |
|
from moellava.model import LlavaQwen1_5ForCausalLM |
|
|
|
model_name_or_path = 'Qwen/Qwen1.5-1.8B' |
|
lora_path = 'llavaqwen1.5-lora' |
|
|
|
model = LlavaQwen1_5ForCausalLM.from_pretrained( |
|
model_name_or_path, |
|
) |
|
|
|
model.to(torch.float16) |
|
model = PeftModel.from_pretrained(model, lora_path) |
|
model |
|
|
|
import transformers |
|
|
|
tokenizer = transformers.AutoTokenizer.from_pretrained( |
|
model_args.model_name_or_path, |
|
model_max_length=1024, |
|
padding_side="right", |
|
use_fast=False, |
|
) |
|
tokenizer.add_special_tokens({'unk_token': '<|extra_0|>'}) |
|
|
|
model.get_model().initialize_vision_modules( |
|
model_args=model_args, |
|
) |
|
|
|
image_tower = model.get_image_tower() |
|
image_tower.to(dtype=torch.float16) |
|
|
|
data_args.image_processor = image_tower.image_processor |
|
data_args.is_multimodal = True |
|
|
|
model.config.image_aspect_ratio = data_args.image_aspect_ratio |
|
model.config.tokenizer_padding_side = tokenizer.padding_side |
|
|
|
model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end |
|
model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token |
|
model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer) |
|
|
|
merged_model = model.merge_and_unload() |
|
merged_model.save_pretrained("llava-qwen1.5-1.8b-complex_reasoning-merged") |
|
``` |