tuanio commited on
Commit
f1e87c4
·
verified ·
1 Parent(s): 028fbff

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +141 -1
README.md CHANGED
@@ -7,4 +7,144 @@ language:
7
  ---
8
 
9
  LLaVA-Qwen1.5-1.8b model trained with LoRA, on a subset of Vista Vi LLaVA Complex Reasoning.
10
- Loss: ~1.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  ---
8
 
9
  LLaVA-Qwen1.5-1.8b model trained with LoRA, on a subset of Vista Vi LLaVA Complex Reasoning.
10
+ Loss: ~1.5
11
+
12
+ Training script
13
+ ```bash
14
+ deepspeed moellava/train/train_mem.py \
15
+ --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 0.00000125 \
16
+ --lora_path /kaggle/temp/lora-llavaqwen \
17
+ --deepspeed ./scripts/zero3.json \
18
+ --model_name_or_path Qwen/Qwen1.5-1.8B \
19
+ --version qwen \
20
+ --data_path /kaggle/temp/vi_llava_train.json \
21
+ --image_folder /kaggle/input/coco-2017-dataset/coco2017/train2017 \
22
+ --image_tower google/siglip-base-patch16-256-multilingual \
23
+ --image_projector_type mlp2x_gelu \
24
+ --pretrain_mm_mlp_adapter /kaggle/temp/pt-llavaqwen1.5-1.8b/mm_projector.bin \
25
+ --mm_vision_select_layer -2 \
26
+ --mm_use_im_start_end False \
27
+ --mm_use_im_patch_token False \
28
+ --image_aspect_ratio pad \
29
+ --group_by_modality_length True \
30
+ --fp16 True \
31
+ --output_dir ./checkpoints/ft-lora-llavaqwen1.5-1.8b-complex_reasoning \
32
+ --num_train_epochs 1 \
33
+ --per_device_train_batch_size 2 \
34
+ --per_device_eval_batch_size 4 \
35
+ --gradient_accumulation_steps 8 \
36
+ --evaluation_strategy "no" \
37
+ --save_strategy "steps" \
38
+ --save_steps 100 \
39
+ --save_total_limit 1 \
40
+ --learning_rate 1e-5 \
41
+ --weight_decay 0. \
42
+ --warmup_ratio 0 \
43
+ --lr_scheduler_type "cosine" \
44
+ --logging_steps 5 \
45
+ --tf32 False \
46
+ --model_max_length 1024 \
47
+ --gradient_checkpointing True \
48
+ --dataloader_num_workers 4 \
49
+ --lazy_preprocess True \
50
+ --report_to wandb \
51
+ --run_name ft-llava-qwen1.5-1.8b-lora-vista_reasoning-cont \
52
+ --push_to_hub True
53
+ ```
54
+
55
+ Python code to merge LoRA
56
+ ```python
57
+ from typing import Optional, List
58
+ class ModelArguments:
59
+ model_name_or_path: Optional[str] = "facebook/opt-125m"
60
+ version: Optional[str] = "v0"
61
+ freeze_backbone: bool = False
62
+ tune_mm_mlp_adapter: bool = False
63
+ mm_vision_select_layer: Optional[int] = -1 # default to the last layer
64
+ pretrain_mm_mlp_adapter: Optional[str] = None
65
+ mm_use_im_start_end: bool = False
66
+ mm_use_im_patch_token: bool = True
67
+ mm_vision_select_feature: Optional[str] = "patch"
68
+ # ===================================================================
69
+ image_tower: Optional[str] = 'google/siglip-base-patch16-256-multilingual'
70
+ video_tower: Optional[str] = None
71
+ image_projector_type: Optional[str] = 'linear'
72
+ video_projector_type: Optional[str] = 'linear'
73
+ video_global_proj: bool = False
74
+ video_temproal_proj: bool = False
75
+ video_spatial_proj: bool = False
76
+ # ===================================================================
77
+
78
+ # =============================================================
79
+ only_lora_ffn: bool = True
80
+ moe_enable: bool = False
81
+ train_modules: Optional[List[str]] = None
82
+ moe_mode: str = "sparse"
83
+ moe_layers_idx: Optional[List[int]] = None
84
+ ep_size: int = 1
85
+ num_experts: Optional[List[int]] = 4
86
+ top_k_experts: int = 2
87
+ capacity_factor: float = 1.
88
+ eval_capacity_factor: float = 2.
89
+ min_capacity: int = 0
90
+ use_residual: bool = False
91
+ router_aux_loss_coef: float = 0.01
92
+
93
+ class DataArguments:
94
+ lazy_preprocess: bool = False
95
+ is_multimodal: bool = False
96
+ image_aspect_ratio: str = 'pad'
97
+ # ===================================================================
98
+ data_path: Optional[List[str]] = None
99
+ image_folder: Optional[str] = None
100
+ video_folder: Optional[str] = None
101
+ num_frames: int = 8
102
+
103
+ model_args = ModelArguments()
104
+ data_args = DataArguments()
105
+
106
+ import torch
107
+ from peft import PeftModel
108
+ from moellava.model import LlavaQwen1_5ForCausalLM
109
+
110
+ model_name_or_path = 'Qwen/Qwen1.5-1.8B'
111
+ lora_path = 'llavaqwen1.5-lora'
112
+
113
+ model = LlavaQwen1_5ForCausalLM.from_pretrained(
114
+ model_name_or_path,
115
+ )
116
+
117
+ model.to(torch.float16)
118
+ model = PeftModel.from_pretrained(model, lora_path)
119
+ model
120
+
121
+ import transformers
122
+
123
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
124
+ model_args.model_name_or_path,
125
+ model_max_length=1024,
126
+ padding_side="right",
127
+ use_fast=False,
128
+ )
129
+ tokenizer.add_special_tokens({'unk_token': '<|extra_0|>'})
130
+
131
+ model.get_model().initialize_vision_modules(
132
+ model_args=model_args,
133
+ )
134
+
135
+ image_tower = model.get_image_tower()
136
+ image_tower.to(dtype=torch.float16)
137
+
138
+ data_args.image_processor = image_tower.image_processor
139
+ data_args.is_multimodal = True
140
+
141
+ model.config.image_aspect_ratio = data_args.image_aspect_ratio
142
+ model.config.tokenizer_padding_side = tokenizer.padding_side
143
+
144
+ model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
145
+ model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
146
+ model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
147
+
148
+ merged_model = model.merge_and_unload()
149
+ merged_model.save_pretrained("llava-qwen1.5-1.8b-complex_reasoning-merged")
150
+ ```