gurumurthy3 commited on
Commit
cef7c92
·
verified ·
1 Parent(s): abcc75e

Upload Vision-GPT model (FP32 and FP16 versions)

Browse files
.gitattributes CHANGED
@@ -1,35 +1,4 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Git LFS tracking
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  *.pth filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: mit
5
+ tags:
6
+ - vision
7
+ - image-to-text
8
+ - image-captioning
9
+ - gpt2
10
+ - vision-transformer
11
+ - flickr8k
12
+ pipeline_tag: image-to-text
13
+ datasets:
14
+ - jxie/flickr8k
15
+ ---
16
+
17
+ # Vision-GPT: Image Captioning Model
18
+
19
+ A multimodal model combining Vision Transformer (ViT-B/16) and GPT-2 for image captioning, trained on Flickr8K dataset.
20
+
21
+ ## Model Description
22
+
23
+ This model generates natural language captions for images by:
24
+ 1. Encoding images using a pre-trained ViT-B/16 vision encoder
25
+ 2. Projecting visual features into GPT-2's embedding space
26
+ 3. Generating captions autoregressively with GPT-2
27
+
28
+ ## Training Details
29
+
30
+ - **Dataset**: Flickr8K (all splits: train, validation, test)
31
+ - **Vision Encoder**: ViT-B/16 (frozen)
32
+ - **Language Model**: GPT-2 (frozen backbone, trainable projection)
33
+ - **Training**: Only the vision-to-text projection layer is trained
34
+
35
+ ## Model Versions
36
+
37
+ ### 📦 FP32 (Full Precision)
38
+ - **Size**: ~0.83 GB
39
+ - **Precision**: 32-bit floating point
40
+ - **Use case**: Maximum accuracy, research
41
+
42
+ ### 📦 FP16 (Half Precision)
43
+ - **Size**: ~0.83 GB
44
+ - **Precision**: 16-bit floating point
45
+ - **Use case**: Faster inference, reduced memory (~50% smaller)
46
+
47
+ ## Usage
48
+
49
+ ### Installation
50
+
51
+ ```bash
52
+ pip install torch torchvision transformers pillow huggingface_hub
53
+ ```
54
+
55
+ ### Loading the Model (FP32)
56
+
57
+ ```python
58
+ import torch
59
+ from transformers import GPT2Tokenizer
60
+ from PIL import Image
61
+ from torchvision import transforms
62
+
63
+ # Load checkpoint
64
+ checkpoint = torch.load("model_fp32/model_checkpoint.pth", map_location="cpu")
65
+
66
+ # Load tokenizer
67
+ tokenizer = GPT2Tokenizer.from_pretrained("model_fp32/tokenizer")
68
+
69
+ # Load your model architecture (you need to define this)
70
+ # model = YourVisionGPTModel(config)
71
+ # model.load_state_dict(checkpoint['model_state_dict'])
72
+ # model.eval()
73
+
74
+ print("Model loaded successfully!")
75
+ ```
76
+
77
+ ### Loading the Model (FP16)
78
+
79
+ ```python
80
+ # Load FP16 checkpoint
81
+ checkpoint = torch.load("model_fp16/model_checkpoint.pth", map_location="cpu")
82
+
83
+ # Load model and convert to FP16
84
+ # model = YourVisionGPTModel(config)
85
+ # model.load_state_dict(checkpoint['model_state_dict'])
86
+ # model.half() # Convert to FP16
87
+ # model.eval()
88
+
89
+ # For inference with FP16, also convert input images to FP16
90
+ ```
91
+
92
+ ### Image Preprocessing
93
+
94
+ ```python
95
+ image_transform = transforms.Compose([
96
+ transforms.Resize((224, 224)),
97
+ transforms.Lambda(lambda x: x.convert('RGB')),
98
+ transforms.ToTensor(),
99
+ transforms.Normalize(
100
+ mean=[0.485, 0.456, 0.406],
101
+ std=[0.229, 0.224, 0.225]
102
+ ),
103
+ ])
104
+
105
+ # Load and preprocess image
106
+ image = Image.open("your_image.jpg")
107
+ image_tensor = image_transform(image).unsqueeze(0) # Add batch dimension
108
+ ```
109
+
110
+ ### Generate Caption
111
+
112
+ ```python
113
+ # Generate caption
114
+ with torch.no_grad():
115
+ # Forward pass
116
+ generated_ids = model.generate(
117
+ image_tensor,
118
+ max_length=50,
119
+ num_beams=5,
120
+ temperature=0.7
121
+ )
122
+
123
+ # Decode caption
124
+ caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
125
+ print(f"Generated caption: {caption}")
126
+ ```
127
+
128
+ ## Model Architecture
129
+
130
+ ```
131
+ ┌─────────────────┐
132
+ │ Input Image │
133
+ │ (224x224) │
134
+ └────────┬────────┘
135
+
136
+
137
+ ┌─────────────────┐
138
+ │ ViT-B/16 │
139
+ │ (frozen) │
140
+ └────────┬────────┘
141
+
142
+
143
+ ┌─────────────────┐
144
+ │ Projection │
145
+ │ (trainable) │
146
+ └────────┬────────┘
147
+
148
+
149
+ ┌─────────────────┐
150
+ │ GPT-2 │
151
+ │ (frozen) │
152
+ └────────┬────────┘
153
+
154
+
155
+ ┌─────────────────┐
156
+ │ Caption Output │
157
+ └─────────────────┘
158
+ ```
159
+
160
+ ## Limitations
161
+
162
+ - Trained only on Flickr8K (limited domain)
163
+ - English captions only
164
+ - Input images must be 224x224
165
+ - May generate generic captions for out-of-domain images
166
+
167
+ ## Citation
168
+
169
+ If you use this model, please cite:
170
+
171
+ ```bibtex
172
+ @misc{vision-gpt-flickr8k,
173
+ author = {gurumurthy3},
174
+ title = {Vision-GPT: Image Captioning with ViT and GPT-2},
175
+ year = {2025},
176
+ publisher = {Hugging Face},
177
+ howpublished = {\url{https://huggingface.co/gurumurthy3/vision-gpt-flickr8k}}
178
+ }
179
+ ```
180
+
181
+ ## License
182
+
183
+ MIT License
184
+
185
+ ## Acknowledgments
186
+
187
+ - Vision Transformer (ViT): Dosovitskiy et al.
188
+ - GPT-2: OpenAI
189
+ - Flickr8K Dataset: Hodosh et al.
model_fp16/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 50258,
3
+ "emb_dim": 768,
4
+ "context_length": 256,
5
+ "n_heads": 12,
6
+ "n_layers": 12,
7
+ "drop_rate": 0.1,
8
+ "qkv_bias": true,
9
+ "cross_attention_pos": [
10
+ 3,
11
+ 6,
12
+ 9
13
+ ],
14
+ "vision_enabled": true,
15
+ "vision_encoder_type": "torchvision_vit_b_16",
16
+ "vision_pretrained": true,
17
+ "vision_freeze": true,
18
+ "perceiver_num_latents": 64,
19
+ "perceiver_depth": 2,
20
+ "perceiver_heads": 8,
21
+ "perceiver_dim_head": 64,
22
+ "weight_decay": 0.01,
23
+ "learning_rate": 0.0001
24
+ }
model_fp16/model_checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d0a5fe0249503f0deecd8c319a1133c010cadef4294d4e301970eb7498de357
3
+ size 891677724
model_fp16/tokenizer/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<|pad|>": 50257
3
+ }
model_fp16/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model_fp16/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": {
5
+ "content": "<|pad|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "unk_token": "<|endoftext|>"
12
+ }
model_fp16/tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model_fp16/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "50257": {
13
+ "content": "<|pad|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|endoftext|>",
22
+ "clean_up_tokenization_spaces": false,
23
+ "eos_token": "<|endoftext|>",
24
+ "extra_special_tokens": {},
25
+ "model_max_length": 1024,
26
+ "pad_token": "<|pad|>",
27
+ "tokenizer_class": "GPT2Tokenizer",
28
+ "unk_token": "<|endoftext|>"
29
+ }
model_fp16/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
model_fp32/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 50258,
3
+ "emb_dim": 768,
4
+ "context_length": 256,
5
+ "n_heads": 12,
6
+ "n_layers": 12,
7
+ "drop_rate": 0.1,
8
+ "qkv_bias": true,
9
+ "cross_attention_pos": [
10
+ 3,
11
+ 6,
12
+ 9
13
+ ],
14
+ "vision_enabled": true,
15
+ "vision_encoder_type": "torchvision_vit_b_16",
16
+ "vision_pretrained": true,
17
+ "vision_freeze": true,
18
+ "perceiver_num_latents": 64,
19
+ "perceiver_depth": 2,
20
+ "perceiver_heads": 8,
21
+ "perceiver_dim_head": 64,
22
+ "weight_decay": 0.01,
23
+ "learning_rate": 0.0001
24
+ }
model_fp32/model_checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d0a5fe0249503f0deecd8c319a1133c010cadef4294d4e301970eb7498de357
3
+ size 891677724
model_fp32/tokenizer/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<|pad|>": 50257
3
+ }
model_fp32/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model_fp32/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": {
5
+ "content": "<|pad|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "unk_token": "<|endoftext|>"
12
+ }
model_fp32/tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model_fp32/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "50257": {
13
+ "content": "<|pad|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|endoftext|>",
22
+ "clean_up_tokenization_spaces": false,
23
+ "eos_token": "<|endoftext|>",
24
+ "extra_special_tokens": {},
25
+ "model_max_length": 1024,
26
+ "pad_token": "<|pad|>",
27
+ "tokenizer_class": "GPT2Tokenizer",
28
+ "unk_token": "<|endoftext|>"
29
+ }
model_fp32/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff