Add files using upload-large-folder tool

Browse files

Files changed (7) hide show

__pycache__/model.cpython-310.pyc +0 -0
__pycache__/transformer.cpython-310.pyc +0 -0
config.json +117 -1
model.py +13 -4
model_sanity_check.ipynb +170 -50
transformer.py +33 -5
zeroshot_classifier.pt +2 -2

__pycache__/model.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/model.cpython-310.pyc and b/__pycache__/model.cpython-310.pyc differ

__pycache__/transformer.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/transformer.cpython-310.pyc and b/__pycache__/transformer.cpython-310.pyc differ

config.json CHANGED Viewed

@@ -22,7 +22,123 @@
     "num_hidden_layers": 12,
     "max_position_embeddings": 77
   },
-  "neuron_dict": {},
   "projection_dim": 768,
   "torch_dtype": "float32",
   "transformers_version": "4.21.0"

     "num_hidden_layers": 12,
     "max_position_embeddings": 77
   },
+  "num_register_tokens": 1,
+  "neuron_dict": {
+    "9": [
+        815,
+        4078,
+        3618,
+        2693,
+        3973,
+        1744,
+        1983,
+        1157,
+        1309,
+        1335,
+        2607,
+        2396,
+        3049,
+        1610,
+        2621,
+        2867,
+        2012,
+        1924,
+        2394,
+        3097,
+        3125,
+        3959,
+        3210,
+        2855,
+        3609,
+        526,
+        3362,
+        3395,
+        2626,
+        503,
+        2941,
+        3696,
+        1823,
+        2000,
+        129,
+        3667,
+        1372,
+        147,
+        1150,
+        852,
+        3222
+    ],
+    "8": [
+        745,
+        3249,
+        2585,
+        1537,
+        200,
+        1603,
+        1851,
+        3523,
+        3697,
+        3137,
+        2563,
+        2293,
+        730,
+        906,
+        1528,
+        3348,
+        2438,
+        1564,
+        1540,
+        3238,
+        3606
+    ],
+    "10": [
+        357,
+        1654,
+        3940,
+        2319,
+        2560,
+        2559,
+        4009,
+        3029,
+        951,
+        1903,
+        738,
+        1602,
+        1807,
+        2018,
+        1281,
+        267,
+        3539,
+        1015,
+        496,
+        693,
+        2278,
+        7,
+        856,
+        2785,
+        2690,
+        1367
+    ],
+    "7": [
+        3228,
+        2550,
+        2977,
+        3716,
+        2467
+    ],
+    "0": [
+        2890,
+        1779,
+        3761
+    ],
+    "6": [
+        1042,
+        2315,
+        1674
+    ],
+    "3": [
+        410
+    ]
+},
   "projection_dim": 768,
   "torch_dtype": "float32",
   "transformers_version": "4.21.0"

model.py CHANGED Viewed

@@ -270,14 +270,23 @@ class CLIP(nn.Module):
         self.visual.set_grad_checkpointing(enable)
         self.transformer.grad_checkpointing = enable
-    def encode_image(self, image, normalize: bool = False, attn_method: Text = 'direct', num_register_tokens = None, neuron_dict=None):
         if num_register_tokens is None and neuron_dict is None:
             num_register_tokens = self.num_register_tokens
             neuron_dict = self.neuron_dict
-        features = self.visual(image, attn_method=attn_method, num_register_tokens=num_register_tokens, neuron_dict=neuron_dict)
-        return F.normalize(features, dim=-1) if normalize else features
     def encode_text(self, text, normalize: bool = False):
         cast_dtype = self.transformer.get_cast_dtype()

         self.visual.set_grad_checkpointing(enable)
         self.transformer.grad_checkpointing = enable
+    def encode_image(self, image, normalize: bool = False, attn_method: Text = 'direct', num_register_tokens = None, neuron_dict=None, get_hidden_states=False):
         if num_register_tokens is None and neuron_dict is None:
             num_register_tokens = self.num_register_tokens
             neuron_dict = self.neuron_dict
+        if get_hidden_states:
+            ret = self.visual(image, attn_method=attn_method, num_register_tokens=num_register_tokens, neuron_dict=neuron_dict, get_hidden_states=get_hidden_states)
+            # warning only global cls token noramlized
+            return {
+                "pooled": F.normalize(ret["pooled"], dim=-1) if normalize else ret["pooled"],
+                "tokens": ret["tokens"],
+                "hidden_states": ret["hidden_states"]
+            }
+        else:
+            features = self.visual(image, attn_method=attn_method, num_register_tokens=num_register_tokens, neuron_dict=neuron_dict)
+            return F.normalize(features, dim=-1) if normalize else features
     def encode_text(self, text, normalize: bool = False):
         cast_dtype = self.transformer.get_cast_dtype()

model_sanity_check.ipynb CHANGED Viewed

@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "e7cec94e",
    "metadata": {},
    "outputs": [],
@@ -65,7 +65,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "b4c7a750",
    "metadata": {},
    "outputs": [
@@ -77,6 +77,7 @@
       "✓ Added '/workspace/code/clipL336_TTR' to Python path.\n",
       "✓ Successfully imported 'model' from '/workspace/code/clipL336_TTR'\n",
       "Building vision tower with config: CLIPVisionCfg(layers=24, width=1024, head_width=64, mlp_ratio=4.0, patch_size=14, image_size=336, ls_init_value=None, patch_dropout=0.0, input_patchnorm=False, global_average_pool=False, attentional_pool=False, n_queries=256, attn_pooler_heads=8, output_tokens=False, timm_model_name=None, timm_model_pretrained=False, timm_pool='avg', timm_proj='linear', timm_proj_bias=False, timm_drop=0.0, timm_drop_path=None)\n",
       "✓ Added '/workspace/data/cache/huggingface/modules/transformers_modules/clipL336_TTR' to Python path.\n",
       "✓ Successfully imported 'tokenizer' from '/workspace/data/cache/huggingface/modules/transformers_modules/clipL336_TTR'\n",
       "Custom CLIP model loaded successfully!\n"
@@ -104,7 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "ed3cbfdc",
    "metadata": {},
    "outputs": [
@@ -112,7 +113,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1000/1000 [00:23<00:00, 41.71it/s]"
      ]
     },
     {
@@ -133,36 +134,29 @@
    "source": [
     "# langauge head\n",
     "### zeroshot head construction (text encoding) ###\n",
-    "with torch.no_grad():\n",
-    "    zeroshot_weight = []\n",
-    "    for classname in tqdm(IMAGENET_CLASSNAMES):\n",
-    "        texts = [template(classname) for template in OPENAI_IMAGENET_TEMPLATES]\n",
-    "        text_inputs = preprocessor(text=texts, return_tensors=\"pt\", padding=\"max_length\").to(device)\n",
-    "        # text_inputs = model.tokenize(texts).to(device)\n",
-    "        # text_features = model.encode_text(text_inputs.input_ids)\n",
-    "        text_features = model_clip.get_text_features(**text_inputs)\n",
-    "        text_feature = F.normalize(text_features, dim=-1).mean(dim=0)\n",
-    "        # text_feature = text_features.mean(dim=0)\n",
-    "        text_feature = text_feature / text_feature.norm()\n",
-    "        zeroshot_weight.append(text_feature)\n",
-    "    \n",
-    "    text_features = torch.stack(zeroshot_weight, dim=1).to(device)\n",
-    "print(\"Built text features:\", text_features.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "e1bd37d1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "torch.save(text_features, \"./zeroshot_classifier.pt\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "id": "dbfeaedf",
    "metadata": {},
    "outputs": [],
@@ -173,7 +167,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "b0000195",
    "metadata": {},
    "outputs": [],
@@ -204,44 +198,170 @@
     "        return top1 / n * 100, top5 / n * 100\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
-   "id": "8795b394",
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Evaluating:   0%|          | 0/391 [00:00<?, ?batch/s]"
-     ]
-    },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Evaluating: 100%|██████████| 391/391 [10:38<00:00,  1.63s/batch, samples=5e+4, top1=74.9, top5=94.4] "
-     ]
-    },
     {
-     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Baseline (Top‑1 / Top‑5) on 50,000 imgs: 74.87% / 94.37%\n"
      ]
     },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
      ]
     }
    ],
    "source": [
     "\n",
     "### baseline evaluator ###\n",
     "\n",
     "BASELINE_SAMPLES = 50000  # set to None for full 50 k\n",
     "acc1, acc5 = evaluate(model, eval_loader, text_features, max_samples=BASELINE_SAMPLES)\n",

   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "id": "e7cec94e",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "id": "b4c7a750",
    "metadata": {},
    "outputs": [
       "✓ Added '/workspace/code/clipL336_TTR' to Python path.\n",
       "✓ Successfully imported 'model' from '/workspace/code/clipL336_TTR'\n",
       "Building vision tower with config: CLIPVisionCfg(layers=24, width=1024, head_width=64, mlp_ratio=4.0, patch_size=14, image_size=336, ls_init_value=None, patch_dropout=0.0, input_patchnorm=False, global_average_pool=False, attentional_pool=False, n_queries=256, attn_pooler_heads=8, output_tokens=False, timm_model_name=None, timm_model_pretrained=False, timm_pool='avg', timm_proj='linear', timm_proj_bias=False, timm_drop=0.0, timm_drop_path=None)\n",
+      "Currently text tower is removed, using only image encoder for feature extraction\n",
       "✓ Added '/workspace/data/cache/huggingface/modules/transformers_modules/clipL336_TTR' to Python path.\n",
       "✓ Successfully imported 'tokenizer' from '/workspace/data/cache/huggingface/modules/transformers_modules/clipL336_TTR'\n",
       "Custom CLIP model loaded successfully!\n"
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "id": "ed3cbfdc",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "100%|██████████| 1000/1000 [00:23<00:00, 41.78it/s]"
      ]
     },
     {
    "source": [
     "# langauge head\n",
     "### zeroshot head construction (text encoding) ###\n",
+    "construction_language_cls_head = True\n",
+    "\n",
+    "if construction_language_cls_head:\n",
+    "    with torch.no_grad():\n",
+    "        zeroshot_weight = []\n",
+    "        for classname in tqdm(IMAGENET_CLASSNAMES):\n",
+    "            texts = [template(classname) for template in OPENAI_IMAGENET_TEMPLATES]\n",
+    "            text_inputs = preprocessor(text=texts, return_tensors=\"pt\", padding=\"max_length\").to(device)\n",
+    "            # text_inputs = model.tokenize(texts).to(device)\n",
+    "            # text_features = model.encode_text(text_inputs.input_ids)\n",
+    "            text_features = model_clip.get_text_features(**text_inputs)\n",
+    "            text_feature = F.normalize(text_features, dim=-1).mean(dim=0)\n",
+    "            # text_feature = text_features.mean(dim=0)\n",
+    "            text_feature = text_feature / text_feature.norm()\n",
+    "            zeroshot_weight.append(text_feature)\n",
+    "        \n",
+    "        text_features = torch.stack(zeroshot_weight, dim=1).to(device)\n",
+    "    print(\"Built text features:\", text_features.shape)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "id": "dbfeaedf",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "id": "b0000195",
    "metadata": {},
    "outputs": [],
     "        return top1 / n * 100, top5 / n * 100\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5806f422",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = model.half().to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "21372f58",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_image = imagenet_dataset[0][0].unsqueeze(0).half().to(device)\n",
+    "result = model.encode_image(sample_image, get_hidden_states=True)  # test\n",
+    "# 이거 그대로 이제 foward"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "741d9cac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vision_forward_result = model_clip.vision_model(pixel_values=sample_image, output_hidden_states=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "c3dd134e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 577, 1024])"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vision_forward_result.hidden_states[0].shape\n",
+    "# 여기서도 25이다."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "04d16694",
    "metadata": {},
    "outputs": [
     {
+     "data": {
+      "text/plain": [
+       "dict_keys(['pooled', 'tokens', 'hidden_states'])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result.keys()\n",
+    "# 하나만 더 확인하기 CLS token을 붙이는 가 아닌가?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "36c68c45",
+   "metadata": {},
+   "outputs": [
     {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 578, 1024])"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 현 상황을 보면 register token은 안 들어가 간 것을 볼 수 있다.\n",
+    "result[\"hidden_states\"][0].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9f06ec5",
+   "metadata": {},
+   "outputs": [
     {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.num_register_tokens\n",
+    "# ok, hidden state 넣어줄 때, layer_idx, 그리고 num_register 잘 인지해서, parsing을 해주도록 해야 겠다.\n",
+    "# model.neuron_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4504ccd6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raise StopIteration()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "8795b394",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Evaluating:   2%|▏         | 6/391 [00:13<14:35,  2.27s/batch, samples=768, top1=89.1, top5=98.3]\n"
      ]
     },
     {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[10], line 8\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m### baseline evaluator ###\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;66;03m### 이거는 지금 당장은 못 써먹는다... 미친 너무 느리다 어디서 문제지 ###\u001b[39;00m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# 씨발 이번에 뭐지\u001b[39;00m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;66;03m# architecture define이 어딘가에서 손상 된 것으로 보인다\u001b[39;00m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;66;03m# 성능 reproduce...\u001b[39;00m\n\u001b[1;32m      7\u001b[0m BASELINE_SAMPLES \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m50000\u001b[39m  \u001b[38;5;66;03m# set to None for full 50 k\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m acc1, acc5 \u001b[38;5;241m=\u001b[39m \u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meval_loader\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext_features\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_samples\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mBASELINE_SAMPLES\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBaseline (Top‑1 / Top‑5) on \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mBASELINE_SAMPLES\u001b[38;5;250m \u001b[39m\u001b[38;5;129;01mor\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28mlen\u001b[39m(imagenet_dataset)\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m,\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m imgs: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00macc1\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m.2f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m% / \u001b[39m\u001b[38;5;132;01m{\u001b[39;00macc5\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m.2f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "Cell \u001b[0;32mIn[5], line 20\u001b[0m, in \u001b[0;36mevaluate\u001b[0;34m(model, loader, text_feats, max_samples)\u001b[0m\n\u001b[1;32m     18\u001b[0m     logits \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mlogit_scale\u001b[38;5;241m.\u001b[39mexp() \u001b[38;5;241m*\u001b[39m feats \u001b[38;5;241m@\u001b[39m text_feats \n\u001b[1;32m     19\u001b[0m _, pred \u001b[38;5;241m=\u001b[39m logits\u001b[38;5;241m.\u001b[39mtopk(\u001b[38;5;241m5\u001b[39m, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m---> 20\u001b[0m top1 \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[43m(\u001b[49m\u001b[43mpred\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43munsqueeze\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msum\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mitem\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     21\u001b[0m top5 \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (pred \u001b[38;5;241m==\u001b[39m labels\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m1\u001b[39m))\u001b[38;5;241m.\u001b[39msum()\u001b[38;5;241m.\u001b[39mitem()\n\u001b[1;32m     22\u001b[0m n \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m images\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m0\u001b[39m)\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
      ]
     }
    ],
    "source": [
     "\n",
     "### baseline evaluator ###\n",
+    "### 이거는 지금 당장은 못 써먹는다... 미친 너무 느리다 어디서 문제지 ###\n",
+    "# 씨발 이번에 뭐지\n",
+    "# architecture define이 어딘가에서 손상 된 것으로 보인다\n",
+    "# 성능 reproduce...\n",
     "\n",
     "BASELINE_SAMPLES = 50000  # set to None for full 50 k\n",
     "acc1, acc5 = evaluate(model, eval_loader, text_features, max_samples=BASELINE_SAMPLES)\n",

transformer.py CHANGED Viewed

@@ -560,8 +560,12 @@ class Transformer(nn.Module):
         attn_mask: Optional[torch.Tensor] = None,
         attn_method: Text = "direct",
         neuron_dict=None,
-        num_register_tokens=0
-    ):
         for r in self.resblocks:
             if self.grad_checkpointing and not torch.jit.is_scripting():
                 raise ValueError("grad_checkpointing not implemented")
@@ -573,7 +577,16 @@ class Transformer(nn.Module):
                     neuron_dict=neuron_dict,
                     num_register_tokens=num_register_tokens
                 )
-        return x
 class VisionTransformer(nn.Module):
@@ -672,7 +685,8 @@ class VisionTransformer(nn.Module):
         else:
             return x[:, 0], x[:, 1:]
-    def forward(self, x: torch.Tensor, attn_method: Text = "direct", num_register_tokens = None, neuron_dict=None):
         # to patches
         if num_register_tokens is None and neuron_dict is None:
@@ -725,7 +739,14 @@ class VisionTransformer(nn.Module):
         x = self.patch_dropout(x)
         x = self.ln_pre(x)
-        x = self.transformer(x, attn_mask=None, attn_method=attn_method, neuron_dict=neuron_dict, num_register_tokens=num_register_tokens)
         if self.attn_pool is not None:
             x = self.attn_pool(x)
@@ -740,6 +761,13 @@ class VisionTransformer(nn.Module):
         if self.output_tokens:
             return pooled, tokens
         return pooled

         attn_mask: Optional[torch.Tensor] = None,
         attn_method: Text = "direct",
         neuron_dict=None,
+        num_register_tokens=0,
+        get_hidden_states: bool = False,
+    ):
+        if "hidden_states" not in locals():
+            hidden_states = []
+            hidden_states.append(x)  # input embedding 저장
         for r in self.resblocks:
             if self.grad_checkpointing and not torch.jit.is_scripting():
                 raise ValueError("grad_checkpointing not implemented")
                     neuron_dict=neuron_dict,
                     num_register_tokens=num_register_tokens
                 )
+                if get_hidden_states:
+                    hidden_states.append(x)
+        if get_hidden_states:
+            return {
+                "hidden_states": hidden_states,
+                "last_hidden_state": x,
+            }
+        else:
+            return x
 class VisionTransformer(nn.Module):
         else:
             return x[:, 0], x[:, 1:]
+    # 여기서
+    def forward(self, x: torch.Tensor, attn_method: Text = "direct", num_register_tokens = None, neuron_dict=None, get_hidden_states:bool=False):
         # to patches
         if num_register_tokens is None and neuron_dict is None:
         x = self.patch_dropout(x)
         x = self.ln_pre(x)
+        # 여기서 들어가는 것은 [B, 1+ 576 + num_register_tokens, C]
+        if get_hidden_states:
+            ret = self.transformer(x, attn_mask=None, attn_method=attn_method, neuron_dict=neuron_dict, num_register_tokens=num_register_tokens,get_hidden_states=get_hidden_states)
+            hidden_states = ret["hidden_states"]
+            x = ret["last_hidden_state"]
+        else:
+            x = self.transformer(x, attn_mask=None, attn_method=attn_method, neuron_dict=neuron_dict, num_register_tokens=num_register_tokens,get_hidden_states=get_hidden_states)
         if self.attn_pool is not None:
             x = self.attn_pool(x)
         if self.output_tokens:
             return pooled, tokens
+        if get_hidden_states:
+            return {
+                "pooled": pooled,  # GLOBAL CLS
+                "tokens": tokens,  # ALL TOKENS
+                "hidden_states": hidden_states  # layer-wise hidden states
+            }
         return pooled

zeroshot_classifier.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7dff47ac37ed4b67771bf6cf651a55dcf95d22eddc91acce2f54638ec82c6783
-size 1537240

 version https://git-lfs.github.com/spec/v1
+oid sha256:629c9c73b717ffa38a56f57b20ebe4fd5470cc03d730f7919c2bacf2c388f560
+size 124120