Spaces:

TongkunGuan
/

Token-level_Text_Image_Foundation_Model

Running on Zero

App Files Files Community

TongkunGuan commited on 6 days ago

Commit

0afd727

verified ·

1 Parent(s): 63e22db

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -7

app.py CHANGED Viewed

@@ -11,11 +11,12 @@ from utils import generate_similiarity_map, post_process, load_tokenizer, build_
 from utils import IMAGENET_MEAN, IMAGENET_STD
 from internvl.train.dataset import dynamic_preprocess
 from internvl.model.internvl_chat import InternVLChatModel
 # 模型配置
 CHECKPOINTS = {
-    "TokenFD-4096-English-seg": "TongkunGuan/TokenFD_4096_English_seg",
-    "TokenFD-2048-Bilingual-seg": "TongkunGuan/TokenFD_2048_Bilingual_seg",
 }
 # 全局变量
@@ -24,9 +25,10 @@ current_vis = []
 current_bpe = []
 current_index = 0
 def load_model(check_type):
-    device = torch.device("cpu")
     if check_type == 'R50':
         tokenizer = load_tokenizer('tokenizer_path')
         model = build_model(argparse.Namespace()).eval()
@@ -39,7 +41,7 @@ def load_model(check_type):
         model.load_state_dict(torch.load(CHECKPOINTS['R50_siglip'], map_location='cpu')['model'])
         transform = build_transform_R50(normalize_type='imagenet')
-    elif 'TokenOCR' in check_type:
         model_path = CHECKPOINTS[check_type]
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False, use_auth_token=HF_TOKEN)
         model = InternVLChatModel.from_pretrained(model_path, torch_dtype=torch.bfloat16).eval()
@@ -121,9 +123,9 @@ with gr.Blocks(title="BPE Visualization Demo") as demo:
     with gr.Row():
         with gr.Column(scale=0.5):
             model_type = gr.Dropdown(
-                choices=["TokenOCR-4096-English-seg", "TokenOCR-2048-Bilingual-seg", "R50", "R50_siglip"],
                 label="Select model type",
-                value="R50"  # 设置默认值为第一个选项
             )
             image_input = gr.Image(label="Upload images", type="pil")
             text_input = gr.Textbox(label="Input text")
@@ -155,6 +157,7 @@ with gr.Blocks(title="BPE Visualization Demo") as demo:
             bpe_display = gr.Markdown("Current BPE: ", visible=False)
     # 事件处理
     def on_run_clicked(model_type, image, text):
         global current_vis, current_bpe, current_index
         current_index = 0  # Reset index when new image is processed

 from utils import IMAGENET_MEAN, IMAGENET_STD
 from internvl.train.dataset import dynamic_preprocess
 from internvl.model.internvl_chat import InternVLChatModel
+import spaces
 # 模型配置
 CHECKPOINTS = {
+    "TokenFD_4096_English_seg": "TongkunGuan/TokenFD_4096_English_seg",
+    "TokenFD_2048_Bilingual_seg": "TongkunGuan/TokenFD_2048_Bilingual_seg",
 }
 # 全局变量
 current_bpe = []
 current_index = 0
 def load_model(check_type):
+    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device("cuda")
     if check_type == 'R50':
         tokenizer = load_tokenizer('tokenizer_path')
         model = build_model(argparse.Namespace()).eval()
         model.load_state_dict(torch.load(CHECKPOINTS['R50_siglip'], map_location='cpu')['model'])
         transform = build_transform_R50(normalize_type='imagenet')
+    elif 'TokenFD' in check_type:
         model_path = CHECKPOINTS[check_type]
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False, use_auth_token=HF_TOKEN)
         model = InternVLChatModel.from_pretrained(model_path, torch_dtype=torch.bfloat16).eval()
     with gr.Row():
         with gr.Column(scale=0.5):
             model_type = gr.Dropdown(
+                choices=["TokenOCR_4096_English_seg", "TokenOCR_2048_Bilingual_seg", "R50", "R50_siglip"],
                 label="Select model type",
+                value="TokenOCR_4096_English_seg"  # 设置默认值为第一个选项
             )
             image_input = gr.Image(label="Upload images", type="pil")
             text_input = gr.Textbox(label="Input text")
             bpe_display = gr.Markdown("Current BPE: ", visible=False)
     # 事件处理
+    @spaces.GPU
     def on_run_clicked(model_type, image, text):
         global current_vis, current_bpe, current_index
         current_index = 0  # Reset index when new image is processed