AXERA-TECH
/

Qwen2.5-VL-3B-Instruct

Image-Text-to-Text

Qwen2.5-VL-3B-Instruct

Model card Files Files and versions

lihongjie commited on 28 days ago

Commit

891634b

·

1 Parent(s): df33ba5

支持多图和任意分辨率

Files changed (1) hide show

qwen2_tokenizer_images.py +5 -4

qwen2_tokenizer_images.py CHANGED Viewed

@@ -79,14 +79,15 @@ class Tokenizer_Http():
     def encode(self, content):
         text = [f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant\n']
         input_ids = self.tokenizer(text)
         return input_ids["input_ids"][0]
-    def encode_vpm(self, content="Describe this image."):
         # official implementation
-        text = f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>' + '<|image_pad|>' * 256 + f'<|vision_end|>{content}<|im_end|>\n<|im_start|>assistant\n'
         output_kwargs = {'text_kwargs': {'padding': True, 'return_tensors': 'pt'}, 'images_kwargs': {'return_tensors': 'pt'}, 'audio_kwargs': {'padding': True, 'return_tensors': 'pt'}, 'videos_kwargs': {'fps': 2.0, 'return_tensors': 'pt'}, 'common_kwargs': {'return_tensors': 'pt'}}
@@ -205,7 +206,7 @@ class Request(BaseHTTPRequestHandler):
             if 'img_prompt' in req:
                 b_img_prompt = req['img_prompt']
             if b_img_prompt:
-                token_ids = tokenizer.encode_vpm(prompt)
             else:
                 token_ids = tokenizer.encode(prompt)

     def encode(self, content):
         text = [f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant\n']
         input_ids = self.tokenizer(text)
         return input_ids["input_ids"][0]
+    def encode_vpm(self, content="Describe this image.", num_img=1, img_token_num=256):
         # official implementation
+        imgs_token = '<|vision_start|>' +  '<|image_pad|>'*img_token_num + '<|vision_end|>'
+        imgs_token *= num_img
+        text = f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{imgs_token}{content}<|im_end|>\n<|im_start|>assistant\n'
         output_kwargs = {'text_kwargs': {'padding': True, 'return_tensors': 'pt'}, 'images_kwargs': {'return_tensors': 'pt'}, 'audio_kwargs': {'padding': True, 'return_tensors': 'pt'}, 'videos_kwargs': {'fps': 2.0, 'return_tensors': 'pt'}, 'common_kwargs': {'return_tensors': 'pt'}}
             if 'img_prompt' in req:
                 b_img_prompt = req['img_prompt']
             if b_img_prompt:
+                token_ids = tokenizer.encode_vpm(prompt, req["num_img"], req["img_token_num"])
             else:
                 token_ids = tokenizer.encode(prompt)