|
from vlm_model import VLMConfig, VLM |
|
from transformers import AutoProcessor, AutoTokenizer |
|
from PIL import Image |
|
import torch |
|
|
|
|
|
config = VLMConfig.from_pretrained("YOUR_USERNAME/vlm-qwen-siglip") |
|
model = VLM.from_pretrained("YOUR_USERNAME/vlm-qwen-siglip") |
|
tokenizer = AutoTokenizer.from_pretrained("YOUR_USERNAME/vlm-qwen-siglip") |
|
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224") |
|
|
|
|
|
image = Image.open("your_image.jpg").convert("RGB") |
|
processor_output = processor(text=None, images=image, return_tensors="pt") |
|
pixel_values = processor_output['pixel_values'] |
|
|
|
|
|
chat = [ |
|
{"role": "system", "content": "You are a helpful assistant."}, |
|
{"role": "user", "content": f"What's in this image?{('<|image_pad|>' * config.image_pad_num)}"} |
|
] |
|
input_text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) |
|
input_ids = tokenizer(input_text, return_tensors="pt").input_ids |
|
|
|
|
|
with torch.no_grad(): |
|
generated_ids = model.generate( |
|
input_ids=input_ids, |
|
pixel_values=pixel_values, |
|
max_new_tokens=200, |
|
do_sample=True, |
|
temperature=0.7, |
|
top_p=0.9, |
|
) |
|
|
|
|
|
response = tokenizer.decode(generated_ids[0][input_ids.shape[1]:], skip_special_tokens=True) |
|
print(response) |