File size: 2,953 Bytes
10c7965 8364010 10c7965 f116497 8364010 f116497 8364010 2b54ad7 f7a3ffd 44c892d f7a3ffd 44c892d f7a3ffd 44c892d f7a3ffd 2b54ad7 44c892d 2b54ad7 f7a3ffd 8364010 34373e1 2b54ad7 8364010 0bebf2f 2b54ad7 fb10a8a 2b54ad7 8364010 f7a3ffd 8364010 5c119e9 f7a3ffd 8364010 34373e1 2b54ad7 8364010 2b54ad7 8364010 34373e1 2b54ad7 8364010 34373e1 2b54ad7 7443e40 2b54ad7 8364010 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
---
license: apache-2.0
datasets:
- jxu124/invig
language:
- en
---
## TiO - An Interactive Visual Grounding Model for Disambiguation.
TiO is an Interactive Visual Grounding Model for Disambiguation. (WIP)
## Online / Offline Demo
- [Colab Online Demo](https://colab.research.google.com/drive/195eDITKi6dahnVz8Cum91sNUCF_lFle8?usp=sharing) - Free T4 is available on Google Colab.
- Gradio Offline Demo:
```python
import os; os.system("pip3 install transformers gradio fire accelerate bitsandbytes > /dev/null")
from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
import torch
model_id = "jxu124/TiO"
model = AutoModel.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
image_processor = AutoImageProcessor.from_pretrained(model_id)
# ---- gradio demo ----
model.get_gradio_demo(tokenizer, image_processor).queue(max_size=20).launch(server_name="0.0.0.0", server_port=7860)
```
## Mini-Example
```python
import os; os.system("pip3 install transformers accelerate bitsandbytes gradio fire")
from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
import torch
model_id = "jxu124/TiO"
model = AutoModel.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
image_processor = AutoImageProcessor.from_pretrained(model_id)
# ---- mini example ----
from PIL import Image
from io import BytesIO
import requests
# Prepare example
image = Image.open(BytesIO(requests.get("http://images.cocodataset.org/val2014/COCO_val2014_000000429913.jpg").content))
text = """\
#instruction: can you specify which region the context describes?
#context:
human: look that man in white!"""
# Inference
with torch.no_grad():
pt_txt = tokenizer([text], return_tensors="pt").input_ids.cuda()
pt_img = image_processor([image], return_tensors="pt").pixel_values.to(torch.float16).cuda()
gen = model.generate(pt_txt, patch_images=pt_img, top_p=0.5, do_sample=True, no_repeat_ngram_size=3, max_length=256)
print(tokenizer.batch_decode(gen, skip_special_tokens=True).replace("not yet.", ""))
# e.g. [' is he the one who just threw the ball?'] # Due to the generator, different results may be output
```
## Other Examples (text)
Guesser(grounding):
```python
text = """\
#instruction: which region does the context describe?
#context:
human: look that man in white!
agent: is he the one who just threw the ball?
human: yes. I mean the pitcher."""
```
Questioner(question generation):
```python
text = """\
#instruction: guess what I want?
#context:
human: look that man in white!"""
```
Oracle(answering):
```python
text = """\
#instruction: answer the question based on the region.
#context:
agent: look that man in white!
human: is he the one who just threw the ball?
#region: <bin_847> <bin_319> <bin_923> <bin_467>"""
``` |