xj commited on
Commit
a2c1aea
·
2 Parent(s): ea05f46 0bebf2f

Merge branch 'main' of https://huggingface.co/jxu124/TiO

Browse files
Files changed (1) hide show
  1. README.md +32 -29
README.md CHANGED
@@ -9,41 +9,41 @@ language:
9
 
10
  TiO is an Interactive Visual Grounding Model for Disambiguation. (WIP)
11
 
12
- ## Online / offline Demo
 
 
 
13
 
14
  ```python
 
15
  from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
 
16
 
17
  model_id = "jxu124/TiO"
18
- model = AutoModel.from_pretrained(
19
- model_id,
20
- trust_remote_code=True,
21
- torch_dtype=torch.float16,
22
- device_map='cuda',
23
- # load_in_4bit=True,
24
- # bnb_4bit_compute_dtype=torch.float16,
25
- )
26
  tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
27
  image_processor = AutoImageProcessor.from_pretrained(model_id)
28
- # setup gradio demo
29
- model.get_gradio_demo(tokenizer, image_processor).\
30
- queue(max_size=20).launch(server_name="0.0.0.0", server_port=7860)
31
  ```
32
 
33
  ## Mini-Example
34
  ```python
 
35
  from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
 
 
 
 
 
 
 
 
36
  from PIL import Image
37
  from io import BytesIO
38
- import torch
39
  import requests
40
 
41
- # Load model, tokenizer, image_processor
42
- tokenizer = AutoTokenizer.from_pretrained("jxu124/TiO", use_fast=False)
43
- image_processor = AutoImageProcessor.from_pretrained("jxu124/TiO")
44
- model = AutoModel.from_pretrained("jxu124/TiO", trust_remote_code=True)
45
- model = model.to(torch.float16).cuda() # It would be faster.
46
-
47
  # Prepare example
48
  image = Image.open(BytesIO(requests.get("http://images.cocodataset.org/val2014/COCO_val2014_000000429913.jpg").content))
49
  text = """\
@@ -64,25 +64,28 @@ print(tokenizer.batch_decode(gen, skip_special_tokens=True).replace("not yet.",
64
 
65
  Guesser(grounding):
66
  ```python
67
- text = """ #instruction: which region does the context describe? \n #context: \"\
 
 
68
  human: look that man in white!
69
  agent: is he the one who just threw the ball?
70
- human: yes. I mean the pitcher.\"
71
- """
72
  ```
73
 
74
  Questioner(question generation):
75
  ```python
76
- text = """ #instruction: guess what I want? \n #context: \"\
77
- human: look that man in white! \"
78
- """
 
79
  ```
80
 
81
  Oracle(answering):
82
  ```python
83
- text = """ #instruction: answer the question based on the region. \n #context: \"\
 
 
84
  agent: look that man in white!
85
- human: is he the one who just threw the ball? \"
86
- #region: <bin_847> <bin_319> <bin_923> <bin_467>
87
- """
88
  ```
 
9
 
10
  TiO is an Interactive Visual Grounding Model for Disambiguation. (WIP)
11
 
12
+ ## Online / Offline Demo
13
+
14
+ - [Colab Online Demo](https://colab.research.google.com/drive/195eDITKi6dahnVz8Cum91sNUCF_lFle8?usp=sharing) - Free T4 is available on Google Colab.
15
+ - Gradio Offline Demo:
16
 
17
  ```python
18
+ import os; os.system("pip3 install transformers gradio fire accelerate bitsandbytes > /dev/null")
19
  from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
20
+ import torch
21
 
22
  model_id = "jxu124/TiO"
23
+ model = AutoModel.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16).cuda()
 
 
 
 
 
 
 
24
  tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
25
  image_processor = AutoImageProcessor.from_pretrained(model_id)
26
+
27
+ # ---- gradio demo ----
28
+ model.get_gradio_demo(tokenizer, image_processor).queue(max_size=20).launch(server_name="0.0.0.0", server_port=7860)
29
  ```
30
 
31
  ## Mini-Example
32
  ```python
33
+ import os; os.system("pip3 install transformers accelerate bitsandbytes gradio fire")
34
  from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
35
+ import torch
36
+
37
+ model_id = "jxu124/TiO"
38
+ model = AutoModel.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16).cuda()
39
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
40
+ image_processor = AutoImageProcessor.from_pretrained(model_id)
41
+
42
+ # ---- mini example ----
43
  from PIL import Image
44
  from io import BytesIO
 
45
  import requests
46
 
 
 
 
 
 
 
47
  # Prepare example
48
  image = Image.open(BytesIO(requests.get("http://images.cocodataset.org/val2014/COCO_val2014_000000429913.jpg").content))
49
  text = """\
 
64
 
65
  Guesser(grounding):
66
  ```python
67
+ text = """\
68
+ #instruction: which region does the context describe?
69
+ #context:
70
  human: look that man in white!
71
  agent: is he the one who just threw the ball?
72
+ human: yes. I mean the pitcher."""
 
73
  ```
74
 
75
  Questioner(question generation):
76
  ```python
77
+ text = """\
78
+ #instruction: guess what I want?
79
+ #context:
80
+ human: look that man in white!"""
81
  ```
82
 
83
  Oracle(answering):
84
  ```python
85
+ text = """\
86
+ #instruction: answer the question based on the region.
87
+ #context:
88
  agent: look that man in white!
89
+ human: is he the one who just threw the ball?
90
+ #region: <bin_847> <bin_319> <bin_923> <bin_467>"""
 
91
  ```