cnmoro commited on
Commit
5162a2a
·
verified ·
1 Parent(s): 28b8643

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +59 -3
README.md CHANGED
@@ -1,3 +1,59 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ base_model:
6
+ - WinKawaks/vit-tiny-patch16-224
7
+ - google/bert_uncased_L-2_H-128_A-2
8
+ pipeline_tag: image-to-text
9
+ library_name: transformers
10
+ tags:
11
+ - vit
12
+ - bert
13
+ - vision
14
+ - caption
15
+ - captioning
16
+ - image
17
+ ---
18
+ An image captioning model, based on bert-tiny and vit-tiny, weighing only 40mb!
19
+
20
+ Works very fast on CPU.
21
+
22
+ ```python
23
+ from transformers import AutoTokenizer, AutoImageProcessor, VisionEncoderDecoderModel
24
+ import requests, time
25
+ from PIL import Image
26
+
27
+ model_path = "cnmoro/nano-image-captioning"
28
+
29
+ # load the image captioning model and corresponding tokenizer and image processor
30
+ model = VisionEncoderDecoderModel.from_pretrained(model_path)
31
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
32
+ image_processor = AutoImageProcessor.from_pretrained(model_path)
33
+
34
+ # preprocess an image
35
+ url = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/New_york_times_square-terabass.jpg/800px-New_york_times_square-terabass.jpg"
36
+ image = Image.open(requests.get(url, stream=True).raw)
37
+ pixel_values = image_processor(image, return_tensors="pt").pixel_values
38
+
39
+ start = time.time()
40
+
41
+ # generate caption - suggested settings
42
+ generated_ids = model.generate(
43
+     pixel_values,
44
+     temperature=0.7,
45
+     top_p=0.8,
46
+     top_k=50,
47
+     num_beams=3 # you can use 1 for even faster inference with a small drop in quality
48
+ )
49
+ generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
50
+
51
+ end = time.time()
52
+
53
+ print(generated_text)
54
+ # a group of people are in the middle of a city.
55
+
56
+ print(f"Time taken: {end - start} seconds")
57
+ # Time taken: 0.07550048828125 seconds
58
+ # on CPU !
59
+ ```