Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_NAME="allenai/MolmoE-1B-0924"
|
2 |
+
from transformers import AutoModelForCausalLM
|
3 |
+
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
4 |
+
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
|
5 |
+
from PIL import Image
|
6 |
+
import requests
|
7 |
+
|
8 |
+
# load the processor
|
9 |
+
processor = AutoProcessor.from_pretrained(
|
10 |
+
'allenai/MolmoE-1B-0924',
|
11 |
+
trust_remote_code=True,
|
12 |
+
torch_dtype='auto',
|
13 |
+
device_map='auto'
|
14 |
+
)
|
15 |
+
|
16 |
+
# load the model
|
17 |
+
model = AutoModelForCausalLM.from_pretrained(
|
18 |
+
'allenai/MolmoE-1B-0924',
|
19 |
+
trust_remote_code=True,
|
20 |
+
torch_dtype='auto',
|
21 |
+
device_map='auto'
|
22 |
+
)
|
23 |
+
|
24 |
+
# process the image and text
|
25 |
+
inputs = processor.process(
|
26 |
+
images=[Image.open(requests.get("https://picsum.photos/id/237/536/354", stream=True).raw)],
|
27 |
+
text="Describe this image."
|
28 |
+
)
|
29 |
+
|
30 |
+
# move inputs to the correct device and make a batch of size 1
|
31 |
+
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
|
32 |
+
|
33 |
+
# generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated
|
34 |
+
output = model.generate_from_batch(
|
35 |
+
inputs,
|
36 |
+
GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
|
37 |
+
tokenizer=processor.tokenizer
|
38 |
+
)
|
39 |
+
|
40 |
+
# only get generated tokens; decode them to text
|
41 |
+
generated_tokens = output[0,inputs['input_ids'].size(1):]
|
42 |
+
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
43 |
+
|
44 |
+
# print the generated text
|
45 |
+
print(generated_text)
|
46 |
+
|
47 |
+
# >>> This photograph captures a small black puppy, likely a Labrador or a similar breed,
|
48 |
+
# sitting attentively on a weathered wooden deck. The deck, composed of three...
|
49 |
+
|
50 |
+
|
51 |
+
# import cv2
|
52 |
+
|
53 |
+
|
54 |
+
# class Solution():
|
55 |
+
# def __init__(self,prompt):
|
56 |
+
# self.prompt= prompt
|
57 |
+
# self.output_dir=None
|
58 |
+
|
59 |
+
# # read a mp4 file and getting its frame at a particular interval.
|
60 |
+
# def read_frame(self,file,interval=1):
|
61 |
+
# video=cv2.VideoCapture(file)
|
62 |
+
# fps= video.get(cv2.CAP_PROP_FPS)
|
63 |
+
|
64 |
+
# frame_interval= fps*interval# fps= 24 frame/sec and interval = 1 sec so frame interval = 24 frame
|
65 |
+
# while True:
|
66 |
+
# success, frame=video.read()
|
67 |
+
# if not success:
|
68 |
+
# break
|
69 |
+
|
70 |
+
# if frame % frame_interval==0:
|
71 |
+
# # process this frame
|
72 |
+
# """
|
73 |
+
# .. to do
|
74 |
+
# """
|
75 |
+
|
76 |
+
# def find(self,input_message):
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
# read a .mp4 file
|
85 |
+
# get a interval N spaced
|
86 |
+
|
87 |
+
|
88 |
+
|