Spaces:
Running
on
Zero
Running
on
Zero
update app
Browse files
app.py
CHANGED
@@ -29,6 +29,8 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
|
29 |
|
30 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
31 |
|
|
|
|
|
32 |
# Load Qwen2.5-VL-7B-Instruct
|
33 |
MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
|
34 |
processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
|
@@ -56,15 +58,39 @@ model_q = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
56 |
torch_dtype=torch.float16
|
57 |
).to(device).eval()
|
58 |
|
59 |
-
# Load
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
trust_remote_code=True,
|
65 |
torch_dtype=torch.float16
|
66 |
).to(device).eval()
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
def downsample_video(video_path):
|
69 |
"""
|
70 |
Downsamples the video to evenly spaced frames.
|
@@ -74,6 +100,7 @@ def downsample_video(video_path):
|
|
74 |
total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
|
75 |
fps = vidcap.get(cv2.CAP_PROP_FPS)
|
76 |
frames = []
|
|
|
77 |
frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
|
78 |
for i in frame_indices:
|
79 |
vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
|
@@ -97,6 +124,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
97 |
Generates responses using the selected model for image input.
|
98 |
Yields raw text and Markdown-formatted text.
|
99 |
"""
|
|
|
|
|
|
|
100 |
if model_name == "Qwen2.5-VL-7B-Instruct":
|
101 |
processor = processor_m
|
102 |
model = model_m
|
@@ -106,9 +136,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
106 |
elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
|
107 |
processor = processor_q
|
108 |
model = model_q
|
109 |
-
elif model_name == "
|
110 |
-
processor =
|
111 |
-
model =
|
|
|
|
|
112 |
else:
|
113 |
yield "Invalid model selected.", "Invalid model selected."
|
114 |
return
|
@@ -133,10 +165,21 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
133 |
truncation=False,
|
134 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
135 |
).to(device)
|
|
|
136 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
137 |
-
generation_kwargs = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
139 |
thread.start()
|
|
|
140 |
buffer = ""
|
141 |
for new_text in streamer:
|
142 |
buffer += new_text
|
@@ -154,6 +197,9 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
154 |
Generates responses using the selected model for video input.
|
155 |
Yields raw text and Markdown-formatted text.
|
156 |
"""
|
|
|
|
|
|
|
157 |
if model_name == "Qwen2.5-VL-7B-Instruct":
|
158 |
processor = processor_m
|
159 |
model = model_m
|
@@ -163,9 +209,11 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
163 |
elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
|
164 |
processor = processor_q
|
165 |
model = model_q
|
166 |
-
elif model_name == "
|
167 |
-
processor =
|
168 |
-
model =
|
|
|
|
|
169 |
else:
|
170 |
yield "Invalid model selected.", "Invalid model selected."
|
171 |
return
|
@@ -175,14 +223,19 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
175 |
return
|
176 |
|
177 |
frames = downsample_video(video_path)
|
|
|
178 |
messages = [
|
179 |
{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
|
180 |
{"role": "user", "content": [{"type": "text", "text": text}]}
|
181 |
]
|
|
|
|
|
182 |
for frame in frames:
|
183 |
image, timestamp = frame
|
184 |
-
messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
|
185 |
messages[1]["content"].append({"type": "image", "image": image})
|
|
|
|
|
186 |
inputs = processor.apply_chat_template(
|
187 |
messages,
|
188 |
tokenize=True,
|
@@ -192,6 +245,7 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
192 |
truncation=False,
|
193 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
194 |
).to(device)
|
|
|
195 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
196 |
generation_kwargs = {
|
197 |
**inputs,
|
@@ -205,12 +259,14 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
205 |
}
|
206 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
207 |
thread.start()
|
|
|
208 |
buffer = ""
|
209 |
for new_text in streamer:
|
210 |
buffer += new_text
|
211 |
time.sleep(0.01)
|
212 |
yield buffer, buffer
|
213 |
|
|
|
214 |
# Define examples for image and video inference
|
215 |
image_examples = [
|
216 |
["Provide a detailed caption for the image..", "images/A.jpg"],
|
@@ -260,24 +316,29 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
260 |
gr.Examples(
|
261 |
examples=video_examples,
|
262 |
inputs=[video_query, video_upload]
|
263 |
-
)
|
264 |
with gr.Accordion("Advanced options", open=False):
|
265 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
266 |
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
|
267 |
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
|
268 |
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
|
269 |
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
270 |
-
|
271 |
with gr.Column():
|
272 |
with gr.Column(elem_classes="canvas-output"):
|
273 |
gr.Markdown("## Output")
|
274 |
output = gr.Textbox(label="Raw Output", interactive=False, lines=2, scale=2)
|
275 |
-
|
276 |
with gr.Accordion("(Result.md)", open=False):
|
277 |
markdown_output = gr.Markdown()
|
278 |
-
|
279 |
model_choice = gr.Radio(
|
280 |
-
choices=[
|
|
|
|
|
|
|
|
|
|
|
281 |
label="Select Model",
|
282 |
value="Qwen2.5-VL-7B-Instruct"
|
283 |
)
|
@@ -285,9 +346,9 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
285 |
gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
|
286 |
gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
|
287 |
gr.Markdown("> [Qwen2.5-VL-7B-Abliterated-Caption-it](https://huggingface.co/prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.")
|
288 |
-
gr.Markdown("> [
|
289 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
290 |
-
|
291 |
image_submit.click(
|
292 |
fn=generate_image,
|
293 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
@@ -300,4 +361,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
300 |
)
|
301 |
|
302 |
if __name__ == "__main__":
|
303 |
-
demo.queue(max_size=50).launch(share=True,
|
|
|
29 |
|
30 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
31 |
|
32 |
+
# --- Model Loading ---
|
33 |
+
|
34 |
# Load Qwen2.5-VL-7B-Instruct
|
35 |
MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
|
36 |
processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
|
|
|
58 |
torch_dtype=torch.float16
|
59 |
).to(device).eval()
|
60 |
|
61 |
+
# Load prithivMLmods/DeepCaption-VLA-7B
|
62 |
+
MODEL_ID_DC = "prithivMLmods/DeepCaption-VLA-7B"
|
63 |
+
processor_dc = AutoProcessor.from_pretrained(MODEL_ID_DC, trust_remote_code=True)
|
64 |
+
model_dc = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
65 |
+
MODEL_ID_DC,
|
66 |
trust_remote_code=True,
|
67 |
torch_dtype=torch.float16
|
68 |
).to(device).eval()
|
69 |
|
70 |
+
|
71 |
+
# --- System Prompt for DeepCaption-VLA-7B ---
|
72 |
+
CAPTION_SYSTEM_PROMPT = """
|
73 |
+
You are an AI assistant that rigorously follows this response protocol:
|
74 |
+
|
75 |
+
1. For every input image, your primary task is to write a **precise caption**. The caption must capture the **essence of the image** in clear, concise, and contextually accurate language.
|
76 |
+
|
77 |
+
2. Along with the caption, provide a structured set of **attributes** that describe the visual elements. Attributes should include details such as objects, people, actions, colors, environment, mood, and other notable characteristics.
|
78 |
+
|
79 |
+
3. Always include a **class_name** field. This must represent the **core theme or main subject** of the image in a compact format.
|
80 |
+
- Use the syntax: `{class_name==write_the_core_theme}`
|
81 |
+
- Example: `{class_name==dog_playing}` or `{class_name==city_sunset}`
|
82 |
+
|
83 |
+
4. Maintain the following strict format in your output:
|
84 |
+
- **Caption:** <one-sentence description>
|
85 |
+
- **Attributes:** <comma-separated list of visual attributes>
|
86 |
+
- **{class_name==core_theme}**
|
87 |
+
|
88 |
+
5. Ensure captions are **precise, neutral, and descriptive**, avoiding unnecessary elaboration or subjective interpretation unless explicitly required.
|
89 |
+
|
90 |
+
6. Do not reference the rules or instructions in the output. Only return the formatted caption, attributes, and class_name.
|
91 |
+
""".strip()
|
92 |
+
|
93 |
+
|
94 |
def downsample_video(video_path):
|
95 |
"""
|
96 |
Downsamples the video to evenly spaced frames.
|
|
|
100 |
total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
|
101 |
fps = vidcap.get(cv2.CAP_PROP_FPS)
|
102 |
frames = []
|
103 |
+
# Use a denser sampling for better video understanding
|
104 |
frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
|
105 |
for i in frame_indices:
|
106 |
vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
|
|
|
124 |
Generates responses using the selected model for image input.
|
125 |
Yields raw text and Markdown-formatted text.
|
126 |
"""
|
127 |
+
processor = None
|
128 |
+
model = None
|
129 |
+
|
130 |
if model_name == "Qwen2.5-VL-7B-Instruct":
|
131 |
processor = processor_m
|
132 |
model = model_m
|
|
|
136 |
elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
|
137 |
processor = processor_q
|
138 |
model = model_q
|
139 |
+
elif model_name == "DeepCaption-VLA-7B":
|
140 |
+
processor = processor_dc
|
141 |
+
model = model_dc
|
142 |
+
# Prepend system prompt for this model
|
143 |
+
text = f"{CAPTION_SYSTEM_PROMPT}\n\n{text}"
|
144 |
else:
|
145 |
yield "Invalid model selected.", "Invalid model selected."
|
146 |
return
|
|
|
165 |
truncation=False,
|
166 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
167 |
).to(device)
|
168 |
+
|
169 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
170 |
+
generation_kwargs = {
|
171 |
+
**inputs,
|
172 |
+
"streamer": streamer,
|
173 |
+
"max_new_tokens": max_new_tokens,
|
174 |
+
"temperature": temperature,
|
175 |
+
"top_p": top_p,
|
176 |
+
"top_k": top_k,
|
177 |
+
"repetition_penalty": repetition_penalty,
|
178 |
+
"do_sample": True,
|
179 |
+
}
|
180 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
181 |
thread.start()
|
182 |
+
|
183 |
buffer = ""
|
184 |
for new_text in streamer:
|
185 |
buffer += new_text
|
|
|
197 |
Generates responses using the selected model for video input.
|
198 |
Yields raw text and Markdown-formatted text.
|
199 |
"""
|
200 |
+
processor = None
|
201 |
+
model = None
|
202 |
+
|
203 |
if model_name == "Qwen2.5-VL-7B-Instruct":
|
204 |
processor = processor_m
|
205 |
model = model_m
|
|
|
209 |
elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
|
210 |
processor = processor_q
|
211 |
model = model_q
|
212 |
+
elif model_name == "DeepCaption-VLA-7B":
|
213 |
+
processor = processor_dc
|
214 |
+
model = model_dc
|
215 |
+
# Prepend system prompt for this model
|
216 |
+
text = f"{CAPTION_SYSTEM_PROMPT}\n\n{text}"
|
217 |
else:
|
218 |
yield "Invalid model selected.", "Invalid model selected."
|
219 |
return
|
|
|
223 |
return
|
224 |
|
225 |
frames = downsample_video(video_path)
|
226 |
+
# Create the message structure with a system prompt and user query
|
227 |
messages = [
|
228 |
{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
|
229 |
{"role": "user", "content": [{"type": "text", "text": text}]}
|
230 |
]
|
231 |
+
|
232 |
+
# Add each frame to the user content
|
233 |
for frame in frames:
|
234 |
image, timestamp = frame
|
235 |
+
messages[1]["content"].append({"type": "text", "text": f"Frame at {timestamp}s:"})
|
236 |
messages[1]["content"].append({"type": "image", "image": image})
|
237 |
+
|
238 |
+
# Prepare inputs for the model
|
239 |
inputs = processor.apply_chat_template(
|
240 |
messages,
|
241 |
tokenize=True,
|
|
|
245 |
truncation=False,
|
246 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
247 |
).to(device)
|
248 |
+
|
249 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
250 |
generation_kwargs = {
|
251 |
**inputs,
|
|
|
259 |
}
|
260 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
261 |
thread.start()
|
262 |
+
|
263 |
buffer = ""
|
264 |
for new_text in streamer:
|
265 |
buffer += new_text
|
266 |
time.sleep(0.01)
|
267 |
yield buffer, buffer
|
268 |
|
269 |
+
|
270 |
# Define examples for image and video inference
|
271 |
image_examples = [
|
272 |
["Provide a detailed caption for the image..", "images/A.jpg"],
|
|
|
316 |
gr.Examples(
|
317 |
examples=video_examples,
|
318 |
inputs=[video_query, video_upload]
|
319 |
+
)
|
320 |
with gr.Accordion("Advanced options", open=False):
|
321 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
322 |
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
|
323 |
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
|
324 |
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
|
325 |
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
326 |
+
|
327 |
with gr.Column():
|
328 |
with gr.Column(elem_classes="canvas-output"):
|
329 |
gr.Markdown("## Output")
|
330 |
output = gr.Textbox(label="Raw Output", interactive=False, lines=2, scale=2)
|
331 |
+
|
332 |
with gr.Accordion("(Result.md)", open=False):
|
333 |
markdown_output = gr.Markdown()
|
334 |
+
|
335 |
model_choice = gr.Radio(
|
336 |
+
choices=[
|
337 |
+
"Qwen2.5-VL-7B-Instruct",
|
338 |
+
"Qwen2.5-VL-3B-Instruct",
|
339 |
+
"Qwen2.5-VL-7B-Abliterated-Caption-it",
|
340 |
+
"DeepCaption-VLA-7B"
|
341 |
+
],
|
342 |
label="Select Model",
|
343 |
value="Qwen2.5-VL-7B-Instruct"
|
344 |
)
|
|
|
346 |
gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
|
347 |
gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
|
348 |
gr.Markdown("> [Qwen2.5-VL-7B-Abliterated-Caption-it](https://huggingface.co/prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.")
|
349 |
+
gr.Markdown("> [prithivMLmods/DeepCaption-VLA-7B](https://huggingface.co/prithivMLmods/DeepCaption-VLA-7B): DeepCaption-VLA-7B is a fine-tuned model based on Qwen2.5-VL, designed for generating precise, structured captions and attributes for images. It follows a strict protocol to provide a main caption, a list of visual attributes, and a core class name, making it ideal for detailed and organized visual analysis.")
|
350 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
351 |
+
|
352 |
image_submit.click(
|
353 |
fn=generate_image,
|
354 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
|
|
361 |
)
|
362 |
|
363 |
if __name__ == "__main__":
|
364 |
+
demo.queue(max_size=50).launch(share=True, show_error=True)
|