prithivMLmods commited on
Commit
93c4756
·
verified ·
1 Parent(s): b13383c

update app

Browse files
Files changed (1) hide show
  1. app.py +82 -21
app.py CHANGED
@@ -29,6 +29,8 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
29
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
 
 
32
  # Load Qwen2.5-VL-7B-Instruct
33
  MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
34
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -56,15 +58,39 @@ model_q = Qwen2_5_VLForConditionalGeneration.from_pretrained(
56
  torch_dtype=torch.float16
57
  ).to(device).eval()
58
 
59
- # Load allenai/olmOCR-7B-0825
60
- MODEL_ID_F = "allenai/olmOCR-7B-0825"
61
- processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
62
- model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
63
- MODEL_ID_F,
64
  trust_remote_code=True,
65
  torch_dtype=torch.float16
66
  ).to(device).eval()
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def downsample_video(video_path):
69
  """
70
  Downsamples the video to evenly spaced frames.
@@ -74,6 +100,7 @@ def downsample_video(video_path):
74
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
75
  fps = vidcap.get(cv2.CAP_PROP_FPS)
76
  frames = []
 
77
  frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
78
  for i in frame_indices:
79
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
@@ -97,6 +124,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
97
  Generates responses using the selected model for image input.
98
  Yields raw text and Markdown-formatted text.
99
  """
 
 
 
100
  if model_name == "Qwen2.5-VL-7B-Instruct":
101
  processor = processor_m
102
  model = model_m
@@ -106,9 +136,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
106
  elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
107
  processor = processor_q
108
  model = model_q
109
- elif model_name == "olmOCR-7B-0825":
110
- processor = processor_f
111
- model = model_f
 
 
112
  else:
113
  yield "Invalid model selected.", "Invalid model selected."
114
  return
@@ -133,10 +165,21 @@ def generate_image(model_name: str, text: str, image: Image.Image,
133
  truncation=False,
134
  max_length=MAX_INPUT_TOKEN_LENGTH
135
  ).to(device)
 
136
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
137
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
 
 
 
 
 
 
 
 
 
138
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
139
  thread.start()
 
140
  buffer = ""
141
  for new_text in streamer:
142
  buffer += new_text
@@ -154,6 +197,9 @@ def generate_video(model_name: str, text: str, video_path: str,
154
  Generates responses using the selected model for video input.
155
  Yields raw text and Markdown-formatted text.
156
  """
 
 
 
157
  if model_name == "Qwen2.5-VL-7B-Instruct":
158
  processor = processor_m
159
  model = model_m
@@ -163,9 +209,11 @@ def generate_video(model_name: str, text: str, video_path: str,
163
  elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
164
  processor = processor_q
165
  model = model_q
166
- elif model_name == "olmOCR-7B-0825":
167
- processor = processor_f
168
- model = model_f
 
 
169
  else:
170
  yield "Invalid model selected.", "Invalid model selected."
171
  return
@@ -175,14 +223,19 @@ def generate_video(model_name: str, text: str, video_path: str,
175
  return
176
 
177
  frames = downsample_video(video_path)
 
178
  messages = [
179
  {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
180
  {"role": "user", "content": [{"type": "text", "text": text}]}
181
  ]
 
 
182
  for frame in frames:
183
  image, timestamp = frame
184
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
185
  messages[1]["content"].append({"type": "image", "image": image})
 
 
186
  inputs = processor.apply_chat_template(
187
  messages,
188
  tokenize=True,
@@ -192,6 +245,7 @@ def generate_video(model_name: str, text: str, video_path: str,
192
  truncation=False,
193
  max_length=MAX_INPUT_TOKEN_LENGTH
194
  ).to(device)
 
195
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
196
  generation_kwargs = {
197
  **inputs,
@@ -205,12 +259,14 @@ def generate_video(model_name: str, text: str, video_path: str,
205
  }
206
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
207
  thread.start()
 
208
  buffer = ""
209
  for new_text in streamer:
210
  buffer += new_text
211
  time.sleep(0.01)
212
  yield buffer, buffer
213
 
 
214
  # Define examples for image and video inference
215
  image_examples = [
216
  ["Provide a detailed caption for the image..", "images/A.jpg"],
@@ -260,24 +316,29 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
260
  gr.Examples(
261
  examples=video_examples,
262
  inputs=[video_query, video_upload]
263
- )
264
  with gr.Accordion("Advanced options", open=False):
265
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
266
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
267
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
268
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
269
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
270
-
271
  with gr.Column():
272
  with gr.Column(elem_classes="canvas-output"):
273
  gr.Markdown("## Output")
274
  output = gr.Textbox(label="Raw Output", interactive=False, lines=2, scale=2)
275
-
276
  with gr.Accordion("(Result.md)", open=False):
277
  markdown_output = gr.Markdown()
278
-
279
  model_choice = gr.Radio(
280
- choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct", "Qwen2.5-VL-7B-Abliterated-Caption-it", "olmOCR-7B-0825"],
 
 
 
 
 
281
  label="Select Model",
282
  value="Qwen2.5-VL-7B-Instruct"
283
  )
@@ -285,9 +346,9 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
285
  gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
286
  gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
287
  gr.Markdown("> [Qwen2.5-VL-7B-Abliterated-Caption-it](https://huggingface.co/prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.")
288
- gr.Markdown("> [olmOCR-7B-0825](https://huggingface.co/allenai/olmOCR-7B-0825): olmOCR-7B-0825 is a 7B parameter open large model designed for OCR tasks with robust text extraction, especially in complex document layouts. Multimodal model emphasizing strong document reading and extraction capabilities combined with vision-language understanding to support detailed document parsing tasks.")
289
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
290
-
291
  image_submit.click(
292
  fn=generate_image,
293
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
@@ -300,4 +361,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
300
  )
301
 
302
  if __name__ == "__main__":
303
- demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
29
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
+ # --- Model Loading ---
33
+
34
  # Load Qwen2.5-VL-7B-Instruct
35
  MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
36
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 
58
  torch_dtype=torch.float16
59
  ).to(device).eval()
60
 
61
+ # Load prithivMLmods/DeepCaption-VLA-7B
62
+ MODEL_ID_DC = "prithivMLmods/DeepCaption-VLA-7B"
63
+ processor_dc = AutoProcessor.from_pretrained(MODEL_ID_DC, trust_remote_code=True)
64
+ model_dc = Qwen2_5_VLForConditionalGeneration.from_pretrained(
65
+ MODEL_ID_DC,
66
  trust_remote_code=True,
67
  torch_dtype=torch.float16
68
  ).to(device).eval()
69
 
70
+
71
+ # --- System Prompt for DeepCaption-VLA-7B ---
72
+ CAPTION_SYSTEM_PROMPT = """
73
+ You are an AI assistant that rigorously follows this response protocol:
74
+
75
+ 1. For every input image, your primary task is to write a **precise caption**. The caption must capture the **essence of the image** in clear, concise, and contextually accurate language.
76
+
77
+ 2. Along with the caption, provide a structured set of **attributes** that describe the visual elements. Attributes should include details such as objects, people, actions, colors, environment, mood, and other notable characteristics.
78
+
79
+ 3. Always include a **class_name** field. This must represent the **core theme or main subject** of the image in a compact format.
80
+ - Use the syntax: `{class_name==write_the_core_theme}`
81
+ - Example: `{class_name==dog_playing}` or `{class_name==city_sunset}`
82
+
83
+ 4. Maintain the following strict format in your output:
84
+ - **Caption:** <one-sentence description>
85
+ - **Attributes:** <comma-separated list of visual attributes>
86
+ - **{class_name==core_theme}**
87
+
88
+ 5. Ensure captions are **precise, neutral, and descriptive**, avoiding unnecessary elaboration or subjective interpretation unless explicitly required.
89
+
90
+ 6. Do not reference the rules or instructions in the output. Only return the formatted caption, attributes, and class_name.
91
+ """.strip()
92
+
93
+
94
  def downsample_video(video_path):
95
  """
96
  Downsamples the video to evenly spaced frames.
 
100
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
101
  fps = vidcap.get(cv2.CAP_PROP_FPS)
102
  frames = []
103
+ # Use a denser sampling for better video understanding
104
  frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
105
  for i in frame_indices:
106
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
 
124
  Generates responses using the selected model for image input.
125
  Yields raw text and Markdown-formatted text.
126
  """
127
+ processor = None
128
+ model = None
129
+
130
  if model_name == "Qwen2.5-VL-7B-Instruct":
131
  processor = processor_m
132
  model = model_m
 
136
  elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
137
  processor = processor_q
138
  model = model_q
139
+ elif model_name == "DeepCaption-VLA-7B":
140
+ processor = processor_dc
141
+ model = model_dc
142
+ # Prepend system prompt for this model
143
+ text = f"{CAPTION_SYSTEM_PROMPT}\n\n{text}"
144
  else:
145
  yield "Invalid model selected.", "Invalid model selected."
146
  return
 
165
  truncation=False,
166
  max_length=MAX_INPUT_TOKEN_LENGTH
167
  ).to(device)
168
+
169
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
170
+ generation_kwargs = {
171
+ **inputs,
172
+ "streamer": streamer,
173
+ "max_new_tokens": max_new_tokens,
174
+ "temperature": temperature,
175
+ "top_p": top_p,
176
+ "top_k": top_k,
177
+ "repetition_penalty": repetition_penalty,
178
+ "do_sample": True,
179
+ }
180
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
181
  thread.start()
182
+
183
  buffer = ""
184
  for new_text in streamer:
185
  buffer += new_text
 
197
  Generates responses using the selected model for video input.
198
  Yields raw text and Markdown-formatted text.
199
  """
200
+ processor = None
201
+ model = None
202
+
203
  if model_name == "Qwen2.5-VL-7B-Instruct":
204
  processor = processor_m
205
  model = model_m
 
209
  elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
210
  processor = processor_q
211
  model = model_q
212
+ elif model_name == "DeepCaption-VLA-7B":
213
+ processor = processor_dc
214
+ model = model_dc
215
+ # Prepend system prompt for this model
216
+ text = f"{CAPTION_SYSTEM_PROMPT}\n\n{text}"
217
  else:
218
  yield "Invalid model selected.", "Invalid model selected."
219
  return
 
223
  return
224
 
225
  frames = downsample_video(video_path)
226
+ # Create the message structure with a system prompt and user query
227
  messages = [
228
  {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
229
  {"role": "user", "content": [{"type": "text", "text": text}]}
230
  ]
231
+
232
+ # Add each frame to the user content
233
  for frame in frames:
234
  image, timestamp = frame
235
+ messages[1]["content"].append({"type": "text", "text": f"Frame at {timestamp}s:"})
236
  messages[1]["content"].append({"type": "image", "image": image})
237
+
238
+ # Prepare inputs for the model
239
  inputs = processor.apply_chat_template(
240
  messages,
241
  tokenize=True,
 
245
  truncation=False,
246
  max_length=MAX_INPUT_TOKEN_LENGTH
247
  ).to(device)
248
+
249
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
250
  generation_kwargs = {
251
  **inputs,
 
259
  }
260
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
261
  thread.start()
262
+
263
  buffer = ""
264
  for new_text in streamer:
265
  buffer += new_text
266
  time.sleep(0.01)
267
  yield buffer, buffer
268
 
269
+
270
  # Define examples for image and video inference
271
  image_examples = [
272
  ["Provide a detailed caption for the image..", "images/A.jpg"],
 
316
  gr.Examples(
317
  examples=video_examples,
318
  inputs=[video_query, video_upload]
319
+ )
320
  with gr.Accordion("Advanced options", open=False):
321
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
322
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
323
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
324
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
325
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
326
+
327
  with gr.Column():
328
  with gr.Column(elem_classes="canvas-output"):
329
  gr.Markdown("## Output")
330
  output = gr.Textbox(label="Raw Output", interactive=False, lines=2, scale=2)
331
+
332
  with gr.Accordion("(Result.md)", open=False):
333
  markdown_output = gr.Markdown()
334
+
335
  model_choice = gr.Radio(
336
+ choices=[
337
+ "Qwen2.5-VL-7B-Instruct",
338
+ "Qwen2.5-VL-3B-Instruct",
339
+ "Qwen2.5-VL-7B-Abliterated-Caption-it",
340
+ "DeepCaption-VLA-7B"
341
+ ],
342
  label="Select Model",
343
  value="Qwen2.5-VL-7B-Instruct"
344
  )
 
346
  gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
347
  gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
348
  gr.Markdown("> [Qwen2.5-VL-7B-Abliterated-Caption-it](https://huggingface.co/prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.")
349
+ gr.Markdown("> [prithivMLmods/DeepCaption-VLA-7B](https://huggingface.co/prithivMLmods/DeepCaption-VLA-7B): DeepCaption-VLA-7B is a fine-tuned model based on Qwen2.5-VL, designed for generating precise, structured captions and attributes for images. It follows a strict protocol to provide a main caption, a list of visual attributes, and a core class name, making it ideal for detailed and organized visual analysis.")
350
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
351
+
352
  image_submit.click(
353
  fn=generate_image,
354
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
 
361
  )
362
 
363
  if __name__ == "__main__":
364
+ demo.queue(max_size=50).launch(share=True, show_error=True)