Spaces:

prithivMLmods
/

Qwen2.5-VL-Outpost

Running on Zero

App Files Files Community

prithivMLmods commited on 13 days ago

Commit

93c4756

verified ·

1 Parent(s): b13383c

update app

Browse files

Files changed (1) hide show

app.py +82 -21

app.py CHANGED Viewed

@@ -29,6 +29,8 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Qwen2.5-VL-7B-Instruct
 MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -56,15 +58,39 @@ model_q = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load allenai/olmOCR-7B-0825
-MODEL_ID_F = "allenai/olmOCR-7B-0825"
-processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
-model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_F,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 def downsample_video(video_path):
     """
     Downsamples the video to evenly spaced frames.
@@ -74,6 +100,7 @@ def downsample_video(video_path):
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
@@ -97,6 +124,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
     """
     if model_name == "Qwen2.5-VL-7B-Instruct":
         processor = processor_m
         model = model_m
@@ -106,9 +136,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
         processor = processor_q
         model = model_q
-    elif model_name == "olmOCR-7B-0825":
-        processor = processor_f
-        model = model_f
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -133,10 +165,21 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
@@ -154,6 +197,9 @@ def generate_video(model_name: str, text: str, video_path: str,
     Generates responses using the selected model for video input.
     Yields raw text and Markdown-formatted text.
     """
     if model_name == "Qwen2.5-VL-7B-Instruct":
         processor = processor_m
         model = model_m
@@ -163,9 +209,11 @@ def generate_video(model_name: str, text: str, video_path: str,
     elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
         processor = processor_q
         model = model_q
-    elif model_name == "olmOCR-7B-0825":
-        processor = processor_f
-        model = model_f
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -175,14 +223,19 @@ def generate_video(model_name: str, text: str, video_path: str,
         return
     frames = downsample_video(video_path)
     messages = [
         {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
         {"role": "user", "content": [{"type": "text", "text": text}]}
     ]
     for frame in frames:
         image, timestamp = frame
-        messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         messages[1]["content"].append({"type": "image", "image": image})
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
@@ -192,6 +245,7 @@ def generate_video(model_name: str, text: str, video_path: str,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
@@ -205,12 +259,14 @@ def generate_video(model_name: str, text: str, video_path: str,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
         yield buffer, buffer
 # Define examples for image and video inference
 image_examples = [
     ["Provide a detailed caption for the image..", "images/A.jpg"],
@@ -260,24 +316,29 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                     gr.Examples(
                         examples=video_examples,
                         inputs=[video_query, video_upload]
-                    )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 output = gr.Textbox(label="Raw Output", interactive=False, lines=2, scale=2)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown()
             model_choice = gr.Radio(
-                choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct", "Qwen2.5-VL-7B-Abliterated-Caption-it", "olmOCR-7B-0825"],
                 label="Select Model",
                 value="Qwen2.5-VL-7B-Instruct"
             )
@@ -285,9 +346,9 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
             gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
             gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
             gr.Markdown("> [Qwen2.5-VL-7B-Abliterated-Caption-it](https://huggingface.co/prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.")
-            gr.Markdown("> [olmOCR-7B-0825](https://huggingface.co/allenai/olmOCR-7B-0825): olmOCR-7B-0825 is a 7B parameter open large model designed for OCR tasks with robust text extraction, especially in complex document layouts. Multimodal model emphasizing strong document reading and extraction capabilities combined with vision-language understanding to support detailed document parsing tasks.")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
@@ -300,4 +361,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)

 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# --- Model Loading ---
 # Load Qwen2.5-VL-7B-Instruct
 MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load prithivMLmods/DeepCaption-VLA-7B
+MODEL_ID_DC = "prithivMLmods/DeepCaption-VLA-7B"
+processor_dc = AutoProcessor.from_pretrained(MODEL_ID_DC, trust_remote_code=True)
+model_dc = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_DC,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
+# --- System Prompt for DeepCaption-VLA-7B ---
+CAPTION_SYSTEM_PROMPT = """
+You are an AI assistant that rigorously follows this response protocol:
+1. For every input image, your primary task is to write a **precise caption**. The caption must capture the **essence of the image** in clear, concise, and contextually accurate language.
+2. Along with the caption, provide a structured set of **attributes** that describe the visual elements. Attributes should include details such as objects, people, actions, colors, environment, mood, and other notable characteristics.
+3. Always include a **class_name** field. This must represent the **core theme or main subject** of the image in a compact format.
+   - Use the syntax: `{class_name==write_the_core_theme}`
+   - Example: `{class_name==dog_playing}` or `{class_name==city_sunset}`
+4. Maintain the following strict format in your output:
+   - **Caption:** <one-sentence description>
+   - **Attributes:** <comma-separated list of visual attributes>
+   - **{class_name==core_theme}**
+5. Ensure captions are **precise, neutral, and descriptive**, avoiding unnecessary elaboration or subjective interpretation unless explicitly required.
+6. Do not reference the rules or instructions in the output. Only return the formatted caption, attributes, and class_name.
+""".strip()
 def downsample_video(video_path):
     """
     Downsamples the video to evenly spaced frames.
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
+    # Use a denser sampling for better video understanding
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
     """
+    processor = None
+    model = None
     if model_name == "Qwen2.5-VL-7B-Instruct":
         processor = processor_m
         model = model_m
     elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
         processor = processor_q
         model = model_q
+    elif model_name == "DeepCaption-VLA-7B":
+        processor = processor_dc
+        model = model_dc
+        # Prepend system prompt for this model
+        text = f"{CAPTION_SYSTEM_PROMPT}\n\n{text}"
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
+        "do_sample": True,
+    }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
     Generates responses using the selected model for video input.
     Yields raw text and Markdown-formatted text.
     """
+    processor = None
+    model = None
     if model_name == "Qwen2.5-VL-7B-Instruct":
         processor = processor_m
         model = model_m
     elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
         processor = processor_q
         model = model_q
+    elif model_name == "DeepCaption-VLA-7B":
+        processor = processor_dc
+        model = model_dc
+        # Prepend system prompt for this model
+        text = f"{CAPTION_SYSTEM_PROMPT}\n\n{text}"
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         return
     frames = downsample_video(video_path)
+    # Create the message structure with a system prompt and user query
     messages = [
         {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
         {"role": "user", "content": [{"type": "text", "text": text}]}
     ]
+    # Add each frame to the user content
     for frame in frames:
         image, timestamp = frame
+        messages[1]["content"].append({"type": "text", "text": f"Frame at {timestamp}s:"})
         messages[1]["content"].append({"type": "image", "image": image})
+    # Prepare inputs for the model
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
         yield buffer, buffer
 # Define examples for image and video inference
 image_examples = [
     ["Provide a detailed caption for the image..", "images/A.jpg"],
                     gr.Examples(
                         examples=video_examples,
                         inputs=[video_query, video_upload]
+                    )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 output = gr.Textbox(label="Raw Output", interactive=False, lines=2, scale=2)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown()
             model_choice = gr.Radio(
+                choices=[
+                    "Qwen2.5-VL-7B-Instruct",
+                    "Qwen2.5-VL-3B-Instruct",
+                    "Qwen2.5-VL-7B-Abliterated-Caption-it",
+                    "DeepCaption-VLA-7B"
+                ],
                 label="Select Model",
                 value="Qwen2.5-VL-7B-Instruct"
             )
             gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
             gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
             gr.Markdown("> [Qwen2.5-VL-7B-Abliterated-Caption-it](https://huggingface.co/prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.")
+            gr.Markdown("> [prithivMLmods/DeepCaption-VLA-7B](https://huggingface.co/prithivMLmods/DeepCaption-VLA-7B): DeepCaption-VLA-7B is a fine-tuned model based on Qwen2.5-VL, designed for generating precise, structured captions and attributes for images. It follows a strict protocol to provide a main caption, a list of visual attributes, and a core class name, making it ideal for detailed and organized visual analysis.")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(share=True, show_error=True)