Spaces:

fffiloni
/

Sa2VA-simple-demo

Running on Zero

App Files Files Community

fffiloni commited on Jun 5

Commit

b0132e3

verified ·

1 Parent(s): d0ffca0

MCP ready

Browse files

Files changed (1) hide show

app.py +34 -1

app.py CHANGED Viewed

@@ -78,6 +78,22 @@ def visualize(pred_mask, image_path, work_dir):
 @spaces.GPU
 def image_vision(image_input_path, prompt):
     image_path = image_input_path
     text_prompts = f"<image>{prompt}"
     image = Image.open(image_path).convert('RGB')
@@ -106,6 +122,23 @@ def image_vision(image_input_path, prompt):
 @spaces.GPU(duration=80)
 def video_vision(video_input_path, prompt, video_interval):
     # Open the original video
     cap = cv2.VideoCapture(video_input_path)
@@ -243,4 +276,4 @@ with gr.Blocks(analytics_enabled=False) as demo:
                 outputs = [vid_output_res, output_video, masked_output]
             )
-demo.queue().launch(show_api=False, show_error=True)

 @spaces.GPU
 def image_vision(image_input_path, prompt):
+    """Perform image-based visual question answering and segmentation.
+    This function takes an image and a text prompt (instruction) as input, processes the image with a
+    multimodal model, and returns a textual answer. If the model response includes a segmentation token ("[SEG]"),
+    and segmentation visualization is available, a visual output is also generated.
+    Args:
+        image_input_path (str): The path to the input image file.
+        prompt (str): The instruction or question about the image.
+    Returns:
+        Tuple[str, Optional[str]]:
+            - A textual answer generated by the model.
+            - If segmentation is requested (indicated by '[SEG]' in the answer), the path to the segmented image file;
+              otherwise, returns None.
+    """
     image_path = image_input_path
     text_prompts = f"<image>{prompt}"
     image = Image.open(image_path).convert('RGB')
 @spaces.GPU(duration=80)
 def video_vision(video_input_path, prompt, video_interval):
+    """Perform video-based visual question answering and segmentation.
+    This function analyzes a video file using a multimodal vision-language model. It extracts frames based
+    on a sampling interval, feeds the frames and prompt to the model, and returns a response. If segmentation
+    is requested, it produces two videos: one with overlaid masks, and one with binary masks only.
+    Args:
+        video_input_path (str): The path to the input video file.
+        prompt (str): The instruction or question about the video.
+        video_interval (int): Frame sampling interval. A value of 1 processes every frame, 2 every second frame, etc.
+    Returns:
+        Tuple[str, Optional[str], Optional[str]]:
+            - The model-generated textual answer.
+            - If segmentation is requested (contains '[SEG]'), the path to the segmented output video file.
+            - If segmentation is requested, the path to a binary mask-only video; otherwise, None.
+    """
     # Open the original video
     cap = cv2.VideoCapture(video_input_path)
                 outputs = [vid_output_res, output_video, masked_output]
             )
+demo.queue().launch(show_api=True, show_error=True, ssr_mode=False, mcp_server=True)