Spaces:
Running
on
Zero
Running
on
Zero
MCP ready
Browse files
app.py
CHANGED
@@ -78,6 +78,22 @@ def visualize(pred_mask, image_path, work_dir):
|
|
78 |
|
79 |
@spaces.GPU
|
80 |
def image_vision(image_input_path, prompt):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
image_path = image_input_path
|
82 |
text_prompts = f"<image>{prompt}"
|
83 |
image = Image.open(image_path).convert('RGB')
|
@@ -106,6 +122,23 @@ def image_vision(image_input_path, prompt):
|
|
106 |
|
107 |
@spaces.GPU(duration=80)
|
108 |
def video_vision(video_input_path, prompt, video_interval):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
# Open the original video
|
110 |
cap = cv2.VideoCapture(video_input_path)
|
111 |
|
@@ -243,4 +276,4 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
243 |
outputs = [vid_output_res, output_video, masked_output]
|
244 |
)
|
245 |
|
246 |
-
demo.queue().launch(show_api=
|
|
|
78 |
|
79 |
@spaces.GPU
|
80 |
def image_vision(image_input_path, prompt):
|
81 |
+
"""Perform image-based visual question answering and segmentation.
|
82 |
+
|
83 |
+
This function takes an image and a text prompt (instruction) as input, processes the image with a
|
84 |
+
multimodal model, and returns a textual answer. If the model response includes a segmentation token ("[SEG]"),
|
85 |
+
and segmentation visualization is available, a visual output is also generated.
|
86 |
+
|
87 |
+
Args:
|
88 |
+
image_input_path (str): The path to the input image file.
|
89 |
+
prompt (str): The instruction or question about the image.
|
90 |
+
|
91 |
+
Returns:
|
92 |
+
Tuple[str, Optional[str]]:
|
93 |
+
- A textual answer generated by the model.
|
94 |
+
- If segmentation is requested (indicated by '[SEG]' in the answer), the path to the segmented image file;
|
95 |
+
otherwise, returns None.
|
96 |
+
"""
|
97 |
image_path = image_input_path
|
98 |
text_prompts = f"<image>{prompt}"
|
99 |
image = Image.open(image_path).convert('RGB')
|
|
|
122 |
|
123 |
@spaces.GPU(duration=80)
|
124 |
def video_vision(video_input_path, prompt, video_interval):
|
125 |
+
"""Perform video-based visual question answering and segmentation.
|
126 |
+
|
127 |
+
This function analyzes a video file using a multimodal vision-language model. It extracts frames based
|
128 |
+
on a sampling interval, feeds the frames and prompt to the model, and returns a response. If segmentation
|
129 |
+
is requested, it produces two videos: one with overlaid masks, and one with binary masks only.
|
130 |
+
|
131 |
+
Args:
|
132 |
+
video_input_path (str): The path to the input video file.
|
133 |
+
prompt (str): The instruction or question about the video.
|
134 |
+
video_interval (int): Frame sampling interval. A value of 1 processes every frame, 2 every second frame, etc.
|
135 |
+
|
136 |
+
Returns:
|
137 |
+
Tuple[str, Optional[str], Optional[str]]:
|
138 |
+
- The model-generated textual answer.
|
139 |
+
- If segmentation is requested (contains '[SEG]'), the path to the segmented output video file.
|
140 |
+
- If segmentation is requested, the path to a binary mask-only video; otherwise, None.
|
141 |
+
"""
|
142 |
# Open the original video
|
143 |
cap = cv2.VideoCapture(video_input_path)
|
144 |
|
|
|
276 |
outputs = [vid_output_res, output_video, masked_output]
|
277 |
)
|
278 |
|
279 |
+
demo.queue().launch(show_api=True, show_error=True, ssr_mode=False, mcp_server=True)
|