fffiloni commited on
Commit
b0132e3
·
verified ·
1 Parent(s): d0ffca0
Files changed (1) hide show
  1. app.py +34 -1
app.py CHANGED
@@ -78,6 +78,22 @@ def visualize(pred_mask, image_path, work_dir):
78
 
79
  @spaces.GPU
80
  def image_vision(image_input_path, prompt):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  image_path = image_input_path
82
  text_prompts = f"<image>{prompt}"
83
  image = Image.open(image_path).convert('RGB')
@@ -106,6 +122,23 @@ def image_vision(image_input_path, prompt):
106
 
107
  @spaces.GPU(duration=80)
108
  def video_vision(video_input_path, prompt, video_interval):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  # Open the original video
110
  cap = cv2.VideoCapture(video_input_path)
111
 
@@ -243,4 +276,4 @@ with gr.Blocks(analytics_enabled=False) as demo:
243
  outputs = [vid_output_res, output_video, masked_output]
244
  )
245
 
246
- demo.queue().launch(show_api=False, show_error=True)
 
78
 
79
  @spaces.GPU
80
  def image_vision(image_input_path, prompt):
81
+ """Perform image-based visual question answering and segmentation.
82
+
83
+ This function takes an image and a text prompt (instruction) as input, processes the image with a
84
+ multimodal model, and returns a textual answer. If the model response includes a segmentation token ("[SEG]"),
85
+ and segmentation visualization is available, a visual output is also generated.
86
+
87
+ Args:
88
+ image_input_path (str): The path to the input image file.
89
+ prompt (str): The instruction or question about the image.
90
+
91
+ Returns:
92
+ Tuple[str, Optional[str]]:
93
+ - A textual answer generated by the model.
94
+ - If segmentation is requested (indicated by '[SEG]' in the answer), the path to the segmented image file;
95
+ otherwise, returns None.
96
+ """
97
  image_path = image_input_path
98
  text_prompts = f"<image>{prompt}"
99
  image = Image.open(image_path).convert('RGB')
 
122
 
123
  @spaces.GPU(duration=80)
124
  def video_vision(video_input_path, prompt, video_interval):
125
+ """Perform video-based visual question answering and segmentation.
126
+
127
+ This function analyzes a video file using a multimodal vision-language model. It extracts frames based
128
+ on a sampling interval, feeds the frames and prompt to the model, and returns a response. If segmentation
129
+ is requested, it produces two videos: one with overlaid masks, and one with binary masks only.
130
+
131
+ Args:
132
+ video_input_path (str): The path to the input video file.
133
+ prompt (str): The instruction or question about the video.
134
+ video_interval (int): Frame sampling interval. A value of 1 processes every frame, 2 every second frame, etc.
135
+
136
+ Returns:
137
+ Tuple[str, Optional[str], Optional[str]]:
138
+ - The model-generated textual answer.
139
+ - If segmentation is requested (contains '[SEG]'), the path to the segmented output video file.
140
+ - If segmentation is requested, the path to a binary mask-only video; otherwise, None.
141
+ """
142
  # Open the original video
143
  cap = cv2.VideoCapture(video_input_path)
144
 
 
276
  outputs = [vid_output_res, output_video, masked_output]
277
  )
278
 
279
+ demo.queue().launch(show_api=True, show_error=True, ssr_mode=False, mcp_server=True)