MonilM commited on
Commit
488a535
·
1 Parent(s): 39d7794

Updated Requirements, Last Straw

Browse files
Files changed (3) hide show
  1. app.py +193 -184
  2. requirements.txt +21 -17
  3. runtime.txt +1 -0
app.py CHANGED
@@ -154,194 +154,203 @@ async def detect_objects_yolo_world(
154
 
155
  # --- Gradio UI Functions ---
156
 
157
- # Keep detect_objects_ui function
158
- def detect_objects_ui(image_pil: Image.Image, profile: str, confidence: float, iou: float): # Add iou parameter
159
- """Gradio function for YOLO-World object detection."""
160
- # Create a placeholder image for errors or no input
161
- placeholder_img = Image.new('RGB', (640, 480), color = (150, 150, 150))
162
- draw_placeholder = ImageDraw.Draw(placeholder_img)
163
-
164
- if image_pil is None:
165
- draw_placeholder.text((10, 10), "Please upload an image.", fill=(255,255,255))
166
- return placeholder_img, "Please upload an image."
167
-
168
- # Check if the correct model structure is available
169
- profile_lower = profile.lower()
170
- if profile_lower not in profile_models:
171
- error_msg = f"Error: Model for profile '{profile_lower}' not loaded."
172
- logger.error(f"UI requested profile '{profile_lower}' but model not loaded.")
173
- # Return original image with error drawn on it
174
- try:
175
- error_img_out = image_pil.copy()
176
- draw_error = ImageDraw.Draw(error_img_out)
177
- draw_error.text((10, 10), error_msg, fill="red", font=ImageFont.load_default())
178
- return error_img_out, error_msg
179
- except Exception: # Fallback if drawing on input fails
180
- draw_placeholder.text((10, 10), error_msg, fill="red")
181
- return placeholder_img, error_msg
182
-
183
-
184
- model_to_use = profile_models[profile_lower]
185
- name_map_to_use = profile_class_maps[profile_lower]
186
-
187
- try:
188
- # Ensure image is PIL Image and in RGB
189
- if not isinstance(image_pil, Image.Image):
190
- if isinstance(image_pil, np.ndarray):
191
- image_pil = Image.fromarray(image_pil).convert("RGB")
192
- else:
193
- error_msg = "Error: Invalid image input type."
194
- draw_placeholder.text((10, 10), error_msg, fill="red")
195
- return placeholder_img, error_msg
196
- else:
197
- image_pil = image_pil.convert("RGB")
198
-
199
- # Run detection using the pre-configured model
200
- logger.info(f"Running YOLO-World detection (UI) with profile: {profile_lower}, confidence: {confidence}, iou: {iou}")
201
- results = model_to_use.predict(image_pil, conf=confidence, iou=iou, verbose=False)
202
-
203
- # Process results using the helper and the stored map
204
- original_w, original_h = image_pil.width, image_pil.height
205
- if results and results[0] and results[0].orig_shape:
206
- original_h, original_w = results[0].orig_shape[:2]
207
- detections = process_prediction_results(
208
- results, original_w, original_h, name_map_to_use
209
- )
210
-
211
- # Draw boxes on a copy of the image for Gradio output
212
- output_image = image_pil.copy()
213
- draw = ImageDraw.Draw(output_image)
214
- try:
215
- font = ImageFont.truetype("arial.ttf", 15)
216
- except IOError:
217
- font = ImageFont.load_default()
218
-
219
- labels = []
220
- if not detections:
221
- labels.append("No objects detected.")
222
- else:
223
- for det in detections:
224
- box = det['box']
225
- label = f"{det['class_name']}: {det['confidence']:.2f}"
226
- labels.append(label)
227
- color = "red"
228
- draw.rectangle(
229
- [(box['x1'], box['y1']), (box['x2'], box['y2'])],
230
- outline=color, width=3
231
- )
232
- text_position = (box['x1'], box['y1'] - 15 if box['y1'] > 15 else box['y1'])
233
- # Use textbbox for better background calculation
234
- try:
235
- text_bbox = draw.textbbox(text_position, label, font=font)
236
- # Adjust background size slightly
237
- bg_coords = (text_bbox[0]-1, text_bbox[1]-1, text_bbox[2]+1, text_bbox[3]+1)
238
- draw.rectangle(bg_coords, fill=color)
239
- draw.text(text_position, label, fill="white", font=font)
240
- except AttributeError: # Fallback for older Pillow versions without textbbox
241
- draw.text(text_position, label, fill=color, font=font)
242
-
243
-
244
- logger.info(f"UI Detection Results: {labels}")
245
- return output_image, "\n".join(labels)
246
-
247
- except Exception as e:
248
- error_msg = f"Error: {str(e)}"
249
- logger.error(f"Error in detect_objects_ui: {e}", exc_info=True)
250
- # Return original image with error message drawn
251
- try:
252
- error_img_out = image_pil.copy()
253
- draw_error = ImageDraw.Draw(error_img_out)
254
- draw_error.text((10, 10), error_msg, fill="red", font=ImageFont.load_default())
255
- return error_img_out, error_msg
256
- except Exception: # Fallback if drawing on input fails
257
- draw_placeholder.text((10, 10), error_msg, fill="red")
258
- return placeholder_img, error_msg
259
-
260
-
261
- # --- Create Gradio Interface ---
262
- # Add theme and descriptions
263
- theme = gr.themes.Soft() # Example theme
264
-
265
- with gr.Blocks(title="IPD-Lingual API", theme=theme) as demo:
266
- gr.Markdown("# IPD-Lingual: Speech & Vision API")
267
- gr.Markdown("An API providing speech transcription/translation and object detection capabilities.")
268
-
269
- with gr.Tab("Home / About"):
270
- gr.Markdown("## Welcome!")
271
- gr.Markdown(
272
- """
273
- This application provides two main functionalities accessible via API endpoints and a demonstration UI:
274
-
275
- 1. **Speech Processing (`/api/speech`):**
276
- * Accepts an audio file and two language codes (e.g., 'en', 'es').
277
- * Uses **OpenAI Whisper (base model)** to transcribe the audio, automatically detecting which of the two provided languages is spoken.
278
- * Uses the **googletrans library** (unofficial Google Translate API) to translate the transcribed text into the *other* provided language.
279
- * Returns the detected language, original transcription, and translation.
280
-
281
- 2. **Object Detection (`/api/detect_objects`):**
282
- * Accepts an image file, a detection profile (e.g., 'casual', 'vehicles'), optional extra object names, confidence threshold, and IoU threshold.
283
- * Uses **YOLO-World (yolov8l-worldv2.pt)**, a powerful zero-shot object detection model from Ultralytics.
284
- * It can detect objects based on predefined profiles or dynamically based on user-provided text prompts (extra words).
285
- * Returns a list of detected objects with their bounding boxes, class names, and confidence scores.
286
-
287
- Use the tabs above to try out the object detection functionality or see the API endpoint details below.
288
- *(Note: The speech processing functionality is currently only available via the API endpoint).*
289
- """
290
- )
291
- gr.Markdown("---")
292
- gr.Markdown("### API Endpoint Summary")
293
- gr.Markdown("- **POST `/api/speech`**: Transcribe and Translate audio.\n - **Type**: `multipart/form-data`\n - **Fields**: `audio` (file), `lang1` (string), `lang2` (string)")
294
- gr.Markdown("- **POST `/api/detect_objects`**: Detect objects using YOLO-World.\n - **Type**: `multipart/form-data`\n - **Fields**: `image` (file), `profile` (string), `extra_words` (string, optional, comma-separated or JSON list), `confidence` (float, optional), `iou` (float, optional)")
295
-
296
-
297
- # Keep the "Object Detection" Tab
298
- with gr.Tab("Object Detection Demo"):
299
- gr.Markdown("## Detect Objects in Image (using YOLO-World)")
300
- gr.Markdown("Upload an image and select a detection profile. The model will identify objects belonging to that profile.")
301
- with gr.Row():
302
- with gr.Column(scale=1): # Input column slightly smaller
303
- image_input = gr.Image(type="pil", label="Upload Image")
304
- profile_select = gr.Dropdown(
305
- choices=sorted(list(PREDEFINED_CLASSES.keys())),
306
- value="casual",
307
- label="Detection Profile"
308
- )
309
- confidence_slider = gr.Slider(
310
- minimum=0.001, maximum=1.0, value=0.01, step=0.001,
311
- label="Confidence Threshold"
312
- )
313
- iou_slider = gr.Slider(
314
- minimum=0.01, maximum=1.0, value=0.2, step=0.01,
315
- label="IoU Threshold (NMS)"
316
- )
317
- detect_btn = gr.Button("Detect Objects", variant="primary") # Make button primary
318
- with gr.Column(scale=2): # Output column larger
319
- image_output = gr.Image(label="Detection Result", interactive=False) # Output not interactive
320
- labels_output = gr.Textbox(label="Detected Objects", lines=10, interactive=False)
321
-
322
- # Ensure the click event is correctly wired
323
- detect_btn.click(
324
- fn=detect_objects_ui,
325
- inputs=[image_input, profile_select, confidence_slider, iou_slider],
326
- outputs=[image_output, labels_output]
327
- )
328
-
329
- # Mount both FastAPI and Gradio
330
- # Ensure the Gradio app uses the FastAPI instance `app`
331
- app = gr.mount_gradio_app(app, demo, path="/")
332
-
333
- # ... (rest of the file remains the same) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
  if __name__ == "__main__":
336
  import uvicorn
337
- # Check if YOLO models initialized before starting server
338
- # Update check to use the new model variables
339
  if not profile_models or dynamic_model is None:
340
- logger.error(f"CRITICAL: One or more YOLO-World models ({MODEL_NAME}) failed to initialize. API endpoint /api/detect_objects might not work correctly.")
341
- # Decide if you want to exit or run with degraded functionality
342
- # exit(1) # Optional: exit if model loading fails
343
  else:
344
  logger.info("All required YOLO models initialized successfully.")
345
-
346
  print("Starting Uvicorn server on http://0.0.0.0:7860")
347
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
154
 
155
  # --- Gradio UI Functions ---
156
 
157
+ # # Keep detect_objects_ui function
158
+ # def detect_objects_ui(image_pil: Image.Image, profile: str, confidence: float, iou: float): # Add iou parameter
159
+ # """Gradio function for YOLO-World object detection."""
160
+ # # Create a placeholder image for errors or no input
161
+ # placeholder_img = Image.new('RGB', (640, 480), color = (150, 150, 150))
162
+ # draw_placeholder = ImageDraw.Draw(placeholder_img)
163
+
164
+ # if image_pil is None:
165
+ # draw_placeholder.text((10, 10), "Please upload an image.", fill=(255,255,255))
166
+ # return placeholder_img, "Please upload an image."
167
+
168
+ # # Check if the correct model structure is available
169
+ # profile_lower = profile.lower()
170
+ # if profile_lower not in profile_models:
171
+ # error_msg = f"Error: Model for profile '{profile_lower}' not loaded."
172
+ # logger.error(f"UI requested profile '{profile_lower}' but model not loaded.")
173
+ # # Return original image with error drawn on it
174
+ # try:
175
+ # error_img_out = image_pil.copy()
176
+ # draw_error = ImageDraw.Draw(error_img_out)
177
+ # draw_error.text((10, 10), error_msg, fill="red", font=ImageFont.load_default())
178
+ # return error_img_out, error_msg
179
+ # except Exception: # Fallback if drawing on input fails
180
+ # draw_placeholder.text((10, 10), error_msg, fill="red")
181
+ # return placeholder_img, error_msg
182
+
183
+
184
+ # model_to_use = profile_models[profile_lower]
185
+ # name_map_to_use = profile_class_maps[profile_lower]
186
+
187
+ # try:
188
+ # # Ensure image is PIL Image and in RGB
189
+ # if not isinstance(image_pil, Image.Image):
190
+ # if isinstance(image_pil, np.ndarray):
191
+ # image_pil = Image.fromarray(image_pil).convert("RGB")
192
+ # else:
193
+ # error_msg = "Error: Invalid image input type."
194
+ # draw_placeholder.text((10, 10), error_msg, fill="red")
195
+ # return placeholder_img, error_msg
196
+ # else:
197
+ # image_pil = image_pil.convert("RGB")
198
+
199
+ # # Run detection using the pre-configured model
200
+ # logger.info(f"Running YOLO-World detection (UI) with profile: {profile_lower}, confidence: {confidence}, iou: {iou}")
201
+ # results = model_to_use.predict(image_pil, conf=confidence, iou=iou, verbose=False)
202
+
203
+ # # Process results using the helper and the stored map
204
+ # original_w, original_h = image_pil.width, image_pil.height
205
+ # if results and results[0] and results[0].orig_shape:
206
+ # original_h, original_w = results[0].orig_shape[:2]
207
+ # detections = process_prediction_results(
208
+ # results, original_w, original_h, name_map_to_use
209
+ # )
210
+
211
+ # # Draw boxes on a copy of the image for Gradio output
212
+ # output_image = image_pil.copy()
213
+ # draw = ImageDraw.Draw(output_image)
214
+ # try:
215
+ # font = ImageFont.truetype("arial.ttf", 15)
216
+ # except IOError:
217
+ # font = ImageFont.load_default()
218
+
219
+ # labels = []
220
+ # if not detections:
221
+ # labels.append("No objects detected.")
222
+ # else:
223
+ # for det in detections:
224
+ # box = det['box']
225
+ # label = f"{det['class_name']}: {det['confidence']:.2f}"
226
+ # labels.append(label)
227
+ # color = "red"
228
+ # draw.rectangle(
229
+ # [(box['x1'], box['y1']), (box['x2'], box['y2'])],
230
+ # outline=color, width=3
231
+ # )
232
+ # text_position = (box['x1'], box['y1'] - 15 if box['y1'] > 15 else box['y1'])
233
+ # # Use textbbox for better background calculation
234
+ # try:
235
+ # text_bbox = draw.textbbox(text_position, label, font=font)
236
+ # # Adjust background size slightly
237
+ # bg_coords = (text_bbox[0]-1, text_bbox[1]-1, text_bbox[2]+1, text_bbox[3]+1)
238
+ # draw.rectangle(bg_coords, fill=color)
239
+ # draw.text(text_position, label, fill="white", font=font)
240
+ # except AttributeError: # Fallback for older Pillow versions without textbbox
241
+ # draw.text(text_position, label, fill=color, font=font)
242
+
243
+
244
+ # logger.info(f"UI Detection Results: {labels}")
245
+ # return output_image, "\n".join(labels)
246
+
247
+ # except Exception as e:
248
+ # error_msg = f"Error: {str(e)}"
249
+ # logger.error(f"Error in detect_objects_ui: {e}", exc_info=True)
250
+ # # Return original image with error message drawn
251
+ # try:
252
+ # error_img_out = image_pil.copy()
253
+ # draw_error = ImageDraw.Draw(error_img_out)
254
+ # draw_error.text((10, 10), error_msg, fill="red", font=ImageFont.load_default())
255
+ # return error_img_out, error_msg
256
+ # except Exception: # Fallback if drawing on input fails
257
+ # draw_placeholder.text((10, 10), error_msg, fill="red")
258
+ # return placeholder_img, error_msg
259
+
260
+
261
+ # # --- Create Gradio Interface ---
262
+ # # Add theme and descriptions
263
+ # theme = gr.themes.Soft() # Example theme
264
+
265
+ # with gr.Blocks(title="IPD-Lingual API", theme=theme) as demo:
266
+ # gr.Markdown("# IPD-Lingual: Speech & Vision API")
267
+ # gr.Markdown("An API providing speech transcription/translation and object detection capabilities.")
268
+
269
+ # with gr.Tab("Home / About"):
270
+ # gr.Markdown("## Welcome!")
271
+ # gr.Markdown(
272
+ # """
273
+ # This application provides two main functionalities accessible via API endpoints and a demonstration UI:
274
+
275
+ # 1. **Speech Processing (`/api/speech`):**
276
+ # * Accepts an audio file and two language codes (e.g., 'en', 'es').
277
+ # * Uses **OpenAI Whisper (base model)** to transcribe the audio, automatically detecting which of the two provided languages is spoken.
278
+ # * Uses the **googletrans library** (unofficial Google Translate API) to translate the transcribed text into the *other* provided language.
279
+ # * Returns the detected language, original transcription, and translation.
280
+
281
+ # 2. **Object Detection (`/api/detect_objects`):**
282
+ # * Accepts an image file, a detection profile (e.g., 'casual', 'vehicles'), optional extra object names, confidence threshold, and IoU threshold.
283
+ # * Uses **YOLO-World (yolov8l-worldv2.pt)**, a powerful zero-shot object detection model from Ultralytics.
284
+ # * It can detect objects based on predefined profiles or dynamically based on user-provided text prompts (extra words).
285
+ # * Returns a list of detected objects with their bounding boxes, class names, and confidence scores.
286
+
287
+ # Use the tabs above to try out the object detection functionality or see the API endpoint details below.
288
+ # *(Note: The speech processing functionality is currently only available via the API endpoint).*
289
+ # """
290
+ # )
291
+ # gr.Markdown("---")
292
+ # gr.Markdown("### API Endpoint Summary")
293
+ # gr.Markdown("- **POST `/api/speech`**: Transcribe and Translate audio.\n - **Type**: `multipart/form-data`\n - **Fields**: `audio` (file), `lang1` (string), `lang2` (string)")
294
+ # gr.Markdown("- **POST `/api/detect_objects`**: Detect objects using YOLO-World.\n - **Type**: `multipart/form-data`\n - **Fields**: `image` (file), `profile` (string), `extra_words` (string, optional, comma-separated or JSON list), `confidence` (float, optional), `iou` (float, optional)")
295
+
296
+
297
+ # # Keep the "Object Detection" Tab
298
+ # with gr.Tab("Object Detection Demo"):
299
+ # gr.Markdown("## Detect Objects in Image (using YOLO-World)")
300
+ # gr.Markdown("Upload an image and select a detection profile. The model will identify objects belonging to that profile.")
301
+ # with gr.Row():
302
+ # with gr.Column(scale=1): # Input column slightly smaller
303
+ # image_input = gr.Image(type="pil", label="Upload Image")
304
+ # profile_select = gr.Dropdown(
305
+ # choices=sorted(list(PREDEFINED_CLASSES.keys())),
306
+ # value="casual",
307
+ # label="Detection Profile"
308
+ # )
309
+ # confidence_slider = gr.Slider(
310
+ # minimum=0.001, maximum=1.0, value=0.01, step=0.001,
311
+ # label="Confidence Threshold"
312
+ # )
313
+ # iou_slider = gr.Slider(
314
+ # minimum=0.01, maximum=1.0, value=0.2, step=0.01,
315
+ # label="IoU Threshold (NMS)"
316
+ # )
317
+ # detect_btn = gr.Button("Detect Objects", variant="primary") # Make button primary
318
+ # with gr.Column(scale=2): # Output column larger
319
+ # image_output = gr.Image(label="Detection Result", interactive=False) # Output not interactive
320
+ # labels_output = gr.Textbox(label="Detected Objects", lines=10, interactive=False)
321
+
322
+ # # Ensure the click event is correctly wired
323
+ # detect_btn.click(
324
+ # fn=detect_objects_ui,
325
+ # inputs=[image_input, profile_select, confidence_slider, iou_slider],
326
+ # outputs=[image_output, labels_output]
327
+ # )
328
+
329
+ # # Mount both FastAPI and Gradio
330
+ # # Ensure the Gradio app uses the FastAPI instance `app`
331
+ # app = gr.mount_gradio_app(app, demo, path="/")
332
+
333
+ # # ... (rest of the file remains the same) ...
334
+
335
+ # if __name__ == "__main__":
336
+ # import uvicorn
337
+ # # Check if YOLO models initialized before starting server
338
+ # # Update check to use the new model variables
339
+ # if not profile_models or dynamic_model is None:
340
+ # logger.error(f"CRITICAL: One or more YOLO-World models ({MODEL_NAME}) failed to initialize. API endpoint /api/detect_objects might not work correctly.")
341
+ # # Decide if you want to exit or run with degraded functionality
342
+ # # exit(1) # Optional: exit if model loading fails
343
+ # else:
344
+ # logger.info("All required YOLO models initialized successfully.")
345
+
346
+ # print("Starting Uvicorn server on http://0.0.0.0:7860")
347
+ # uvicorn.run(app, host="0.0.0.0", port=7860)
348
 
349
  if __name__ == "__main__":
350
  import uvicorn
 
 
351
  if not profile_models or dynamic_model is None:
352
+ logger.error(f"CRITICAL: One or more YOLO-World models failed to initialize. API endpoint /api/detect_objects might not work correctly.")
 
 
353
  else:
354
  logger.info("All required YOLO models initialized successfully.")
 
355
  print("Starting Uvicorn server on http://0.0.0.0:7860")
356
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt CHANGED
@@ -1,20 +1,24 @@
1
- fastapi==0.110.0
2
- uvicorn==0.29.0
3
- gradio==3.50.2
4
- Pillow==10.2.0
5
- numpy==1.26.4
6
- openai-whisper==20240930
7
- ultralytics==8.0.203
8
- torch==2.2.1
9
- pydantic==1.10.13
10
- python-multipart==0.0.9
11
- opencv-python==4.9.0.80
12
- transformers==4.39.3
13
- sentencepiece==0.1.99
14
- scikit-image==0.25.2
15
- scipy==1.15.2
 
 
 
16
  googletrans==4.0.2
17
  httpcore==1.0.9
18
- inference-gpu[yolo-world]==0.48.1
 
19
  git+https://github.com/ultralytics/CLIP.git
20
- git+https://github.com/roboflow/roboflow-python.git
 
1
+ pip>=23.0
2
+ setuptools>=68.0
3
+ wheel
4
+ fastapi>=0.100
5
+ uvicorn>=0.34.0
6
+ # gradio==5.29.0
7
+ Pillow>=11.0.0
8
+ numpy==2.2.5
9
+ openai-whisper==20240927
10
+ ultralytics>=8.3.40
11
+ torch>=2.6.0
12
+ pydantic>=2.11.3
13
+ python-multipart>=0.0.20
14
+ opencv-python>=4.10.0.84
15
+ transformers>=4.51.3
16
+ sentencepiece>=0.2.0
17
+ scikit-image>=0.25.2
18
+ scipy>=1.15.2
19
  googletrans==4.0.2
20
  httpcore==1.0.9
21
+ roboflow==1.1.63
22
+ inference-gpu[yolo-world]==0.48.1 # Commented out due to numpy version conflicts
23
  git+https://github.com/ultralytics/CLIP.git
24
+
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.11