Spaces:

MonilM
/

Lingual

Running

App Files Files Community

MonilM commited on May 4

Commit

488a535

1 Parent(s): 39d7794

Updated Requirements, Last Straw

Browse files

Files changed (3) hide show

app.py +193 -184
requirements.txt +21 -17
runtime.txt +1 -0

app.py CHANGED Viewed

@@ -154,194 +154,203 @@ async def detect_objects_yolo_world(
 # --- Gradio UI Functions ---
-# Keep detect_objects_ui function
-def detect_objects_ui(image_pil: Image.Image, profile: str, confidence: float, iou: float): # Add iou parameter
-    """Gradio function for YOLO-World object detection."""
-    # Create a placeholder image for errors or no input
-    placeholder_img = Image.new('RGB', (640, 480), color = (150, 150, 150))
-    draw_placeholder = ImageDraw.Draw(placeholder_img)
-    if image_pil is None:
-        draw_placeholder.text((10, 10), "Please upload an image.", fill=(255,255,255))
-        return placeholder_img, "Please upload an image."
-    # Check if the correct model structure is available
-    profile_lower = profile.lower()
-    if profile_lower not in profile_models:
-        error_msg = f"Error: Model for profile '{profile_lower}' not loaded."
-        logger.error(f"UI requested profile '{profile_lower}' but model not loaded.")
-        # Return original image with error drawn on it
-        try:
-            error_img_out = image_pil.copy()
-            draw_error = ImageDraw.Draw(error_img_out)
-            draw_error.text((10, 10), error_msg, fill="red", font=ImageFont.load_default())
-            return error_img_out, error_msg
-        except Exception: # Fallback if drawing on input fails
-             draw_placeholder.text((10, 10), error_msg, fill="red")
-             return placeholder_img, error_msg
-    model_to_use = profile_models[profile_lower]
-    name_map_to_use = profile_class_maps[profile_lower]
-    try:
-        # Ensure image is PIL Image and in RGB
-        if not isinstance(image_pil, Image.Image):
-             if isinstance(image_pil, np.ndarray):
-                 image_pil = Image.fromarray(image_pil).convert("RGB")
-             else:
-                 error_msg = "Error: Invalid image input type."
-                 draw_placeholder.text((10, 10), error_msg, fill="red")
-                 return placeholder_img, error_msg
-        else:
-            image_pil = image_pil.convert("RGB")
-        # Run detection using the pre-configured model
-        logger.info(f"Running YOLO-World detection (UI) with profile: {profile_lower}, confidence: {confidence}, iou: {iou}")
-        results = model_to_use.predict(image_pil, conf=confidence, iou=iou, verbose=False)
-        # Process results using the helper and the stored map
-        original_w, original_h = image_pil.width, image_pil.height
-        if results and results[0] and results[0].orig_shape:
-             original_h, original_w = results[0].orig_shape[:2]
-        detections = process_prediction_results(
-            results, original_w, original_h, name_map_to_use
-        )
-        # Draw boxes on a copy of the image for Gradio output
-        output_image = image_pil.copy()
-        draw = ImageDraw.Draw(output_image)
-        try:
-            font = ImageFont.truetype("arial.ttf", 15)
-        except IOError:
-            font = ImageFont.load_default()
-        labels = []
-        if not detections:
-            labels.append("No objects detected.")
-        else:
-            for det in detections:
-                box = det['box']
-                label = f"{det['class_name']}: {det['confidence']:.2f}"
-                labels.append(label)
-                color = "red"
-                draw.rectangle(
-                    [(box['x1'], box['y1']), (box['x2'], box['y2'])],
-                    outline=color, width=3
-                )
-                text_position = (box['x1'], box['y1'] - 15 if box['y1'] > 15 else box['y1'])
-                # Use textbbox for better background calculation
-                try:
-                    text_bbox = draw.textbbox(text_position, label, font=font)
-                    # Adjust background size slightly
-                    bg_coords = (text_bbox[0]-1, text_bbox[1]-1, text_bbox[2]+1, text_bbox[3]+1)
-                    draw.rectangle(bg_coords, fill=color)
-                    draw.text(text_position, label, fill="white", font=font)
-                except AttributeError: # Fallback for older Pillow versions without textbbox
-                    draw.text(text_position, label, fill=color, font=font)
-        logger.info(f"UI Detection Results: {labels}")
-        return output_image, "\n".join(labels)
-    except Exception as e:
-        error_msg = f"Error: {str(e)}"
-        logger.error(f"Error in detect_objects_ui: {e}", exc_info=True)
-        # Return original image with error message drawn
-        try:
-            error_img_out = image_pil.copy()
-            draw_error = ImageDraw.Draw(error_img_out)
-            draw_error.text((10, 10), error_msg, fill="red", font=ImageFont.load_default())
-            return error_img_out, error_msg
-        except Exception: # Fallback if drawing on input fails
-             draw_placeholder.text((10, 10), error_msg, fill="red")
-             return placeholder_img, error_msg
-# --- Create Gradio Interface ---
-# Add theme and descriptions
-theme = gr.themes.Soft() # Example theme
-with gr.Blocks(title="IPD-Lingual API", theme=theme) as demo:
-    gr.Markdown("# IPD-Lingual: Speech & Vision API")
-    gr.Markdown("An API providing speech transcription/translation and object detection capabilities.")
-    with gr.Tab("Home / About"):
-        gr.Markdown("## Welcome!")
-        gr.Markdown(
-            """
-            This application provides two main functionalities accessible via API endpoints and a demonstration UI:
-            1.  **Speech Processing (`/api/speech`):**
-                *   Accepts an audio file and two language codes (e.g., 'en', 'es').
-                *   Uses **OpenAI Whisper (base model)** to transcribe the audio, automatically detecting which of the two provided languages is spoken.
-                *   Uses the **googletrans library** (unofficial Google Translate API) to translate the transcribed text into the *other* provided language.
-                *   Returns the detected language, original transcription, and translation.
-            2.  **Object Detection (`/api/detect_objects`):**
-                *   Accepts an image file, a detection profile (e.g., 'casual', 'vehicles'), optional extra object names, confidence threshold, and IoU threshold.
-                *   Uses **YOLO-World (yolov8l-worldv2.pt)**, a powerful zero-shot object detection model from Ultralytics.
-                *   It can detect objects based on predefined profiles or dynamically based on user-provided text prompts (extra words).
-                *   Returns a list of detected objects with their bounding boxes, class names, and confidence scores.
-            Use the tabs above to try out the object detection functionality or see the API endpoint details below.
-            *(Note: The speech processing functionality is currently only available via the API endpoint).*
-            """
-        )
-        gr.Markdown("---")
-        gr.Markdown("### API Endpoint Summary")
-        gr.Markdown("- **POST `/api/speech`**: Transcribe and Translate audio.\n  - **Type**: `multipart/form-data`\n  - **Fields**: `audio` (file), `lang1` (string), `lang2` (string)")
-        gr.Markdown("- **POST `/api/detect_objects`**: Detect objects using YOLO-World.\n  - **Type**: `multipart/form-data`\n  - **Fields**: `image` (file), `profile` (string), `extra_words` (string, optional, comma-separated or JSON list), `confidence` (float, optional), `iou` (float, optional)")
-    # Keep the "Object Detection" Tab
-    with gr.Tab("Object Detection Demo"):
-        gr.Markdown("## Detect Objects in Image (using YOLO-World)")
-        gr.Markdown("Upload an image and select a detection profile. The model will identify objects belonging to that profile.")
-        with gr.Row():
-            with gr.Column(scale=1): # Input column slightly smaller
-                image_input = gr.Image(type="pil", label="Upload Image")
-                profile_select = gr.Dropdown(
-                    choices=sorted(list(PREDEFINED_CLASSES.keys())),
-                    value="casual",
-                    label="Detection Profile"
-                )
-                confidence_slider = gr.Slider(
-                    minimum=0.001, maximum=1.0, value=0.01, step=0.001,
-                    label="Confidence Threshold"
-                )
-                iou_slider = gr.Slider(
-                    minimum=0.01, maximum=1.0, value=0.2, step=0.01,
-                    label="IoU Threshold (NMS)"
-                )
-                detect_btn = gr.Button("Detect Objects", variant="primary") # Make button primary
-            with gr.Column(scale=2): # Output column larger
-                image_output = gr.Image(label="Detection Result", interactive=False) # Output not interactive
-                labels_output = gr.Textbox(label="Detected Objects", lines=10, interactive=False)
-        # Ensure the click event is correctly wired
-        detect_btn.click(
-            fn=detect_objects_ui,
-            inputs=[image_input, profile_select, confidence_slider, iou_slider],
-            outputs=[image_output, labels_output]
-        )
-# Mount both FastAPI and Gradio
-# Ensure the Gradio app uses the FastAPI instance `app`
-app = gr.mount_gradio_app(app, demo, path="/")
-# ... (rest of the file remains the same) ...
 if __name__ == "__main__":
     import uvicorn
-    # Check if YOLO models initialized before starting server
-    # Update check to use the new model variables
     if not profile_models or dynamic_model is None:
-        logger.error(f"CRITICAL: One or more YOLO-World models ({MODEL_NAME}) failed to initialize. API endpoint /api/detect_objects might not work correctly.")
-        # Decide if you want to exit or run with degraded functionality
-        # exit(1) # Optional: exit if model loading fails
     else:
         logger.info("All required YOLO models initialized successfully.")
     print("Starting Uvicorn server on http://0.0.0.0:7860")
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 # --- Gradio UI Functions ---
+# # Keep detect_objects_ui function
+# def detect_objects_ui(image_pil: Image.Image, profile: str, confidence: float, iou: float): # Add iou parameter
+#     """Gradio function for YOLO-World object detection."""
+#     # Create a placeholder image for errors or no input
+#     placeholder_img = Image.new('RGB', (640, 480), color = (150, 150, 150))
+#     draw_placeholder = ImageDraw.Draw(placeholder_img)
+#     if image_pil is None:
+#         draw_placeholder.text((10, 10), "Please upload an image.", fill=(255,255,255))
+#         return placeholder_img, "Please upload an image."
+#     # Check if the correct model structure is available
+#     profile_lower = profile.lower()
+#     if profile_lower not in profile_models:
+#         error_msg = f"Error: Model for profile '{profile_lower}' not loaded."
+#         logger.error(f"UI requested profile '{profile_lower}' but model not loaded.")
+#         # Return original image with error drawn on it
+#         try:
+#             error_img_out = image_pil.copy()
+#             draw_error = ImageDraw.Draw(error_img_out)
+#             draw_error.text((10, 10), error_msg, fill="red", font=ImageFont.load_default())
+#             return error_img_out, error_msg
+#         except Exception: # Fallback if drawing on input fails
+#              draw_placeholder.text((10, 10), error_msg, fill="red")
+#              return placeholder_img, error_msg
+#     model_to_use = profile_models[profile_lower]
+#     name_map_to_use = profile_class_maps[profile_lower]
+#     try:
+#         # Ensure image is PIL Image and in RGB
+#         if not isinstance(image_pil, Image.Image):
+#              if isinstance(image_pil, np.ndarray):
+#                  image_pil = Image.fromarray(image_pil).convert("RGB")
+#              else:
+#                  error_msg = "Error: Invalid image input type."
+#                  draw_placeholder.text((10, 10), error_msg, fill="red")
+#                  return placeholder_img, error_msg
+#         else:
+#             image_pil = image_pil.convert("RGB")
+#         # Run detection using the pre-configured model
+#         logger.info(f"Running YOLO-World detection (UI) with profile: {profile_lower}, confidence: {confidence}, iou: {iou}")
+#         results = model_to_use.predict(image_pil, conf=confidence, iou=iou, verbose=False)
+#         # Process results using the helper and the stored map
+#         original_w, original_h = image_pil.width, image_pil.height
+#         if results and results[0] and results[0].orig_shape:
+#              original_h, original_w = results[0].orig_shape[:2]
+#         detections = process_prediction_results(
+#             results, original_w, original_h, name_map_to_use
+#         )
+#         # Draw boxes on a copy of the image for Gradio output
+#         output_image = image_pil.copy()
+#         draw = ImageDraw.Draw(output_image)
+#         try:
+#             font = ImageFont.truetype("arial.ttf", 15)
+#         except IOError:
+#             font = ImageFont.load_default()
+#         labels = []
+#         if not detections:
+#             labels.append("No objects detected.")
+#         else:
+#             for det in detections:
+#                 box = det['box']
+#                 label = f"{det['class_name']}: {det['confidence']:.2f}"
+#                 labels.append(label)
+#                 color = "red"
+#                 draw.rectangle(
+#                     [(box['x1'], box['y1']), (box['x2'], box['y2'])],
+#                     outline=color, width=3
+#                 )
+#                 text_position = (box['x1'], box['y1'] - 15 if box['y1'] > 15 else box['y1'])
+#                 # Use textbbox for better background calculation
+#                 try:
+#                     text_bbox = draw.textbbox(text_position, label, font=font)
+#                     # Adjust background size slightly
+#                     bg_coords = (text_bbox[0]-1, text_bbox[1]-1, text_bbox[2]+1, text_bbox[3]+1)
+#                     draw.rectangle(bg_coords, fill=color)
+#                     draw.text(text_position, label, fill="white", font=font)
+#                 except AttributeError: # Fallback for older Pillow versions without textbbox
+#                     draw.text(text_position, label, fill=color, font=font)
+#         logger.info(f"UI Detection Results: {labels}")
+#         return output_image, "\n".join(labels)
+#     except Exception as e:
+#         error_msg = f"Error: {str(e)}"
+#         logger.error(f"Error in detect_objects_ui: {e}", exc_info=True)
+#         # Return original image with error message drawn
+#         try:
+#             error_img_out = image_pil.copy()
+#             draw_error = ImageDraw.Draw(error_img_out)
+#             draw_error.text((10, 10), error_msg, fill="red", font=ImageFont.load_default())
+#             return error_img_out, error_msg
+#         except Exception: # Fallback if drawing on input fails
+#              draw_placeholder.text((10, 10), error_msg, fill="red")
+#              return placeholder_img, error_msg
+# # --- Create Gradio Interface ---
+# # Add theme and descriptions
+# theme = gr.themes.Soft() # Example theme
+# with gr.Blocks(title="IPD-Lingual API", theme=theme) as demo:
+#     gr.Markdown("# IPD-Lingual: Speech & Vision API")
+#     gr.Markdown("An API providing speech transcription/translation and object detection capabilities.")
+#     with gr.Tab("Home / About"):
+#         gr.Markdown("## Welcome!")
+#         gr.Markdown(
+#             """
+#             This application provides two main functionalities accessible via API endpoints and a demonstration UI:
+#             1.  **Speech Processing (`/api/speech`):**
+#                 *   Accepts an audio file and two language codes (e.g., 'en', 'es').
+#                 *   Uses **OpenAI Whisper (base model)** to transcribe the audio, automatically detecting which of the two provided languages is spoken.
+#                 *   Uses the **googletrans library** (unofficial Google Translate API) to translate the transcribed text into the *other* provided language.
+#                 *   Returns the detected language, original transcription, and translation.
+#             2.  **Object Detection (`/api/detect_objects`):**
+#                 *   Accepts an image file, a detection profile (e.g., 'casual', 'vehicles'), optional extra object names, confidence threshold, and IoU threshold.
+#                 *   Uses **YOLO-World (yolov8l-worldv2.pt)**, a powerful zero-shot object detection model from Ultralytics.
+#                 *   It can detect objects based on predefined profiles or dynamically based on user-provided text prompts (extra words).
+#                 *   Returns a list of detected objects with their bounding boxes, class names, and confidence scores.
+#             Use the tabs above to try out the object detection functionality or see the API endpoint details below.
+#             *(Note: The speech processing functionality is currently only available via the API endpoint).*
+#             """
+#         )
+#         gr.Markdown("---")
+#         gr.Markdown("### API Endpoint Summary")
+#         gr.Markdown("- **POST `/api/speech`**: Transcribe and Translate audio.\n  - **Type**: `multipart/form-data`\n  - **Fields**: `audio` (file), `lang1` (string), `lang2` (string)")
+#         gr.Markdown("- **POST `/api/detect_objects`**: Detect objects using YOLO-World.\n  - **Type**: `multipart/form-data`\n  - **Fields**: `image` (file), `profile` (string), `extra_words` (string, optional, comma-separated or JSON list), `confidence` (float, optional), `iou` (float, optional)")
+#     # Keep the "Object Detection" Tab
+#     with gr.Tab("Object Detection Demo"):
+#         gr.Markdown("## Detect Objects in Image (using YOLO-World)")
+#         gr.Markdown("Upload an image and select a detection profile. The model will identify objects belonging to that profile.")
+#         with gr.Row():
+#             with gr.Column(scale=1): # Input column slightly smaller
+#                 image_input = gr.Image(type="pil", label="Upload Image")
+#                 profile_select = gr.Dropdown(
+#                     choices=sorted(list(PREDEFINED_CLASSES.keys())),
+#                     value="casual",
+#                     label="Detection Profile"
+#                 )
+#                 confidence_slider = gr.Slider(
+#                     minimum=0.001, maximum=1.0, value=0.01, step=0.001,
+#                     label="Confidence Threshold"
+#                 )
+#                 iou_slider = gr.Slider(
+#                     minimum=0.01, maximum=1.0, value=0.2, step=0.01,
+#                     label="IoU Threshold (NMS)"
+#                 )
+#                 detect_btn = gr.Button("Detect Objects", variant="primary") # Make button primary
+#             with gr.Column(scale=2): # Output column larger
+#                 image_output = gr.Image(label="Detection Result", interactive=False) # Output not interactive
+#                 labels_output = gr.Textbox(label="Detected Objects", lines=10, interactive=False)
+#         # Ensure the click event is correctly wired
+#         detect_btn.click(
+#             fn=detect_objects_ui,
+#             inputs=[image_input, profile_select, confidence_slider, iou_slider],
+#             outputs=[image_output, labels_output]
+#         )
+# # Mount both FastAPI and Gradio
+# # Ensure the Gradio app uses the FastAPI instance `app`
+# app = gr.mount_gradio_app(app, demo, path="/")
+# # ... (rest of the file remains the same) ...
+# if __name__ == "__main__":
+#     import uvicorn
+#     # Check if YOLO models initialized before starting server
+#     # Update check to use the new model variables
+#     if not profile_models or dynamic_model is None:
+#         logger.error(f"CRITICAL: One or more YOLO-World models ({MODEL_NAME}) failed to initialize. API endpoint /api/detect_objects might not work correctly.")
+#         # Decide if you want to exit or run with degraded functionality
+#         # exit(1) # Optional: exit if model loading fails
+#     else:
+#         logger.info("All required YOLO models initialized successfully.")
+#     print("Starting Uvicorn server on http://0.0.0.0:7860")
+#     uvicorn.run(app, host="0.0.0.0", port=7860)
 if __name__ == "__main__":
     import uvicorn
     if not profile_models or dynamic_model is None:
+        logger.error(f"CRITICAL: One or more YOLO-World models failed to initialize. API endpoint /api/detect_objects might not work correctly.")
     else:
         logger.info("All required YOLO models initialized successfully.")
     print("Starting Uvicorn server on http://0.0.0.0:7860")
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt CHANGED Viewed

@@ -1,20 +1,24 @@
-fastapi==0.110.0
-uvicorn==0.29.0
-gradio==3.50.2
-Pillow==10.2.0
-numpy==1.26.4
-openai-whisper==20240930
-ultralytics==8.0.203
-torch==2.2.1
-pydantic==1.10.13
-python-multipart==0.0.9
-opencv-python==4.9.0.80
-transformers==4.39.3
-sentencepiece==0.1.99
-scikit-image==0.25.2
-scipy==1.15.2
 googletrans==4.0.2
 httpcore==1.0.9
-inference-gpu[yolo-world]==0.48.1
 git+https://github.com/ultralytics/CLIP.git
-git+https://github.com/roboflow/roboflow-python.git

+pip>=23.0
+setuptools>=68.0
+wheel
+fastapi>=0.100
+uvicorn>=0.34.0
+# gradio==5.29.0
+Pillow>=11.0.0
+numpy==2.2.5
+openai-whisper==20240927
+ultralytics>=8.3.40
+torch>=2.6.0
+pydantic>=2.11.3
+python-multipart>=0.0.20
+opencv-python>=4.10.0.84
+transformers>=4.51.3
+sentencepiece>=0.2.0
+scikit-image>=0.25.2
+scipy>=1.15.2
 googletrans==4.0.2
 httpcore==1.0.9
+roboflow==1.1.63
+inference-gpu[yolo-world]==0.48.1  # Commented out due to numpy version conflicts
 git+https://github.com/ultralytics/CLIP.git

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.11