Spaces:

MonilM
/

Lingual

Running

App Files Files Community

Lingual / detection_utils.py

MonilM

Jhol

9053779 3 months ago

raw

history blame contribute delete

12.3 kB

	import numpy as np
	from PIL import Image
	from typing import List, Dict, Any, Set, Tuple # Add Tuple
	import os
	import tempfile # Restored
	from googletrans import Translator # Restored
	import cv2
	from inference.models.yolo_world.yolo_world import YOLOWorld
	import onnxruntime as ort
	import requests
	import random # Restored
	# import json # Removed, was added for the direct API call

	# Patch ONNXRuntime to only use CPU globally
	original_inference_session = ort.InferenceSession

	def patched_inference_session(args, *kwargs):
	kwargs["providers"] = ["CPUExecutionProvider"]
	return original_inference_session(args, *kwargs)

	ort.InferenceSession = patched_inference_session
	import warnings
	warnings.filterwarnings("ignore", category=UserWarning, module="onnxruntime")
	# Define Predefined Class Lists for YOLO-World
	PREDEFINED_CLASSES = {
	"tourist": [
	"person", "car", "bus", "train", "truck", "boat", "traffic light",
	"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
	"dog", "backpack", "umbrella", "handbag", "tie", "suitcase", "building",
	"signboard", "taxi", "rickshaw", "camera", "map", "monument", "souvenir",
	"statue", "fountain", "street sign", "tour guide", "hotel", "restaurant",
	# Added more tourist/India-specific classes
	"temple", "mosque", "church", "fort", "palace", "museum", "market", "bazaar",
	"auto rickshaw", "cycle rickshaw", "metro", "heritage site",
	"ticket counter", "luggage", "water bottle", "scarf", "hat","bus stop",
	"information center", "shopping bag", "vendor", "street food", "food stall",
	"hawker", "street performer", "camel", "elephant ride", "tour bus", "minaret",
	"gopuram", "chhatri", "ghat", "river", "lake", "bridge", "park", "garden",

	],
	"casual": [
	"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
	"truck", "boat", "traffic light", "fire hydrant", "stop sign",
	"parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
	"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
	"handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
	"sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
	"surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork",
	"knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
	"broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
	"couch", "potted plant", "bed", "dining table", "toilet", "tv",
	"laptop", "mouse", "tv remote","remote control", "keyboard", "cell phone", "microwave",
	"oven", "toaster", "book", "clock",
	"scissors", "teddy bear", "toothbrush", "tree", "flower", "park",
	"computer", "desk", "window", "door",
	# Added Indian road/office-specific classes
	"auto rickshaw", "cycle rickshaw", "scooter", "tempo", "tractor", "e-rickshaw",
	"delivery van", "ambulance", "police car", "roadside stall", "food cart",
	"street vendor", "helmet", "road sign", "speed breaker",
	"divider", "pothole", "bus stop", "petrol pump",
	"water dispenser", "printer", "file cabinet", "whiteboard", "projector",
	"security guard", "id card", "notice board",
	"elevator", "staircase", "canteen", "cafeteria", "tea cup", "tiffin box",
	"lunch box", "stationery", "pen", "notebook", "marker", "mouse pad"
	],
	"kids": [
	"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
	"truck", "boat", "bird", "cat", "dog", "horse", "sheep", "cow",
	"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
	"handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
	"sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
	"surfboard", "tennis racket", "bottle", "banana", "apple", "sandwich",
	"orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
	"chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv",
	"laptop", "mouse", "remote", "keyboard", "cell phone", "book", "clock",
	"scissors", "teddy bear", "hair drier", "toothbrush", "red", "blue",
	"green", "yellow", "orange", "purple", "pink", "black", "white", "gray",
	"brown", "circle", "square", "triangle", "rectangle", "star", "heart",
	"ball", "block", "toy", "doll", "crayon", "slide", "swing", "duck", "lion",
	"tiger", "monkey", "moon", "sun", "cloud", "rainbow",
	# Added geometric solids
	"cylinder", "rectangular prism", "pyramid", "cube", "cone", "sphere", "triangular prism"
	]
	}

	# --- Synonym Mapping ---
	# Add known synonyms for potentially ambiguous terms
	# Keys are the terms users might input, values are terms the model might recognize better
	SYNONYM_MAP = {
	"rickshaw": ["tuk-tuk", "auto rickshaw"],
	# Add more synonyms as needed, e.g.:
	"motorbike": ["motorcycle"],
	"automobile": ["car"],
	}

	# Reverse map to easily find the 'original' term from a synonym
	ORIGINAL_TERM_MAP = {}
	for original, synonyms in SYNONYM_MAP.items():
	for synonym in synonyms:
	ORIGINAL_TERM_MAP[synonym] = original

	def expand_synonyms(class_list: List[str]) -> List[str]:
	"""Expands a list of class names with predefined synonyms."""
	expanded_set = set(class_list) # Start with original classes
	for term in class_list:
	if term in SYNONYM_MAP:
	expanded_set.update(SYNONYM_MAP[term])
	return sorted(list(expanded_set))
	# --- End Synonym Mapping ---

	# --- YOLO-World Model Setup (Local, Preloaded) ---
	# Preload a separate YOLO-World model for each profile (casual, tourist, kids)
	YOLOWORLD_MODELS = {}
	for profile in PREDEFINED_CLASSES.keys():
	YOLOWORLD_MODELS[profile] = YOLOWorld(model_id="yolo_world/l")
	YOLOWORLD_MODELS[profile].set_classes(PREDEFINED_CLASSES[profile])

	# Patch requests for this module only (for proxies) # Restored
	def patch_requests_with_proxy():
	proxies_path = os.path.join(os.path.dirname(__file__), "proxies.txt")
	try:
	with open(proxies_path, "r") as f:
	proxies = [line.strip() for line in f if line.strip()]
	if proxies:
	proxy = random.choice(proxies)
	proxy_url = f"http://{proxy}"
	requests.Session.proxies = {
	"http": proxy_url,
	"https": proxy_url
	}
	except Exception:
	pass # Silently pass if proxy setup fails

	# --- Google Translate Helper (inline, replaces missing translate_text) ---
	def translate_text(text, dest_lang): # Restored original signature and logic
	patch_requests_with_proxy() # Patch requests with a random proxy for this call
	try:
	# from googletrans import Translator # Already imported at the top
	translator = Translator(service_urls=['translate.googleapis.com'])
	result = translator.translate(text, dest=dest_lang)
	return result.text
	except Exception:
	return text # fallback to original if translation fails

	# --- Helper function to process YOLO-World prediction results ---
	def process_yoloworld_results(predictions, original_w: int, original_h: int, scale: float, pad_top: int, pad_left: int, class_filter=None, target_language="en"):
	"""
	Process YOLO-World predictions to match the expected output format.
	Transforms coordinates back to the original image space.
	Only translate label if target_language is not English.
	"""
	detections = []
	for pred in predictions:
	class_name = pred.class_name # This should be the English class name from the model
	if class_filter and class_name not in class_filter:
	continue

	# Coordinates from the model are on the resized+padded image
	# (pred.x, pred.y) is the center of the box
	box_center_x_padded = float(pred.x)
	box_center_y_padded = float(pred.y)
	box_width_padded = float(pred.width)
	box_height_padded = float(pred.height)

	# 1. Un-pad: Adjust for padding
	box_center_x_resized = box_center_x_padded - pad_left
	box_center_y_resized = box_center_y_padded - pad_top

	# 2. Un-scale: Adjust for scaling
	original_center_x = box_center_x_resized / scale
	original_center_y = box_center_y_resized / scale
	original_width = box_width_padded / scale
	original_height = box_height_padded / scale

	# Calculate x1, y1 for the original image
	original_x1 = original_center_x - (original_width / 2)
	original_y1 = original_center_y - (original_height / 2)

	# Ensure coordinates are within original image bounds (clipping)
	original_x1 = max(0, min(original_x1, original_w))
	original_y1 = max(0, min(original_y1, original_h))
	# Calculate x2, y2 and then clip them too, then re-calculate width/height
	original_x2 = max(0, min(original_x1 + original_width, original_w))
	original_y2 = max(0, min(original_y1 + original_height, original_h))

	final_width = original_x2 - original_x1
	final_height = original_y2 - original_y1
	final_center_x = original_x1 + final_width / 2
	final_center_y = original_y1 + final_height / 2

	# Translate only if target_language is not English
	if target_language and target_language.lower() != "en":
	label_translated = translate_text(class_name, target_language) # Uses the restored translate_text
	else:
	label_translated = class_name

	detections.append({
	"box": [int(original_x1), int(original_y1), int(final_width), int(final_height)],
	"confidence": float(pred.confidence),
	"label": label_translated, # translated or original label
	"label_en": class_name, # always original (English) label
	"centre": [int(final_center_x), int(final_center_y)]
	})
	return detections

	# --- YOLO-World Inference Function ---
	def run_yoloworld_detection(image: Image.Image, target_classes: set, confidence_threshold: float = 0.1, iou_threshold: float = 0.4, profile: str = "casual", target_language: str = "en"):
	"""Run YOLO-World detection for the given profile and filter by target classes."""
	model = YOLOWORLD_MODELS.get(profile, list(YOLOWORLD_MODELS.values())[0])

	# Convert PIL Image to OpenCV format
	image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

	# --- Image Resizing (maintaining aspect ratio) ---
	target_size = 640
	h, w = image_cv.shape[:2]
	scale = target_size / max(h, w)
	new_w, new_h = int(w * scale), int(h * scale)

	resized_image_cv = cv2.resize(image_cv, (new_w, new_h), interpolation=cv2.INTER_AREA)

	# Pad to target_size x target_size
	delta_w = target_size - new_w
	delta_h = target_size - new_h
	top, bottom = delta_h // 2, delta_h - (delta_h // 2)
	left, right = delta_w // 2, delta_w - (delta_w // 2)

	padded_image_cv = cv2.copyMakeBorder(resized_image_cv, top, bottom, left, right,
	cv2.BORDER_CONSTANT, value=[114, 114, 114]) # Use a neutral padding color

	# Force CPU provider for inference (required for Hugging Face free tier)
	results = model.infer(padded_image_cv, confidence=confidence_threshold, iou=iou_threshold, providers=["CPUExecutionProvider"])

	# Adjust detection coordinates back to original image size if necessary (OPTIONAL, for now results are on padded_image_cv)
	# This is important if you need to draw boxes on the original, unresized image.
	# For now, the processing function `process_yoloworld_results` receives predictions based on the `padded_image_cv`.
	# If you need to scale back, you'd have to account for padding and scaling.
	# Example (conceptual, needs careful implementation if you draw on original image):
	# for pred in results.predictions:
	# pred.x = (pred.x - left) / scale
	# pred.y = (pred.y - top) / scale
	# pred.width /= scale
	# pred.height /= scale

	detections = process_yoloworld_results(results.predictions, w, h, scale, top, left, class_filter=target_classes, target_language=target_language)
	return detections