import argparse import os import random from collections import defaultdict import cv2 import re import math import numpy as np from PIL import Image import torch import html import gradio as gr import torchvision.transforms as T import torch.backends.cudnn as cudnn from geochat.conversation import conv_templates, Chat from geochat.model.builder import load_pretrained_model from geochat.mm_utils import get_model_name_from_path def parse_args(): parser = argparse.ArgumentParser(description="Demo") # parser = argparse.ArgumentParser() parser.add_argument("--model-path", type=str, default="facebook/opt-350m") parser.add_argument("--model-base", type=str, default=None) parser.add_argument("--gpu-id", type=str,default=0) parser.add_argument("--device", type=str, default="cuda") parser.add_argument("--conv-mode", type=str, default=None) parser.add_argument("--max-new-tokens", type=int, default=300) parser.add_argument("--load-8bit", action="store_true") parser.add_argument("--load-4bit", action="store_true") parser.add_argument("--debug", action="store_true") parser.add_argument("--image-aspect-ratio", type=str, default='pad') # args = parser.parse_args() args = parser.parse_args() return args random.seed(42) np.random.seed(42) torch.manual_seed(42) cudnn.benchmark = False cudnn.deterministic = True print('Initializing Chat') args = parse_args() # cfg = Config(args) model_name = get_model_name_from_path(args.model_path) print(model_name) tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device) device = 'cuda:{}'.format(args.gpu_id) # model_config = cfg.model_cfg # model_config.device_8bit = args.gpu_id # model_cls = registry.get_model_class(model_config.arch) # model = model_cls.from_config(model_config).to(device) bounding_box_size = 100 # vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train # vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg) model = model.eval() CONV_VISION = conv_templates['llava_v1'].copy() def bbox_and_angle_to_polygon(x1, y1, x2, y2, a): # Calculate center coordinates x_ctr = (x1 + x2) / 2 y_ctr = (y1 + y2) / 2 # Calculate width and height w = abs(x2 - x1) h = abs(y2 - y1) # Calculate the angle in radians angle_rad = math.radians(a) # Calculate coordinates of the four corners of the rotated bounding box cos_a = math.cos(angle_rad) sin_a = math.sin(angle_rad) x1_rot = cos_a * (-w / 2) - sin_a * (-h / 2) + x_ctr y1_rot = sin_a * (-w / 2) + cos_a * (-h / 2) + y_ctr x2_rot = cos_a * (w / 2) - sin_a * (-h / 2) + x_ctr y2_rot = sin_a * (w / 2) + cos_a * (-h / 2) + y_ctr x3_rot = cos_a * (w / 2) - sin_a * (h / 2) + x_ctr y3_rot = sin_a * (w / 2) + cos_a * (h / 2) + y_ctr x4_rot = cos_a * (-w / 2) - sin_a * (h / 2) + x_ctr y4_rot = sin_a * (-w / 2) + cos_a * (h / 2) + y_ctr # Return the polygon coordinates polygon_coords = np.array((x1_rot, y1_rot, x2_rot, y2_rot, x3_rot, y3_rot, x4_rot, y4_rot)) return polygon_coords def rotate_bbox(top_right, bottom_left, angle_degrees): # Convert angle to radians angle_radians = np.radians(angle_degrees) # Calculate the center of the rectangle center = ((top_right[0] + bottom_left[0]) / 2, (top_right[1] + bottom_left[1]) / 2) # Calculate the width and height of the rectangle width = top_right[0] - bottom_left[0] height = top_right[1] - bottom_left[1] # Create a rotation matrix rotation_matrix = cv2.getRotationMatrix2D(center, angle_degrees, 1) # Create an array of the rectangle corners rectangle_points = np.array([[bottom_left[0], bottom_left[1]], [top_right[0], bottom_left[1]], [top_right[0], top_right[1]], [bottom_left[0], top_right[1]]], dtype=np.float32) # Rotate the rectangle points rotated_rectangle = cv2.transform(np.array([rectangle_points]), rotation_matrix)[0] return rotated_rectangle def extract_substrings(string): # first check if there is no-finished bracket index = string.rfind('}') if index != -1: string = string[:index + 1] pattern = r'
(.*?)\}(?!<)' matches = re.findall(pattern, string) substrings = [match for match in matches] return substrings def is_overlapping(rect1, rect2): x1, y1, x2, y2 = rect1 x3, y3, x4, y4 = rect2 return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4) def computeIoU(bbox1, bbox2): x1, y1, x2, y2 = bbox1 x3, y3, x4, y4 = bbox2 intersection_x1 = max(x1, x3) intersection_y1 = max(y1, y3) intersection_x2 = min(x2, x4) intersection_y2 = min(y2, y4) intersection_area = max(0, intersection_x2 - intersection_x1 + 1) * max(0, intersection_y2 - intersection_y1 + 1) bbox1_area = (x2 - x1 + 1) * (y2 - y1 + 1) bbox2_area = (x4 - x3 + 1) * (y4 - y3 + 1) union_area = bbox1_area + bbox2_area - intersection_area iou = intersection_area / union_area return iou def save_tmp_img(visual_img): file_name = "".join([str(random.randint(0, 9)) for _ in range(5)]) + ".jpg" file_path = "/tmp/gradio" + file_name visual_img.save(file_path) return file_path def mask2bbox(mask): if mask is None: return '' mask = mask.resize([100, 100], resample=Image.NEAREST) mask = np.array(mask)[:, :, 0] rows = np.any(mask, axis=1) cols = np.any(mask, axis=0) if rows.sum(): # Get the top, bottom, left, and right boundaries rmin, rmax = np.where(rows)[0][[0, -1]] cmin, cmax = np.where(cols)[0][[0, -1]] bbox = '{{<{}><{}><{}><{}>}}'.format(cmin, rmin, cmax, rmax) else: bbox = '' return bbox def escape_markdown(text): # List of Markdown special characters that need to be escaped md_chars = ['<', '>'] # Escape each special character for char in md_chars: text = text.replace(char, '\\' + char) return text def reverse_escape(text): md_chars = ['\\<', '\\>'] for char in md_chars: text = text.replace(char, char[1:]) return text colors = [ (255, 0, 0), (0, 255, 0), (0, 0, 255), (210, 210, 0), (255, 0, 255), (0, 255, 255), (114, 128, 250), (0, 165, 255), (0, 128, 0), (144, 238, 144), (238, 238, 175), (255, 191, 0), (0, 128, 0), (226, 43, 138), (255, 0, 255), (0, 215, 255), ] color_map = { f"{color_id}": f"#{hex(color[2])[2:].zfill(2)}{hex(color[1])[2:].zfill(2)}{hex(color[0])[2:].zfill(2)}" for color_id, color in enumerate(colors) } used_colors = colors def visualize_all_bbox_together(image, generation): if image is None: return None, '' generation = html.unescape(generation) image_width, image_height = image.size image = image.resize([500, int(500 / image_width * image_height)]) image_width, image_height = image.size string_list = extract_substrings(generation) if string_list: # it is grounding or detection mode = 'all' entities = defaultdict(list) i = 0 j = 0 for string in string_list: try: obj, string = string.split('
') except ValueError: print('wrong string: ', string) continue if "}{" in string: string=string.replace("}{","}(.*?)
', colored_phrases, generation) else: generation_colored = '' pil_image = Image.fromarray(new_image) return pil_image, generation_colored def gradio_reset(chat_state, img_list): if chat_state is not None: chat_state.messages = [] if img_list is not None: img_list = [] return None, gr.update(value=None, interactive=True), gr.update(placeholder='Upload your image and chat', interactive=True), chat_state, img_list def image_upload_trigger(upload_flag, replace_flag, img_list): # set the upload flag to true when receive a new image. # if there is an old image (and old conversation), set the replace flag to true to reset the conv later. upload_flag = 1 if img_list: replace_flag = 1 return upload_flag, replace_flag def example_trigger(text_input, image, upload_flag, replace_flag, img_list): # set the upload flag to true when receive a new image. # if there is an old image (and old conversation), set the replace flag to true to reset the conv later. upload_flag = 1 if img_list or replace_flag == 1: replace_flag = 1 return upload_flag, replace_flag def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag, replace_flag): if len(user_message) == 0: text_box_show = 'Input should not be empty!' else: text_box_show = '' if isinstance(gr_img, dict): gr_img, mask = gr_img['image'], gr_img['mask'] else: mask = None if '[identify]' in user_message: # check if user provide bbox in the text input integers = re.findall(r'-?\d+', user_message) if len(integers) != 4: # no bbox in text bbox = mask2bbox(mask) user_message = user_message + bbox if chat_state is None: chat_state = CONV_VISION.copy() if upload_flag: if replace_flag: chat_state = CONV_VISION.copy() # new image, reset everything replace_flag = 0 chatbot = [] img_list = [] llm_message = chat.upload_img(gr_img, chat_state, img_list) upload_flag = 0 chat.ask(user_message, chat_state) chatbot = chatbot + [[user_message, None]] if '[identify]' in user_message: visual_img, _ = visualize_all_bbox_together(gr_img, user_message) if visual_img is not None: file_path = save_tmp_img(visual_img) chatbot = chatbot + [[(file_path,), None]] return text_box_show, chatbot, chat_state, img_list, upload_flag, replace_flag # def gradio_answer(chatbot, chat_state, img_list, temperature): # llm_message = chat.answer(conv=chat_state, # img_list=img_list, # temperature=temperature, # max_new_tokens=500, # max_length=2000)[0] # chatbot[-1][1] = llm_message # return chatbot, chat_state def gradio_stream_answer(chatbot, chat_state, img_list, temperature): if len(img_list) > 0: if not isinstance(img_list[0], torch.Tensor): chat.encode_img(img_list) streamer = chat.stream_answer(conv=chat_state, img_list=img_list, temperature=temperature, max_new_tokens=500, max_length=2000) # chatbot[-1][1] = output # chat_state.messages[-1][1] = '' output = '' for new_output in streamer: # print(new_output) output=output+new_output print(output) # if "{" in output: # chatbot[-1][1]="Grounding and referring expression is still under work." # else: output = escape_markdown(output) # output += escapped chatbot[-1][1] = output yield chatbot, chat_state chat_state.messages[-1][1] = '' return chatbot, chat_state def gradio_visualize(chatbot, gr_img): if isinstance(gr_img, dict): gr_img, mask = gr_img['image'], gr_img['mask'] unescaped = reverse_escape(chatbot[-1][1]) visual_img, generation_color = visualize_all_bbox_together(gr_img, unescaped) if visual_img is not None: if len(generation_color): chatbot[-1][1] = generation_color file_path = save_tmp_img(visual_img) chatbot = chatbot + [[None, (file_path,)]] return chatbot def gradio_taskselect(idx): prompt_list = [ '', 'Classify the image in the following classes: ', '[identify] what is this ', ] instruct_list = [ '**Hint:** Type in whatever you want', '**Hint:** Type in the classes you want the model to classify in', '**Hint:** Draw a bounding box on the uploaded image then send the command. Click the "clear" botton on the top right of the image before redraw', ] return prompt_list[idx], instruct_list[idx] chat = Chat(model, image_processor,tokenizer, device=device) title = """