From d8730331f1bb8a4f595d1fbf101b269774051d19 Mon Sep 17 00:00:00 2001 From: Adam Kucera Date: Fri, 25 Oct 2024 14:34:33 -0600 Subject: [PATCH] nit: update spelling --- demo.ipynb | 8 ++++---- gradio_demo.py | 8 ++++---- omniparser.py | 10 +++++----- utils.py | 10 +++++----- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/demo.ipynb b/demo.ipynb index f78b52e..39e14f0 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -458,7 +458,7 @@ " 'text_padding': 3,\n", " 'thickness': 3,\n", " }\n", - " BOX_TRESHOLD = 0.05\n", + " BOX_THRESHOLD = 0.05\n", "elif platform == 'web':\n", " draw_bbox_config = {\n", " 'text_scale': 0.8,\n", @@ -466,7 +466,7 @@ " 'text_padding': 3,\n", " 'thickness': 3,\n", " }\n", - " BOX_TRESHOLD = 0.05\n", + " BOX_THRESHOLD = 0.05\n", "elif platform == 'mobile':\n", " draw_bbox_config = {\n", " 'text_scale': 0.8,\n", @@ -474,14 +474,14 @@ " 'text_padding': 3,\n", " 'thickness': 3,\n", " }\n", - " BOX_TRESHOLD = 0.05\n", + " BOX_THRESHOLD = 0.05\n", "image = Image.open(image_path)\n", "image_rgb = image.convert('RGB')\n", "\n", "ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9})\n", "text, ocr_bbox = ocr_bbox_rslt\n", "\n", - "dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=False)\n", + "dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_THRESHOLD = BOX_THRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=False)\n", "\n", "\n" ] diff --git a/gradio_demo.py b/gradio_demo.py index 5522f10..7d4d6f3 100644 --- a/gradio_demo.py +++ b/gradio_demo.py @@ -22,7 +22,7 @@ 'text_padding': 2, 'thickness': 2, } - BOX_TRESHOLD = 0.05 + BOX_THRESHOLD = 0.05 elif platform == 'web': draw_bbox_config = { 'text_scale': 0.8, @@ -30,7 +30,7 @@ 'text_padding': 3, 'thickness': 3, } - BOX_TRESHOLD = 0.05 + BOX_THRESHOLD = 0.05 elif platform == 'mobile': draw_bbox_config = { 'text_scale': 0.8, @@ -38,7 +38,7 @@ 'text_padding': 3, 'thickness': 3, } - BOX_TRESHOLD = 0.05 + BOX_THRESHOLD = 0.05 @@ -70,7 +70,7 @@ def process( ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}) text, ocr_bbox = ocr_bbox_rslt print('prompt:', prompt) - dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=0.3,prompt=prompt) + dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_THRESHOLD = BOX_THRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=0.3,prompt=prompt) image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img))) print('finish processing') parsed_content_list = '\n'.join(parsed_content_list) diff --git a/omniparser.py b/omniparser.py index 37b7780..2ad8c67 100644 --- a/omniparser.py +++ b/omniparser.py @@ -17,7 +17,7 @@ 'text_padding': 3, 'thickness': 3, }, - 'BOX_TRESHOLD': 0.05 + 'BOX_THRESHOLD': 0.05 } @@ -35,11 +35,11 @@ def parse(self, image_path: str): text, ocr_bbox = ocr_bbox_rslt draw_bbox_config = self.config['draw_bbox_config'] - BOX_TRESHOLD = self.config['BOX_TRESHOLD'] - dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, self.som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=None, ocr_text=text,use_local_semantics=False) + BOX_THRESHOLD = self.config['BOX_THRESHOLD'] + dino_labeled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, self.som_model, BOX_THRESHOLD = BOX_THRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=None, ocr_text=text,use_local_semantics=False) - image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img))) - # formating output + image = Image.open(io.BytesIO(base64.b64decode(dino_labeled_img))) + # formatting output return_list = [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]}, 'text': parsed_content_list[i].split(': ')[1], 'type':'text'} for i, (k, coord) in enumerate(label_coordinates.items()) if i < len(parsed_content_list)] return_list.extend( diff --git a/utils.py b/utils.py index 2fc180d..1ba2d15 100755 --- a/utils.py +++ b/utils.py @@ -278,19 +278,19 @@ def predict_yolo(model, image_path, box_threshold): return boxes, conf, phrases -def get_som_labeled_img(img_path, model=None, BOX_TRESHOLD = 0.01, output_coord_in_ratio=False, ocr_bbox=None, text_scale=0.4, text_padding=5, draw_bbox_config=None, caption_model_processor=None, ocr_text=[], use_local_semantics=True, iou_threshold=0.9,prompt=None): +def get_som_labeled_img(img_path, model=None, BOX_THRESHOLD = 0.01, output_coord_in_ratio=False, ocr_bbox=None, text_scale=0.4, text_padding=5, draw_bbox_config=None, caption_model_processor=None, ocr_text=[], use_local_semantics=True, iou_threshold=0.9,prompt=None): """ ocr_bbox: list of xyxy format bbox """ TEXT_PROMPT = "clickable buttons on the screen" - # BOX_TRESHOLD = 0.02 # 0.05/0.02 for web and 0.1 for mobile - TEXT_TRESHOLD = 0.01 # 0.9 # 0.01 + # BOX_THRESHOLD = 0.02 # 0.05/0.02 for web and 0.1 for mobile + TEXT_THRESHOLD = 0.01 # 0.9 # 0.01 image_source = Image.open(img_path).convert("RGB") w, h = image_source.size # import pdb; pdb.set_trace() if False: # TODO - xyxy, logits, phrases = predict(model=model, image=image_source, caption=TEXT_PROMPT, box_threshold=BOX_TRESHOLD, text_threshold=TEXT_TRESHOLD) + xyxy, logits, phrases = predict(model=model, image=image_source, caption=TEXT_PROMPT, box_threshold=BOX_THRESHOLD, text_threshold=TEXT_THRESHOLD) else: - xyxy, logits, phrases = predict_yolo(model=model, image_path=img_path, box_threshold=BOX_TRESHOLD) + xyxy, logits, phrases = predict_yolo(model=model, image_path=img_path, box_threshold=BOX_THRESHOLD) xyxy = xyxy / torch.Tensor([w, h, w, h]).to(xyxy.device) image_source = np.asarray(image_source) phrases = [str(i) for i in range(len(phrases))]