microsoft · muttonchop · Oct 25, 2024
diff --git a/demo.ipynb b/demo.ipynb
@@ -458,30 +458,30 @@
     "        'text_padding': 3,\n",
     "        'thickness': 3,\n",
     "    }\n",
-    "    BOX_TRESHOLD = 0.05\n",
+    "    BOX_THRESHOLD = 0.05\n",
     "elif platform == 'web':\n",
     "    draw_bbox_config = {\n",
     "        'text_scale': 0.8,\n",
     "        'text_thickness': 2,\n",
     "        'text_padding': 3,\n",
     "        'thickness': 3,\n",
     "    }\n",
-    "    BOX_TRESHOLD = 0.05\n",
+    "    BOX_THRESHOLD = 0.05\n",
     "elif platform == 'mobile':\n",
     "    draw_bbox_config = {\n",
     "        'text_scale': 0.8,\n",
     "        'text_thickness': 2,\n",
     "        'text_padding': 3,\n",
     "        'thickness': 3,\n",
     "    }\n",
-    "    BOX_TRESHOLD = 0.05\n",
+    "    BOX_THRESHOLD = 0.05\n",
     "image = Image.open(image_path)\n",
     "image_rgb = image.convert('RGB')\n",
     "\n",
     "ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9})\n",
     "text, ocr_bbox = ocr_bbox_rslt\n",
     "\n",
-    "dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=False)\n",
+    "dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, som_model, BOX_THRESHOLD = BOX_THRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,use_local_semantics=False)\n",
     "\n",
     "\n"
    ]

diff --git a/gradio_demo.py b/gradio_demo.py
@@ -22,23 +22,23 @@
         'text_padding': 2,
         'thickness': 2,
     }
-    BOX_TRESHOLD = 0.05
+    BOX_THRESHOLD = 0.05
 elif platform == 'web':
     draw_bbox_config = {
         'text_scale': 0.8,
         'text_thickness': 2,
         'text_padding': 3,
         'thickness': 3,
     }
-    BOX_TRESHOLD = 0.05
+    BOX_THRESHOLD = 0.05
 elif platform == 'mobile':
     draw_bbox_config = {
         'text_scale': 0.8,
         'text_thickness': 2,
         'text_padding': 3,
         'thickness': 3,
     }
-    BOX_TRESHOLD = 0.05
+    BOX_THRESHOLD = 0.05
 
 
 
@@ -70,7 +70,7 @@ def process(
     ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9})
     text, ocr_bbox = ocr_bbox_rslt
     print('prompt:', prompt)
-    dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=0.3,prompt=prompt)
+    dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_THRESHOLD = BOX_THRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=0.3,prompt=prompt)
     image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
     print('finish processing')
     parsed_content_list = '\n'.join(parsed_content_list)

diff --git a/omniparser.py b/omniparser.py
@@ -17,7 +17,7 @@
         'text_padding': 3,
         'thickness': 3,
     },
-    'BOX_TRESHOLD': 0.05
+    'BOX_THRESHOLD': 0.05
 }
 
 
@@ -35,11 +35,11 @@ def parse(self, image_path: str):
         text, ocr_bbox = ocr_bbox_rslt
 
         draw_bbox_config = self.config['draw_bbox_config']
-        BOX_TRESHOLD = self.config['BOX_TRESHOLD']
-        dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, self.som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=None, ocr_text=text,use_local_semantics=False)
+        BOX_THRESHOLD = self.config['BOX_THRESHOLD']
+        dino_labeled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, self.som_model, BOX_THRESHOLD = BOX_THRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=None, ocr_text=text,use_local_semantics=False)
 
-        image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
-        # formating output
+        image = Image.open(io.BytesIO(base64.b64decode(dino_labeled_img)))
+        # formatting output
         return_list = [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]},
                         'text': parsed_content_list[i].split(': ')[1], 'type':'text'} for i, (k, coord) in enumerate(label_coordinates.items()) if i < len(parsed_content_list)]
         return_list.extend(

diff --git a/utils.py b/utils.py
@@ -278,19 +278,19 @@ def predict_yolo(model, image_path, box_threshold):
     return boxes, conf, phrases
 
 
-def get_som_labeled_img(img_path, model=None, BOX_TRESHOLD = 0.01, output_coord_in_ratio=False, ocr_bbox=None, text_scale=0.4, text_padding=5, draw_bbox_config=None, caption_model_processor=None, ocr_text=[], use_local_semantics=True, iou_threshold=0.9,prompt=None):
+def get_som_labeled_img(img_path, model=None, BOX_THRESHOLD = 0.01, output_coord_in_ratio=False, ocr_bbox=None, text_scale=0.4, text_padding=5, draw_bbox_config=None, caption_model_processor=None, ocr_text=[], use_local_semantics=True, iou_threshold=0.9,prompt=None):
     """ ocr_bbox: list of xyxy format bbox
     """
     TEXT_PROMPT = "clickable buttons on the screen"
-    # BOX_TRESHOLD = 0.02 # 0.05/0.02 for web and 0.1 for mobile
-    TEXT_TRESHOLD = 0.01 # 0.9 # 0.01
+    # BOX_THRESHOLD = 0.02 # 0.05/0.02 for web and 0.1 for mobile
+    TEXT_THRESHOLD = 0.01 # 0.9 # 0.01
     image_source = Image.open(img_path).convert("RGB")
     w, h = image_source.size
     # import pdb; pdb.set_trace()
     if False: # TODO
-        xyxy, logits, phrases = predict(model=model, image=image_source, caption=TEXT_PROMPT, box_threshold=BOX_TRESHOLD, text_threshold=TEXT_TRESHOLD)
+        xyxy, logits, phrases = predict(model=model, image=image_source, caption=TEXT_PROMPT, box_threshold=BOX_THRESHOLD, text_threshold=TEXT_THRESHOLD)
     else:
-        xyxy, logits, phrases = predict_yolo(model=model, image_path=img_path, box_threshold=BOX_TRESHOLD)
+        xyxy, logits, phrases = predict_yolo(model=model, image_path=img_path, box_threshold=BOX_THRESHOLD)
     xyxy = xyxy / torch.Tensor([w, h, w, h]).to(xyxy.device)
     image_source = np.asarray(image_source)
     phrases = [str(i) for i in range(len(phrases))]