microsoft · abrichr · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,9 @@ weights/icon_caption_blip2
 weights/icon_caption_florence
 weights/icon_detect/
 .gradio
-__pycache__/
+__pycache__
+*.swp
+.env
+.env.*
+venv/
+*.pem
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,85 @@
+# Dockerfile for OmniParser with GPU and OpenGL support.
+#
+# Base: nvidia/cuda:12.3.1-devel-ubuntu22.04
+# Features:
+# - Python 3.12 with Miniconda environment.
+# - Git LFS for large file support.
+# - Required libraries: OpenCV, Hugging Face, Gradio, OpenGL.
+# - Gradio server on port 7861.
+#
+# 1. Build the image with CUDA support.
+# ```
+# sudo docker build -t omniparser .
+# ```
+#
+# 2. Run the Docker container with GPU access and port mapping for Gradio.
+# ```bash
+# sudo docker run -d -p 7861:7861 --gpus all --name omniparser-container omniparser
+# ```
+#
+# Author: Richard Abrich ([email protected])
+
+FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
+
+# Install system dependencies with explicit OpenGL libraries
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    git-lfs \
+    wget \
+    libgl1 \
+    libglib2.0-0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && git lfs install
+
+# Install Miniconda for Python 3.12
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
+    bash miniconda.sh -b -p /opt/conda && \
+    rm miniconda.sh
+ENV PATH="/opt/conda/bin:$PATH"
+
+# Create and activate Conda environment with Python 3.12, and set it as the default
+RUN conda create -n omni python=3.12 && \
+    echo "source activate omni" > ~/.bashrc
+ENV CONDA_DEFAULT_ENV=omni
+ENV PATH="/opt/conda/envs/omni/bin:$PATH"
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Copy project files and requirements
+COPY . .
+COPY requirements.txt /usr/src/app/requirements.txt
+
+# Initialize Git LFS and pull LFS files
+RUN git lfs install && \
+    git lfs pull
+
+# Install dependencies from requirements.txt with specific opencv-python-headless version
+RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
+    pip uninstall -y opencv-python opencv-python-headless && \
+    pip install --no-cache-dir opencv-python-headless==4.8.1.78 && \
+    pip install -r requirements.txt && \
+    pip install huggingface_hub
+
+# Run download.py to fetch model weights and convert safetensors to .pt format
+RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
+    python download.py && \
+    echo "Contents of weights directory:" && \
+    ls -lR weights && \
+    python weights/convert_safetensor_to_pt.py
+
+# Expose the default Gradio port
+EXPOSE 7861
+
+# Configure Gradio to be accessible externally
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+
+# Copy and set permissions for entrypoint script
+COPY entrypoint.sh /usr/src/app/entrypoint.sh
+RUN chmod +x /usr/src/app/entrypoint.sh
+
+# To debug, keep the container running
+# CMD ["tail", "-f", "/dev/null"]
+
+# Set the entrypoint
+ENTRYPOINT ["/usr/src/app/entrypoint.sh"]
diff --git a/README.md b/README.md
@@ -17,6 +17,19 @@
 - [2024/10] Both Interactive Region Detection Model and Icon functional description model are released! [Hugginface models](https://huggingface.co/microsoft/OmniParser)
 - [2024/09] OmniParser achieves the best performance on [Windows Agent Arena](https://microsoft.github.io/WindowsAgentArena/)! 
 
+### :rocket: Docker Quick Start
+
+Prerequisites:
+- CUDA-enabled GPU
+- NVIDIA Container Toolkit installed (https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
+```
+# Build the image (requires CUDA)
+sudo docker build -t omniparser .
+
+# Run the image
+sudo docker run -d -p 7861:7861 --gpus all --name omniparser-container omniparser
+```
+
 ## Install 
 Install environment:
 ```python
@@ -25,8 +38,12 @@ conda activate omni
 pip install -r requirements.txt
 ```
 
-Then download the model ckpts files in: https://huggingface.co/microsoft/OmniParser, and put them under weights/, default folder structure is: weights/icon_detect, weights/icon_caption_florence, weights/icon_caption_blip2. 
+Download and convert the model ckpt files from https://huggingface.co/microsoft/OmniParser:
+```python
+python download.py
+```
 
+Or, download the model ckpts files in: https://huggingface.co/microsoft/OmniParser, and put them under weights/, default folder structure is: weights/icon_detect, weights/icon_caption_florence, weights/icon_caption_blip2.
 Finally, convert the safetensor to .pt file. 
 ```python
 python weights/convert_safetensor_to_pt.py
@@ -41,6 +58,13 @@ To run gradio demo, simply run:
 python gradio_demo.py
 ```
 
+## Deploy to AWS
+
+To deploy OmniParser to EC2 on AWS via Github Actions:
+
+1. Fork this repository and clone your fork to your local machine.
+2. Follow the instructions at the top of [`deploy.py`](https://github.com/microsoft/OmniParser/blob/main/deploy.py).
+
 ## Model Weights License
 For the model checkpoints on huggingface model hub, please note that icon_detect model is under AGPL license since it is a license inherited from the original yolo model. And icon_caption_blip2 & icon_caption_florence is under MIT license. Please refer to the LICENSE file in the folder of each model: https://huggingface.co/microsoft/OmniParser.
 

diff --git a/client.py b/client.py
@@ -0,0 +1,132 @@
+"""
+This module provides a command-line interface and programmatic API to interact with the OmniParser Gradio server.
+
+Command-line usage:
+    python client.py "http://<server_ip>:7861" "path/to/image.jpg"
+
+View results:
+    JSON: cat result_data_<timestamp>.json
+    Image:
+        macOS:   open output_image_<timestamp>.png
+        Windows: start output_image_<timestamp>.png
+        Linux:   xdg-open output_image_<timestamp>.png
+
+Programmatic usage:
+    from omniparse.client import predict
+    result = predict("http://<server_ip>:7861", "path/to/image.jpg")
+
+Result data format:
+    {
+        "label_coordinates": {
+            "0": [x1, y1, width, height],  // Normalized coordinates for each bounding box
+            "1": [x1, y1, width, height],
+            ...
+        },
+        "parsed_content_list": [
+            "Text Box ID 0: [content]",
+            "Text Box ID 1: [content]",
+            ...,
+            "Icon Box ID X: [description]",
+            ...
+        ]
+    }
+
+Note: The parsed_content_list includes both text box contents and icon descriptions.
+"""
+
+import fire
+from gradio_client import Client
+from loguru import logger
+import base64
+import os
+import shutil
+import json
+from datetime import datetime
+
+# Define constants for default thresholds
+DEFAULT_BOX_THRESHOLD = 0.05
+DEFAULT_IOU_THRESHOLD = 0.1
+
+def predict(server_url: str, image_path: str, box_threshold: float = DEFAULT_BOX_THRESHOLD, iou_threshold: float = DEFAULT_IOU_THRESHOLD):
+    """
+    Makes a prediction using the OmniParser Gradio client with the provided server URL and image.
+    Args:
+        server_url (str): The URL of the OmniParser Gradio server.
+        image_path (str): Path to the image file to be processed.
+        box_threshold (float): Box threshold value (default: 0.05).
+        iou_threshold (float): IOU threshold value (default: 0.1).
+    Returns:
+        dict: Parsed result data containing label coordinates and parsed content list.
+    """
+    client = Client(server_url)
+
+    # Load and encode the image
+    image_path = os.path.expanduser(image_path)
+    with open(image_path, "rb") as image_file:
+        encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
+
+    # Prepare the image input in the format expected by the server
+    image_input = {
+        "path": None,
+        "url": f"data:image/png;base64,{encoded_image}",
+        "size": None,
+        "orig_name": image_path,
+        "mime_type": "image/png",
+        "is_stream": False,
+        "meta": {}
+    }
+
+    # Make the prediction
+    result = client.predict(
+        image_input,
+        box_threshold,
+        iou_threshold,
+        api_name="/process"
+    )
+
+    # Process and return the result
+    output_image, result_json = result
+    result_data = json.loads(result_json)
+
+    return {"output_image": output_image, "result_data": result_data}
+
+
+def predict_and_save(server_url: str, image_path: str, box_threshold: float = DEFAULT_BOX_THRESHOLD, iou_threshold: float = DEFAULT_IOU_THRESHOLD):
+    """
+    Makes a prediction and saves the results to files, including logs and image outputs.
+    Args:
+        server_url (str): The URL of the OmniParser Gradio server.
+        image_path (str): Path to the image file to be processed.
+        box_threshold (float): Box threshold value (default: 0.05).
+        iou_threshold (float): IOU threshold value (default: 0.1).
+    """
+    # Generate a timestamp for unique file naming
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+    # Call the predict function to get prediction data
+    try:
+        result = predict(server_url, image_path, box_threshold, iou_threshold)
+        output_image = result["output_image"]
+        result_data = result["result_data"]
+
+        # Save result data to JSON file
+        result_data_path = f"result_data_{timestamp}.json"
+        with open(result_data_path, "w") as json_file:
+            json.dump(result_data, json_file, indent=4)
+        logger.info(f"Parsed content saved to: {result_data_path}")
+
+        # Save the output image
+        output_image_path = f"output_image_{timestamp}.png"
+        if isinstance(output_image, str) and os.path.exists(output_image):
+            shutil.copy(output_image, output_image_path)
+            logger.info(f"Output image saved to: {output_image_path}")
+        else:
+            logger.warning(f"Unexpected output_image format or file not found: {output_image}")
+
+    except Exception as e:
+        logger.error(f"An error occurred: {str(e)}")
+        logger.exception("Traceback:")
+
+
+if __name__ == "__main__":
+    fire.Fire(predict_and_save)