Find objects bounding boxes (#240)

* Alternative find_objects implementation * Dask docs suggest bag.fold is more efficient than bag.reduction * Merge dataframes instead of using combine * Make find_objects output obviously a dask dataframe * A clearer way to use dask delayed * Try to clarify dataframe column naming convention * Additional clarifying comment * Default step size for slices is None * Use Marvin's suggestion so we don't have to call compute twice on result * Revert delayed changes * Be consistent with slice step (use default value) * Remove redundant import statement * Avoid user having to call compute twice on result * Fix delayed so we know output is a dask dataframe * Use functools partial to pass in array dimension information to _find_objects * Improve docstrings in _find_objects.py * Add check for integer array dtype in find_objects * Re-trigger CI * Improve find_objects docstring re return value * find_objects _merge_bounding_boxes, clarify comment * Fix find_objects bug where chunk has no non-zero labels * Add test for find_objects with empty chunk in array
dask · Dec 17, 2021 · f1fc159 · f1fc159
1 parent 9b4ff96
commit f1fc159
Show file tree

Hide file tree

Showing 3 changed files with 209 additions and 0 deletions.
diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py
@@ -4,12 +4,16 @@
 import functools
 import operator
 import warnings
+from dask import compute, delayed
 
 import dask.array as da
+import dask.bag as db
+import dask.dataframe as dd
 import numpy as np
 
 from . import _utils
 from ._utils import _label
+from ._utils._find_objects import _array_chunk_location, _find_bounding_boxes, _find_objects
 
 __all__ = [
     "area",
@@ -202,6 +206,49 @@ def extrema(image, label_image=None, index=None):
     return result
 
 
+def find_objects(label_image):
+    """Return bounding box slices for each object labelled by integers.
+
+    Parameters
+    ----------
+    label_image : ndarray
+        Image features noted by integers.
+
+    Returns
+    -------
+    Dask dataframe
+        Each row respresents an indivdual integrer label. Columns contain the
+        slice information for the object boundaries in each dimension
+        (dimensions are named: 0, 1, ..., nd).
+
+    Notes
+    -----
+    You must have the optional dependency ``dask[dataframe]`` installed
+    to use the ``find_objects`` function.
+    """
+    if label_image.dtype.char not in np.typecodes['AllInteger']:
+        raise ValueError("find_objects only accepts integer dtype arrays")
+
+    block_iter = zip(
+        np.ndindex(*label_image.numblocks),
+        map(functools.partial(operator.getitem, label_image),
+            da.core.slices_from_chunks(label_image.chunks))
+    )
+
+    arrays = []
+    for block_id, block in block_iter:
+        array_location = _array_chunk_location(block_id, label_image.chunks)
+        arrays.append(delayed(_find_bounding_boxes)(block, array_location))
+
+    bag = db.from_sequence(arrays)
+    result = bag.fold(functools.partial(_find_objects, label_image.ndim), split_every=2).to_delayed()
+    meta = dd.utils.make_meta([(i, object) for i in range(label_image.ndim)])
+    result = delayed(compute)(result)[0]  # avoid the user having to call compute twice on result
+    result = dd.from_delayed(result, meta=meta, prefix="find-objects-", verify_meta=False)
+
+    return result
+
+
 def histogram(image,
               min,
               max,

diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py
@@ -0,0 +1,75 @@
+import numpy as np
+import pandas as pd
+from dask.delayed import Delayed
+import dask.dataframe as dd
+
+
+def _array_chunk_location(block_id, chunks):
+    """Pixel coordinate of top left corner of the array chunk."""
+    array_location = []
+    for idx, chunk in zip(block_id, chunks):
+        array_location.append(sum(chunk[:idx]))
+    return tuple(array_location)
+
+
+def _find_bounding_boxes(x, array_location):
+    """An alternative to scipy.ndimage.find_objects.
+
+    We use this alternative because scipy.ndimage.find_objects
+    returns a tuple of length N, where N is the largest integer label.
+    This is not ideal for distributed labels, where there might be only
+    one or two objects in an image chunk labelled with very large integers.
+
+    This alternative function returns a pandas dataframe,
+    with one row per object found in the image chunk.
+    """
+    unique_vals = np.unique(x)
+    unique_vals = unique_vals[unique_vals != 0]
+    result = {}
+    for val in unique_vals:
+        positions = np.where(x == val)
+        slices = tuple(slice(np.min(pos) + array_location[i], np.max(pos) + 1 + array_location[i]) for i, pos in enumerate(positions))
+        result[val] = slices
+    column_names = [i for i in range(x.ndim)]  # column names are: 0, 1, ... nD
+    return pd.DataFrame.from_dict(result, orient='index', columns=column_names)
+
+
+def _combine_slices(slices):
+    "Return the union of all slices."
+    if len(slices) == 1:
+        return slices[0]
+    else:
+        start = min([sl.start for sl in slices])
+        stop = max([sl.stop for sl in slices])
+        return slice(start, stop)
+
+
+def _merge_bounding_boxes(x, ndim):
+    """Merge the bounding boxes describing objects over multiple image chunks."""
+    x = x.dropna()
+    data = {}
+    # For each dimension in the array,
+    # pick out the slice values belonging to that dimension
+    # and combine the slices
+    # (i.e. find the union; the slice expanded to all input slices).
+    for i in range(ndim):
+        # Array dimensions are labelled by a number followed by an underscroe
+        # i.e. column labels are: 0_x, 1_x, 2_x, ... 0_y, 1_y, 2_y, ...
+        # (x and y represent the pair of chunks label slices are merged from)
+        slices = [x[ii] for ii in x.index if str(ii).startswith(str(i))]
+        combined_slices = _combine_slices(slices)
+        data[i] = combined_slices
+    result = pd.Series(data=data, index=[i for i in range(ndim)], name=x.name)
+    return result
+
+
+def _find_objects(ndim, df1, df2):
+    """Main utility function for find_objects."""
+    meta = dd.utils.make_meta([(i, object) for i in range(ndim)])
+    if isinstance(df1, Delayed):
+        df1 = dd.from_delayed(df1, meta=meta)
+    if isinstance(df2, Delayed):
+        df2 = dd.from_delayed(df2, meta=meta)
+    ddf = dd.merge(df1, df2, how="outer", left_index=True, right_index=True)
+    result = ddf.apply(_merge_bounding_boxes, ndim=ndim, axis=1, meta=meta)
+    return result
diff --git a/tests/test_dask_image/test_ndmeasure/test_find_objects.py b/tests/test_dask_image/test_ndmeasure/test_find_objects.py
@@ -0,0 +1,87 @@
+from dask_image.ndmeasure._utils import _labeled_comprehension_delayed
+import dask.array as da
+import dask.dataframe as dd
+import numpy as np
+import pandas as pd
+import pytest
+
+import dask_image.ndmeasure
+
+
+@pytest.fixture
+def label_image():
+    """Return small label image for tests.
+
+    dask.array<array, shape=(5, 10), dtype=int64, chunksize=(5, 5), chunktype=numpy.ndarray>
+
+    array([[   0,   0,   0,   0,   0,   0,   0, 333, 333, 333],
+            [111, 111,   0,   0,   0,   0,   0, 333, 333, 333],
+            [111, 111,   0,   0,   0,   0,   0,   0,   0,   0],
+            [  0,   0,   0, 222, 222, 222, 222, 222, 222,   0],
+            [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])
+
+    """
+    label_image = np.zeros((5, 10)).astype(int)
+    label_image[1:3,0:2] = 111
+    label_image[3,3:-2] = 222
+    label_image[0:2,-3:] = 333
+    label_image = da.from_array(label_image, chunks=(5, 5))
+    return label_image
+
+
+@pytest.fixture
+def label_image_with_empty_chunk():
+    """Return small label image with an empty chunk for tests.
+
+    dask.array<array, shape=(6, 6), dtype=int64, chunksize=(3, 3), chunktype=numpy.ndarray>
+
+    array([[   0,   0,   0,   0,   0,   0],
+            [111, 111,   0,   0,   0,   0],
+            [111, 111,   0,   0,   0,   0],
+            [  0,   0,   0,   0,   0,   0],
+            [  0,   0,   0, 222, 222, 222],
+            [  0,   0,   0,   0,   0,   0]])
+    """
+    label_image = np.zeros((6, 6)).astype(int)
+    label_image[1:3,0:2] = 111
+    label_image[4,3:] = 222
+    label_image = da.from_array(label_image, chunks=(3, 3))
+    return label_image
+
+
+def test_find_objects(label_image):
+    result = dask_image.ndmeasure.find_objects(label_image)
+    assert isinstance(result, dd.DataFrame)
+    computed_result = result.compute()
+    assert isinstance(computed_result, pd.DataFrame)
+    expected = pd.DataFrame.from_dict(
+        {0: {111: slice(1, 3), 222: slice(3, 4), 333: slice(0, 2)},
+        1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)}}
+    )
+    assert computed_result.equals(expected)
+
+
+def test_3d_find_objects(label_image):
+    label_image = da.stack([label_image, label_image], axis=2)
+    result = dask_image.ndmeasure.find_objects(label_image)
+    assert isinstance(result, dd.DataFrame)
+    computed_result = result.compute()
+    assert isinstance(computed_result, pd.DataFrame)
+    expected = pd.DataFrame.from_dict(
+        {0: {111: slice(1, 3), 222: slice(3, 4), 333: slice(0, 2)},
+        1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)},
+        2: {111: slice(0, 2), 222: slice(0, 2), 333: slice(0, 2)}}
+    )
+    assert computed_result.equals(expected)
+
+
+def test_find_objects_with_empty_chunks(label_image_with_empty_chunk):
+    result = dask_image.ndmeasure.find_objects(label_image_with_empty_chunk)
+    assert isinstance(result, dd.DataFrame)
+    computed_result = result.compute()
+    assert isinstance(computed_result, pd.DataFrame)
+    expected = pd.DataFrame.from_dict(
+        {0: {111: slice(1, 3, None), 222: slice(4, 5, None)},
+        1: {111: slice(0, 2, None), 222: slice(3, 6, None)}}
+    )
+    assert computed_result.equals(expected)