Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Find objects bounding boxes #240

Merged
merged 24 commits into from
Dec 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9ac92a8
Alternative find_objects implementation
GenevieveBuckley Jul 22, 2021
195edb5
Dask docs suggest bag.fold is more efficient than bag.reduction
GenevieveBuckley Jul 22, 2021
ab90271
Merge dataframes instead of using combine
GenevieveBuckley Jul 23, 2021
64d2d87
Make find_objects output obviously a dask dataframe
GenevieveBuckley Jul 23, 2021
7cf8d7a
A clearer way to use dask delayed
GenevieveBuckley Jul 29, 2021
4ceeec3
Try to clarify dataframe column naming convention
GenevieveBuckley Jul 29, 2021
c301e2a
Additional clarifying comment
GenevieveBuckley Jul 29, 2021
dcc78ee
Default step size for slices is None
GenevieveBuckley Jul 29, 2021
2bdb70a
Use Marvin's suggestion so we don't have to call compute twice on result
GenevieveBuckley Jul 29, 2021
ce3e17f
Revert delayed changes
GenevieveBuckley Jul 29, 2021
f120067
Be consistent with slice step (use default value)
GenevieveBuckley Jul 29, 2021
299a9eb
Remove redundant import statement
GenevieveBuckley Jul 29, 2021
02a86be
Avoid user having to call compute twice on result
GenevieveBuckley Jul 29, 2021
a7479f0
Fix delayed so we know output is a dask dataframe
GenevieveBuckley Jul 29, 2021
9f812fc
Use functools partial to pass in array dimension information to _find…
GenevieveBuckley Jul 29, 2021
ec20255
Improve docstrings in _find_objects.py
GenevieveBuckley Jul 29, 2021
7192c00
Fix merge conflicts
GenevieveBuckley Jul 29, 2021
07ccbd8
Add check for integer array dtype in find_objects
GenevieveBuckley Jul 29, 2021
cf5636e
Merge branch 'main' into find-objects
GenevieveBuckley Jul 30, 2021
cc10a93
Re-trigger CI
GenevieveBuckley Dec 16, 2021
eba33ee
Improve find_objects docstring re return value
GenevieveBuckley Dec 17, 2021
3205108
find_objects _merge_bounding_boxes, clarify comment
GenevieveBuckley Dec 17, 2021
d696202
Fix find_objects bug where chunk has no non-zero labels
GenevieveBuckley Dec 17, 2021
42b95da
Add test for find_objects with empty chunk in array
GenevieveBuckley Dec 17, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions dask_image/ndmeasure/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
import functools
import operator
import warnings
from dask import compute, delayed

import dask.array as da
import dask.bag as db
import dask.dataframe as dd
import numpy as np

from . import _utils
from ._utils import _label
from ._utils._find_objects import _array_chunk_location, _find_bounding_boxes, _find_objects

__all__ = [
"area",
Expand Down Expand Up @@ -202,6 +206,49 @@ def extrema(image, label_image=None, index=None):
return result


def find_objects(label_image):
GenevieveBuckley marked this conversation as resolved.
Show resolved Hide resolved
"""Return bounding box slices for each object labelled by integers.

Parameters
----------
label_image : ndarray
Image features noted by integers.

Returns
-------
Dask dataframe
Each row respresents an indivdual integrer label. Columns contain the
slice information for the object boundaries in each dimension
(dimensions are named: 0, 1, ..., nd).

Notes
-----
You must have the optional dependency ``dask[dataframe]`` installed
to use the ``find_objects`` function.
"""
if label_image.dtype.char not in np.typecodes['AllInteger']:
raise ValueError("find_objects only accepts integer dtype arrays")

block_iter = zip(
np.ndindex(*label_image.numblocks),
map(functools.partial(operator.getitem, label_image),
da.core.slices_from_chunks(label_image.chunks))
)

arrays = []
for block_id, block in block_iter:
array_location = _array_chunk_location(block_id, label_image.chunks)
arrays.append(delayed(_find_bounding_boxes)(block, array_location))

bag = db.from_sequence(arrays)
result = bag.fold(functools.partial(_find_objects, label_image.ndim), split_every=2).to_delayed()
meta = dd.utils.make_meta([(i, object) for i in range(label_image.ndim)])
result = delayed(compute)(result)[0] # avoid the user having to call compute twice on result
GenevieveBuckley marked this conversation as resolved.
Show resolved Hide resolved
result = dd.from_delayed(result, meta=meta, prefix="find-objects-", verify_meta=False)

return result


def histogram(image,
min,
max,
Expand Down
75 changes: 75 additions & 0 deletions dask_image/ndmeasure/_utils/_find_objects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import numpy as np
import pandas as pd
from dask.delayed import Delayed
import dask.dataframe as dd


def _array_chunk_location(block_id, chunks):
"""Pixel coordinate of top left corner of the array chunk."""
array_location = []
for idx, chunk in zip(block_id, chunks):
array_location.append(sum(chunk[:idx]))
return tuple(array_location)


def _find_bounding_boxes(x, array_location):
"""An alternative to scipy.ndimage.find_objects.

We use this alternative because scipy.ndimage.find_objects
returns a tuple of length N, where N is the largest integer label.
This is not ideal for distributed labels, where there might be only
one or two objects in an image chunk labelled with very large integers.

This alternative function returns a pandas dataframe,
with one row per object found in the image chunk.
"""
unique_vals = np.unique(x)
unique_vals = unique_vals[unique_vals != 0]
GenevieveBuckley marked this conversation as resolved.
Show resolved Hide resolved
result = {}
for val in unique_vals:
positions = np.where(x == val)
slices = tuple(slice(np.min(pos) + array_location[i], np.max(pos) + 1 + array_location[i]) for i, pos in enumerate(positions))
result[val] = slices
column_names = [i for i in range(x.ndim)] # column names are: 0, 1, ... nD
return pd.DataFrame.from_dict(result, orient='index', columns=column_names)


def _combine_slices(slices):
"Return the union of all slices."
if len(slices) == 1:
return slices[0]
else:
start = min([sl.start for sl in slices])
stop = max([sl.stop for sl in slices])
return slice(start, stop)


def _merge_bounding_boxes(x, ndim):
"""Merge the bounding boxes describing objects over multiple image chunks."""
x = x.dropna()
data = {}
# For each dimension in the array,
# pick out the slice values belonging to that dimension
# and combine the slices
# (i.e. find the union; the slice expanded to all input slices).
for i in range(ndim):
# Array dimensions are labelled by a number followed by an underscroe
GenevieveBuckley marked this conversation as resolved.
Show resolved Hide resolved
# i.e. column labels are: 0_x, 1_x, 2_x, ... 0_y, 1_y, 2_y, ...
# (x and y represent the pair of chunks label slices are merged from)
slices = [x[ii] for ii in x.index if str(ii).startswith(str(i))]
GenevieveBuckley marked this conversation as resolved.
Show resolved Hide resolved
combined_slices = _combine_slices(slices)
data[i] = combined_slices
result = pd.Series(data=data, index=[i for i in range(ndim)], name=x.name)
return result


def _find_objects(ndim, df1, df2):
"""Main utility function for find_objects."""
meta = dd.utils.make_meta([(i, object) for i in range(ndim)])
if isinstance(df1, Delayed):
df1 = dd.from_delayed(df1, meta=meta)
if isinstance(df2, Delayed):
df2 = dd.from_delayed(df2, meta=meta)
ddf = dd.merge(df1, df2, how="outer", left_index=True, right_index=True)
result = ddf.apply(_merge_bounding_boxes, ndim=ndim, axis=1, meta=meta)
return result
87 changes: 87 additions & 0 deletions tests/test_dask_image/test_ndmeasure/test_find_objects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from dask_image.ndmeasure._utils import _labeled_comprehension_delayed
import dask.array as da
import dask.dataframe as dd
import numpy as np
import pandas as pd
import pytest

import dask_image.ndmeasure


@pytest.fixture
def label_image():
GenevieveBuckley marked this conversation as resolved.
Show resolved Hide resolved
"""Return small label image for tests.

dask.array<array, shape=(5, 10), dtype=int64, chunksize=(5, 5), chunktype=numpy.ndarray>

array([[ 0, 0, 0, 0, 0, 0, 0, 333, 333, 333],
[111, 111, 0, 0, 0, 0, 0, 333, 333, 333],
[111, 111, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 222, 222, 222, 222, 222, 222, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

"""
label_image = np.zeros((5, 10)).astype(int)
label_image[1:3,0:2] = 111
label_image[3,3:-2] = 222
label_image[0:2,-3:] = 333
label_image = da.from_array(label_image, chunks=(5, 5))
return label_image


@pytest.fixture
def label_image_with_empty_chunk():
"""Return small label image with an empty chunk for tests.

dask.array<array, shape=(6, 6), dtype=int64, chunksize=(3, 3), chunktype=numpy.ndarray>

array([[ 0, 0, 0, 0, 0, 0],
[111, 111, 0, 0, 0, 0],
[111, 111, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 222, 222, 222],
[ 0, 0, 0, 0, 0, 0]])
"""
label_image = np.zeros((6, 6)).astype(int)
label_image[1:3,0:2] = 111
label_image[4,3:] = 222
label_image = da.from_array(label_image, chunks=(3, 3))
return label_image


def test_find_objects(label_image):
result = dask_image.ndmeasure.find_objects(label_image)
assert isinstance(result, dd.DataFrame)
computed_result = result.compute()
assert isinstance(computed_result, pd.DataFrame)
expected = pd.DataFrame.from_dict(
{0: {111: slice(1, 3), 222: slice(3, 4), 333: slice(0, 2)},
1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)}}
)
assert computed_result.equals(expected)


def test_3d_find_objects(label_image):
label_image = da.stack([label_image, label_image], axis=2)
result = dask_image.ndmeasure.find_objects(label_image)
assert isinstance(result, dd.DataFrame)
computed_result = result.compute()
assert isinstance(computed_result, pd.DataFrame)
expected = pd.DataFrame.from_dict(
{0: {111: slice(1, 3), 222: slice(3, 4), 333: slice(0, 2)},
1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)},
2: {111: slice(0, 2), 222: slice(0, 2), 333: slice(0, 2)}}
)
assert computed_result.equals(expected)


def test_find_objects_with_empty_chunks(label_image_with_empty_chunk):
result = dask_image.ndmeasure.find_objects(label_image_with_empty_chunk)
assert isinstance(result, dd.DataFrame)
computed_result = result.compute()
assert isinstance(computed_result, pd.DataFrame)
expected = pd.DataFrame.from_dict(
{0: {111: slice(1, 3, None), 222: slice(4, 5, None)},
1: {111: slice(0, 2, None), 222: slice(3, 6, None)}}
)
assert computed_result.equals(expected)