From 9ac92a8b3f9e51e6194c5d9e4f71b65ecad68f64 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 22 Jul 2021 17:34:00 +1000 Subject: [PATCH 01/22] Alternative find_objects implementation --- dask_image/ndmeasure/__init__.py | 24 +++++++ dask_image/ndmeasure/_utils/_find_objects.py | 63 +++++++++++++++++++ .../test_ndmeasure/test_find_objects.py | 36 +++++++++++ 3 files changed, 123 insertions(+) create mode 100644 dask_image/ndmeasure/_utils/_find_objects.py create mode 100644 tests/test_dask_image/test_ndmeasure/test_find_objects.py diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index 9213f16f..e8d39c88 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -6,10 +6,12 @@ import warnings import dask.array as da +import dask.bag as db import numpy as np from . import _utils from ._utils import _label +from ._utils._find_objects import _array_chunk_location, _find_bounding_boxes, _merge_bounding_boxes __all__ = [ "area", @@ -202,6 +204,28 @@ def extrema(image, label_image=None, index=None): return result +def find_objects(label_image): + """Return bounding box slices for each object labelled by integers. + + Parameters + ---------- + label_image : ndarray + Image features noted by integers. + """ + block_iter = zip( + np.ndindex(*label_image.numblocks), + map(functools.partial(operator.getitem, label_image), + da.core.slices_from_chunks(label_image.chunks)) + ) + arrays = [] + for block_id, block in block_iter: + array_location = _array_chunk_location(block_id, label_image.chunks) + arrays.append(_find_bounding_boxes(block, array_location)) + bag = db.from_sequence(arrays) + result = bag.reduction(_merge_bounding_boxes, _merge_bounding_boxes, split_every=2) + return result + + def histogram(image, min, max, diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py new file mode 100644 index 00000000..6e6d1a8e --- /dev/null +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -0,0 +1,63 @@ +import numpy as np +import pandas as pd +from dask.delayed import delayed +import dask.dataframe as dd + + +def _array_chunk_location(block_id, chunks): + """Pixel coordinate of top left corner of the array chunk.""" + array_location = [] + for idx, chunk in zip(block_id, chunks): + array_location.append(sum(chunk[:idx])) + return tuple(array_location) + + +@delayed +def _find_bounding_boxes(x, array_location): + """An alternative to scipy.ndi.find_objects""" + unique_vals = np.unique(x) + unique_vals = unique_vals[unique_vals != 0] + result = {} + for val in unique_vals: + positions = np.where(x == val) + slices = tuple(slice(np.min(pos) + array_location[i], np.max(pos) + 1 + array_location[i], 1) for i, pos in enumerate(positions)) + result[val] = slices + return pd.DataFrame.from_dict(result, orient='index') + + +def isnan(value): + try: + if np.isnan(value): + return True + except Exception: + if value is np.nan: + return True + else: + return False + + +def _combine_series(a, b): + if isnan(a): + return b + elif isnan(b): + return a + else: + start = min(a.start, b.start) + stop = max(a.stop, b.stop) + return slice(start, stop, 1) + + +def _combine_dataframes(s1, s2): + combined = s1.combine(s2, _combine_series) + return combined + + +def _merge_bounding_boxes(iterable): + iterable = list(iterable) + if len(iterable) == 1: + df1 = iterable[0] + return df1 + else: + df1, df2 = iterable + result = df1.combine(df2, _combine_dataframes) + return result diff --git a/tests/test_dask_image/test_ndmeasure/test_find_objects.py b/tests/test_dask_image/test_ndmeasure/test_find_objects.py new file mode 100644 index 00000000..1a088807 --- /dev/null +++ b/tests/test_dask_image/test_ndmeasure/test_find_objects.py @@ -0,0 +1,36 @@ +import dask.array as da +import numpy as np +import pandas as pd +import pytest + +import dask_image.ndmeasure + + +@pytest.fixture +def label_image(): + """Return small label image for tests. + + dask.array + + array([[ 0, 0, 0, 0, 0, 0, 0, 333, 333, 333], + [111, 111, 0, 0, 0, 0, 0, 333, 333, 333], + [111, 111, 0, 0, 0, 0, 0, 0, 0, 0], + [ 0, 0, 0, 222, 222, 222, 222, 222, 222, 0], + [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) + + """ + label_image = np.zeros((5, 10)).astype(int) + label_image[1:3,0:2] = 111 + label_image[3,3:-2] = 222 + label_image[0:2,-3:] = 333 + label_image = da.from_array(label_image, chunks=(5, 5)) + return label_image + + +def test_bounding_boxes(label_image): + result = dask_image.ndmeasure.find_objects(label_image).compute().compute() + expected = pd.DataFrame.from_dict( + {0: {111: slice(1, 3, 1), 222: slice(3, 4, 1), 333: slice(0, 2, 1)}, + 1: {111: slice(0, 2, 1), 222: slice(3, 8, 1), 333: slice(7, 10, 1)}} + ) + assert result.equals(expected) From 195edb5e123104a468d5dfca37c4e00d68797aa0 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 22 Jul 2021 17:52:03 +1000 Subject: [PATCH 02/22] Dask docs suggest bag.fold is more efficient than bag.reduction --- dask_image/ndmeasure/__init__.py | 3 ++- dask_image/ndmeasure/_utils/_find_objects.py | 7 ++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index e8d39c88..d83fa66a 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -222,7 +222,8 @@ def find_objects(label_image): array_location = _array_chunk_location(block_id, label_image.chunks) arrays.append(_find_bounding_boxes(block, array_location)) bag = db.from_sequence(arrays) - result = bag.reduction(_merge_bounding_boxes, _merge_bounding_boxes, split_every=2) + meta = dd.utils.make_meta([('x', np.int64), ('y', np.int64)]) + result = bag.fold(_merge_bounding_boxes, split_every=2) return result diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py index 6e6d1a8e..614203a9 100644 --- a/dask_image/ndmeasure/_utils/_find_objects.py +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -52,12 +52,9 @@ def _combine_dataframes(s1, s2): return combined -def _merge_bounding_boxes(iterable): - iterable = list(iterable) - if len(iterable) == 1: - df1 = iterable[0] +def _merge_bounding_boxes(df1, df2=None): + if df2 is None: return df1 else: - df1, df2 = iterable result = df1.combine(df2, _combine_dataframes) return result From ab90271ba34d6fa20f2980514e38e839c4fa3e42 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Fri, 23 Jul 2021 17:12:38 +1000 Subject: [PATCH 03/22] Merge dataframes instead of using combine --- dask_image/ndmeasure/__init__.py | 5 +-- dask_image/ndmeasure/_utils/_find_objects.py | 47 +++++++++----------- 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index d83fa66a..6ea40d40 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -11,7 +11,7 @@ from . import _utils from ._utils import _label -from ._utils._find_objects import _array_chunk_location, _find_bounding_boxes, _merge_bounding_boxes +from ._utils._find_objects import _array_chunk_location, _find_bounding_boxes, _find_objects __all__ = [ "area", @@ -222,8 +222,7 @@ def find_objects(label_image): array_location = _array_chunk_location(block_id, label_image.chunks) arrays.append(_find_bounding_boxes(block, array_location)) bag = db.from_sequence(arrays) - meta = dd.utils.make_meta([('x', np.int64), ('y', np.int64)]) - result = bag.fold(_merge_bounding_boxes, split_every=2) + result = bag.fold(_find_objects, split_every=2) return result diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py index 614203a9..8387b455 100644 --- a/dask_image/ndmeasure/_utils/_find_objects.py +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -25,36 +25,29 @@ def _find_bounding_boxes(x, array_location): return pd.DataFrame.from_dict(result, orient='index') -def isnan(value): - try: - if np.isnan(value): - return True - except Exception: - if value is np.nan: - return True +def _combine_slices(slices): + "Return the union of all slices." + if len(slices) == 1: + return slices[0] else: - return False - - -def _combine_series(a, b): - if isnan(a): - return b - elif isnan(b): - return a - else: - start = min(a.start, b.start) - stop = max(a.stop, b.stop) + start = min([sl.start for sl in slices]) + stop = max([sl.stop for sl in slices]) return slice(start, stop, 1) -def _combine_dataframes(s1, s2): - combined = s1.combine(s2, _combine_series) - return combined +def _merge_bounding_boxes(x, ndim): + x = x.dropna() + data = {} + for i in range(ndim): + slices = [x[ii] for ii in x.index if str(ii).startswith(str(i))] + combined_slices = _combine_slices(slices) + data[i] = combined_slices + result = pd.Series(data=data, index=[i for i in range(ndim)], name=x.name) + return result -def _merge_bounding_boxes(df1, df2=None): - if df2 is None: - return df1 - else: - result = df1.combine(df2, _combine_dataframes) - return result +def _find_objects(df1, df2, ndim=2): + ddf = dd.merge(df1, df2, how="outer", left_index=True, right_index=True) + meta = dd.utils.make_meta([(i, object) for i in range(ndim)]) + result = ddf.apply(_merge_bounding_boxes, ndim=ndim, axis=1, meta=meta) + return result From 64d2d87fcfc739c6a84597ccbf541de3bd34a9a2 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Fri, 23 Jul 2021 20:22:06 +1000 Subject: [PATCH 04/22] Make find_objects output obviously a dask dataframe --- dask_image/ndmeasure/__init__.py | 5 ++++- dask_image/ndmeasure/_utils/_find_objects.py | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index 6ea40d40..f215a41b 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -7,6 +7,7 @@ import dask.array as da import dask.bag as db +import dask.dataframe as dd import numpy as np from . import _utils @@ -222,7 +223,9 @@ def find_objects(label_image): array_location = _array_chunk_location(block_id, label_image.chunks) arrays.append(_find_bounding_boxes(block, array_location)) bag = db.from_sequence(arrays) - result = bag.fold(_find_objects, split_every=2) + result = bag.fold(_find_objects, split_every=2).to_delayed() + meta = dd.utils.make_meta([(i, object) for i in range(label_image.ndim)]) + result = dd.from_delayed(result, meta=meta, prefix="find-objects-", verify_meta=False) return result diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py index 8387b455..2d285b10 100644 --- a/dask_image/ndmeasure/_utils/_find_objects.py +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -1,6 +1,6 @@ import numpy as np import pandas as pd -from dask.delayed import delayed +from dask.delayed import delayed, Delayed import dask.dataframe as dd @@ -47,7 +47,11 @@ def _merge_bounding_boxes(x, ndim): def _find_objects(df1, df2, ndim=2): - ddf = dd.merge(df1, df2, how="outer", left_index=True, right_index=True) meta = dd.utils.make_meta([(i, object) for i in range(ndim)]) + if isinstance(df1, Delayed): + df1 = dd.from_delayed(df1, meta=meta) + if isinstance(df2, Delayed): + df2 = dd.from_delayed(df2, meta=meta) + ddf = dd.merge(df1, df2, how="outer", left_index=True, right_index=True) result = ddf.apply(_merge_bounding_boxes, ndim=ndim, axis=1, meta=meta) return result From 7cf8d7a25ee19e24e0af3e8b7d6c632a3dc6fe03 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 15:11:06 +1000 Subject: [PATCH 05/22] A clearer way to use dask delayed --- dask_image/ndmeasure/__init__.py | 4 +++- dask_image/ndmeasure/_utils/_find_objects.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index f215a41b..f202fa09 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -4,10 +4,12 @@ import functools import operator import warnings +from dask import delayed import dask.array as da import dask.bag as db import dask.dataframe as dd +from dask.delayed import delayed import numpy as np from . import _utils @@ -221,7 +223,7 @@ def find_objects(label_image): arrays = [] for block_id, block in block_iter: array_location = _array_chunk_location(block_id, label_image.chunks) - arrays.append(_find_bounding_boxes(block, array_location)) + arrays.append(delayed(_find_bounding_boxes)(block, array_location)) bag = db.from_sequence(arrays) result = bag.fold(_find_objects, split_every=2).to_delayed() meta = dd.utils.make_meta([(i, object) for i in range(label_image.ndim)]) diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py index 2d285b10..bde0981d 100644 --- a/dask_image/ndmeasure/_utils/_find_objects.py +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -1,6 +1,6 @@ import numpy as np import pandas as pd -from dask.delayed import delayed, Delayed +from dask.delayed import Delayed import dask.dataframe as dd @@ -12,7 +12,6 @@ def _array_chunk_location(block_id, chunks): return tuple(array_location) -@delayed def _find_bounding_boxes(x, array_location): """An alternative to scipy.ndi.find_objects""" unique_vals = np.unique(x) From 4ceeec3e4e851600326a8ad5d67d75f33599ce17 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 15:19:12 +1000 Subject: [PATCH 06/22] Try to clarify dataframe column naming convention --- dask_image/ndmeasure/_utils/_find_objects.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py index bde0981d..03fab899 100644 --- a/dask_image/ndmeasure/_utils/_find_objects.py +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -38,6 +38,8 @@ def _merge_bounding_boxes(x, ndim): x = x.dropna() data = {} for i in range(ndim): + # Array dimensions are labelled by a number followed by an underscroe + # i.e. column labels are: 0_x, 1_x, 2_x, ... 0_y, 1_y, 2_y, ... slices = [x[ii] for ii in x.index if str(ii).startswith(str(i))] combined_slices = _combine_slices(slices) data[i] = combined_slices From c301e2a0cc60550d39489267911f528fc5655cf2 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 15:32:07 +1000 Subject: [PATCH 07/22] Additional clarifying comment --- dask_image/ndmeasure/_utils/_find_objects.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py index 03fab899..02489679 100644 --- a/dask_image/ndmeasure/_utils/_find_objects.py +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -37,6 +37,9 @@ def _combine_slices(slices): def _merge_bounding_boxes(x, ndim): x = x.dropna() data = {} + # For each dimension in the array, + # go through every integer label and pick out the values belonging to that dimension + # and combine those slices (find the union; the slice expanded to all input slices). for i in range(ndim): # Array dimensions are labelled by a number followed by an underscroe # i.e. column labels are: 0_x, 1_x, 2_x, ... 0_y, 1_y, 2_y, ... From dcc78ee3e040eea87836701bded4f33d192287cb Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 15:39:29 +1000 Subject: [PATCH 08/22] Default step size for slices is None --- dask_image/ndmeasure/_utils/_find_objects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py index 02489679..0dd09425 100644 --- a/dask_image/ndmeasure/_utils/_find_objects.py +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -31,7 +31,7 @@ def _combine_slices(slices): else: start = min([sl.start for sl in slices]) stop = max([sl.stop for sl in slices]) - return slice(start, stop, 1) + return slice(start, stop) def _merge_bounding_boxes(x, ndim): From 2bdb70adb3b9c5efeed53459ed3e294d2c875102 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 16:29:44 +1000 Subject: [PATCH 09/22] Use Marvin's suggestion so we don't have to call compute twice on result --- dask_image/ndmeasure/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index f202fa09..3f0f0101 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -10,6 +10,7 @@ import dask.bag as db import dask.dataframe as dd from dask.delayed import delayed +from dask import compute import numpy as np from . import _utils @@ -224,7 +225,7 @@ def find_objects(label_image): for block_id, block in block_iter: array_location = _array_chunk_location(block_id, label_image.chunks) arrays.append(delayed(_find_bounding_boxes)(block, array_location)) - bag = db.from_sequence(arrays) + bag = db.from_delayed([delayed(compute)(array) for array in arrays]) result = bag.fold(_find_objects, split_every=2).to_delayed() meta = dd.utils.make_meta([(i, object) for i in range(label_image.ndim)]) result = dd.from_delayed(result, meta=meta, prefix="find-objects-", verify_meta=False) From ce3e17fa7884d108eb6263d184b0283b2637d5c0 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 16:38:44 +1000 Subject: [PATCH 10/22] Revert delayed changes --- dask_image/ndmeasure/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index 3f0f0101..f202fa09 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -10,7 +10,6 @@ import dask.bag as db import dask.dataframe as dd from dask.delayed import delayed -from dask import compute import numpy as np from . import _utils @@ -225,7 +224,7 @@ def find_objects(label_image): for block_id, block in block_iter: array_location = _array_chunk_location(block_id, label_image.chunks) arrays.append(delayed(_find_bounding_boxes)(block, array_location)) - bag = db.from_delayed([delayed(compute)(array) for array in arrays]) + bag = db.from_sequence(arrays) result = bag.fold(_find_objects, split_every=2).to_delayed() meta = dd.utils.make_meta([(i, object) for i in range(label_image.ndim)]) result = dd.from_delayed(result, meta=meta, prefix="find-objects-", verify_meta=False) From f120067b5c04b7f5899f0971743dfc91d6c0009e Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 16:39:38 +1000 Subject: [PATCH 11/22] Be consistent with slice step (use default value) --- dask_image/ndmeasure/_utils/_find_objects.py | 2 +- tests/test_dask_image/test_ndmeasure/test_find_objects.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py index 0dd09425..45a1bc31 100644 --- a/dask_image/ndmeasure/_utils/_find_objects.py +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -19,7 +19,7 @@ def _find_bounding_boxes(x, array_location): result = {} for val in unique_vals: positions = np.where(x == val) - slices = tuple(slice(np.min(pos) + array_location[i], np.max(pos) + 1 + array_location[i], 1) for i, pos in enumerate(positions)) + slices = tuple(slice(np.min(pos) + array_location[i], np.max(pos) + 1 + array_location[i]) for i, pos in enumerate(positions)) result[val] = slices return pd.DataFrame.from_dict(result, orient='index') diff --git a/tests/test_dask_image/test_ndmeasure/test_find_objects.py b/tests/test_dask_image/test_ndmeasure/test_find_objects.py index 1a088807..4a79e19d 100644 --- a/tests/test_dask_image/test_ndmeasure/test_find_objects.py +++ b/tests/test_dask_image/test_ndmeasure/test_find_objects.py @@ -30,7 +30,7 @@ def label_image(): def test_bounding_boxes(label_image): result = dask_image.ndmeasure.find_objects(label_image).compute().compute() expected = pd.DataFrame.from_dict( - {0: {111: slice(1, 3, 1), 222: slice(3, 4, 1), 333: slice(0, 2, 1)}, - 1: {111: slice(0, 2, 1), 222: slice(3, 8, 1), 333: slice(7, 10, 1)}} + {0: {111: slice(1, 3), 222: slice(3, 4), 333: slice(0, 2)}, + 1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)}} ) assert result.equals(expected) From 299a9ebc760efebed3c8f116e8f37d69703b9ae8 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 16:40:49 +1000 Subject: [PATCH 12/22] Remove redundant import statement --- dask_image/ndmeasure/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index f202fa09..654414af 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -9,7 +9,6 @@ import dask.array as da import dask.bag as db import dask.dataframe as dd -from dask.delayed import delayed import numpy as np from . import _utils From 02a86be43f898f04aae47071f49d46340cea2e39 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 16:45:04 +1000 Subject: [PATCH 13/22] Avoid user having to call compute twice on result --- dask_image/ndmeasure/__init__.py | 3 ++- tests/test_dask_image/test_ndmeasure/test_find_objects.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index 654414af..c169ffca 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -4,7 +4,7 @@ import functools import operator import warnings -from dask import delayed +from dask import compute, delayed import dask.array as da import dask.bag as db @@ -227,6 +227,7 @@ def find_objects(label_image): result = bag.fold(_find_objects, split_every=2).to_delayed() meta = dd.utils.make_meta([(i, object) for i in range(label_image.ndim)]) result = dd.from_delayed(result, meta=meta, prefix="find-objects-", verify_meta=False) + result = delayed(compute)(result)[0] # avoid the user having to call compute twice on result return result diff --git a/tests/test_dask_image/test_ndmeasure/test_find_objects.py b/tests/test_dask_image/test_ndmeasure/test_find_objects.py index 4a79e19d..0bc90d1b 100644 --- a/tests/test_dask_image/test_ndmeasure/test_find_objects.py +++ b/tests/test_dask_image/test_ndmeasure/test_find_objects.py @@ -28,7 +28,7 @@ def label_image(): def test_bounding_boxes(label_image): - result = dask_image.ndmeasure.find_objects(label_image).compute().compute() + result = dask_image.ndmeasure.find_objects(label_image).compute() expected = pd.DataFrame.from_dict( {0: {111: slice(1, 3), 222: slice(3, 4), 333: slice(0, 2)}, 1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)}} From a7479f00452793f1922f27e899bec91b1cfa810f Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 16:52:48 +1000 Subject: [PATCH 14/22] Fix delayed so we know output is a dask dataframe --- dask_image/ndmeasure/__init__.py | 2 +- tests/test_dask_image/test_ndmeasure/test_find_objects.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index c169ffca..c69e0e54 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -226,8 +226,8 @@ def find_objects(label_image): bag = db.from_sequence(arrays) result = bag.fold(_find_objects, split_every=2).to_delayed() meta = dd.utils.make_meta([(i, object) for i in range(label_image.ndim)]) - result = dd.from_delayed(result, meta=meta, prefix="find-objects-", verify_meta=False) result = delayed(compute)(result)[0] # avoid the user having to call compute twice on result + result = dd.from_delayed(result, meta=meta, prefix="find-objects-", verify_meta=False) return result diff --git a/tests/test_dask_image/test_ndmeasure/test_find_objects.py b/tests/test_dask_image/test_ndmeasure/test_find_objects.py index 0bc90d1b..578e69b7 100644 --- a/tests/test_dask_image/test_ndmeasure/test_find_objects.py +++ b/tests/test_dask_image/test_ndmeasure/test_find_objects.py @@ -1,4 +1,5 @@ import dask.array as da +import dask.dataframe as dd import numpy as np import pandas as pd import pytest @@ -28,9 +29,12 @@ def label_image(): def test_bounding_boxes(label_image): - result = dask_image.ndmeasure.find_objects(label_image).compute() + result = dask_image.ndmeasure.find_objects(label_image) + assert isinstance(result, dd.DataFrame) + computed_result = result.compute() + assert isinstance(computed_result, pd.DataFrame) expected = pd.DataFrame.from_dict( {0: {111: slice(1, 3), 222: slice(3, 4), 333: slice(0, 2)}, 1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)}} ) - assert result.equals(expected) + assert computed_result.equals(expected) From 9f812fc2748c9b579dc086ad213ff6e6e0a649b4 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 18:30:34 +1000 Subject: [PATCH 15/22] Use functools partial to pass in array dimension information to _find_objects --- dask_image/ndmeasure/__init__.py | 2 +- dask_image/ndmeasure/_utils/_find_objects.py | 2 +- .../test_ndmeasure/test_find_objects.py | 17 ++++++++++++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index c69e0e54..cf79fa97 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -224,7 +224,7 @@ def find_objects(label_image): array_location = _array_chunk_location(block_id, label_image.chunks) arrays.append(delayed(_find_bounding_boxes)(block, array_location)) bag = db.from_sequence(arrays) - result = bag.fold(_find_objects, split_every=2).to_delayed() + result = bag.fold(functools.partial(_find_objects, label_image.ndim), split_every=2).to_delayed() meta = dd.utils.make_meta([(i, object) for i in range(label_image.ndim)]) result = delayed(compute)(result)[0] # avoid the user having to call compute twice on result result = dd.from_delayed(result, meta=meta, prefix="find-objects-", verify_meta=False) diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py index 45a1bc31..88a16f45 100644 --- a/dask_image/ndmeasure/_utils/_find_objects.py +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -50,7 +50,7 @@ def _merge_bounding_boxes(x, ndim): return result -def _find_objects(df1, df2, ndim=2): +def _find_objects(ndim, df1, df2): meta = dd.utils.make_meta([(i, object) for i in range(ndim)]) if isinstance(df1, Delayed): df1 = dd.from_delayed(df1, meta=meta) diff --git a/tests/test_dask_image/test_ndmeasure/test_find_objects.py b/tests/test_dask_image/test_ndmeasure/test_find_objects.py index 578e69b7..58824c30 100644 --- a/tests/test_dask_image/test_ndmeasure/test_find_objects.py +++ b/tests/test_dask_image/test_ndmeasure/test_find_objects.py @@ -1,3 +1,4 @@ +from dask_image.ndmeasure._utils import _labeled_comprehension_delayed import dask.array as da import dask.dataframe as dd import numpy as np @@ -28,7 +29,7 @@ def label_image(): return label_image -def test_bounding_boxes(label_image): +def test_find_objects(label_image): result = dask_image.ndmeasure.find_objects(label_image) assert isinstance(result, dd.DataFrame) computed_result = result.compute() @@ -38,3 +39,17 @@ def test_bounding_boxes(label_image): 1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)}} ) assert computed_result.equals(expected) + + +def test_3d_find_objects(label_image): + label_image = da.stack([label_image, label_image], axis=2) + result = dask_image.ndmeasure.find_objects(label_image) + assert isinstance(result, dd.DataFrame) + computed_result = result.compute() + assert isinstance(computed_result, pd.DataFrame) + expected = pd.DataFrame.from_dict( + {0: {111: slice(1, 3), 222: slice(3, 4), 333: slice(0, 2)}, + 1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)}, + 2: {111: slice(0, 2), 222: slice(0, 2), 333: slice(0, 2)}} + ) + assert computed_result.equals(expected) From ec20255df999d1f3354fbe7a425ecb0056c1510a Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 18:55:54 +1000 Subject: [PATCH 16/22] Improve docstrings in _find_objects.py --- dask_image/ndmeasure/_utils/_find_objects.py | 72 ++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 dask_image/ndmeasure/_utils/_find_objects.py diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py new file mode 100644 index 00000000..81c0e7dc --- /dev/null +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -0,0 +1,72 @@ +import numpy as np +import pandas as pd +from dask.delayed import Delayed +import dask.dataframe as dd + + +def _array_chunk_location(block_id, chunks): + """Pixel coordinate of top left corner of the array chunk.""" + array_location = [] + for idx, chunk in zip(block_id, chunks): + array_location.append(sum(chunk[:idx])) + return tuple(array_location) + + +def _find_bounding_boxes(x, array_location): + """An alternative to scipy.ndimage.find_objects. + + We use this alternative because scipy.ndimage.find_objects + returns a tuple of length N, where N is the largest integer label. + This is not ideal for distributed labels, where there might be only + one or two objects in an image chunk labelled with very large integers. + + This alternative function returns a pandas dataframe, + with one row per object found in the image chunk. + """ + unique_vals = np.unique(x) + unique_vals = unique_vals[unique_vals != 0] + result = {} + for val in unique_vals: + positions = np.where(x == val) + slices = tuple(slice(np.min(pos) + array_location[i], np.max(pos) + 1 + array_location[i]) for i, pos in enumerate(positions)) + result[val] = slices + return pd.DataFrame.from_dict(result, orient='index') + + +def _combine_slices(slices): + "Return the union of all slices." + if len(slices) == 1: + return slices[0] + else: + start = min([sl.start for sl in slices]) + stop = max([sl.stop for sl in slices]) + return slice(start, stop) + + +def _merge_bounding_boxes(x, ndim): + """Merge the bounding boxes describing objects over multiple image chunks.""" + x = x.dropna() + data = {} + # For each dimension in the array, + # pick out the slice values belonging to that dimension + # and combine those slices (find the union; the slice expanded to all input slices). + for i in range(ndim): + # Array dimensions are labelled by a number followed by an underscroe + # i.e. column labels are: 0_x, 1_x, 2_x, ... 0_y, 1_y, 2_y, ... + slices = [x[ii] for ii in x.index if str(ii).startswith(str(i))] + combined_slices = _combine_slices(slices) + data[i] = combined_slices + result = pd.Series(data=data, index=[i for i in range(ndim)], name=x.name) + return result + + +def _find_objects(ndim, df1, df2): + """Main utility function for find_objects.""" + meta = dd.utils.make_meta([(i, object) for i in range(ndim)]) + if isinstance(df1, Delayed): + df1 = dd.from_delayed(df1, meta=meta) + if isinstance(df2, Delayed): + df2 = dd.from_delayed(df2, meta=meta) + ddf = dd.merge(df1, df2, how="outer", left_index=True, right_index=True) + result = ddf.apply(_merge_bounding_boxes, ndim=ndim, axis=1, meta=meta) + return result From 07ccbd8157b0dc63881c74ac6314d03d06479052 Mon Sep 17 00:00:00 2001 From: GenevieveBuckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 29 Jul 2021 20:06:36 +1000 Subject: [PATCH 17/22] Add check for integer array dtype in find_objects --- dask_image/ndmeasure/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index cf79fa97..4943ad2b 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -214,20 +214,26 @@ def find_objects(label_image): label_image : ndarray Image features noted by integers. """ + if label_image.dtype.char not in np.typecodes['AllInteger']: + raise ValueError("find_objects only accepts integer dtype arrays") + block_iter = zip( np.ndindex(*label_image.numblocks), map(functools.partial(operator.getitem, label_image), da.core.slices_from_chunks(label_image.chunks)) ) + arrays = [] for block_id, block in block_iter: array_location = _array_chunk_location(block_id, label_image.chunks) arrays.append(delayed(_find_bounding_boxes)(block, array_location)) + bag = db.from_sequence(arrays) result = bag.fold(functools.partial(_find_objects, label_image.ndim), split_every=2).to_delayed() meta = dd.utils.make_meta([(i, object) for i in range(label_image.ndim)]) result = delayed(compute)(result)[0] # avoid the user having to call compute twice on result result = dd.from_delayed(result, meta=meta, prefix="find-objects-", verify_meta=False) + return result From cc10a937ad78da42e34ee551b5f666b1e894a2d8 Mon Sep 17 00:00:00 2001 From: Genevieve Buckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Thu, 16 Dec 2021 15:13:47 +1100 Subject: [PATCH 18/22] Re-trigger CI From eba33eed47df734bd33dc2accf1b68bff92e71df Mon Sep 17 00:00:00 2001 From: Genevieve Buckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Fri, 17 Dec 2021 17:39:08 +1100 Subject: [PATCH 19/22] Improve find_objects docstring re return value --- dask_image/ndmeasure/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dask_image/ndmeasure/__init__.py b/dask_image/ndmeasure/__init__.py index 4943ad2b..505603de 100644 --- a/dask_image/ndmeasure/__init__.py +++ b/dask_image/ndmeasure/__init__.py @@ -213,6 +213,18 @@ def find_objects(label_image): ---------- label_image : ndarray Image features noted by integers. + + Returns + ------- + Dask dataframe + Each row respresents an indivdual integrer label. Columns contain the + slice information for the object boundaries in each dimension + (dimensions are named: 0, 1, ..., nd). + + Notes + ----- + You must have the optional dependency ``dask[dataframe]`` installed + to use the ``find_objects`` function. """ if label_image.dtype.char not in np.typecodes['AllInteger']: raise ValueError("find_objects only accepts integer dtype arrays") From 3205108dc19304e0e794f8079efba6f815e6aa5c Mon Sep 17 00:00:00 2001 From: Genevieve Buckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Fri, 17 Dec 2021 17:44:29 +1100 Subject: [PATCH 20/22] find_objects _merge_bounding_boxes, clarify comment --- dask_image/ndmeasure/_utils/_find_objects.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py index 81c0e7dc..b73315cb 100644 --- a/dask_image/ndmeasure/_utils/_find_objects.py +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -49,10 +49,12 @@ def _merge_bounding_boxes(x, ndim): data = {} # For each dimension in the array, # pick out the slice values belonging to that dimension - # and combine those slices (find the union; the slice expanded to all input slices). + # and combine the slices + # (i.e. find the union; the slice expanded to all input slices). for i in range(ndim): # Array dimensions are labelled by a number followed by an underscroe # i.e. column labels are: 0_x, 1_x, 2_x, ... 0_y, 1_y, 2_y, ... + # (x and y represent the pair of chunks label slices are merged from) slices = [x[ii] for ii in x.index if str(ii).startswith(str(i))] combined_slices = _combine_slices(slices) data[i] = combined_slices From d696202a1adc495d0433984bb2f9c5397113c323 Mon Sep 17 00:00:00 2001 From: Genevieve Buckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Fri, 17 Dec 2021 18:33:20 +1100 Subject: [PATCH 21/22] Fix find_objects bug where chunk has no non-zero labels --- dask_image/ndmeasure/_utils/_find_objects.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dask_image/ndmeasure/_utils/_find_objects.py b/dask_image/ndmeasure/_utils/_find_objects.py index b73315cb..e9a24ab9 100644 --- a/dask_image/ndmeasure/_utils/_find_objects.py +++ b/dask_image/ndmeasure/_utils/_find_objects.py @@ -30,7 +30,8 @@ def _find_bounding_boxes(x, array_location): positions = np.where(x == val) slices = tuple(slice(np.min(pos) + array_location[i], np.max(pos) + 1 + array_location[i]) for i, pos in enumerate(positions)) result[val] = slices - return pd.DataFrame.from_dict(result, orient='index') + column_names = [i for i in range(x.ndim)] # column names are: 0, 1, ... nD + return pd.DataFrame.from_dict(result, orient='index', columns=column_names) def _combine_slices(slices): From 42b95da527afc36fe9a9bf31f5f078968088d72a Mon Sep 17 00:00:00 2001 From: Genevieve Buckley <30920819+GenevieveBuckley@users.noreply.github.com> Date: Fri, 17 Dec 2021 18:47:56 +1100 Subject: [PATCH 22/22] Add test for find_objects with empty chunk in array --- .../test_ndmeasure/test_find_objects.py | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/test_dask_image/test_ndmeasure/test_find_objects.py b/tests/test_dask_image/test_ndmeasure/test_find_objects.py index 58824c30..4f78d598 100644 --- a/tests/test_dask_image/test_ndmeasure/test_find_objects.py +++ b/tests/test_dask_image/test_ndmeasure/test_find_objects.py @@ -14,7 +14,7 @@ def label_image(): dask.array - array([[ 0, 0, 0, 0, 0, 0, 0, 333, 333, 333], + array([[ 0, 0, 0, 0, 0, 0, 0, 333, 333, 333], [111, 111, 0, 0, 0, 0, 0, 333, 333, 333], [111, 111, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 222, 222, 222, 222, 222, 222, 0], @@ -29,6 +29,26 @@ def label_image(): return label_image +@pytest.fixture +def label_image_with_empty_chunk(): + """Return small label image with an empty chunk for tests. + + dask.array + + array([[ 0, 0, 0, 0, 0, 0], + [111, 111, 0, 0, 0, 0], + [111, 111, 0, 0, 0, 0], + [ 0, 0, 0, 0, 0, 0], + [ 0, 0, 0, 222, 222, 222], + [ 0, 0, 0, 0, 0, 0]]) + """ + label_image = np.zeros((6, 6)).astype(int) + label_image[1:3,0:2] = 111 + label_image[4,3:] = 222 + label_image = da.from_array(label_image, chunks=(3, 3)) + return label_image + + def test_find_objects(label_image): result = dask_image.ndmeasure.find_objects(label_image) assert isinstance(result, dd.DataFrame) @@ -53,3 +73,15 @@ def test_3d_find_objects(label_image): 2: {111: slice(0, 2), 222: slice(0, 2), 333: slice(0, 2)}} ) assert computed_result.equals(expected) + + +def test_find_objects_with_empty_chunks(label_image_with_empty_chunk): + result = dask_image.ndmeasure.find_objects(label_image_with_empty_chunk) + assert isinstance(result, dd.DataFrame) + computed_result = result.compute() + assert isinstance(computed_result, pd.DataFrame) + expected = pd.DataFrame.from_dict( + {0: {111: slice(1, 3, None), 222: slice(4, 5, None)}, + 1: {111: slice(0, 2, None), 222: slice(3, 6, None)}} + ) + assert computed_result.equals(expected)