Skip to content

Commit

Permalink
Improving planning and preprocessing
Browse files Browse the repository at this point in the history
* replacing multiple calls to np.percentile with single call.
  • Loading branch information
ancestor-mithril committed Mar 28, 2024
1 parent c7f85b7 commit 2af3c75
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,7 @@ def collect_foreground_intensities(segmentation: np.ndarray, images: np.ndarray,
"""
images=image with multiple channels = shape (c, x, y(, z))
"""
assert images.ndim == 4
assert segmentation.ndim == 4

assert images.ndim == 4 and segmentation.ndim == 4
assert not np.any(np.isnan(segmentation)), "Segmentation contains NaN values. grrrr.... :-("
assert not np.any(np.isnan(images)), "Images contains NaN values. grrrr.... :-("

Expand All @@ -58,6 +56,7 @@ def collect_foreground_intensities(segmentation: np.ndarray, images: np.ndarray,

# segmentation is 4d: 1,x,y,z. We need to remove the empty dimension for the following code to work
foreground_mask = segmentation[0] > 0
percentiles = np.array((0.5, 50.0, 99.5))

for i in range(len(images)):
foreground_pixels = images[i][foreground_mask]
Expand All @@ -67,13 +66,21 @@ def collect_foreground_intensities(segmentation: np.ndarray, images: np.ndarray,
# training cases to be underrepresented
intensities_per_channel.append(
rs.choice(foreground_pixels, num_samples, replace=True) if num_fg > 0 else [])

mean, median, mini, maxi, percentile_99_5, percentile_00_5 = np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
if num_fg > 0:
percentile_00_5, median, percentile_99_5 = np.percentile(foreground_pixels, percentiles)
mean = np.mean(foreground_pixels)
mini = np.min(foreground_pixels)
maxi = np.max(foreground_pixels)

intensity_statistics_per_channel.append({
'mean': np.mean(foreground_pixels) if num_fg > 0 else np.nan,
'median': np.median(foreground_pixels) if num_fg > 0 else np.nan,
'min': np.min(foreground_pixels) if num_fg > 0 else np.nan,
'max': np.max(foreground_pixels) if num_fg > 0 else np.nan,
'percentile_99_5': np.percentile(foreground_pixels, 99.5) if num_fg > 0 else np.nan,
'percentile_00_5': np.percentile(foreground_pixels, 0.5) if num_fg > 0 else np.nan,
'mean': mean,
'median': median,
'min': mini,
'max': maxi,
'percentile_99_5': percentile_99_5,
'percentile_00_5': percentile_00_5,

})

Expand Down Expand Up @@ -157,6 +164,7 @@ def run(self, overwrite_existing: bool = False) -> dict:
spacings = [r[1] for r in results]
foreground_intensities_per_channel = [np.concatenate([r[2][i] for r in results]) for i in
range(len(results[0][2]))]
foreground_intensities_per_channel = np.array(foreground_intensities_per_channel)
# we drop this so that the json file is somewhat human readable
# foreground_intensity_stats_by_case_and_modality = [r[3] for r in results]
median_relative_size_after_cropping = np.median([r[4] for r in results], 0)
Expand All @@ -165,15 +173,18 @@ def run(self, overwrite_existing: bool = False) -> dict:
if 'channel_names' in self.dataset_json.keys()
else self.dataset_json['modality'].keys())
intensity_statistics_per_channel = {}
percentiles = np.array((0.5, 50.0, 99.5))
for i in range(num_channels):
percentile_00_5, median, percentile_99_5 = np.percentile(foreground_intensities_per_channel[i],
percentiles)
intensity_statistics_per_channel[i] = {
'mean': float(np.mean(foreground_intensities_per_channel[i])),
'median': float(np.median(foreground_intensities_per_channel[i])),
'median': float(median),
'std': float(np.std(foreground_intensities_per_channel[i])),
'min': float(np.min(foreground_intensities_per_channel[i])),
'max': float(np.max(foreground_intensities_per_channel[i])),
'percentile_99_5': float(np.percentile(foreground_intensities_per_channel[i], 99.5)),
'percentile_00_5': float(np.percentile(foreground_intensities_per_channel[i], 0.5)),
'percentile_99_5': float(percentile_99_5),
'percentile_00_5': float(percentile_00_5),
}

fingerprint = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,10 @@ def determine_fullres_target_spacing(self) -> np.ndarray:
if self.overwrite_target_spacing is not None:
return np.array(self.overwrite_target_spacing)

spacings = self.dataset_fingerprint['spacings']
spacings = np.vstack(self.dataset_fingerprint['spacings'])
sizes = self.dataset_fingerprint['shapes_after_crop']

target = np.percentile(np.vstack(spacings), 50, 0)
target = np.percentile(spacings, 50, 0)

# todo sizes_after_resampling = [compute_new_shape(j, i, target) for i, j in zip(spacings, sizes)]

Expand All @@ -187,7 +187,7 @@ def determine_fullres_target_spacing(self) -> np.ndarray:
has_aniso_voxels = target_size[worst_spacing_axis] * self.anisotropy_threshold < min(other_sizes)

if has_aniso_spacing and has_aniso_voxels:
spacings_of_that_axis = np.vstack(spacings)[:, worst_spacing_axis]
spacings_of_that_axis = spacings[:, worst_spacing_axis]
target_spacing_of_that_axis = np.percentile(spacings_of_that_axis, 10)
# don't let the spacing of that axis get higher than the other axes
if target_spacing_of_that_axis < max(other_spacings):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def _keygen(patch_size, strides):
# clip initial patch size to median_shape. It makes little sense to have it be larger than that. Note that
# this is different from how nnU-Net v1 does it!
# todo patch size can still get too large because we pad the patch size to a multiple of 2**n
initial_patch_size = np.array([min(i, j) for i, j in zip(initial_patch_size, median_shape[:len(spacing)])])
initial_patch_size = np.minimum(initial_patch_size, median_shape[:len(spacing)])

# use that to get the network topology. Note that this changes the patch_size depending on the number of
# pooling operations (must be divisible by 2**num_pool in each axis)
Expand Down
9 changes: 5 additions & 4 deletions nnunetv2/training/dataloading/data_loader_3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ def generate_train_batch(self):
# bbox that actually lies within the data. This will result in a smaller array which is then faster to pad.
# valid_bbox is just the coord that lied within the data cube. It will be padded to match the patch size
# later
valid_bbox_lbs = [max(0, bbox_lbs[i]) for i in range(dim)]
valid_bbox_ubs = [min(shape[i], bbox_ubs[i]) for i in range(dim)]
valid_bbox_lbs = np.clip(bbox_lbs, a_min=0, a_max=None)
valid_bbox_ubs = np.minimum(shape, bbox_ubs)

# At this point you might ask yourself why we would treat seg differently from seg_from_previous_stage.
# Why not just concatenate them here and forget about the if statements? Well that's because segneeds to
Expand All @@ -43,8 +43,9 @@ def generate_train_batch(self):
seg = seg[this_slice]

padding = [(-min(0, bbox_lbs[i]), max(bbox_ubs[i] - shape[i], 0)) for i in range(dim)]
data_all[j] = np.pad(data, ((0, 0), *padding), 'constant', constant_values=0)
seg_all[j] = np.pad(seg, ((0, 0), *padding), 'constant', constant_values=-1)
padding = ((0, 0), *padding)
data_all[j] = np.pad(data, padding, 'constant', constant_values=0)
seg_all[j] = np.pad(seg, padding, 'constant', constant_values=-1)

return {'data': data_all, 'seg': seg_all, 'properties': case_properties, 'keys': selected_keys}

Expand Down

0 comments on commit 2af3c75

Please sign in to comment.