From 97fcaa1674b172625c25e763dc4b4a44a34e0ca6 Mon Sep 17 00:00:00 2001 From: atomprobe-tc Date: Wed, 6 Dec 2023 18:27:06 +0100 Subject: [PATCH] Started the refactoring to discretize always all point cloud data which are not collected on a square grid that is smaller than the maximum possible extent supported by h5web, tested with use case 207_2081.edaxh5 resulting ROI map is a square likely due to improper handling of HexGrid, next steps: i) fix this bug for 207_2081, ii) replace xmap in ebsd map twod by discretized grid, iii) test with examples from all other tech partners, iv) run against all datasets --- .../readers/em/examples/ebsd_database.py | 3 + .../readers/em/subparsers/hfive_apex.py | 61 ++++++-- .../readers/em/subparsers/hfive_bruker.py | 11 +- .../em/subparsers/hfive_dreamthreed.py | 8 +- .../readers/em/subparsers/hfive_ebsd.py | 25 ++- .../readers/em/subparsers/hfive_edax.py | 21 +-- .../readers/em/subparsers/hfive_oxford.py | 8 + .../readers/em/subparsers/nxs_pyxem.py | 95 ++++++++---- .../readers/em/utils/get_scan_points.py | 80 ++++++++++ .../readers/em/utils/get_sqr_grid.py | 145 ++++++++++++++++++ .../readers/em/utils/hfive_utils.py | 5 + 11 files changed, 391 insertions(+), 71 deletions(-) create mode 100644 pynxtools/dataconverter/readers/em/utils/get_scan_points.py create mode 100644 pynxtools/dataconverter/readers/em/utils/get_sqr_grid.py diff --git a/pynxtools/dataconverter/readers/em/examples/ebsd_database.py b/pynxtools/dataconverter/readers/em/examples/ebsd_database.py index 620284f23..34e79debd 100644 --- a/pynxtools/dataconverter/readers/em/examples/ebsd_database.py +++ b/pynxtools/dataconverter/readers/em/examples/ebsd_database.py @@ -28,6 +28,9 @@ # is recoverable when there is no common agreement about the phases used and their # exact atomic configuration +HEXAGONAL_GRID = "hexagonal_grid" +SQUARE_GRID = "square_grid" + FreeTextToUniquePhase = {"Actinolite": "Actinolite", "al": "Al", diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py index 26008e21b..47c339a91 100644 --- a/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py +++ b/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py @@ -37,7 +37,9 @@ from pynxtools.dataconverter.readers.em.utils.hfive_utils import \ read_strings_from_dataset from pynxtools.dataconverter.readers.em.examples.ebsd_database import \ - ASSUME_PHASE_NAME_TO_SPACE_GROUP + ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID +from pynxtools.dataconverter.readers.em.utils.get_scan_points import \ + get_scan_point_coords class HdfFiveEdaxApexReader(HdfFiveBaseParser): @@ -106,7 +108,6 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str): if f"{self.prfx}/EBSD/ANG/DATA/DATA" not in fp: raise ValueError(f"Unable to parse {self.prfx}/EBSD/ANG/DATA/DATA !") - grid_type = None # for a regular tiling of R^2 with perfect hexagons n_pts = 0 # their vertical center of mass distance is smaller than the horizontal @@ -118,10 +119,14 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str): if f"{self.prfx}/Sample/{req_field}" not in fp: raise ValueError(f"Unable to parse {self.prfx}/Sample/{req_field} !") + self.tmp[ckey]["dimensionality"] = 2 grid_type = read_strings_from_dataset(fp[f"{self.prfx}/Sample/Grid Type"][()]) - if grid_type not in ["HexGrid", "SqrGrid"]: - raise ValueError(f"Grid Type {grid_type} is currently not supported !") - self.tmp[ckey]["grid_type"] = grid_type + if grid_type == "HexGrid": + self.tmp[ckey]["grid_type"] = HEXAGONAL_GRID + elif grid_type == "SqrGrid": + self.tmp[ckey]["grid_type"] = SQUARE_GRID + else: + raise ValueError(f"Unable to parse {self.prfx}/Sample/Grid Type !") self.tmp[ckey]["s_x"] = fp[f"{self.prfx}/Sample/Step X"][0] self.tmp[ckey]["s_unit"] = "um" # "µm" # TODO::always micron? self.tmp[ckey]["n_x"] = fp[f"{self.prfx}/Sample/Number Of Columns"][0] @@ -226,12 +231,40 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str): # TODO::currently assuming s_x and s_y are already the correct center of mass # distances for hexagonal or square tiling of R^2 # self.tmp[ckey]["grid_type"] in ["HexGrid", "SqrGrid"]: - self.tmp[ckey]["scan_point_x"] = np.asarray( - np.linspace(0, self.tmp[ckey]["n_x"] - 1, - num=self.tmp[ckey]["n_x"], - endpoint=True) * self.tmp[ckey]["s_x"], np.float32) - - self.tmp[ckey]["scan_point_y"] = np.asarray( - np.linspace(0, self.tmp[ckey]["n_y"] - 1, - num=self.tmp[ckey]["n_y"], - endpoint=True) * self.tmp[ckey]["s_y"], np.float32) + # if just SQUARE_GRID there is no point to explicitly compute the scan_point + # coordinates here (for every subparser) especially not when the respective + # quantity from the tech partner is just a pixel index i.e. zeroth, first px ... + # however, ideally the tech partners would use the scan_point fields to report + # calibrated absolute scan point positions in the local reference frame of the + # sample surface in which case these could indeed not just scaled positions + # having the correct x and y spacing but eventually even the absolute coordinate + # where the scan was performed on the sample surface whereby one could conclude + # more precisely where the scanned area was located, in practice though this precision + # is usually not needed because scientists assume that the ROI is representative for + # the material which they typically never scan (time, interest, costs, instrument + # availability) completely! + if self.tmp[ckey]["grid_type"] != SQUARE_GRID: + print(f"WARNING: {self.tmp[ckey]['grid_type']}: check carefully the " \ + f"correct interpretation of scan_point coords!") + # the case of EDAX APEX shows the key problem with implicit assumptions + # edaxh5 file not necessarily store the scan_point_{dim} positions + # therefore the following code is deprecated as the axes coordinates anyway + # have to be recomputed based on whether results are rediscretized on a coarser + # grid or not ! + # mind also that the code below anyway would give only the NeXus dim axis but + # not the array of pairs of x, y coordinates for each scan point + # TODO::also keep in mind that the order in which the scan points are stored + # i.e. which index on self.tmp[ckey]["euler"] belongs to which scan point + # depends not only on the scan grid but also the flight plan i.e. how the grid + # gets visited + # only because of the fact that in most cases people seem to accept that + # scanning snake like first a line along +x and then +y meandering over the + # scan area from the top left corner to the bottom right corner is JUST an + # assumption for a random or dynamically adaptive scan strategy the scan positions + # have to be reported anyway, TODO::tech partners should be convinced to export + # scaled and calibrated scan positions as they are not necessarily redundant information + # that can be stripped to improve performance of their commercial product, I mean + # we talk typically <5k pattern per second demanding to store 5k * 2 * 8B, indeed + # this is the non-harmonized content one is facing in the field of EBSD despite + # almost two decades of commercialization of the technique now + get_scan_point_coords(self.tmp[ckey]) diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_bruker.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_bruker.py index 4af1cd5e0..9457ec46d 100644 --- a/pynxtools/dataconverter/readers/em/subparsers/hfive_bruker.py +++ b/pynxtools/dataconverter/readers/em/subparsers/hfive_bruker.py @@ -39,7 +39,7 @@ from pynxtools.dataconverter.readers.em.utils.hfive_utils import \ EBSD_MAP_SPACEGROUP, read_strings_from_dataset, all_equal, format_euler_parameterization from pynxtools.dataconverter.readers.em.examples.ebsd_database import \ - ASSUME_PHASE_NAME_TO_SPACE_GROUP + ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID class HdfFiveBrukerEspritReader(HdfFiveBaseParser): @@ -107,6 +107,12 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str): if f"{grp_name}" not in fp: raise ValueError(f"Unable to parse {grp_name} !") + self.tmp[ckey]["dimensionality"] = 2 # TODO::QUBE can also yield 3D datasets + if read_strings_from_dataset(fp[f"{grp_name}/Grid Type"]) == "isometric": + self.tmp[ckey]["grid_type"] = SQUARE_GRID + else: + raise ValueError(f"Unable to parse {grp_name}/Grid Type !") + req_fields = ["NCOLS", "NROWS", "XSTEP", "YSTEP"] for req_field in req_fields: if f"{grp_name}/{req_field}" not in fp: @@ -221,6 +227,9 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str): # there is X SAMPLE and Y SAMPLE but these are not defined somewhere instead # here adding x and y assuming that we scan first lines along positive x and then # moving downwards along +y + # TODO::calculation below x/y only valid if self.tmp[ckey]["grid_type"] == SQUARE_GRID + if self.tmp[ckey]["grid_type"] != SQUARE_GRID: + print(f"WARNING: Check carefully correct interpretation of scan_point coords!") self.tmp[ckey]["scan_point_x"] \ = np.asarray(np.tile(np.linspace(0., self.tmp[ckey]["n_x"] - 1., diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_dreamthreed.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_dreamthreed.py index bdb739ff0..248be5452 100644 --- a/pynxtools/dataconverter/readers/em/subparsers/hfive_dreamthreed.py +++ b/pynxtools/dataconverter/readers/em/subparsers/hfive_dreamthreed.py @@ -38,7 +38,7 @@ from pynxtools.dataconverter.readers.em.utils.hfive_utils import \ EBSD_MAP_SPACEGROUP, read_strings_from_dataset, all_equal, format_euler_parameterization from pynxtools.dataconverter.readers.em.examples.ebsd_database import \ - ASSUME_PHASE_NAME_TO_SPACE_GROUP + ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID # DREAM3D implements essentially a data analysis workflow with individual steps # in the DREAM3D jargon each step is referred to as a filter, filters have well-defined @@ -312,6 +312,10 @@ def parse_and_normalize_ebsd_header(self, ckey: str): spc = h5r[f"{self.path_registry['group_geometry']}" \ f"/_SIMPL_GEOMETRY/SPACING"][:].flatten() idx = 0 + + # TODO::is it correct an assumption that DREAM3D regrids using square voxel + self.tmp[ckey]["dimensionality"] = 3 + self.tmp[ckey]["grid_type"] = SQUARE_GRID for dim in ["x", "y", "z"]: self.tmp[ckey][f"n_{dim}"] = dims[idx] self.tmp[ckey][f"s_{dim}"] = spc[idx] @@ -388,6 +392,8 @@ def parse_and_normalize_ebsd_data(self, ckey: str): # in effect, the phase_id == 0 rightly so marks position indexed with the null-model # normalize pixel coordinates to physical positions even though the origin can still dangle somewhere + if self.tmp[ckey]["grid_type"] != SQUARE_GRID: + print(f"WARNING: Check carefully correct interpretation of scan_point coords!") for dim in ["x", "y", "z"]: self.tmp[ckey][f"scan_point_{dim}"] \ = np.asarray(np.linspace(0, self.tmp[ckey][f"n_{dim}"] - 1, diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_ebsd.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_ebsd.py index 3a11eddec..8bb2bbeb1 100644 --- a/pynxtools/dataconverter/readers/em/subparsers/hfive_ebsd.py +++ b/pynxtools/dataconverter/readers/em/subparsers/hfive_ebsd.py @@ -38,7 +38,7 @@ from pynxtools.dataconverter.readers.em.utils.hfive_utils import \ EBSD_MAP_SPACEGROUP, read_strings_from_dataset, all_equal, format_euler_parameterization from pynxtools.dataconverter.readers.em.examples.ebsd_database import \ - ASSUME_PHASE_NAME_TO_SPACE_GROUP + ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID class HdfFiveCommunityReader(HdfFiveBaseParser): @@ -108,6 +108,12 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str): if f"{grp_name}" not in fp: raise ValueError(f"Unable to parse {grp_name} !") + self.tmp[ckey]["dimensionality"] = 2 + if read_strings_from_dataset(fp[f"{grp_name}/Grid Type"][()]) == "isometric": + self.tmp[ckey]["grid_type"] = SQUARE_GRID + else: + raise ValueError(f"Unable to parse {grp_name}/Grid Type !") + req_fields = ["NCOLS", "NROWS", "XSTEP", "YSTEP"] for req_field in req_fields: if f"{grp_name}/{req_field}" not in fp: @@ -223,7 +229,10 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str): # X and Y # there exist X SAMPLE and Y SAMPLE which give indeed calibrated coordinates # relative to the sample coordinate system, ignore this for now an - # and TOD::just calibrate on image dimension + # and TODO::just calibrate on image dimension + # TODO::calculation below x/y only valid if self.tmp[ckey]["grid_type"] == SQUARE_GRID + if self.tmp[ckey]["grid_type"] != SQUARE_GRID: + print(f"WARNING: Check carefully correct interpretation of scan_point coords!") self.tmp[ckey]["scan_point_x"] \ = np.asarray(np.tile(np.linspace(0., self.tmp[ckey]["n_x"] - 1., @@ -236,17 +245,7 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str): num=self.tmp[ckey]["n_y"], endpoint=True) * self.tmp[ckey]["s_y"], self.tmp[ckey]["n_x"]), np.float32) - - # if np.shape(fp[f"{grp_name}/X SAMPLE"][:])[0] == n_pts: - # self.tmp[ckey]["scan_point_x"] \ - # = np.asarray(fp[f"{grp_name}/X SAMPLE"][:], np.float32) - # else: - # raise ValueError(f"{grp_name}/X SAMPLE has unexpected shape !") - # if np.shape(fp[f"{grp_name}/Y SAMPLE"][:])[0] == n_pts: - # self.tmp[ckey]["scan_point_y"] \ - # = np.asarray(fp[f"{grp_name}/Y SAMPLE"], np.float32) - # else: - # raise ValueError(f"{grp_name}/Y SAMPLE has unexpected shape !") + # X SAMPLE and Y SAMPLE seem to be something different! # Band Contrast is not stored in Bruker but Radon Quality or MAD # but this is s.th. different as it is the mean angular deviation between diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_edax.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_edax.py index 586179a51..8e3fc1164 100644 --- a/pynxtools/dataconverter/readers/em/subparsers/hfive_edax.py +++ b/pynxtools/dataconverter/readers/em/subparsers/hfive_edax.py @@ -39,7 +39,7 @@ from pynxtools.dataconverter.readers.em.utils.hfive_utils import EULER_SPACE_SYMMETRY, \ read_strings_from_dataset, read_first_scalar, format_euler_parameterization from pynxtools.dataconverter.readers.em.examples.ebsd_database import \ - ASSUME_PHASE_NAME_TO_SPACE_GROUP + ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID class HdfFiveEdaxOimAnalysisReader(HdfFiveBaseParser): @@ -110,17 +110,20 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str): if f"{grp_name}" not in fp: raise ValueError(f"Unable to parse {grp_name} !") - grid_type = None n_pts = 0 req_fields = ["Grid Type", "Step X", "Step Y", "nColumns", "nRows"] for req_field in req_fields: if f"{grp_name}/{req_field}" not in fp: raise ValueError(f"Unable to parse {grp_name}/{req_field} !") + self.tmp[ckey]["dimensionality"] = 2 grid_type = read_strings_from_dataset(fp[f"{grp_name}/Grid Type"][()]) - if grid_type not in ["HexGrid", "SqrGrid"]: - raise ValueError(f"Grid Type {grid_type} is currently not supported !") - self.tmp[ckey]["grid_type"] = grid_type + if grid_type == "HexGrid": + self.tmp[ckey]["grid_type"] = HEXAGONAL_GRID + elif grid_type == "SqrGrid": + self.tmp[ckey]["grid_type"] = SQUARE_GRID + else: + raise ValueError(f"Unable to parse {grp_name}/Grid Type !") self.tmp[ckey]["s_x"] = read_first_scalar(fp[f"{grp_name}/Step X"]) self.tmp[ckey]["s_unit"] = "um" # "µm" # TODO::always micron? self.tmp[ckey]["n_x"] = read_first_scalar(fp[f"{grp_name}/nColumns"]) @@ -248,17 +251,17 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str): # as the step size has already been accounted for by the tech partner when writing! if self.version["schema_version"] in ["OIM Analysis 8.5.1002 x64 [07-17-20]"]: print(f"{self.version['schema_version']}, tech partner accounted for calibration") + if self.tmp[ckey]["grid_type"] != SQUARE_GRID: + print(f"WARNING: Check carefully correct interpretation of scan_point coords!") self.tmp[ckey]["scan_point_x"] \ = np.asarray(fp[f"{grp_name}/X Position"][:], np.float32) self.tmp[ckey]["scan_point_y"] \ = np.asarray(fp[f"{grp_name}/Y Position"][:], np.float32) else: print(f"{self.version['schema_version']}, parser has to do the calibration") + if self.tmp[ckey]["grid_type"] != SQUARE_GRID: + print(f"WARNING: Check carefully correct interpretation of scan_point coords!") self.tmp[ckey]["scan_point_x"] = np.asarray( fp[f"{grp_name}/X Position"][:] * self.tmp[ckey]["s_x"], np.float32) self.tmp[ckey]["scan_point_y"] = np.asarray( fp[f"{grp_name}/Y Position"][:] * self.tmp[ckey]["s_y"], np.float32) - print(f"xmin {np.min(self.tmp[ckey]['scan_point_x'])}," \ - f"xmax {np.max(self.tmp[ckey]['scan_point_x'])}," \ - f"ymin {np.min(self.tmp[ckey]['scan_point_y'])}," \ - f"ymax {np.max(self.tmp[ckey]['scan_point_y'])}") diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_oxford.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_oxford.py index e05d7f4d6..2f6d6d3d7 100644 --- a/pynxtools/dataconverter/readers/em/subparsers/hfive_oxford.py +++ b/pynxtools/dataconverter/readers/em/subparsers/hfive_oxford.py @@ -38,6 +38,8 @@ from pynxtools.dataconverter.readers.em.subparsers.hfive_base import HdfFiveBaseParser from pynxtools.dataconverter.readers.em.utils.hfive_utils import \ read_strings_from_dataset, format_euler_parameterization +from pynxtools.dataconverter.readers.em.examples.ebsd_database import \ + HEXAGONAL_GRID, SQUARE_GRID class HdfFiveOxfordReader(HdfFiveBaseParser): @@ -118,6 +120,10 @@ def parse_and_normalize_slice_ebsd_header(self, fp, ckey: str): if f"{grp_name}" not in fp: raise ValueError(f"Unable to parse {grp_name} !") + # TODO::check if Oxford always uses SquareGrid like assumed here + self.tmp[ckey]["dimensionality"] = 2 + self.tmp[ckey]["grid_type"] = SQUARE_GRID + req_fields = ["X Cells", "Y Cells", "X Step", "Y Step"] for req_field in req_fields: if f"{grp_name}/{req_field}" not in fp: @@ -231,6 +237,8 @@ def parse_and_normalize_slice_ebsd_data(self, fp, ckey: str): # expected is order on x is first all possible x values while y == 0 # followed by as many copies of this linear sequence for each y increment # no action needed Oxford reports already the pixel coordinate multiplied by step + if self.tmp[ckey]["grid_type"] != SQUARE_GRID: + print(f"WARNING: Check carefully correct interpretation of scan_point coords!") # X, no, H5T_NATIVE_FLOAT, (size, 1), X position of each pixel in micrometers (origin: top left corner) self.tmp[ckey]["scan_point_x"] = np.asarray(fp[f"{grp_name}/X"], np.float32) # inconsistency f32 in file although specification states float diff --git a/pynxtools/dataconverter/readers/em/subparsers/nxs_pyxem.py b/pynxtools/dataconverter/readers/em/subparsers/nxs_pyxem.py index 1709cd865..d9139e59a 100644 --- a/pynxtools/dataconverter/readers/em/subparsers/nxs_pyxem.py +++ b/pynxtools/dataconverter/readers/em/subparsers/nxs_pyxem.py @@ -48,7 +48,6 @@ from orix.quaternion import Rotation from orix.quaternion.symmetry import get_point_group from orix.vector import Vector3d -from scipy.spatial import KDTree import matplotlib.pyplot as plt @@ -58,13 +57,16 @@ from pynxtools.dataconverter.readers.em.utils.hfive_web_utils \ import hfive_web_decorate_nxdata from pynxtools.dataconverter.readers.em.utils.image_processing import thumbnail +from pynxtools.dataconverter.readers.em.utils.get_sqr_grid import \ + get_scan_points_with_mark_data_discretized_on_sqr_grid +from pynxtools.dataconverter.readers.em.utils.get_scan_points import \ + get_scan_point_axis_values, get_scan_point_coords PROJECTION_VECTORS = [Vector3d.xvector(), Vector3d.yvector(), Vector3d.zvector()] PROJECTION_DIRECTIONS = [("X", Vector3d.xvector().data.flatten()), ("Y", Vector3d.yvector().data.flatten()), ("Z", Vector3d.zvector().data.flatten())] - from pynxtools.dataconverter.readers.em.subparsers.hfive_oxford import HdfFiveOxfordReader from pynxtools.dataconverter.readers.em.subparsers.hfive_bruker import HdfFiveBrukerEspritReader from pynxtools.dataconverter.readers.em.subparsers.hfive_edax import HdfFiveEdaxOimAnalysisReader @@ -114,17 +116,33 @@ def parse(self, template: dict) -> dict: # copying over all data and content within tech partner files into NeXus makes # not much sense as the data exists and we would like to motivate that # tech partners and community members write NeXus content directly - # therefore currently in this example we carry over the EBSD map and some - # metadata to motivate that there is indeed value wrt to interoperability - # when such data are harmonized exactly this is the point we would like to - # make with this example for NeXus and NOMAD OASIS within the FAIRmat project + # therefore, in this example we carry over the EBSD map and some metadata + # to motivate that there is indeed value wrt to interoperability when such data + # are harmonized upon injection in the RDMS - exactly this is the point + # we would like to make with this comprehensive example of data harmonization + # within the field of EBSD as one method in the field of electron diffraction + # we use NeXus, NOMAD OASIS within the FAIRmat project # it is practically beyond our resources to implement a mapping for all cases - # and corner cases of the vendor files + # and corner cases of vendor files # ideally concept mapping would be applied to just point to pieces of information - # in the HDF5 file that is written by the tech partners however because of the - # fact that currently these pieces of information are formatted very differently - # it is non-trivial to establish this mapping and only because of this we - # map over manually + # in (HDF5) files based on which semantically understood pieces of information + # are then interpreted and injected into the RDMS + # currently the fact that the documentation by tech partners is incomplete + # and the fact that conceptually similar or even the same concepts as instances + # with their pieces of information are formatted very differently, it is + # non-trivial to establish this mapping and only because of this we + # map over using hardcoding of concept names and symbols + + # a collection of different tech-partner-specific subparser follows + # these subparsers already extract specific information and perform a first + # step of harmonization. The subparsers specifically store e.g. EBSD maps in a + # tmp dictionary, which is + # TODO: scan point positions (irrespective on which grid type (sqr, hex) these + # were probed, in some cases the grid may have a two large extent along a dim + # so that a sub-sampling is performed, here only for the purpose of using + # h5web to show the IPF color maps but deal with the fact that h5web has so far + # not been designed to deal with images as large as several thousand pixels along + # either dimension if hfive_parser_type == "oxford": oina = HdfFiveOxfordReader(self.file_path) oina.parse_and_normalize() @@ -199,6 +217,8 @@ def process_into_template(self, inp: dict, template: dict) -> dict: return template def get_named_axis(self, inp: dict, dim_name: str): + """"Return scaled but not offset-calibrated scan point coordinates along dim.""" + # TODO::remove! return np.asarray(np.linspace(0, inp[f"n_{dim_name}"] - 1, num=inp[f"n_{dim_name}"], @@ -217,38 +237,47 @@ def process_roi_overview_ebsd_based(self, roi_id: str, template: dict) -> dict: print("Parse ROI default plot...") + # tech partner specific subparsers have just extracted the per scan point information + # in the sequence they were (which is often how they were scanned) + # however that can be a square, a hexagonal or some random grid + # a consuming visualization tool (like h5web) may however not be able to + # represent the data as point cloud but only visualizes a grid of square pixels + # therefore in general the scan_point_x and scan_point_y arrays and their associated + # data arrays such as euler should always be interpolated on a specific grid + # Here, the square_grid supported by h5web with a specific maximum extent + # which may represent a downsampled representation of the actual ROI + # only in the case that indeed the grid is a square grid this interpolation is + # obsolete but also only when the grid does not exceed the technical limitation + # of here h5web + # TODO::implement rediscretization using a kdtree take n_x, n_y, and n_z as guides + + trg_grid \ + = get_scan_points_with_mark_data_discretized_on_sqr_grid(inp, + HFIVE_WEB_MAXIMUM_ROI) + contrast_modes = [(None, "n/a"), ("bc", "normalized_band_contrast"), ("ci", "normalized_confidence_index"), ("mad", "normalized_mean_angular_deviation")] contrast_mode = None for mode in contrast_modes: - if mode[0] in inp.keys() and contrast_mode is None: + if mode[0] in trg_grid.keys() and contrast_mode is None: contrast_mode = mode break if contrast_mode is None: print(f"{__name__} unable to generate plot for entry{self.entry_id}, roi{roi_id} !") return template - is_threed = False - if "n_z" in inp.keys(): - is_threed = True - if np.max((inp["n_x"], inp["n_y"], inp["n_z"])) > HFIVE_WEB_MAXIMUM_ROI: - raise ValueError(f"Plotting 3D roi_overviews larger than " \ - f"{HFIVE_WEB_MAXIMUM_ROI} is not supported !") - else: - if np.max((inp["n_x"], inp["n_y"])) > HFIVE_WEB_MAXIMUM_ROI: - raise ValueError(f"Plotting 2D roi_overviews larger than " \ - f"{HFIVE_WEB_MAXIMUM_ROI} is not supported !") - - template[f"/ENTRY[entry{self.entry_id}]/ROI[roi{roi_id}]/@NX_class"] = "NXroi" # TODO::writer should decorate automatically! - template[f"/ENTRY[entry{self.entry_id}]/ROI[roi{roi_id}]/ebsd/indexing/@NX_class"] = "NXprocess" # TODO::writer should decorate automatically! + template[f"/ENTRY[entry{self.entry_id}]/ROI[roi{roi_id}]/@NX_class"] = "NXroi" + # TODO::writer should decorate automatically! + template[f"/ENTRY[entry{self.entry_id}]/ROI[roi{roi_id}]/ebsd/indexing/@NX_class"] = "NXprocess" + # TODO::writer should decorate automatically! trg = f"/ENTRY[entry{self.entry_id}]/ROI[roi{roi_id}]/ebsd/indexing/DATA[roi]" template[f"{trg}/title"] = f"Region-of-interest overview image" template[f"{trg}/@NX_class"] = f"NXdata" # TODO::writer should decorate automatically! template[f"{trg}/@signal"] = "data" dims = ["x", "y"] - if is_threed is True: + if trg_grid["dimensionality"] == 3: dims.append("z") idx = 0 for dim in dims: @@ -258,22 +287,22 @@ def process_roi_overview_ebsd_based(self, for dim in dims[::-1]: template[f"{trg}/@axes"].append(f"axis_{dim}") - if is_threed is True: - template[f"{trg}/data"] = {"compress": np.squeeze(np.asarray(np.asarray((inp[contrast_mode[0]] / np.max(inp[contrast_mode[0]], axis=None) * 255.), np.uint32), np.uint8), axis=3), "strength": 1} + if trg_grid["dimensionality"] == 3: + template[f"{trg}/data"] = {"compress": np.squeeze(np.asarray(np.asarray((trg_grid[contrast_mode[0]] / np.max(trg_grid[contrast_mode[0]], axis=None) * 255.), np.uint32), np.uint8), axis=3), "strength": 1} else: - template[f"{trg}/data"] = {"compress": np.reshape(np.asarray(np.asarray((inp[contrast_mode[0]] / np.max(inp[contrast_mode[0]]) * 255.), np.uint32), np.uint8), (inp["n_y"], inp["n_x"]), order="C"), "strength": 1} + template[f"{trg}/data"] = {"compress": np.reshape(np.asarray(np.asarray((trg_grid[contrast_mode[0]] / np.max(trg_grid[contrast_mode[0]]) * 255.), np.uint32), np.uint8), (trg_grid["n_y"], trg_grid["n_x"]), order="C"), "strength": 1} template[f"{trg}/descriptor"] = contrast_mode[1] # 0 is y while 1 is x for 2d, 0 is z, 1 is y, while 2 is x for 3d template[f"{trg}/data/@long_name"] = f"Signal" hfive_web_decorate_nxdata(f"{trg}/data", template) - scan_unit = inp["s_unit"] + scan_unit = trg_grid["s_unit"] if scan_unit == "um": scan_unit = "µm" for dim in dims: template[f"{trg}/AXISNAME[axis_{dim}]"] \ - = {"compress": self.get_named_axis(inp, dim), "strength": 1} + = {"compress": self.get_named_axis(trg_grid, dim), "strength": 1} template[f"{trg}/AXISNAME[axis_{dim}]/@long_name"] \ = f"Coordinate along {dim}-axis ({scan_unit})" template[f"{trg}/AXISNAME[axis_{dim}]/@units"] = f"{scan_unit}" @@ -287,7 +316,7 @@ def process_roi_ebsd_maps(self, inp: dict, template: dict) -> dict: if "n_z" not in inp[ckey].keys(): self.prepare_roi_ipfs_phases_twod(inp[ckey], roi_id, template) self.process_roi_ipfs_phases_twod(inp[ckey], roi_id, template) - # self.onthefly_process_roi_ipfs_phases_threed(inp[ckey], roi_id, template) + # self.onthefly_process_roi_ipfs_phases_two(inp[ckey], roi_id, template) else: self.onthefly_process_roi_ipfs_phases_threed(inp[ckey], roi_id, template) return template @@ -612,7 +641,7 @@ def process_roi_phase_ipfs_threed(self, inp: dict, roi_id: int, pyxem_phase_id: # TODO: I have not seen any dataset yet where is limit is exhausted, the largest # dataset is a 3D SEM/FIB study from a UK project this is likely because to # get an EBSD map as large one already scans quite long for one section as making - # a ompromise is required and thus such hypothetical large serial-sectioning + # a compromise is required and thus such hypothetical large serial-sectioning # studies would block the microscope for a very long time # however I have seen examples from Hadi Pirgazi with L. Kestens from Leuven # where indeed large but thin 3d slabs were characterized diff --git a/pynxtools/dataconverter/readers/em/utils/get_scan_points.py b/pynxtools/dataconverter/readers/em/utils/get_scan_points.py new file mode 100644 index 000000000..9bfbe5ab1 --- /dev/null +++ b/pynxtools/dataconverter/readers/em/utils/get_scan_points.py @@ -0,0 +1,80 @@ +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Identify likely scan_point_positions for specific EBSD grid types.""" + +# pylint: disable=no-member + +import numpy as np + +from pynxtools.dataconverter.readers.em.examples.ebsd_database import \ + HEXAGONAL_GRID, SQUARE_GRID + + +def get_scan_point_axis_values(inp: dict, dim_name: str): + is_threed = False + if "dimensionality" in inp.keys(): + if inp["dimensionality"] == 3: + is_threed = True + req_keys = ["grid_type", f"n_{dim_name}", f"s_{dim_name}"] + for key in req_keys: + if key not in inp.keys(): + raise ValueError(f"Unable to find required key {key} in inp !") + + if inp["grid_type"] in [HEXAGONAL_GRID, SQUARE_GRID]: + return np.asarray(np.linspace(0, + inp[f"n_{dim_name}"] - 1, + num=inp[f"n_{dim_name}"], + endpoint=True) * inp[f"s_{dim_name}"], np.float32) + else: + return None + + +def get_scan_point_coords(inp: dict) -> dict: + """Add scan_point_dim array assuming top-left to bottom-right snake style scanning.""" + is_threed = False + if "dimensionality" in inp.keys(): + if inp["dimensionality"] == 3: + is_threed = True + + req_keys = ["grid_type"] + dims = ["x", "y"] + if is_threed is True: + dims.append("z") + for dim in dims: + req_keys.append(f"n_{dim}") + req_keys.append(f"s_{dim}") + + for key in req_keys: + if key not in inp.keys(): + raise ValueError(f"Unable to find required key {key} in inp !") + + if is_threed is False: + if inp["grid_type"] in [SQUARE_GRID, HEXAGONAL_GRID]: + # TODO::check that below code is correct as well for hexagonal grid ! + for dim in dims: + if "scan_point_{dim}" in inp.keys(): + print("WARNING::Overwriting scan_point_{dim} !") + inp["scan_point_x"] = np.tile( + np.linspace(0, inp["n_x"] - 1, num=inp["n_x"], endpoint=True) * inp["s_x"], inp["n_y"]) + inp["scan_point_y"] = np.repeat( + np.linspace(0, inp["n_y"] - 1, num=inp["n_y"], endpoint=True) * inp["s_y"], inp["n_x"]) + else: + print("WARNING::{__name__} facing an unknown scan strategy !") + else: + print("WARNING::{__name__} not implemented for 3D case !") + return inp diff --git a/pynxtools/dataconverter/readers/em/utils/get_sqr_grid.py b/pynxtools/dataconverter/readers/em/utils/get_sqr_grid.py new file mode 100644 index 000000000..ec9a78eb3 --- /dev/null +++ b/pynxtools/dataconverter/readers/em/utils/get_sqr_grid.py @@ -0,0 +1,145 @@ +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Discretize point cloud in R^d (d=2, 3) with mark data to square/cube voxel grid.""" + +# pylint: disable=no-member + +import numpy as np +from scipy.spatial import KDTree + +from pynxtools.dataconverter.readers.em.examples.ebsd_database import SQUARE_GRID + + +def get_scan_points_with_mark_data_discretized_on_sqr_grid(src_grid: dict, + max_edge_length: int) -> dict: + """Inspect grid_type, dimensionality, point locations, and mark src_grida, map then.""" + is_threed = False + if "dimensionality" in src_grid.keys(): + if src_grid["dimensionality"] == 3: + is_threed = True + + req_keys = ["grid_type"] + dims = ["x", "y"] + if is_threed is True: + dims.append("z") + for dim in dims: + req_keys.append(f"scan_point_{dim}") + req_keys.append(f"n_{dim}") + req_keys.append(f"s_{dim}") + + trg_grid = {} + for key in req_keys: + if key not in src_grid.keys(): + raise ValueError(f"Unable to find required key {key} in src_grid !") + + # take discretization of the source grid as a guide for the target_grid + # optimization possible if square grid and matching maximum_extent + + max_extent = None + if is_threed is False: + max_extent = np.max((src_grid["n_x"], src_grid["n_y"])) + else: + max_extent = np.max((src_grid["n_x"], src_grid["n_y"], src_grid["n_z"])) + + if src_grid["grid_type"] == SQUARE_GRID: + if max_extent <= max_edge_length: + return src_grid + else: + max_extent = max_edge_length # cap the maximum extent + + # all non-square grids or too large square grids will be + # discretized onto a regular grid with square or cubic pixel/voxel + aabb = [] + for dim in dims: + aabb.append(np.min(src_grid[f"scan_point_{dim}"])) # - 0.5 * src_grid[f"s_{dim}"])) + aabb.append(np.max(src_grid[f"scan_point_{dim}"])) # + 0.5 * src_grid[f"s_{dim}"])) + print(f"{aabb}") + + if is_threed is False: + if aabb[1] - aabb[0] >= aabb[3] - aabb[2]: + sxy = (aabb[1] - aabb[0]) / max_extent + nxy = [max_extent, int(np.ceil((aabb[3] - aabb[2]) / sxy))] + else: + sxy = (aabb[3] - aabb[2]) / max_extent + nxy = [int(np.ceil((aabb[1] - aabb[0]) / sxy)), max_extent] + print(f"H5Web default plot generation, scaling nxy0 {[src_grid['n_x'], src_grid['n_y']]}, nxy {nxy}") + # the above estimate is not exactly correct (may create a slight real space shift) + # of the EBSD map TODO:: regrid the real world axis-aligned bounding box aabb with + # a regular tiling of squares or hexagons + # https://stackoverflow.com/questions/18982650/differences-between-matlab-and-numpy-and-pythons-round-function + # MTex/Matlab round not exactly the same as numpy round but reasonably close + + # scan point positions were normalized by tech partner subparsers such that they + # always build on pixel coordinates calibrated for step size not by giving absolute positions + # in the sample surface frame of reference as this is typically not yet consistently documented + # because we assume in addition that we always start at the top left corner the zeroth/first + # coordinate is always 0., 0. ! + xy = np.column_stack( + (np.tile(np.linspace(0, nxy[0] - 1, num=nxy[0], endpoint=True) * sxy, nxy[1]), + np.repeat(np.linspace(0, nxy[1] - 1, num=nxy[1], endpoint=True) * sxy, nxy[0]))) + # TODO:: if scan_point_{dim} are calibrated this approach + # here would shift the origin to 0, 0 implicitly which may not be desired + print(f"xy {xy}, shape {np.shape(xy)}") + tree = KDTree(np.column_stack((src_grid["scan_point_x"], src_grid["scan_point_y"]))) + d, idx = tree.query(xy, k=1) + if np.sum(idx == tree.n) > 0: + raise ValueError(f"kdtree query left some query points without a neighbor!") + del d + del tree + + # rebuild src_grid container with only the relevant src_grida selected from src_grid + for key in src_grid.keys(): + if key == "euler": + trg_grid[key] = np.zeros((np.shape(xy)[0], 3), np.float32) + trg_grid[key] = np.nan + trg_grid[key] = src_grid["euler"][idx, :] + if np.isnan(trg_grid[key]).any() is True: + raise ValueError(f"Downsampling of the point cloud left " \ + f"pixels without mark data {key} !") + print(f"final np.shape(trg_grid[{key}]) {np.shape(trg_grid[key])}") + elif key == "phase_id" or key == "bc": + trg_grid[key] = np.zeros((np.shape(xy)[0],), np.int32) - 2 + # pyxem_id is at least -1, bc is typically positive + trg_grid[key] = src_grid[key][idx] + if np.sum(trg_grid[key] == -2) > 0: + raise ValueError(f"Downsampling of the point cloud left " \ + f"pixels without mark data {key} !") + print(f"final np.shape(trg_grid[{key}]) {np.shape(trg_grid[key])}") + elif key == "ci" or key == "mad": + trg_grid[key] = np.zeros((np.shape(xy)[0],), np.float32) + trg_grid[key] = np.nan + trg_grid[key] = src_grid[key][idx] + print(f"final np.shape(trg_grid[{key}]) {np.shape(trg_grid[key])}") + if np.isnan(trg_grid[key]).any() is True: + raise ValueError(f"Downsampling of the point cloud left " \ + f"pixels without mark data {key} !") + elif key not in ["n_x", "n_y", "n_z", "s_x", "s_y", "s_z"]: + print(f"WARNING:: src_grid[{key}] is mapped as is on trg_grid[{key}] !") + trg_grid[key] = src_grid[key] + print(f"final np.shape(trg_grid[{key}]) {np.shape(trg_grid[key])}") + else: + print(f"WARNING:: src_grid[{key}] is not yet mapped on trg_grid[{key}] !") + trg_grid["n_x"] = nxy[0] + trg_grid["n_y"] = nxy[1] + trg_grid["s_x"] = sxy + trg_grid["s_y"] = sxy + # TODO::need to update scan_point_{dim} + return trg_grid + else: + raise ValueError(f"The 3D discretization is currently not implemented because " \ + f"we do not know of any large enough dataset the test it !") diff --git a/pynxtools/dataconverter/readers/em/utils/hfive_utils.py b/pynxtools/dataconverter/readers/em/utils/hfive_utils.py index bf1e7af10..3b2320c4e 100644 --- a/pynxtools/dataconverter/readers/em/utils/hfive_utils.py +++ b/pynxtools/dataconverter/readers/em/utils/hfive_utils.py @@ -101,3 +101,8 @@ def read_first_scalar(obj): def all_equal(iterable): g = groupby(iterable) return next(g, True) and not next(g, False) + + +# for dim in ["x", "y"]: +# print(f"{dim}min {np.min(self.tmp[ckey][f'scan_point_{dim}'])}") +# print(f"{dim}max {np.max(self.tmp[ckey][f'scan_point_{dim}'])}")