Started the refactoring to discretize always all point cloud data whi…

…ch are not collected on a square grid that is smaller than the maximum possible extent supported by h5web, tested with use case 207_2081.edaxh5 resulting ROI map is a square likely due to improper handling of HexGrid, next steps: i) fix this bug for 207_2081, ii) replace xmap in ebsd map twod by discretized grid, iii) test with examples from all other tech partners, iv) run against all datasets
FAIRmat-NFDI · Dec 6, 2023 · 97fcaa1 · 97fcaa1
1 parent 71a6a56
commit 97fcaa1
Show file tree

Hide file tree

Showing 11 changed files with 391 additions and 71 deletions.
diff --git a/pynxtools/dataconverter/readers/em/examples/ebsd_database.py b/pynxtools/dataconverter/readers/em/examples/ebsd_database.py
@@ -28,6 +28,9 @@
 # is recoverable when there is no common agreement about the phases used and their
 # exact atomic configuration
 
+HEXAGONAL_GRID = "hexagonal_grid"
+SQUARE_GRID = "square_grid"
+
 
 FreeTextToUniquePhase = {"Actinolite": "Actinolite",
                          "al": "Al",

diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py
@@ -37,7 +37,9 @@
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import \
     read_strings_from_dataset
 from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
-    ASSUME_PHASE_NAME_TO_SPACE_GROUP
+    ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID
+from pynxtools.dataconverter.readers.em.utils.get_scan_points import \
+    get_scan_point_coords
 
 
 class HdfFiveEdaxApexReader(HdfFiveBaseParser):
@@ -106,7 +108,6 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str):
         if f"{self.prfx}/EBSD/ANG/DATA/DATA" not in fp:
             raise ValueError(f"Unable to parse {self.prfx}/EBSD/ANG/DATA/DATA !")
 
-        grid_type = None
         # for a regular tiling of R^2 with perfect hexagons
         n_pts = 0
         # their vertical center of mass distance is smaller than the horizontal
@@ -118,10 +119,14 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str):
             if f"{self.prfx}/Sample/{req_field}" not in fp:
                 raise ValueError(f"Unable to parse {self.prfx}/Sample/{req_field} !")
 
+        self.tmp[ckey]["dimensionality"] = 2
         grid_type = read_strings_from_dataset(fp[f"{self.prfx}/Sample/Grid Type"][()])
-        if grid_type not in ["HexGrid", "SqrGrid"]:
-            raise ValueError(f"Grid Type {grid_type} is currently not supported !")
-        self.tmp[ckey]["grid_type"] = grid_type
+        if grid_type == "HexGrid":
+            self.tmp[ckey]["grid_type"] = HEXAGONAL_GRID
+        elif grid_type == "SqrGrid":
+            self.tmp[ckey]["grid_type"] = SQUARE_GRID
+        else:
+            raise ValueError(f"Unable to parse {self.prfx}/Sample/Grid Type !")
         self.tmp[ckey]["s_x"] = fp[f"{self.prfx}/Sample/Step X"][0]
         self.tmp[ckey]["s_unit"] = "um"  # "µm"  # TODO::always micron?
         self.tmp[ckey]["n_x"] = fp[f"{self.prfx}/Sample/Number Of Columns"][0]
@@ -226,12 +231,40 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
         # TODO::currently assuming s_x and s_y are already the correct center of mass
         # distances for hexagonal or square tiling of R^2
         # self.tmp[ckey]["grid_type"] in ["HexGrid", "SqrGrid"]:
-        self.tmp[ckey]["scan_point_x"] = np.asarray(
-            np.linspace(0, self.tmp[ckey]["n_x"] - 1,
-                        num=self.tmp[ckey]["n_x"],
-                        endpoint=True) * self.tmp[ckey]["s_x"], np.float32)
-
-        self.tmp[ckey]["scan_point_y"] = np.asarray(
-            np.linspace(0, self.tmp[ckey]["n_y"] - 1,
-                        num=self.tmp[ckey]["n_y"],
-                        endpoint=True) * self.tmp[ckey]["s_y"], np.float32)
+        # if just SQUARE_GRID there is no point to explicitly compute the scan_point
+        # coordinates here (for every subparser) especially not when the respective
+        # quantity from the tech partner is just a pixel index i.e. zeroth, first px ...
+        # however, ideally the tech partners would use the scan_point fields to report
+        # calibrated absolute scan point positions in the local reference frame of the
+        # sample surface in which case these could indeed not just scaled positions
+        # having the correct x and y spacing but eventually even the absolute coordinate
+        # where the scan was performed on the sample surface whereby one could conclude
+        # more precisely where the scanned area was located, in practice though this precision
+        # is usually not needed because scientists assume that the ROI is representative for
+        # the material which they typically never scan (time, interest, costs, instrument
+        # availability) completely!
+        if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+            print(f"WARNING: {self.tmp[ckey]['grid_type']}: check carefully the " \
+                  f"correct interpretation of scan_point coords!")
+        # the case of EDAX APEX shows the key problem with implicit assumptions
+        # edaxh5 file not necessarily store the scan_point_{dim} positions
+        # therefore the following code is deprecated as the axes coordinates anyway
+        # have to be recomputed based on whether results are rediscretized on a coarser
+        # grid or not !
+        # mind also that the code below anyway would give only the NeXus dim axis but
+        # not the array of pairs of x, y coordinates for each scan point
+        # TODO::also keep in mind that the order in which the scan points are stored
+        # i.e. which index on self.tmp[ckey]["euler"] belongs to which scan point
+        # depends not only on the scan grid but also the flight plan i.e. how the grid
+        # gets visited
+        # only because of the fact that in most cases people seem to accept that
+        # scanning snake like first a line along +x and then +y meandering over the
+        # scan area from the top left corner to the bottom right corner is JUST an
+        # assumption for a random or dynamically adaptive scan strategy the scan positions
+        # have to be reported anyway, TODO::tech partners should be convinced to export
+        # scaled and calibrated scan positions as they are not necessarily redundant information
+        # that can be stripped to improve performance of their commercial product, I mean
+        # we talk typically <5k pattern per second demanding to store 5k * 2 * 8B, indeed
+        # this is the non-harmonized content one is facing in the field of EBSD despite
+        # almost two decades of commercialization of the technique now
+        get_scan_point_coords(self.tmp[ckey])
diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_bruker.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_bruker.py
@@ -39,7 +39,7 @@
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import \
     EBSD_MAP_SPACEGROUP, read_strings_from_dataset, all_equal, format_euler_parameterization
 from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
-    ASSUME_PHASE_NAME_TO_SPACE_GROUP
+    ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID
 
 
 class HdfFiveBrukerEspritReader(HdfFiveBaseParser):
@@ -107,6 +107,12 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str):
         if f"{grp_name}" not in fp:
             raise ValueError(f"Unable to parse {grp_name} !")
 
+        self.tmp[ckey]["dimensionality"] = 2  # TODO::QUBE can also yield 3D datasets
+        if read_strings_from_dataset(fp[f"{grp_name}/Grid Type"]) == "isometric":
+            self.tmp[ckey]["grid_type"] = SQUARE_GRID
+        else:
+            raise ValueError(f"Unable to parse {grp_name}/Grid Type !")
+
         req_fields = ["NCOLS", "NROWS", "XSTEP", "YSTEP"]
         for req_field in req_fields:
             if f"{grp_name}/{req_field}" not in fp:
@@ -221,6 +227,9 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
         # there is X SAMPLE and Y SAMPLE but these are not defined somewhere instead
         # here adding x and y assuming that we scan first lines along positive x and then
         # moving downwards along +y
+        # TODO::calculation below x/y only valid if self.tmp[ckey]["grid_type"] == SQUARE_GRID
+        if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+            print(f"WARNING: Check carefully correct interpretation of scan_point coords!")
         self.tmp[ckey]["scan_point_x"] \
             = np.asarray(np.tile(np.linspace(0.,
                                              self.tmp[ckey]["n_x"] - 1.,

diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_dreamthreed.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_dreamthreed.py
@@ -38,7 +38,7 @@
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import \
     EBSD_MAP_SPACEGROUP, read_strings_from_dataset, all_equal, format_euler_parameterization
 from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
-    ASSUME_PHASE_NAME_TO_SPACE_GROUP
+    ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID
 
 # DREAM3D implements essentially a data analysis workflow with individual steps
 # in the DREAM3D jargon each step is referred to as a filter, filters have well-defined
@@ -312,6 +312,10 @@ def parse_and_normalize_ebsd_header(self, ckey: str):
             spc = h5r[f"{self.path_registry['group_geometry']}" \
                       f"/_SIMPL_GEOMETRY/SPACING"][:].flatten()
             idx = 0
+
+            # TODO::is it correct an assumption that DREAM3D regrids using square voxel
+            self.tmp[ckey]["dimensionality"] = 3
+            self.tmp[ckey]["grid_type"] = SQUARE_GRID
             for dim in ["x", "y", "z"]:
                 self.tmp[ckey][f"n_{dim}"] = dims[idx]
                 self.tmp[ckey][f"s_{dim}"] = spc[idx]
@@ -388,6 +392,8 @@ def parse_and_normalize_ebsd_data(self, ckey: str):
             # in effect, the phase_id == 0 rightly so marks position indexed with the null-model
 
             # normalize pixel coordinates to physical positions even though the origin can still dangle somewhere
+            if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+                print(f"WARNING: Check carefully correct interpretation of scan_point coords!")
             for dim in ["x", "y", "z"]:
                 self.tmp[ckey][f"scan_point_{dim}"] \
                     = np.asarray(np.linspace(0, self.tmp[ckey][f"n_{dim}"] - 1,

diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_ebsd.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_ebsd.py
@@ -38,7 +38,7 @@
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import \
     EBSD_MAP_SPACEGROUP, read_strings_from_dataset, all_equal, format_euler_parameterization
 from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
-    ASSUME_PHASE_NAME_TO_SPACE_GROUP
+    ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID
 
 
 class HdfFiveCommunityReader(HdfFiveBaseParser):
@@ -108,6 +108,12 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str):
         if f"{grp_name}" not in fp:
             raise ValueError(f"Unable to parse {grp_name} !")
 
+        self.tmp[ckey]["dimensionality"] = 2
+        if read_strings_from_dataset(fp[f"{grp_name}/Grid Type"][()]) == "isometric":
+            self.tmp[ckey]["grid_type"] = SQUARE_GRID
+        else:
+            raise ValueError(f"Unable to parse {grp_name}/Grid Type !")
+
         req_fields = ["NCOLS", "NROWS", "XSTEP", "YSTEP"]
         for req_field in req_fields:
             if f"{grp_name}/{req_field}" not in fp:
@@ -223,7 +229,10 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
         # X and Y
         # there exist X SAMPLE and Y SAMPLE which give indeed calibrated coordinates
         # relative to the sample coordinate system, ignore this for now an
-        # and TOD::just calibrate on image dimension
+        # and TODO::just calibrate on image dimension
+        # TODO::calculation below x/y only valid if self.tmp[ckey]["grid_type"] == SQUARE_GRID
+        if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+            print(f"WARNING: Check carefully correct interpretation of scan_point coords!")
         self.tmp[ckey]["scan_point_x"] \
             = np.asarray(np.tile(np.linspace(0.,
                                              self.tmp[ckey]["n_x"] - 1.,
@@ -236,17 +245,7 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
                                                num=self.tmp[ckey]["n_y"],
                                                endpoint=True) * self.tmp[ckey]["s_y"],
                                                self.tmp[ckey]["n_x"]), np.float32)
-
-        # if np.shape(fp[f"{grp_name}/X SAMPLE"][:])[0] == n_pts:
-        #     self.tmp[ckey]["scan_point_x"] \
-        #         = np.asarray(fp[f"{grp_name}/X SAMPLE"][:], np.float32)
-        # else:
-        #     raise ValueError(f"{grp_name}/X SAMPLE has unexpected shape !")
-        # if np.shape(fp[f"{grp_name}/Y SAMPLE"][:])[0] == n_pts:
-        #     self.tmp[ckey]["scan_point_y"] \
-        #         = np.asarray(fp[f"{grp_name}/Y SAMPLE"], np.float32)
-        # else:
-        #     raise ValueError(f"{grp_name}/Y SAMPLE has unexpected shape !")
+        # X SAMPLE and Y SAMPLE seem to be something different!
 
         # Band Contrast is not stored in Bruker but Radon Quality or MAD
         # but this is s.th. different as it is the mean angular deviation between

diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_edax.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_edax.py
@@ -39,7 +39,7 @@
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import EULER_SPACE_SYMMETRY, \
     read_strings_from_dataset, read_first_scalar, format_euler_parameterization
 from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
-    ASSUME_PHASE_NAME_TO_SPACE_GROUP
+    ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID
 
 
 class HdfFiveEdaxOimAnalysisReader(HdfFiveBaseParser):
@@ -110,17 +110,20 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str):
         if f"{grp_name}" not in fp:
             raise ValueError(f"Unable to parse {grp_name} !")
 
-        grid_type = None
         n_pts = 0
         req_fields = ["Grid Type", "Step X", "Step Y", "nColumns", "nRows"]
         for req_field in req_fields:
             if f"{grp_name}/{req_field}" not in fp:
                 raise ValueError(f"Unable to parse {grp_name}/{req_field} !")
 
+        self.tmp[ckey]["dimensionality"] = 2
         grid_type = read_strings_from_dataset(fp[f"{grp_name}/Grid Type"][()])
-        if grid_type not in ["HexGrid", "SqrGrid"]:
-            raise ValueError(f"Grid Type {grid_type} is currently not supported !")
-        self.tmp[ckey]["grid_type"] = grid_type
+        if grid_type == "HexGrid":
+            self.tmp[ckey]["grid_type"] = HEXAGONAL_GRID
+        elif grid_type == "SqrGrid":
+            self.tmp[ckey]["grid_type"] = SQUARE_GRID
+        else:
+            raise ValueError(f"Unable to parse {grp_name}/Grid Type !")
         self.tmp[ckey]["s_x"] = read_first_scalar(fp[f"{grp_name}/Step X"])
         self.tmp[ckey]["s_unit"] = "um"  # "µm"  # TODO::always micron?
         self.tmp[ckey]["n_x"] = read_first_scalar(fp[f"{grp_name}/nColumns"])
@@ -248,17 +251,17 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
         # as the step size has already been accounted for by the tech partner when writing!
         if self.version["schema_version"] in ["OIM Analysis 8.5.1002 x64 [07-17-20]"]:
             print(f"{self.version['schema_version']}, tech partner accounted for calibration")
+            if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+                print(f"WARNING: Check carefully correct interpretation of scan_point coords!")
             self.tmp[ckey]["scan_point_x"] \
                 = np.asarray(fp[f"{grp_name}/X Position"][:], np.float32)
             self.tmp[ckey]["scan_point_y"] \
                 = np.asarray(fp[f"{grp_name}/Y Position"][:], np.float32)
         else:
             print(f"{self.version['schema_version']}, parser has to do the calibration")
+            if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+                print(f"WARNING: Check carefully correct interpretation of scan_point coords!")
             self.tmp[ckey]["scan_point_x"] = np.asarray(
                     fp[f"{grp_name}/X Position"][:] * self.tmp[ckey]["s_x"], np.float32)
             self.tmp[ckey]["scan_point_y"] = np.asarray(
                     fp[f"{grp_name}/Y Position"][:] * self.tmp[ckey]["s_y"], np.float32)
-        print(f"xmin {np.min(self.tmp[ckey]['scan_point_x'])}," \
-              f"xmax {np.max(self.tmp[ckey]['scan_point_x'])}," \
-              f"ymin {np.min(self.tmp[ckey]['scan_point_y'])}," \
-              f"ymax {np.max(self.tmp[ckey]['scan_point_y'])}")
diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_oxford.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_oxford.py
@@ -38,6 +38,8 @@
 from pynxtools.dataconverter.readers.em.subparsers.hfive_base import HdfFiveBaseParser
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import \
     read_strings_from_dataset, format_euler_parameterization
+from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
+    HEXAGONAL_GRID, SQUARE_GRID
 
 
 class HdfFiveOxfordReader(HdfFiveBaseParser):
@@ -118,6 +120,10 @@ def parse_and_normalize_slice_ebsd_header(self, fp, ckey: str):
         if f"{grp_name}" not in fp:
             raise ValueError(f"Unable to parse {grp_name} !")
 
+        # TODO::check if Oxford always uses SquareGrid like assumed here
+        self.tmp[ckey]["dimensionality"] = 2
+        self.tmp[ckey]["grid_type"] = SQUARE_GRID
+
         req_fields = ["X Cells", "Y Cells", "X Step", "Y Step"]
         for req_field in req_fields:
             if f"{grp_name}/{req_field}" not in fp:
@@ -231,6 +237,8 @@ def parse_and_normalize_slice_ebsd_data(self, fp, ckey: str):
         # expected is order on x is first all possible x values while y == 0
         # followed by as many copies of this linear sequence for each y increment
         # no action needed Oxford reports already the pixel coordinate multiplied by step
+        if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+            print(f"WARNING: Check carefully correct interpretation of scan_point coords!")
         # X, no, H5T_NATIVE_FLOAT, (size, 1), X position of each pixel in micrometers (origin: top left corner)
         self.tmp[ckey]["scan_point_x"] = np.asarray(fp[f"{grp_name}/X"], np.float32)
         # inconsistency f32 in file although specification states float