From 97fcaa1674b172625c25e763dc4b4a44a34e0ca6 Mon Sep 17 00:00:00 2001
From: atomprobe-tc <markus.kuehbach@physik.hu-berlin.de>
Date: Wed, 6 Dec 2023 18:27:06 +0100
Subject: [PATCH] Started the refactoring to discretize always all point cloud
 data which are not collected on a square grid that is smaller than the
 maximum possible extent supported by h5web, tested with use case
 207_2081.edaxh5 resulting ROI map is a square likely due to improper handling
 of HexGrid, next steps: i) fix this bug for 207_2081, ii) replace xmap in
 ebsd map twod by discretized grid, iii) test with examples from all other
 tech partners, iv) run against all datasets

---
 .../readers/em/examples/ebsd_database.py      |   3 +
 .../readers/em/subparsers/hfive_apex.py       |  61 ++++++--
 .../readers/em/subparsers/hfive_bruker.py     |  11 +-
 .../em/subparsers/hfive_dreamthreed.py        |   8 +-
 .../readers/em/subparsers/hfive_ebsd.py       |  25 ++-
 .../readers/em/subparsers/hfive_edax.py       |  21 +--
 .../readers/em/subparsers/hfive_oxford.py     |   8 +
 .../readers/em/subparsers/nxs_pyxem.py        |  95 ++++++++----
 .../readers/em/utils/get_scan_points.py       |  80 ++++++++++
 .../readers/em/utils/get_sqr_grid.py          | 145 ++++++++++++++++++
 .../readers/em/utils/hfive_utils.py           |   5 +
 11 files changed, 391 insertions(+), 71 deletions(-)
 create mode 100644 pynxtools/dataconverter/readers/em/utils/get_scan_points.py
 create mode 100644 pynxtools/dataconverter/readers/em/utils/get_sqr_grid.py

diff --git a/pynxtools/dataconverter/readers/em/examples/ebsd_database.py b/pynxtools/dataconverter/readers/em/examples/ebsd_database.py
index 620284f23..34e79debd 100644
--- a/pynxtools/dataconverter/readers/em/examples/ebsd_database.py
+++ b/pynxtools/dataconverter/readers/em/examples/ebsd_database.py
@@ -28,6 +28,9 @@
 # is recoverable when there is no common agreement about the phases used and their
 # exact atomic configuration
 
+HEXAGONAL_GRID = "hexagonal_grid"
+SQUARE_GRID = "square_grid"
+
 
 FreeTextToUniquePhase = {"Actinolite": "Actinolite",
                          "al": "Al",
diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py
index 26008e21b..47c339a91 100644
--- a/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py
+++ b/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py
@@ -37,7 +37,9 @@
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import \
     read_strings_from_dataset
 from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
-    ASSUME_PHASE_NAME_TO_SPACE_GROUP
+    ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID
+from pynxtools.dataconverter.readers.em.utils.get_scan_points import \
+    get_scan_point_coords
 
 
 class HdfFiveEdaxApexReader(HdfFiveBaseParser):
@@ -106,7 +108,6 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str):
         if f"{self.prfx}/EBSD/ANG/DATA/DATA" not in fp:
             raise ValueError(f"Unable to parse {self.prfx}/EBSD/ANG/DATA/DATA !")
 
-        grid_type = None
         # for a regular tiling of R^2 with perfect hexagons
         n_pts = 0
         # their vertical center of mass distance is smaller than the horizontal
@@ -118,10 +119,14 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str):
             if f"{self.prfx}/Sample/{req_field}" not in fp:
                 raise ValueError(f"Unable to parse {self.prfx}/Sample/{req_field} !")
 
+        self.tmp[ckey]["dimensionality"] = 2
         grid_type = read_strings_from_dataset(fp[f"{self.prfx}/Sample/Grid Type"][()])
-        if grid_type not in ["HexGrid", "SqrGrid"]:
-            raise ValueError(f"Grid Type {grid_type} is currently not supported !")
-        self.tmp[ckey]["grid_type"] = grid_type
+        if grid_type == "HexGrid":
+            self.tmp[ckey]["grid_type"] = HEXAGONAL_GRID
+        elif grid_type == "SqrGrid":
+            self.tmp[ckey]["grid_type"] = SQUARE_GRID
+        else:
+            raise ValueError(f"Unable to parse {self.prfx}/Sample/Grid Type !")
         self.tmp[ckey]["s_x"] = fp[f"{self.prfx}/Sample/Step X"][0]
         self.tmp[ckey]["s_unit"] = "um"  # "µm"  # TODO::always micron?
         self.tmp[ckey]["n_x"] = fp[f"{self.prfx}/Sample/Number Of Columns"][0]
@@ -226,12 +231,40 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
         # TODO::currently assuming s_x and s_y are already the correct center of mass
         # distances for hexagonal or square tiling of R^2
         # self.tmp[ckey]["grid_type"] in ["HexGrid", "SqrGrid"]:
-        self.tmp[ckey]["scan_point_x"] = np.asarray(
-            np.linspace(0, self.tmp[ckey]["n_x"] - 1,
-                        num=self.tmp[ckey]["n_x"],
-                        endpoint=True) * self.tmp[ckey]["s_x"], np.float32)
-
-        self.tmp[ckey]["scan_point_y"] = np.asarray(
-            np.linspace(0, self.tmp[ckey]["n_y"] - 1,
-                        num=self.tmp[ckey]["n_y"],
-                        endpoint=True) * self.tmp[ckey]["s_y"], np.float32)
+        # if just SQUARE_GRID there is no point to explicitly compute the scan_point
+        # coordinates here (for every subparser) especially not when the respective
+        # quantity from the tech partner is just a pixel index i.e. zeroth, first px ...
+        # however, ideally the tech partners would use the scan_point fields to report
+        # calibrated absolute scan point positions in the local reference frame of the
+        # sample surface in which case these could indeed not just scaled positions
+        # having the correct x and y spacing but eventually even the absolute coordinate
+        # where the scan was performed on the sample surface whereby one could conclude
+        # more precisely where the scanned area was located, in practice though this precision
+        # is usually not needed because scientists assume that the ROI is representative for
+        # the material which they typically never scan (time, interest, costs, instrument
+        # availability) completely!
+        if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+            print(f"WARNING: {self.tmp[ckey]['grid_type']}: check carefully the " \
+                  f"correct interpretation of scan_point coords!")
+        # the case of EDAX APEX shows the key problem with implicit assumptions
+        # edaxh5 file not necessarily store the scan_point_{dim} positions
+        # therefore the following code is deprecated as the axes coordinates anyway
+        # have to be recomputed based on whether results are rediscretized on a coarser
+        # grid or not !
+        # mind also that the code below anyway would give only the NeXus dim axis but
+        # not the array of pairs of x, y coordinates for each scan point
+        # TODO::also keep in mind that the order in which the scan points are stored
+        # i.e. which index on self.tmp[ckey]["euler"] belongs to which scan point
+        # depends not only on the scan grid but also the flight plan i.e. how the grid
+        # gets visited
+        # only because of the fact that in most cases people seem to accept that
+        # scanning snake like first a line along +x and then +y meandering over the
+        # scan area from the top left corner to the bottom right corner is JUST an
+        # assumption for a random or dynamically adaptive scan strategy the scan positions
+        # have to be reported anyway, TODO::tech partners should be convinced to export
+        # scaled and calibrated scan positions as they are not necessarily redundant information
+        # that can be stripped to improve performance of their commercial product, I mean
+        # we talk typically <5k pattern per second demanding to store 5k * 2 * 8B, indeed
+        # this is the non-harmonized content one is facing in the field of EBSD despite
+        # almost two decades of commercialization of the technique now
+        get_scan_point_coords(self.tmp[ckey])
diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_bruker.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_bruker.py
index 4af1cd5e0..9457ec46d 100644
--- a/pynxtools/dataconverter/readers/em/subparsers/hfive_bruker.py
+++ b/pynxtools/dataconverter/readers/em/subparsers/hfive_bruker.py
@@ -39,7 +39,7 @@
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import \
     EBSD_MAP_SPACEGROUP, read_strings_from_dataset, all_equal, format_euler_parameterization
 from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
-    ASSUME_PHASE_NAME_TO_SPACE_GROUP
+    ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID
 
 
 class HdfFiveBrukerEspritReader(HdfFiveBaseParser):
@@ -107,6 +107,12 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str):
         if f"{grp_name}" not in fp:
             raise ValueError(f"Unable to parse {grp_name} !")
 
+        self.tmp[ckey]["dimensionality"] = 2  # TODO::QUBE can also yield 3D datasets
+        if read_strings_from_dataset(fp[f"{grp_name}/Grid Type"]) == "isometric":
+            self.tmp[ckey]["grid_type"] = SQUARE_GRID
+        else:
+            raise ValueError(f"Unable to parse {grp_name}/Grid Type !")
+
         req_fields = ["NCOLS", "NROWS", "XSTEP", "YSTEP"]
         for req_field in req_fields:
             if f"{grp_name}/{req_field}" not in fp:
@@ -221,6 +227,9 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
         # there is X SAMPLE and Y SAMPLE but these are not defined somewhere instead
         # here adding x and y assuming that we scan first lines along positive x and then
         # moving downwards along +y
+        # TODO::calculation below x/y only valid if self.tmp[ckey]["grid_type"] == SQUARE_GRID
+        if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+            print(f"WARNING: Check carefully correct interpretation of scan_point coords!")
         self.tmp[ckey]["scan_point_x"] \
             = np.asarray(np.tile(np.linspace(0.,
                                              self.tmp[ckey]["n_x"] - 1.,
diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_dreamthreed.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_dreamthreed.py
index bdb739ff0..248be5452 100644
--- a/pynxtools/dataconverter/readers/em/subparsers/hfive_dreamthreed.py
+++ b/pynxtools/dataconverter/readers/em/subparsers/hfive_dreamthreed.py
@@ -38,7 +38,7 @@
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import \
     EBSD_MAP_SPACEGROUP, read_strings_from_dataset, all_equal, format_euler_parameterization
 from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
-    ASSUME_PHASE_NAME_TO_SPACE_GROUP
+    ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID
 
 # DREAM3D implements essentially a data analysis workflow with individual steps
 # in the DREAM3D jargon each step is referred to as a filter, filters have well-defined
@@ -312,6 +312,10 @@ def parse_and_normalize_ebsd_header(self, ckey: str):
             spc = h5r[f"{self.path_registry['group_geometry']}" \
                       f"/_SIMPL_GEOMETRY/SPACING"][:].flatten()
             idx = 0
+
+            # TODO::is it correct an assumption that DREAM3D regrids using square voxel
+            self.tmp[ckey]["dimensionality"] = 3
+            self.tmp[ckey]["grid_type"] = SQUARE_GRID
             for dim in ["x", "y", "z"]:
                 self.tmp[ckey][f"n_{dim}"] = dims[idx]
                 self.tmp[ckey][f"s_{dim}"] = spc[idx]
@@ -388,6 +392,8 @@ def parse_and_normalize_ebsd_data(self, ckey: str):
             # in effect, the phase_id == 0 rightly so marks position indexed with the null-model
 
             # normalize pixel coordinates to physical positions even though the origin can still dangle somewhere
+            if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+                print(f"WARNING: Check carefully correct interpretation of scan_point coords!")
             for dim in ["x", "y", "z"]:
                 self.tmp[ckey][f"scan_point_{dim}"] \
                     = np.asarray(np.linspace(0, self.tmp[ckey][f"n_{dim}"] - 1,
diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_ebsd.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_ebsd.py
index 3a11eddec..8bb2bbeb1 100644
--- a/pynxtools/dataconverter/readers/em/subparsers/hfive_ebsd.py
+++ b/pynxtools/dataconverter/readers/em/subparsers/hfive_ebsd.py
@@ -38,7 +38,7 @@
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import \
     EBSD_MAP_SPACEGROUP, read_strings_from_dataset, all_equal, format_euler_parameterization
 from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
-    ASSUME_PHASE_NAME_TO_SPACE_GROUP
+    ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID
 
 
 class HdfFiveCommunityReader(HdfFiveBaseParser):
@@ -108,6 +108,12 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str):
         if f"{grp_name}" not in fp:
             raise ValueError(f"Unable to parse {grp_name} !")
 
+        self.tmp[ckey]["dimensionality"] = 2
+        if read_strings_from_dataset(fp[f"{grp_name}/Grid Type"][()]) == "isometric":
+            self.tmp[ckey]["grid_type"] = SQUARE_GRID
+        else:
+            raise ValueError(f"Unable to parse {grp_name}/Grid Type !")
+
         req_fields = ["NCOLS", "NROWS", "XSTEP", "YSTEP"]
         for req_field in req_fields:
             if f"{grp_name}/{req_field}" not in fp:
@@ -223,7 +229,10 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
         # X and Y
         # there exist X SAMPLE and Y SAMPLE which give indeed calibrated coordinates
         # relative to the sample coordinate system, ignore this for now an
-        # and TOD::just calibrate on image dimension
+        # and TODO::just calibrate on image dimension
+        # TODO::calculation below x/y only valid if self.tmp[ckey]["grid_type"] == SQUARE_GRID
+        if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+            print(f"WARNING: Check carefully correct interpretation of scan_point coords!")
         self.tmp[ckey]["scan_point_x"] \
             = np.asarray(np.tile(np.linspace(0.,
                                              self.tmp[ckey]["n_x"] - 1.,
@@ -236,17 +245,7 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
                                                num=self.tmp[ckey]["n_y"],
                                                endpoint=True) * self.tmp[ckey]["s_y"],
                                                self.tmp[ckey]["n_x"]), np.float32)
-
-        # if np.shape(fp[f"{grp_name}/X SAMPLE"][:])[0] == n_pts:
-        #     self.tmp[ckey]["scan_point_x"] \
-        #         = np.asarray(fp[f"{grp_name}/X SAMPLE"][:], np.float32)
-        # else:
-        #     raise ValueError(f"{grp_name}/X SAMPLE has unexpected shape !")
-        # if np.shape(fp[f"{grp_name}/Y SAMPLE"][:])[0] == n_pts:
-        #     self.tmp[ckey]["scan_point_y"] \
-        #         = np.asarray(fp[f"{grp_name}/Y SAMPLE"], np.float32)
-        # else:
-        #     raise ValueError(f"{grp_name}/Y SAMPLE has unexpected shape !")
+        # X SAMPLE and Y SAMPLE seem to be something different!
 
         # Band Contrast is not stored in Bruker but Radon Quality or MAD
         # but this is s.th. different as it is the mean angular deviation between
diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_edax.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_edax.py
index 586179a51..8e3fc1164 100644
--- a/pynxtools/dataconverter/readers/em/subparsers/hfive_edax.py
+++ b/pynxtools/dataconverter/readers/em/subparsers/hfive_edax.py
@@ -39,7 +39,7 @@
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import EULER_SPACE_SYMMETRY, \
     read_strings_from_dataset, read_first_scalar, format_euler_parameterization
 from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
-    ASSUME_PHASE_NAME_TO_SPACE_GROUP
+    ASSUME_PHASE_NAME_TO_SPACE_GROUP, HEXAGONAL_GRID, SQUARE_GRID
 
 
 class HdfFiveEdaxOimAnalysisReader(HdfFiveBaseParser):
@@ -110,17 +110,20 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str):
         if f"{grp_name}" not in fp:
             raise ValueError(f"Unable to parse {grp_name} !")
 
-        grid_type = None
         n_pts = 0
         req_fields = ["Grid Type", "Step X", "Step Y", "nColumns", "nRows"]
         for req_field in req_fields:
             if f"{grp_name}/{req_field}" not in fp:
                 raise ValueError(f"Unable to parse {grp_name}/{req_field} !")
 
+        self.tmp[ckey]["dimensionality"] = 2
         grid_type = read_strings_from_dataset(fp[f"{grp_name}/Grid Type"][()])
-        if grid_type not in ["HexGrid", "SqrGrid"]:
-            raise ValueError(f"Grid Type {grid_type} is currently not supported !")
-        self.tmp[ckey]["grid_type"] = grid_type
+        if grid_type == "HexGrid":
+            self.tmp[ckey]["grid_type"] = HEXAGONAL_GRID
+        elif grid_type == "SqrGrid":
+            self.tmp[ckey]["grid_type"] = SQUARE_GRID
+        else:
+            raise ValueError(f"Unable to parse {grp_name}/Grid Type !")
         self.tmp[ckey]["s_x"] = read_first_scalar(fp[f"{grp_name}/Step X"])
         self.tmp[ckey]["s_unit"] = "um"  # "µm"  # TODO::always micron?
         self.tmp[ckey]["n_x"] = read_first_scalar(fp[f"{grp_name}/nColumns"])
@@ -248,17 +251,17 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
         # as the step size has already been accounted for by the tech partner when writing!
         if self.version["schema_version"] in ["OIM Analysis 8.5.1002 x64 [07-17-20]"]:
             print(f"{self.version['schema_version']}, tech partner accounted for calibration")
+            if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+                print(f"WARNING: Check carefully correct interpretation of scan_point coords!")
             self.tmp[ckey]["scan_point_x"] \
                 = np.asarray(fp[f"{grp_name}/X Position"][:], np.float32)
             self.tmp[ckey]["scan_point_y"] \
                 = np.asarray(fp[f"{grp_name}/Y Position"][:], np.float32)
         else:
             print(f"{self.version['schema_version']}, parser has to do the calibration")
+            if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+                print(f"WARNING: Check carefully correct interpretation of scan_point coords!")
             self.tmp[ckey]["scan_point_x"] = np.asarray(
                     fp[f"{grp_name}/X Position"][:] * self.tmp[ckey]["s_x"], np.float32)
             self.tmp[ckey]["scan_point_y"] = np.asarray(
                     fp[f"{grp_name}/Y Position"][:] * self.tmp[ckey]["s_y"], np.float32)
-        print(f"xmin {np.min(self.tmp[ckey]['scan_point_x'])}," \
-              f"xmax {np.max(self.tmp[ckey]['scan_point_x'])}," \
-              f"ymin {np.min(self.tmp[ckey]['scan_point_y'])}," \
-              f"ymax {np.max(self.tmp[ckey]['scan_point_y'])}")
diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_oxford.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_oxford.py
index e05d7f4d6..2f6d6d3d7 100644
--- a/pynxtools/dataconverter/readers/em/subparsers/hfive_oxford.py
+++ b/pynxtools/dataconverter/readers/em/subparsers/hfive_oxford.py
@@ -38,6 +38,8 @@
 from pynxtools.dataconverter.readers.em.subparsers.hfive_base import HdfFiveBaseParser
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import \
     read_strings_from_dataset, format_euler_parameterization
+from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
+    HEXAGONAL_GRID, SQUARE_GRID
 
 
 class HdfFiveOxfordReader(HdfFiveBaseParser):
@@ -118,6 +120,10 @@ def parse_and_normalize_slice_ebsd_header(self, fp, ckey: str):
         if f"{grp_name}" not in fp:
             raise ValueError(f"Unable to parse {grp_name} !")
 
+        # TODO::check if Oxford always uses SquareGrid like assumed here
+        self.tmp[ckey]["dimensionality"] = 2
+        self.tmp[ckey]["grid_type"] = SQUARE_GRID
+
         req_fields = ["X Cells", "Y Cells", "X Step", "Y Step"]
         for req_field in req_fields:
             if f"{grp_name}/{req_field}" not in fp:
@@ -231,6 +237,8 @@ def parse_and_normalize_slice_ebsd_data(self, fp, ckey: str):
         # expected is order on x is first all possible x values while y == 0
         # followed by as many copies of this linear sequence for each y increment
         # no action needed Oxford reports already the pixel coordinate multiplied by step
+        if self.tmp[ckey]["grid_type"] != SQUARE_GRID:
+            print(f"WARNING: Check carefully correct interpretation of scan_point coords!")
         # X, no, H5T_NATIVE_FLOAT, (size, 1), X position of each pixel in micrometers (origin: top left corner)
         self.tmp[ckey]["scan_point_x"] = np.asarray(fp[f"{grp_name}/X"], np.float32)
         # inconsistency f32 in file although specification states float
diff --git a/pynxtools/dataconverter/readers/em/subparsers/nxs_pyxem.py b/pynxtools/dataconverter/readers/em/subparsers/nxs_pyxem.py
index 1709cd865..d9139e59a 100644
--- a/pynxtools/dataconverter/readers/em/subparsers/nxs_pyxem.py
+++ b/pynxtools/dataconverter/readers/em/subparsers/nxs_pyxem.py
@@ -48,7 +48,6 @@
 from orix.quaternion import Rotation
 from orix.quaternion.symmetry import get_point_group
 from orix.vector import Vector3d
-from scipy.spatial import KDTree
 
 import matplotlib.pyplot as plt
 
@@ -58,13 +57,16 @@
 from pynxtools.dataconverter.readers.em.utils.hfive_web_utils \
     import hfive_web_decorate_nxdata
 from pynxtools.dataconverter.readers.em.utils.image_processing import thumbnail
+from pynxtools.dataconverter.readers.em.utils.get_sqr_grid import \
+    get_scan_points_with_mark_data_discretized_on_sqr_grid
+from pynxtools.dataconverter.readers.em.utils.get_scan_points import \
+    get_scan_point_axis_values, get_scan_point_coords
 
 PROJECTION_VECTORS = [Vector3d.xvector(), Vector3d.yvector(), Vector3d.zvector()]
 PROJECTION_DIRECTIONS = [("X", Vector3d.xvector().data.flatten()),
                          ("Y", Vector3d.yvector().data.flatten()),
                          ("Z", Vector3d.zvector().data.flatten())]
 
-
 from pynxtools.dataconverter.readers.em.subparsers.hfive_oxford import HdfFiveOxfordReader
 from pynxtools.dataconverter.readers.em.subparsers.hfive_bruker import HdfFiveBrukerEspritReader
 from pynxtools.dataconverter.readers.em.subparsers.hfive_edax import HdfFiveEdaxOimAnalysisReader
@@ -114,17 +116,33 @@ def parse(self, template: dict) -> dict:
         # copying over all data and content within tech partner files into NeXus makes
         # not much sense as the data exists and we would like to motivate that
         # tech partners and community members write NeXus content directly
-        # therefore currently in this example we carry over the EBSD map and some
-        # metadata to motivate that there is indeed value wrt to interoperability
-        # when such data are harmonized exactly this is the point we would like to
-        # make with this example for NeXus and NOMAD OASIS within the FAIRmat project
+        # therefore, in this example we carry over the EBSD map and some metadata
+        # to motivate that there is indeed value wrt to interoperability when such data
+        # are harmonized upon injection in the RDMS - exactly this is the point
+        # we would like to make with this comprehensive example of data harmonization
+        # within the field of EBSD as one method in the field of electron diffraction
+        # we use NeXus, NOMAD OASIS within the FAIRmat project
         # it is practically beyond our resources to implement a mapping for all cases
-        # and corner cases of the vendor files
+        # and corner cases of vendor files
         # ideally concept mapping would be applied to just point to pieces of information
-        # in the HDF5 file that is written by the tech partners however because of the
-        # fact that currently these pieces of information are formatted very differently
-        # it is non-trivial to establish this mapping and only because of this we
-        # map over manually
+        # in (HDF5) files based on which semantically understood pieces of information
+        # are then interpreted and injected into the RDMS
+        # currently the fact that the documentation by tech partners is incomplete
+        # and the fact that conceptually similar or even the same concepts as instances
+        # with their pieces of information are formatted very differently, it is
+        # non-trivial to establish this mapping and only because of this we
+        # map over using hardcoding of concept names and symbols
+
+        # a collection of different tech-partner-specific subparser follows
+        # these subparsers already extract specific information and perform a first
+        # step of harmonization. The subparsers specifically store e.g. EBSD maps in a
+        # tmp dictionary, which is
+        # TODO: scan point positions (irrespective on which grid type (sqr, hex) these
+        # were probed, in some cases the grid may have a two large extent along a dim
+        # so that a sub-sampling is performed, here only for the purpose of using
+        # h5web to show the IPF color maps but deal with the fact that h5web has so far
+        # not been designed to deal with images as large as several thousand pixels along
+        # either dimension
         if hfive_parser_type == "oxford":
             oina = HdfFiveOxfordReader(self.file_path)
             oina.parse_and_normalize()
@@ -199,6 +217,8 @@ def process_into_template(self, inp: dict, template: dict) -> dict:
         return template
 
     def get_named_axis(self, inp: dict, dim_name: str):
+        """"Return scaled but not offset-calibrated scan point coordinates along dim."""
+        # TODO::remove!
         return np.asarray(np.linspace(0,
                                       inp[f"n_{dim_name}"] - 1,
                                       num=inp[f"n_{dim_name}"],
@@ -217,38 +237,47 @@ def process_roi_overview_ebsd_based(self,
                                         roi_id: str,
                                         template: dict) -> dict:
         print("Parse ROI default plot...")
+        # tech partner specific subparsers have just extracted the per scan point information
+        # in the sequence they were (which is often how they were scanned)
+        # however that can be a square, a hexagonal or some random grid
+        # a consuming visualization tool (like h5web) may however not be able to
+        # represent the data as point cloud but only visualizes a grid of square pixels
+        # therefore in general the scan_point_x and scan_point_y arrays and their associated
+        # data arrays such as euler should always be interpolated on a specific grid
+        # Here, the square_grid supported by h5web with a specific maximum extent
+        # which may represent a downsampled representation of the actual ROI
+        # only in the case that indeed the grid is a square grid this interpolation is
+        # obsolete but also only when the grid does not exceed the technical limitation
+        # of here h5web
+        # TODO::implement rediscretization using a kdtree take n_x, n_y, and n_z as guides
+
+        trg_grid \
+            = get_scan_points_with_mark_data_discretized_on_sqr_grid(inp,
+                                                                     HFIVE_WEB_MAXIMUM_ROI)
+
         contrast_modes = [(None, "n/a"),
                           ("bc", "normalized_band_contrast"),
                           ("ci", "normalized_confidence_index"),
                           ("mad", "normalized_mean_angular_deviation")]
         contrast_mode = None
         for mode in contrast_modes:
-            if mode[0] in inp.keys() and contrast_mode is None:
+            if mode[0] in trg_grid.keys() and contrast_mode is None:
                 contrast_mode = mode
                 break
         if contrast_mode is None:
             print(f"{__name__} unable to generate plot for entry{self.entry_id}, roi{roi_id} !")
             return template
 
-        is_threed = False
-        if "n_z" in inp.keys():
-            is_threed = True
-            if np.max((inp["n_x"], inp["n_y"], inp["n_z"])) > HFIVE_WEB_MAXIMUM_ROI:
-                raise ValueError(f"Plotting 3D roi_overviews larger than " \
-                                 f"{HFIVE_WEB_MAXIMUM_ROI} is not supported !")
-        else:
-            if np.max((inp["n_x"], inp["n_y"])) > HFIVE_WEB_MAXIMUM_ROI:
-                raise ValueError(f"Plotting 2D roi_overviews larger than " \
-                                 f"{HFIVE_WEB_MAXIMUM_ROI} is not supported !")
-
-        template[f"/ENTRY[entry{self.entry_id}]/ROI[roi{roi_id}]/@NX_class"] = "NXroi"  # TODO::writer should decorate automatically!
-        template[f"/ENTRY[entry{self.entry_id}]/ROI[roi{roi_id}]/ebsd/indexing/@NX_class"] = "NXprocess"  # TODO::writer should decorate automatically!
+        template[f"/ENTRY[entry{self.entry_id}]/ROI[roi{roi_id}]/@NX_class"] = "NXroi"
+        # TODO::writer should decorate automatically!
+        template[f"/ENTRY[entry{self.entry_id}]/ROI[roi{roi_id}]/ebsd/indexing/@NX_class"] = "NXprocess"
+        # TODO::writer should decorate automatically!
         trg = f"/ENTRY[entry{self.entry_id}]/ROI[roi{roi_id}]/ebsd/indexing/DATA[roi]"
         template[f"{trg}/title"] = f"Region-of-interest overview image"
         template[f"{trg}/@NX_class"] = f"NXdata"  # TODO::writer should decorate automatically!
         template[f"{trg}/@signal"] = "data"
         dims = ["x", "y"]
-        if is_threed is True:
+        if trg_grid["dimensionality"] == 3:
             dims.append("z")
         idx = 0
         for dim in dims:
@@ -258,22 +287,22 @@ def process_roi_overview_ebsd_based(self,
         for dim in dims[::-1]:
             template[f"{trg}/@axes"].append(f"axis_{dim}")
 
-        if is_threed is True:
-            template[f"{trg}/data"] = {"compress": np.squeeze(np.asarray(np.asarray((inp[contrast_mode[0]] / np.max(inp[contrast_mode[0]], axis=None) * 255.), np.uint32), np.uint8), axis=3), "strength": 1}
+        if trg_grid["dimensionality"] == 3:
+            template[f"{trg}/data"] = {"compress": np.squeeze(np.asarray(np.asarray((trg_grid[contrast_mode[0]] / np.max(trg_grid[contrast_mode[0]], axis=None) * 255.), np.uint32), np.uint8), axis=3), "strength": 1}
         else:
-            template[f"{trg}/data"] = {"compress": np.reshape(np.asarray(np.asarray((inp[contrast_mode[0]] / np.max(inp[contrast_mode[0]]) * 255.), np.uint32), np.uint8), (inp["n_y"], inp["n_x"]), order="C"), "strength": 1}
+            template[f"{trg}/data"] = {"compress": np.reshape(np.asarray(np.asarray((trg_grid[contrast_mode[0]] / np.max(trg_grid[contrast_mode[0]]) * 255.), np.uint32), np.uint8), (trg_grid["n_y"], trg_grid["n_x"]), order="C"), "strength": 1}
         template[f"{trg}/descriptor"] = contrast_mode[1]
 
         # 0 is y while 1 is x for 2d, 0 is z, 1 is y, while 2 is x for 3d
         template[f"{trg}/data/@long_name"] = f"Signal"
         hfive_web_decorate_nxdata(f"{trg}/data", template)
 
-        scan_unit = inp["s_unit"]
+        scan_unit = trg_grid["s_unit"]
         if scan_unit == "um":
             scan_unit = "µm"
         for dim in dims:
             template[f"{trg}/AXISNAME[axis_{dim}]"] \
-                = {"compress": self.get_named_axis(inp, dim), "strength": 1}
+                = {"compress": self.get_named_axis(trg_grid, dim), "strength": 1}
             template[f"{trg}/AXISNAME[axis_{dim}]/@long_name"] \
                 = f"Coordinate along {dim}-axis ({scan_unit})"
             template[f"{trg}/AXISNAME[axis_{dim}]/@units"] = f"{scan_unit}"
@@ -287,7 +316,7 @@ def process_roi_ebsd_maps(self, inp: dict, template: dict) -> dict:
                     if "n_z" not in inp[ckey].keys():
                         self.prepare_roi_ipfs_phases_twod(inp[ckey], roi_id, template)
                         self.process_roi_ipfs_phases_twod(inp[ckey], roi_id, template)
-                        # self.onthefly_process_roi_ipfs_phases_threed(inp[ckey], roi_id, template)
+                        # self.onthefly_process_roi_ipfs_phases_two(inp[ckey], roi_id, template)
                     else:
                         self.onthefly_process_roi_ipfs_phases_threed(inp[ckey], roi_id, template)
         return template
@@ -612,7 +641,7 @@ def process_roi_phase_ipfs_threed(self, inp: dict, roi_id: int, pyxem_phase_id:
         # TODO: I have not seen any dataset yet where is limit is exhausted, the largest
         # dataset is a 3D SEM/FIB study from a UK project this is likely because to
         # get an EBSD map as large one already scans quite long for one section as making
-        # a ompromise is required and thus such hypothetical large serial-sectioning
+        # a compromise is required and thus such hypothetical large serial-sectioning
         # studies would block the microscope for a very long time
         # however I have seen examples from Hadi Pirgazi with L. Kestens from Leuven
         # where indeed large but thin 3d slabs were characterized
diff --git a/pynxtools/dataconverter/readers/em/utils/get_scan_points.py b/pynxtools/dataconverter/readers/em/utils/get_scan_points.py
new file mode 100644
index 000000000..9bfbe5ab1
--- /dev/null
+++ b/pynxtools/dataconverter/readers/em/utils/get_scan_points.py
@@ -0,0 +1,80 @@
+#
+# Copyright The NOMAD Authors.
+#
+# This file is part of NOMAD. See https://nomad-lab.eu for further info.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Identify likely scan_point_positions for specific EBSD grid types."""
+
+# pylint: disable=no-member
+
+import numpy as np
+
+from pynxtools.dataconverter.readers.em.examples.ebsd_database import \
+    HEXAGONAL_GRID, SQUARE_GRID
+
+
+def get_scan_point_axis_values(inp: dict, dim_name: str):
+    is_threed = False
+    if "dimensionality" in inp.keys():
+        if inp["dimensionality"] == 3:
+            is_threed = True
+    req_keys = ["grid_type", f"n_{dim_name}", f"s_{dim_name}"]
+    for key in req_keys:
+        if key not in inp.keys():
+            raise ValueError(f"Unable to find required key {key} in inp !")
+
+    if inp["grid_type"] in [HEXAGONAL_GRID, SQUARE_GRID]:
+        return np.asarray(np.linspace(0,
+                                      inp[f"n_{dim_name}"] - 1,
+                                      num=inp[f"n_{dim_name}"],
+                                      endpoint=True) * inp[f"s_{dim_name}"], np.float32)
+    else:
+        return None
+
+
+def get_scan_point_coords(inp: dict) -> dict:
+    """Add scan_point_dim array assuming top-left to bottom-right snake style scanning."""
+    is_threed = False
+    if "dimensionality" in inp.keys():
+        if inp["dimensionality"] == 3:
+            is_threed = True
+
+    req_keys = ["grid_type"]
+    dims = ["x", "y"]
+    if is_threed is True:
+        dims.append("z")
+    for dim in dims:
+        req_keys.append(f"n_{dim}")
+        req_keys.append(f"s_{dim}")
+
+    for key in req_keys:
+        if key not in inp.keys():
+            raise ValueError(f"Unable to find required key {key} in inp !")
+
+    if is_threed is False:
+        if inp["grid_type"] in [SQUARE_GRID, HEXAGONAL_GRID]:
+            # TODO::check that below code is correct as well for hexagonal grid !
+            for dim in dims:
+                if "scan_point_{dim}" in inp.keys():
+                    print("WARNING::Overwriting scan_point_{dim} !")
+            inp["scan_point_x"] = np.tile(
+                np.linspace(0, inp["n_x"] - 1, num=inp["n_x"], endpoint=True) * inp["s_x"], inp["n_y"])
+            inp["scan_point_y"] = np.repeat(
+                np.linspace(0, inp["n_y"] - 1, num=inp["n_y"], endpoint=True) * inp["s_y"], inp["n_x"])
+        else:
+            print("WARNING::{__name__} facing an unknown scan strategy !")
+    else:
+        print("WARNING::{__name__} not implemented for 3D case !")
+    return inp
diff --git a/pynxtools/dataconverter/readers/em/utils/get_sqr_grid.py b/pynxtools/dataconverter/readers/em/utils/get_sqr_grid.py
new file mode 100644
index 000000000..ec9a78eb3
--- /dev/null
+++ b/pynxtools/dataconverter/readers/em/utils/get_sqr_grid.py
@@ -0,0 +1,145 @@
+#
+# Copyright The NOMAD Authors.
+#
+# This file is part of NOMAD. See https://nomad-lab.eu for further info.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Discretize point cloud in R^d (d=2, 3) with mark data to square/cube voxel grid."""
+
+# pylint: disable=no-member
+
+import numpy as np
+from scipy.spatial import KDTree
+
+from pynxtools.dataconverter.readers.em.examples.ebsd_database import SQUARE_GRID
+
+
+def get_scan_points_with_mark_data_discretized_on_sqr_grid(src_grid: dict,
+                                                           max_edge_length: int) -> dict:
+    """Inspect grid_type, dimensionality, point locations, and mark src_grida, map then."""
+    is_threed = False
+    if "dimensionality" in src_grid.keys():
+        if src_grid["dimensionality"] == 3:
+            is_threed = True
+
+    req_keys = ["grid_type"]
+    dims = ["x", "y"]
+    if is_threed is True:
+        dims.append("z")
+    for dim in dims:
+        req_keys.append(f"scan_point_{dim}")
+        req_keys.append(f"n_{dim}")
+        req_keys.append(f"s_{dim}")
+
+    trg_grid = {}
+    for key in req_keys:
+        if key not in src_grid.keys():
+            raise ValueError(f"Unable to find required key {key} in src_grid !")
+
+    # take discretization of the source grid as a guide for the target_grid
+    # optimization possible if square grid and matching maximum_extent
+
+    max_extent = None
+    if is_threed is False:
+        max_extent = np.max((src_grid["n_x"], src_grid["n_y"]))
+    else:
+        max_extent = np.max((src_grid["n_x"], src_grid["n_y"], src_grid["n_z"]))
+
+    if src_grid["grid_type"] == SQUARE_GRID:
+        if max_extent <= max_edge_length:
+            return src_grid
+        else:
+            max_extent = max_edge_length  # cap the maximum extent
+
+    # all non-square grids or too large square grids will be
+    # discretized onto a regular grid with square or cubic pixel/voxel
+    aabb = []
+    for dim in dims:
+        aabb.append(np.min(src_grid[f"scan_point_{dim}"]))  # - 0.5 * src_grid[f"s_{dim}"]))
+        aabb.append(np.max(src_grid[f"scan_point_{dim}"]))  # + 0.5 * src_grid[f"s_{dim}"]))
+    print(f"{aabb}")
+
+    if is_threed is False:
+        if aabb[1] - aabb[0] >= aabb[3] - aabb[2]:
+            sxy = (aabb[1] - aabb[0]) / max_extent
+            nxy = [max_extent, int(np.ceil((aabb[3] - aabb[2]) / sxy))]
+        else:
+            sxy = (aabb[3] - aabb[2]) / max_extent
+            nxy = [int(np.ceil((aabb[1] - aabb[0]) / sxy)), max_extent]
+        print(f"H5Web default plot generation, scaling nxy0 {[src_grid['n_x'], src_grid['n_y']]}, nxy {nxy}")
+        # the above estimate is not exactly correct (may create a slight real space shift)
+        # of the EBSD map TODO:: regrid the real world axis-aligned bounding box aabb with
+        # a regular tiling of squares or hexagons
+        # https://stackoverflow.com/questions/18982650/differences-between-matlab-and-numpy-and-pythons-round-function
+        # MTex/Matlab round not exactly the same as numpy round but reasonably close
+
+        # scan point positions were normalized by tech partner subparsers such that they
+        # always build on pixel coordinates calibrated for step size not by giving absolute positions
+        # in the sample surface frame of reference as this is typically not yet consistently documented
+        # because we assume in addition that we always start at the top left corner the zeroth/first
+        # coordinate is always 0., 0. !
+        xy = np.column_stack(
+            (np.tile(np.linspace(0, nxy[0] - 1, num=nxy[0], endpoint=True) * sxy, nxy[1]),
+            np.repeat(np.linspace(0, nxy[1] - 1, num=nxy[1], endpoint=True) * sxy, nxy[0])))
+        # TODO:: if scan_point_{dim} are calibrated this approach
+        # here would shift the origin to 0, 0 implicitly which may not be desired
+        print(f"xy {xy}, shape {np.shape(xy)}")
+        tree = KDTree(np.column_stack((src_grid["scan_point_x"], src_grid["scan_point_y"])))
+        d, idx = tree.query(xy, k=1)
+        if np.sum(idx == tree.n) > 0:
+            raise ValueError(f"kdtree query left some query points without a neighbor!")
+        del d
+        del tree
+
+        # rebuild src_grid container with only the relevant src_grida selected from src_grid
+        for key in src_grid.keys():
+            if key == "euler":
+                trg_grid[key] = np.zeros((np.shape(xy)[0], 3), np.float32)
+                trg_grid[key] = np.nan
+                trg_grid[key] = src_grid["euler"][idx, :]
+                if np.isnan(trg_grid[key]).any() is True:
+                    raise ValueError(f"Downsampling of the point cloud left " \
+                                     f"pixels without mark data {key} !")
+                print(f"final np.shape(trg_grid[{key}]) {np.shape(trg_grid[key])}")
+            elif key == "phase_id" or key == "bc":
+                trg_grid[key] = np.zeros((np.shape(xy)[0],), np.int32) - 2
+                # pyxem_id is at least -1, bc is typically positive
+                trg_grid[key] = src_grid[key][idx]
+                if np.sum(trg_grid[key] == -2) > 0:
+                    raise ValueError(f"Downsampling of the point cloud left " \
+                                     f"pixels without mark data {key} !")
+                print(f"final np.shape(trg_grid[{key}]) {np.shape(trg_grid[key])}")
+            elif key == "ci" or key == "mad":
+                trg_grid[key] = np.zeros((np.shape(xy)[0],), np.float32)
+                trg_grid[key] = np.nan
+                trg_grid[key] = src_grid[key][idx]
+                print(f"final np.shape(trg_grid[{key}]) {np.shape(trg_grid[key])}")
+                if np.isnan(trg_grid[key]).any() is True:
+                    raise ValueError(f"Downsampling of the point cloud left " \
+                                     f"pixels without mark data {key} !")
+            elif key not in ["n_x", "n_y", "n_z", "s_x", "s_y", "s_z"]:
+                print(f"WARNING:: src_grid[{key}] is mapped as is on trg_grid[{key}] !")
+                trg_grid[key] = src_grid[key]
+                print(f"final np.shape(trg_grid[{key}]) {np.shape(trg_grid[key])}")
+            else:
+                print(f"WARNING:: src_grid[{key}] is not yet mapped on trg_grid[{key}] !")
+            trg_grid["n_x"] = nxy[0]
+            trg_grid["n_y"] = nxy[1]
+            trg_grid["s_x"] = sxy
+            trg_grid["s_y"] = sxy
+            # TODO::need to update scan_point_{dim}
+        return trg_grid
+    else:
+        raise ValueError(f"The 3D discretization is currently not implemented because " \
+                         f"we do not know of any large enough dataset the test it !")
diff --git a/pynxtools/dataconverter/readers/em/utils/hfive_utils.py b/pynxtools/dataconverter/readers/em/utils/hfive_utils.py
index bf1e7af10..3b2320c4e 100644
--- a/pynxtools/dataconverter/readers/em/utils/hfive_utils.py
+++ b/pynxtools/dataconverter/readers/em/utils/hfive_utils.py
@@ -101,3 +101,8 @@ def read_first_scalar(obj):
 def all_equal(iterable):
     g = groupby(iterable)
     return next(g, True) and not next(g, False)
+
+
+# for dim in ["x", "y"]:
+#     print(f"{dim}min {np.min(self.tmp[ckey][f'scan_point_{dim}'])}")
+#    print(f"{dim}max {np.max(self.tmp[ckey][f'scan_point_{dim}'])}")