Implemented xmap mapping, runthrough tests with all datasets in HDF5 …

…family files, bugfixing
FAIRmat-NFDI · Oct 23, 2023 · f947f9d · f947f9d
1 parent 4076096
commit f947f9d
Show file tree

Hide file tree

Showing 15 changed files with 178 additions and 61 deletions.
diff --git a/pynxtools/dataconverter/readers/em/concepts/README.md b/pynxtools/dataconverter/readers/em/concepts/README.md
@@ -0,0 +1,3 @@
+# Context
+
+Mapping of pieces of information from concepts onto NeXus concepts.
diff --git a/pynxtools/dataconverter/readers/em/examples/README.md b/pynxtools/dataconverter/readers/em/examples/README.md
@@ -0,0 +1,4 @@
+# Context
+
+Specific code that is relevant only for the implementation of examples for
+this em parser and the NOMAD OASIS research data management system.
diff --git a/pynxtools/dataconverter/readers/em/examples/ebsd_database.py b/pynxtools/dataconverter/readers/em/examples/ebsd_database.py
@@ -303,3 +303,6 @@
                           "ZrO": "Zr;O"}
 
 ProjectIdToCitation = {"Forsterite.ctf.nxs.mtex": {"data": "someurl", "paper": "someurl"}}
+
+AssumePhaseNameToSpaceGroup = {"Silver": 225,
+                               "Copper": 225}
diff --git a/pynxtools/dataconverter/readers/em/geometry/README.md b/pynxtools/dataconverter/readers/em/geometry/README.md
@@ -0,0 +1,4 @@
+# Context
+
+Utility code relevant to handle coordinate systems and geometrical entities
+used or normalized by the em parser or components of the em parser.
diff --git a/pynxtools/dataconverter/readers/em/reader.py b/pynxtools/dataconverter/readers/em/reader.py
@@ -134,15 +134,6 @@ def read(self,
         # for dat_instance in case.dat_parser_type:
         #     print(f"Process pieces of information in {dat_instance} tech partner file...")
         #    continue
-        # if case.dat_parser_type == "orix":
-            #     orix_parser = NxEmOmOrixEbsdParser(case.dat[0], entry_id)
-            #     # h5oina parser evaluating content and plotting with orix on the fly
-            #     orix_parser.parse(template)
-            # elif case.dat_parser_type == "mtex":
-            #     mtex_parser = NxEmOmMtexEbsdParser(case.dat[0], entry_id)
-            #     # ebsd parser because concept suggested for MTex by M. Kühbach
-            #     # would include different HDF5 dumps for different MTex classes
-            #     mtex_parser.parse(template)
             # elif case.dat_parser_type == "zip":
             #     zip_parser = NxEmOmZipEbsdParser(case.dat[0], entry_id)
             #     zip_parser.parse(template)
@@ -167,7 +158,7 @@ def read(self,
         if resolved_path != "":
             nxs_plt.annotate_default_plot(template, resolved_path)
 
-        debugging = True
+        debugging = False
         if debugging is True:
             print("Reporting state of template before passing to HDF5 writing...")
             for keyword in template.keys():

diff --git a/pynxtools/dataconverter/readers/em/subparsers/README.md b/pynxtools/dataconverter/readers/em/subparsers/README.md
@@ -0,0 +1,3 @@
+# Context
+
+Specific parsers for metadata and data stored in HDF5 files from technology partners.
diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py
@@ -80,7 +80,7 @@ def check_if_supported(self):
     def parse_and_normalize(self):
         """Read and normalize away EDAX/APEX-specific formatting with an equivalent in NXem."""
         with h5py.File(f"{self.file_path}", "r") as h5r:
-            cache_id = 0
+            cache_id = 1
             grp_nms = list(h5r["/"])
             for grp_nm in grp_nms:
                 sub_grp_nms = list(h5r[grp_nm])
@@ -169,7 +169,7 @@ def parse_and_normalize_group_ebsd_phases(self, fp, ckey: str):
                     # problematic because mapping is not bijective!
                     # if you know the space group we know laue and point group and symmetry
                     # but the opposite direction leaves room for ambiguities
-                    space_group = "n/a"
+                    space_group = None
                     self.tmp[ckey]["phases"][int(phase_id)]["space_group"] = space_group
 
                     if len(self.tmp[ckey]["space_group"]) > 0:
@@ -210,8 +210,8 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
             # check shape of internal virtual chunked number array
             r = Rotation.from_matrix([np.reshape(dat[i][0], (3, 3))])
             self.tmp[ckey]["euler"][i, :] = r.to_euler(degrees=False)
-            self.tmp[ckey]["phase_id"][i] = dat[i][2]
-            self.tmp[ckey]["ci"][i] = dat[i][3]
+            self.tmp[ckey]["ci"][i] = dat[i][2]
+            self.tmp[ckey]["phase_id"][i] = dat[i][3]
 
         # TODO::convert orientation matrix to Euler angles via om_eu but what are conventions !
         # orix based transformation ends up in positive half space and with degrees=False

diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_bruker.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_bruker.py
@@ -88,7 +88,7 @@ def check_if_supported(self):
     def parse_and_normalize(self):
         """Read and normalize away Bruker-specific formatting with an equivalent in NXem."""
         with h5py.File(f"{self.file_path}", "r") as h5r:
-            cache_id = 0
+            cache_id = 1
             grp_names = list(h5r["/"])
             for grp_name in grp_names:
                 if grp_name not in ["Version", "Manufacturer"]:
@@ -159,8 +159,9 @@ def parse_and_normalize_group_ebsd_phases(self, fp, ckey: str):
                     self.tmp[ckey]["phases"][int(phase_id)]["space_group"] = space_group
                 else:
                     raise ValueError(f"Unable to decode improperly formatted space group {spc_grp} !")
-
                 # formatting is a nightmare F m#ovl3m for F m 3bar m...
+                # TODO::in some case instead a group IT (likely International Tables of Crystallography)
+                # was there so parse this instead of the above used mapping table
                 if len(self.tmp[ckey]["space_group"]) > 0:
                     self.tmp[ckey]["space_group"].append(space_group)
                 else:

diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_ebsd.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_ebsd.py
@@ -89,7 +89,7 @@ def check_if_supported(self):
     def parse_and_normalize(self):
         """Read and normalize away community-specific formatting with an equivalent in NXem."""
         with h5py.File(f"{self.file_path}", "r") as h5r:
-            cache_id = 0
+            cache_id = 1
             grp_names = list(h5r["/"])
             for grp_name in grp_names:
                 if grp_name not in ["Version", "Manufacturer"]:

diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_edax.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_edax.py
@@ -37,7 +37,7 @@
 
 from pynxtools.dataconverter.readers.em.subparsers.hfive_base import HdfFiveBaseParser
 from pynxtools.dataconverter.readers.em.utils.hfive_utils import \
-    read_strings_from_dataset, format_euler_parameterization
+    read_strings_from_dataset, read_first_scalar, format_euler_parameterization
 
 
 class HdfFiveEdaxOimAnalysisReader(HdfFiveBaseParser):
@@ -91,7 +91,7 @@ def check_if_supported(self):
     def parse_and_normalize(self):
         """Read and normalize away EDAX-specific formatting with an equivalent in NXem."""
         with h5py.File(f"{self.file_path}", "r") as h5r:
-            cache_id = 0
+            cache_id = 1
             grp_names = list(h5r["/"])
             for grp_name in grp_names:
                 if grp_name not in ["Version", "Manufacturer"]:
@@ -119,11 +119,14 @@ def parse_and_normalize_group_ebsd_header(self, fp, ckey: str):
         if grid_type not in ["HexGrid", "SqrGrid"]:
             raise ValueError(f"Grid Type {grid_type} is currently not supported !")
         self.tmp[ckey]["grid_type"] = grid_type
-        self.tmp[ckey]["s_x"] = fp[f"{grp_name}/Step X"][()]
+        self.tmp[ckey]["s_x"] = read_first_scalar(fp[f"{grp_name}/Step X"])
         self.tmp[ckey]["s_unit"] = "µm"  # TODO::always micron?
-        self.tmp[ckey]["n_x"] = fp[f"{grp_name}/nColumns"][()]
-        self.tmp[ckey]["s_y"] = fp[f"{grp_name}/Step Y"][()]
-        self.tmp[ckey]["n_y"] = fp[f"{grp_name}/nRows"][()]
+        self.tmp[ckey]["n_x"] = read_first_scalar(fp[f"{grp_name}/nColumns"])
+        self.tmp[ckey]["s_y"] = read_first_scalar(fp[f"{grp_name}/Step Y"])
+        self.tmp[ckey]["n_y"] = read_first_scalar(fp[f"{grp_name}/nRows"])
+        # TODO::different version store the same concept with the same path name with different shape
+        # the read_first_scalar is not an optimal solution, in the future all reads from
+        # HDF5 should check for the shape instead
         # TODO::check that all data are consistent
 
     def parse_and_normalize_group_ebsd_phases(self, fp, ckey: str):
@@ -165,10 +168,11 @@ def parse_and_normalize_group_ebsd_phases(self, fp, ckey: str):
                         = np.asarray(angles, np.float32)
 
                     # Space Group not stored, only laue group, point group and symmetry
+                    # https://doi.org/10.1107/S1600576718012724 is a relevant read here
                     # problematic because mapping is not bijective!
                     # if you know the space group we know laue and point group and symmetry
                     # but the opposite direction leaves room for ambiguities
-                    space_group = "n/a"
+                    space_group = None
                     self.tmp[ckey]["phases"][int(phase_id)]["space_group"] = space_group
 
                     if len(self.tmp[ckey]["space_group"]) > 0:
@@ -207,7 +211,13 @@ def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
         # TODO::seems to be the situation in the example but there is no documentation
         self.tmp[ckey]["euler"] = format_euler_parameterization(self.tmp[ckey]["euler"])
 
-        self.tmp[ckey]["phase_id"] = np.asarray(fp[f"{grp_name}/Phase"][:], np.int32)
+        # given no official EDAX OimAnalysis spec we cannot define for sure if
+        # phase_id == 0 means just all was indexed with the first/zeroth phase or nothing
+        # was indexed, TODO::assuming it means all indexed:
+        if np.all(fp[f"{grp_name}/Phase"][:] == 0):
+            self.tmp[ckey]["phase_id"] = np.zeros(n_pts, np.int32) + 1
+        else:
+            self.tmp[ckey]["phase_id"] = np.asarray(fp[f"{grp_name}/Phase"][:], np.int32)
         # promoting int8 to int32 no problem
         self.tmp[ckey]["ci"] = np.asarray(fp[f"{grp_name}/CI"][:], np.float32)
         self.tmp[ckey]["scan_point_x"] = np.asarray(

diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_oxford.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_oxford.py
@@ -99,10 +99,10 @@ def check_if_supported(self):
     def parse_and_normalize(self):
         """Read and normalize away Oxford-specific formatting with an equivalent in NXem."""
         with h5py.File(f"{self.file_path}", "r") as h5r:
-            cache_id = 0
+            cache_id = 1
             slice_ids = sorted(list(h5r["/"]))
             for slice_id in slice_ids:
-                if slice_id.isdigit() is True and slice_id == "1":
+                if slice_id.isdigit() is True and slice_id == "1" and f"/{slice_id}/EBSD" in h5r:
                     # non-negative int, parse for now only the 1. slice
                     self.prfx = f"/{slice_id}"
                     ckey = self.init_named_cache(f"ebsd{cache_id}")  # name of the cache to use
@@ -241,6 +241,6 @@ def parse_and_normalize_slice_ebsd_data(self, fp, ckey: str):
         # inconsistency f32 in file although specification states float
 
         # Band Contrast, no, H5T_NATIVE_INT32, (size, 1)
-        self.tmp[ckey]["band_contrast"] = np.asarray(fp[f"{grp_name}/Band Contrast"], np.int32)
+        self.tmp[ckey]["bc"] = np.asarray(fp[f"{grp_name}/Band Contrast"], np.int32)
         # inconsistency uint8 in file although specification states should be int32
         # promoting uint8 to int32 no problem
diff --git a/pynxtools/dataconverter/readers/em/subparsers/nxs_hfive.py b/pynxtools/dataconverter/readers/em/subparsers/nxs_hfive.py
@@ -121,15 +121,6 @@ def parse(self, template: dict) -> dict:
             return template
         else:  # none or something unsupported
             return template
-
-        for key, val in self.cache.items():
-            print(f"{key}, type: {type(val)}, shape: {np.shape(val)}")
-
-        if self.cache["is_filled"] is True:
-            self.process_roi_overview(template)
-            self.process_roi_xmap(template)
-            self.process_roi_phases(template)
-            self.process_roi_inverse_pole_figures(template)
         return template
 
     def identify_hfive_type(self):
@@ -157,50 +148,118 @@ def identify_hfive_type(self):
         return None
 
     def process_into_template(self, inp: dict, template: dict) -> dict:
-        for key, val in inp.items():
-            if isinstance(val, dict):
-                for ckey, cval in val.items():
-                    print(f"{ckey}, {cval}")
-            else:
-                print(f"{key}, {val}")
+        debugging = False
+        if debugging is True:
+            for key, val in inp.items():
+                if isinstance(val, dict):
+                    for ckey, cval in val.items():
+                        print(f"{ckey}, {cval}")
+                else:
+                    print(f"{key}, {val}")
+
+        self.process_roi_overview(inp, template)
+        self.process_roi_ebsd_maps(inp, template)
+        return template
+
+    def process_roi_overview(self, inp: dict, template: dict) -> dict:
+        for ckey in inp.keys():
+            if ckey.startswith("ebsd"):
+                self.process_roi_overview_ebsd_based(
+                    inp[ckey], ckey.replace("ebsd", ""), template)
+                break  # only one roi for now
         return template
-        # super().process_ebsd_cache(self.tmp, template)
-        # return template
 
-    def process_roi_overview(inp: dict, template: dict) -> dict:
+    def process_roi_overview_ebsd_based(self,
+                                        inp: dict,
+                                        roi_id: str,
+                                        template: dict) -> dict:
+        print("Parse ROI default plot...")
+        # prfx = f"/ENTRY[entry{self.entry_id}]/experiment/indexing/region_of_interest/roi{roi_id}"
+        prfx = f"/roi{roi_id}"
+        trg = f"{prfx}"
+        template[f"{trg}/title"] = str("Region-of-interest overview image")
+        template[f"{trg}/@signal"] = "data"
+        template[f"{trg}/@axes"] = ["axis_y", "axis_x"]
+        template[f"{trg}/@AXISNAME_indices[axis_x_indices]"] = np.uint32(0)
+        template[f"{trg}/@AXISNAME_indices[axis_y_indices]"] = np.uint32(1)
+        trg = f"{prfx}/data"
+        contrast_modes = [(None, "n/a"),
+                          ("bc", "normalized_band_contrast"),
+                          ("ci", "normalized_confidence_index"),
+                          ("mad", "normalized_mean_angular_deviation")]
+        success = False
+        for contrast_mode in contrast_modes:
+            if contrast_mode[0] in inp.keys() and success is False:
+                template[f"{trg}"] = {"compress": np.reshape(np.asarray(np.asarray((inp[contrast_mode[0]] / np.max(inp[contrast_mode[0]]) * 255.), np.uint32), np.uint8), (inp["n_y"], inp["n_x"]), order="C"), "strength": 1}
+                template[f"{prfx}/descriptor"] = contrast_mode[1]
+                success = True
+        if success is False:
+            raise ValueError(f"{__name__} unable to generate plot for {prfx} !")
+        # 0 is y while 1 is x !
+        template[f"{trg}/@long_name"] = "Signal"
+        template[f"{trg}/@CLASS"] = "IMAGE"  # required by H5Web to plot RGB maps
+        template[f"{trg}/@IMAGE_VERSION"] = "1.2"
+        template[f"{trg}/@SUBCLASS_VERSION"] = np.int64(15)
+
+        trg = f"{prfx}/axis_x"
+        template[f"{trg}"] = {"compress": np.asarray(inp["scan_point_x"], np.float32), "strength": 1}
+        template[f"{trg}/@long_name"] = f"Coordinate along x-axis ({inp['s_unit']})"
+        template[f"{trg}/@units"] = f"{inp['s_unit']}"
+        trg = f"{prfx}/axis_y"
+        template[f"{trg}"] = {"compress": np.asarray(inp["scan_point_y"], np.float32), "strength": 1}
+        template[f"{trg}/@long_name"] = f"Coordinate along y-axis ({inp['s_unit']})"
+        template[f"{trg}/@units"] =  f"{inp['s_unit']}"
+        return template
+
+    def process_roi_ebsd_maps(self, inp: dict, template: dict) -> dict:
+        for ckey in inp.keys():
+            if ckey.startswith("ebsd"):
+                roi_identifier = ckey.replace("ebsd", "")
+                self.process_roi_xmap(
+                    inp[ckey], roi_identifier, template)
+                # self.process_roi_phases(
+                #     inp[ckey], roi_identifier, template)
+                # self.process_roi_inverse_pole_figures(
+                #     inp[ckey], roi_identifier, template)
+                break  # only one roi for now
         return template
 
-    def process_roi_xmap(inp: dict) -> dict:
-        """Process standardized IPF orientation map using pyxem from normalized orientation data."""
-        # for NeXus would like to create a default
-        '''
-        if np.max(inp["n_x"], inp["n_y"]) < HFIVE_WEB_MAXIMUM_RGB:
+    def process_roi_xmap(self, inp: dict, roi_id: str, template: dict) -> dict:
+        """Process crystal orientation map from normalized orientation data."""
+        # for NeXus to create a default representation of the EBSD map to explore
+        if np.max((inp["n_x"], inp["n_y"])) < HFIVE_WEB_MAXIMUM_RGB:
             # can use the map discretization as is
             coordinates, _ = create_coordinate_arrays(
                 (inp["n_x"], inp["n_y"]), (inp["s_x"], inp["s_y"]))
             xaxis = coordinates["x"]
             yaxis = coordinates["y"]
             del coordinates
-        # else:
+        else:
+            raise ValueError(f"Downsampling for too large EBSD maps is currently not supported !")
             # need to regrid to downsample too large maps
             # TODO::implement 1NN-based downsampling approach
             #       build grid
             #       tree-based 1NN
             #       proceed as usual
 
-        pyxem_phase_identifier = inp["phase_identifier"] \
-            - (np.min(inp["phase_identifier"]) - (-1))  # pyxem, non-indexed has to be -1
-        print(np.unique(pyxem_phase_identifier))
+        pyxem_phase_identifier = inp["phase_id"] - 1
+        # inp["phase_id"] - (np.min(inp["phase_id"]) - (-1))
+        # for pyxem the non-indexed has to be -1 instead of 0 which is what NeXus uses
+        # -1 always because content of inp["phase_id"] is normalized
+        # to NeXus NXem_ebsd_crystal_structure concept already!
+        print(f"Unique pyxem_phase_identifier {np.unique(pyxem_phase_identifier)}")
 
-        self.xmap = CrystalMap(rotations=inp["rotation"],
-                               x=self.xaxis, y=self.yaxis,
+        self.xmap = CrystalMap(rotations=Rotation.from_euler(euler=inp["euler"],
+                                                             direction='lab2crystal',
+                                                             degrees=False),
+                               x=xaxis, y=yaxis,
                                phase_id=pyxem_phase_identifier,
                                phase_list=PhaseList(space_groups=inp["space_group"],
                                                     structures=inp["phase"]),
-                               prop={"bc": inp["band_contrast"]},
-                               scan_unit=inp["s_unit"])
+                               prop={})
+        # "bc": inp["band_contrast"]}, scan_unit=inp["s_unit"])
         print(self.xmap)
-        '''
+        return template
 
     def process_roi_phases(self, template: dict) -> dict:
         return template

diff --git a/pynxtools/dataconverter/readers/em/utils/README.md b/pynxtools/dataconverter/readers/em/utils/README.md
@@ -0,0 +1,3 @@
+# Context
+
+Utility code used by eventual multiple (sub)parsers of the em parser.
diff --git a/pynxtools/dataconverter/readers/em/utils/hfive_utils.py b/pynxtools/dataconverter/readers/em/utils/hfive_utils.py
@@ -25,8 +25,17 @@
 from itertools import groupby
 
 
-EBSD_MAP_SPACEGROUP = {"F m#ovl3m": 225,
+EBSD_MAP_SPACEGROUP = {"P 6#sub3mc": 186,
+                       "P 6/mmm": 191,
+                       "P 6#sub3/mmc": 194,
+                       "F #ovl43m": 216,
+                       "P m#ovl3m": 221,
+                       "F m#ovl3m": 225,
+                       "Fd#ovl3m(*)": 227,
                        "I m#ovl3m": 229}
+# see here for typical examples http://img.chem.ucl.ac.uk/sgp/large/186az1.htm
+
+DIRTY_FIX_SPACEGROUP = {}
 
 def format_euler_parameterization(triplet_set):
     """Transform degrees to radiant and apply orientation space symmetry"""
@@ -81,6 +90,18 @@ def read_strings_from_dataset(obj):
         # raise ValueError("Neither np.ndarray, nor bytes, nor str !")
 
 
+def read_first_scalar(obj):
+    if hasattr(obj, "shape"):
+        if obj.shape == ():
+            return obj[()]
+        elif obj.shape == (1,):
+            return obj[0]
+        else:
+            raise ValueError(f"Unexpected shape found in {__name__} from object {obj} !")
+    else:
+        raise ValueError(f"Unexpected input passed to {__name__} with object {obj} !")
+
+
 def all_equal(iterable):
     g = groupby(iterable)
     return next(g, True) and not next(g, False)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Context

		Mapping of pieces of information from concepts onto NeXus concepts.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Context

		Specific parsers for metadata and data stored in HDF5 files from technology partners.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Context

		Utility code used by eventual multiple (sub)parsers of the em parser.