Refactoring and implementation for normalization oxford, bruker, apex…

… and tested
FAIRmat-NFDI · Oct 22, 2023 · 9c708da · 9c708da
1 parent d5ae63f
commit 9c708da
Show file tree

Hide file tree

Showing 11 changed files with 1,152 additions and 713 deletions.
diff --git a/pynxtools/dataconverter/readers/em/reader.py b/pynxtools/dataconverter/readers/em/reader.py
@@ -27,6 +27,8 @@
 
 from pynxtools.dataconverter.readers.em.subparsers.nxs_mtex import NxEmNxsMTexSubParser
 
+from pynxtools.dataconverter.readers.em.subparsers.nxs_hfive import NxEmNxsHfiveSubParser
+
 from pynxtools.dataconverter.readers.em.utils.default_plots import NxEmDefaultPlotResolver
 
 from pynxtools.dataconverter.readers.em.geometry.convention_mapper \
@@ -118,11 +120,16 @@ def read(self,
         conventions.parse(template)
 
         print("Parse and map pieces of information within files from tech partners...")
-        sub_parser = "nxs_mtex"
-        subparser = NxEmNxsMTexSubParser(entry_id, file_paths[0])
-        subparser.parse(template)
+        # sub_parser = "nxs_mtex"
+        # subparser = NxEmNxsMTexSubParser(entry_id, file_paths[0])
+        # subparser.parse(template)
 
         # add further with resolving cases
+        # if file_path is an HDF5 will use hfive parser
+        sub_parser = "nxs_hfive"
+        subparser = NxEmNxsHfiveSubParser(entry_id, file_paths[0])
+        subparser.parse(template)
+        exit(1)
 
         # for dat_instance in case.dat_parser_type:
         #     print(f"Process pieces of information in {dat_instance} tech partner file...")

diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py
@@ -19,50 +19,217 @@
 
 import numpy as np
 import h5py
-from pynxtools.dataconverter.readers.em.subparsers.hfive import HdfFiveGenericReader
+from itertools import groupby
+# import imageio.v3 as iio
+from PIL import Image as pil
 
+import diffsims
+import orix
+from diffpy.structure import Lattice, Structure
+from orix import plot
+from orix.crystal_map import create_coordinate_arrays, CrystalMap, PhaseList
+from orix.quaternion import Rotation
+from orix.vector import Vector3d
 
-class HdfFiveEdaxApexReader(HdfFiveGenericReader):
+import matplotlib.pyplot as plt
+
+from pynxtools.dataconverter.readers.em.subparsers.hfive_base import HdfFiveBaseParser
+from pynxtools.dataconverter.readers.em.utils.hfive_utils import read_strings_from_dataset
+
+
+def om_eu(inp):
+    return inp[0:2]
+
+
+class HdfFiveEdaxApexReader(HdfFiveBaseParser):
     """Read APEX edaxh5"""
-    def __init__(self, file_name: str = ""):
-        super().__init__(file_name)
-        # this specialized reader implements reading capabilities for the following formats
+    def __init__(self, file_path: str = ""):
+        super().__init__(file_path)
+        self.prfx = None
+        self.tmp = {}
         self.supported_version = {}
         self.version = {}
+        self.init_support()
+        self.supported = False
+        self.check_if_supported()
+
+    def init_support(self):
+        """Init supported versions."""
         self.supported_version["tech_partner"] = ["EDAX, LLC"]
         self.supported_version["schema_name"] = ["EDAXH5"]
         self.supported_version["schema_version"] = ["2.5.1001.0001"]
         self.supported_version["writer_name"] = ["APEX"]
         self.supported_version["writer_version"] = ["2.5.1001.0001"]
-        self.supported = True
-        # check if instance to process matches any of these constraints
-        h5r = h5py.File(self.file_name, "r")
-        # parse Company and PRODUCT_VERSION attribute values from the first group below / but these are not scalar but single value lists
-        # so much about interoperability
-        # but hehe for the APEX example from Sebastian and Sabine there is again no Company but PRODUCT_VERSION, 2 files, 2 "formats"
-        if "/Manufacturer" in h5r:
-            self.version["tech_partner"] \
-                = super().read_strings_from_dataset(h5r["/Manufacturer"][()])
-            if self.version["tech_partner"] not in self.supported_version["tech_partner"]:
-                # print(f"{self.version['tech_partner']} is not {self.supported_version['tech_partner']} !")
-                self.supported = False
+
+    def check_if_supported(self):
+        """Check if instance matches all constraints to qualify as supported H5OINA"""
+        self.supported = True  # try to falsify
+        with h5py.File(self.file_path, "r") as h5r:
+            # parse Company and PRODUCT_VERSION attribute values from the first group below / but these are not scalar but single value lists
+            # so much about interoperability
+            # but hehe for the APEX example from Sebastian and Sabine there is again no Company but PRODUCT_VERSION, 2 files, 2 "formats"
+            grp_names = list(h5r["/"])
+            if len(grp_names) == 1:
+                if read_strings_from_dataset(h5r[grp_names[0]].attrs["Company"][0]) \
+                    not in self.supported_version["tech_partner"]:
+                    self.supported = False
+                if read_strings_from_dataset(h5r[grp_names[0]].attrs["PRODUCT_VERSION"][0]) \
+                    not in self.supported_version["schema_version"]:
+                    self.supported = False
+            if self.supported is True:
+                self.version = self.supported_version.copy()
+
+    def parse_and_normalize(self):
+        """Read and normalize away EDAX/APEX-specific formatting with an equivalent in NXem."""
+        with h5py.File(f"{self.file_path}", "r") as h5r:
+            cache_id = 0
+            grp_nms = list(h5r["/"])
+            for grp_nm in grp_nms:
+                sub_grp_nms = list(h5r[grp_nm])
+                for sub_grp_nm in sub_grp_nms:
+                    sub_sub_grp_nms = list(h5r[f"/{grp_nm}/{sub_grp_nm}"])
+                    for sub_sub_grp_nm in sub_sub_grp_nms:
+                        if sub_sub_grp_nm.startswith("Area"):
+                            area_grp_nms = list(h5r[f"/{grp_nm}/{sub_grp_nm}/{sub_sub_grp_nm}"])
+                            for area_grp_nm in area_grp_nms:
+                                if area_grp_nm.startswith("OIM Map"):
+                                    self.prfx = f"/{grp_nm}/{sub_grp_nm}/{sub_sub_grp_nm}/{area_grp_nm}"
+                                    print(f"Parsing {self.prfx}")
+                                    ckey = self.init_named_cache(f"ebsd{cache_id}")
+                                    self.parse_and_normalize_group_ebsd_header(h5r, ckey)
+                                    self.parse_and_normalize_group_ebsd_phases(h5r, ckey)
+                                    self.parse_and_normalize_group_ebsd_data(h5r, ckey)
+                                    cache_id += 1
+
+    def parse_and_normalize_group_ebsd_header(self, fp, ckey: str):
+        # no official documentation yet from EDAX/APEX, deeply nested, chunking, virtual ds
+        if f"{self.prfx}/EBSD/ANG/DATA/DATA" not in fp:
+            raise ValueError(f"Unable to parse {self.prfx}/EBSD/ANG/DATA/DATA !")
+
+        grid_type = None
+        # for a regular tiling of R^2 with perfect hexagons
+        n_pts = 0
+        # their vertical center of mass distance is smaller than the horizontal
+        # center of mass distance (x cols, y rows)
+        req_fields = ["Grid Type",
+                      "Step X", "Step Y",
+                      "Number Of Rows", "Number Of Columns"]
+        for req_field in req_fields:
+            if f"{self.prfx}/Sample/{req_field}" not in fp:
+                raise ValueError(f"Unable to parse {self.prfx}/Sample/{req_field} !")
+
+        grid_type = read_strings_from_dataset(fp[f"{self.prfx}/Sample/Grid Type"][()])
+        if grid_type != "HexGrid":
+            raise ValueError(f"Grid Type {grid_type} is currently not supported !")
+        self.tmp[ckey]["s_x"] = fp[f"{self.prfx}/Sample/Step X"][0]
+        self.tmp[ckey]["s_unit"] = "µm"  # TODO::always micron?
+        self.tmp[ckey]["n_x"] = fp[f"{self.prfx}/Sample/Number Of Columns"][0]
+        self.tmp[ckey]["s_y"] = fp[f"{self.prfx}/Sample/Step Y"][0]
+        self.tmp[ckey]["n_y"] = fp[f"{self.prfx}/Sample/Number Of Rows"][0]
+
+    def parse_and_normalize_group_ebsd_phases(self, fp, ckey: str):
+        grp_name = f"{self.prfx}/EBSD/ANG/HEADER/Phase"
+        # Phases, contains a subgroup for each phase where the name
+        # of each subgroup is the index of the phase starting at 1.
+        if f"{grp_name}" in fp:
+            phase_ids = sorted(list(fp[f"{grp_name}"]), key=int)
+            self.tmp[ckey]["phase"] = []
+            self.tmp[ckey]["space_group"] = []
+            self.tmp[ckey]["phases"] = {}
+            for phase_id in phase_ids:
+                if phase_id.isdigit() is True:
+                    self.tmp[ckey]["phases"][int(phase_id)] = {}
+                    sub_grp_name = f"{grp_name}/{phase_id}"
+                    # Name
+                    if f"{sub_grp_name}/Material Name" in fp:
+                        phase_name = read_strings_from_dataset(fp[f"{sub_grp_name}/Material Name"][0])
+                        self.tmp[ckey]["phases"][int(phase_id)]["phase_name"] = phase_name
+                    else:
+                        raise ValueError(f"Unable to parse {sub_grp_name}/Material Name !")
+
+                    # Reference not available only Info but this can be empty
+                    self.tmp[ckey]["phases"][int(phase_id)]["reference"] = "n/a"
+
+                    req_fields = ["A", "B", "C", "Alpha", "Beta", "Gamma"]
+                    for req_field in req_fields:
+                        if f"{sub_grp_name}/Lattice Constant {req_field}" not in fp:
+                            raise ValueError(f"Unable to parse ../Lattice Constant {req_field} !")
+                    a_b_c = [fp[f"{sub_grp_name}/Lattice Constant A"][0],
+                             fp[f"{sub_grp_name}/Lattice Constant B"][0],
+                             fp[f"{sub_grp_name}/Lattice Constant C"][0]]
+                    angles = [fp[f"{sub_grp_name}/Lattice Constant Alpha"][0],
+                              fp[f"{sub_grp_name}/Lattice Constant Beta"][0],
+                              fp[f"{sub_grp_name}/Lattice Constant Gamma"][0]]
+                    self.tmp[ckey]["phases"][int(phase_id)]["a_b_c"] \
+                        = np.asarray(a_b_c, np.float32) * 0.1
+                    self.tmp[ckey]["phases"][int(phase_id)]["alpha_beta_gamma"] \
+                        = np.asarray(angles, np.float32)
+
+                    # Space Group not stored, only laue group, point group and symmetry
+                    # problematic because mapping is not bijective!
+                    # if you know the space group we know laue and point group and symmetry
+                    # but the opposite direction leaves room for ambiguities
+                    space_group = "n/a"
+                    self.tmp[ckey]["phases"][int(phase_id)]["space_group"] = space_group
+
+                    if len(self.tmp[ckey]["space_group"]) > 0:
+                        self.tmp[ckey]["space_group"].append(space_group)
+                    else:
+                        self.tmp[ckey]["space_group"] = [space_group]
+
+                    if len(self.tmp[ckey]["phase"]) > 0:
+                        self.tmp[ckey]["phase"].append(
+                            Structure(title=phase_name, atoms=None,
+                                      lattice=Lattice(a_b_c[0], a_b_c[1], a_b_c[2],
+                                      angles[0], angles[1], angles[2])))
+                    else:
+                        self.tmp[ckey]["phase"] \
+                            = [Structure(title=phase_name, atoms=None,
+                                         lattice=Lattice(a_b_c[0], a_b_c[1], a_b_c[2],
+                                         angles[0], angles[1], angles[2]))]
         else:
-            self.supported = False
-        if "/Version" in h5r:
-            self.version["schema_version"] \
-                = super().read_strings_from_dataset(h5r["/Version"][()])
-            if self.version["schema_version"] not in self.supported_version["schema_version"]:
-                # print(f"{self.version['schema_version']} is not any of {self.supported_version['schema_version']} !")
-                self.supported = False
+            raise ValueError(f"Unable to parse {grp_name} !")
+
+    def parse_and_normalize_group_ebsd_data(self, fp, ckey: str):
+        grp_name = f"{self.prfx}/EBSD/ANG/DATA/DATA"
+        n_pts = self.tmp[ckey]["n_x"] * self.tmp[ckey]["n_y"]
+        if f"{grp_name}" in fp:
+            if np.shape(fp[f"{grp_name}"]) != (n_pts,) and n_pts > 0:
+                raise ValueError(f"Unexpected shape of {grp_name} !")
+
+            dat = fp[f"{grp_name}"]
+            self.tmp[ckey]["euler"] = np.zeros((n_pts, 3), np.float32)
+            # index of phase, 0 if not indexed
+            # # no normalization needed, also in NXem_ebsd the null model notIndexed is phase_identifier 0
+            self.tmp[ckey]["phase_id"] = np.zeros((n_pts,), np.int32)
+            self.tmp[ckey]["ci"] = np.zeros((n_pts,), np.float32)
+
+            for i in np.arange(0, n_pts):
+                # check shape of internal virtual chunked number array
+                r = Rotation.from_matrix([np.reshape(dat[i][0], (3, 3))])
+                self.tmp[ckey]["euler"][i, :] = r.to_euler(degrees=False)
+                self.tmp[ckey]["phase_id"][i] = dat[i][2]
+                self.tmp[ckey]["ci"][i] = dat[i][3]
+
+            # TODO::convert orientation matrix to Euler angles via om_eu but what are conventions !
+            # orix based transformation ends up in positive half space and with degrees=False
+            # as radiants but the from_matrix command above might miss one rotation
+
+            # inconsistency f32 in file although specification states float
+            # Rotation.from_euler(euler=fp[f"{grp_name}/Euler"],
+            #                                 direction='lab2crystal',
+            #                                degrees=is_degrees)
+
+            # compute explicit hexagon grid cells center of mass pixel positions
+            # TODO::currently assuming HexGrid
+            self.tmp[ckey]["scan_point_x"] = np.asarray(
+                np.linspace(0, self.tmp[ckey]["n_x"] - 1,
+                            num=self.tmp[ckey]["n_x"],
+                            endpoint=True) * self.tmp[ckey]["s_x"] + 0., np.float32)
+
+            self.tmp[ckey]["scan_point_y"] = np.asarray(
+                np.linspace(0, self.tmp[ckey]["n_y"] - 1,
+                            num=self.tmp[ckey]["n_y"],
+                            endpoint=True) * self.tmp[ckey]["s_y"] + 0., np.float32)
         else:
-            self.supported = False
-        h5r.close()
-
-        if self.supported is True:
-            # print(f"Reading {self.file_name} is supported")
-            self.version["schema_name"] = self.supported_version["schema_name"]
-            self.version["writer_name"] = self.supported_version["writer_name"]
-            self.version["writer_version"] = self.supported_version["writer_version"]
-            # print(f"{self.version['schema_name']}, {self.supported_version['schema_version']}, {self.supported_version['writer_name']}, {self.supported_version['writer_version']}")
-        # else:
-            # print(f"Reading {self.file_name} is not supported!")
+            raise ValueError(f"Unable to parse {grp_name} !")