Added first version of a imaging mode case distinction logic, tested …

…imgs, adf, and ceta imaging modes. NeXus files were generated successfully but weird h5web display error coming up within ipynb, removechild tested if spaces in filenames cause this but no, in hdfviewer file shows without any issues
FAIRmat-NFDI · Jan 17, 2024 · 3892c70 · 3892c70
1 parent 4a9de81
commit 3892c70
Show file tree

Hide file tree

Showing 4 changed files with 282 additions and 19 deletions.
diff --git a/debug/spctrscpy.batch.sh b/debug/spctrscpy.batch.sh
@@ -2,7 +2,7 @@
 
 datasource="../../../../../paper_paper_paper/scidat_nomad_ebsd/bb_analysis/data/development_spctrscpy/pdi/"
 datasource="../../../../../paper_paper_paper/scidat_nomad_ebsd/bb_analysis/data/development_spctrscpy/ikz/"
-
+datasource="../../../../../paper_paper_paper/scidat_nomad_ebsd/bb_analysis/data/development_spctrscpy/fhi/"
 
 # apex examples ikz, pdi
 # examples="ikz/VInP_108_L2.h5 ikz/GeSn_13.h5 pynx/46_ES-LP_L1_brg.bcf pynx/1613_Si_HAADF_610_kx.emd pynx/EELS_map_2_ROI_1_location_4.dm3 pynx/H5OINA_examples_Specimen_1_Map_EDS_+_EBSD_Map_Data_2.h5oina"
@@ -11,8 +11,11 @@ examples="AlGaO.nxs"
 examples="GeSi.nxs"
 examples="GeSn_13.nxs"
 # examples="VInP_108_L2.h5"
+examples="CG71113 1513 HAADF-DF4-DF2-BF 1.2 Mx STEM.emd"
+examples="CG71113 1138 Ceta 660 mm Camera.emd"
+examples="CG71113 1125 Ceta 1.1 Mx Camera.emd"
 
-for example in $examples; do
+for example in "$examples"; do
 	echo $example
-	dataconverter --reader em --nxdl NXroot --input-file $datasource$example --output debug.$example.nxs 1>stdout.$example.nxs.txt 2>stderr.$example.nxs.txt
+	dataconverter --reader em --nxdl NXroot --input-file "$datasource$example" --output "debug.$example.nxs" 1>"stdout.$example.nxs.txt" 2>"stderr.$example.nxs.txt"
 done
diff --git a/pynxtools/dataconverter/readers/em/reader.py b/pynxtools/dataconverter/readers/em/reader.py
@@ -27,7 +27,7 @@
 from pynxtools.dataconverter.readers.em.subparsers.nxs_pyxem import NxEmNxsPyxemSubParser
 # from pynxtools.dataconverter.readers.em.subparsers.nxs_imgs import NxEmImagesSubParser
 # from pynxtools.dataconverter.readers.em.subparsers.nxs_nion import NxEmZippedNionProjectSubParser
-# from pynxtools.dataconverter.readers.em.subparsers.rsciio_velox import RsciioVeloxSubParser
+from pynxtools.dataconverter.readers.em.subparsers.rsciio_velox import RsciioVeloxSubParser
 from pynxtools.dataconverter.readers.em.utils.default_plots import NxEmDefaultPlotResolver
 # from pynxtools.dataconverter.readers.em.geometry.convention_mapper import NxEmConventionMapper
 
@@ -121,21 +121,23 @@ def read(self,
         # add further with resolving cases
         # if file_path is an HDF5 will use hfive parser
         # sub_parser = "nxs_pyxem"
-        subparser = NxEmNxsPyxemSubParser(entry_id, file_paths[0])
-        subparser.parse(template)
+        # subparser = NxEmNxsPyxemSubParser(entry_id, file_paths[0])
+        # subparser.parse(template)
         # TODO::check correct loop through!
 
         # sub_parser = "image_tiff"
         # subparser = NxEmImagesSubParser(entry_id, file_paths[0])
         # subparser.parse(template)
+        # TODO::check correct loop through!
 
         # sub_parser = "zipped_nion_project"
         # subparser = NxEmZippedNionProjectSubParser(entry_id, file_paths[0])
         # subparser.parse(template, verbose=True)
+        # TODO::check correct loop through!
 
         # sub_parser = "velox_emd"
-        # subparser = RsciioVeloxSubParser(entry_id, file_paths[0])
-        # subparser.parse(template, verbose=True)
+        subparser = RsciioVeloxSubParser(entry_id, file_paths[0])
+        subparser.parse(template, verbose=True)
 
         # for dat_instance in case.dat_parser_type:
         #     print(f"Process pieces of information in {dat_instance} tech partner file...")

diff --git a/pynxtools/dataconverter/readers/em/subparsers/rsciio_velox.py b/pynxtools/dataconverter/readers/em/subparsers/rsciio_velox.py
@@ -17,10 +17,17 @@
 #
 """(Sub-)parser for reading content from ThermoFisher Velox *.emd (HDF5) via rosettasciio."""
 
+import flatdict as fd
+import numpy as np
+
 from typing import Dict, List
 from rsciio import emd
 
 from pynxtools.dataconverter.readers.em.subparsers.rsciio_base import RsciioBaseParser
+from pynxtools.dataconverter.readers.em.utils.rsciio_hyperspy_utils \
+    import get_named_axis, get_axes_dims, get_axes_units
+from pynxtools.dataconverter.readers.shared.shared_utils \
+    import get_sha256_of_file_content
 
 
 class RsciioVeloxSubParser(RsciioBaseParser):
@@ -31,8 +38,11 @@ def __init__(self, entry_id: int = 1, file_path: str = ""):
             self.entry_id = entry_id
         else:
             self.entry_id = 1
-        self.id_mgn: Dict = {}
-        self.prfx = None
+        self.id_mgn: Dict = {"event": 1,
+                             "event_img": 1,
+                             "event_spc": 1,
+                             "roi": 1}
+        self.file_path_sha256 = None
         self.tmp: Dict = {}
         self.supported_version: Dict = {}
         self.version: Dict = {}
@@ -47,11 +57,15 @@ def check_if_supported(self):
             # only the collection of the concepts without the actual instance data
             # based on this one could then plan how much memory has to be reserved
             # in the template and stream out accordingly
+            with open(self.file_path, "rb", 0) as fp:
+                self.file_path_sha256 = get_sha256_of_file_content(fp)
+
+            print(f"Parsing {self.file_path} with SHA256 {self.file_path_sha256} ...")
             self.supported = True
         except IOError:
             print(f"Loading {self.file_path} using {self.__name__} is not supported !")
 
-    def parse_and_normalize_and_process_into_template(self, template: dict) -> dict:
+    def parse(self, template: dict, verbose=False) -> dict:
         """Perform actual parsing filling cache self.tmp."""
         if self.supported is True:
             self.tech_partner_to_nexus_normalization(template)
@@ -62,20 +76,196 @@ def parse_and_normalize_and_process_into_template(self, template: dict) -> dict:
 
     def tech_partner_to_nexus_normalization(self, template: dict) -> dict:
         """Translate tech partner concepts to NeXus concepts."""
-        self.normalize_bfdf_content(template)  # conventional bright/dark field
-        self.normalize_adf_content(template)  # (high-angle) annular dark field
-        self.normalize_edxs_content(template)  # EDS in the TEM
-        self.normalize_eels_content(template)  # electron energy loss spectroscopy
+        reqs = ["data", "axes", "metadata", "original_metadata", "mapping"]
+        for idx, obj in enumerate(self.objs):
+            if not isinstance(obj, dict):
+                continue
+            parse = True
+            for req in reqs:
+                if req not in obj:
+                    parse = False
+            if parse == False:
+                continue
+
+            content_type = self.content_resolver(obj)
+            print(f"Parsing {idx}-th object in {self.file_path} content type is {content_type}")
+            if content_type == "imgs":
+                self.normalize_imgs_content(obj, template)  # generic imaging modes
+                # TODO:: could later make an own one for bright/dark field, but
+                # currently no distinction in hyperspy
+            elif content_type == "adf":
+                self.normalize_adf_content(obj, template)  # (high-angle) annular dark field
+            elif content_type == "diff":  # diffraction image in reciprocal space
+                self.normalize_diff_content(obj, template)  # diffraction images
+            elif content_type == "eds":
+                self.normalize_eds_content(obj,template)  # ED(X)S in the TEM
+            elif content_type == "eels":
+                self.normalize_eels_content(obj, template)  # electron energy loss spectroscopy
+            else:  # == "n/a"
+                print(f"WARNING::Unable to resolve content of {idx}-th object in {self.file_path}!")
         return template
 
-    def normalize_bfdf_content(self, template: dict) -> dict:
+    def content_resolver(self, obj: dict) -> str:
+        """Try to identify which content the obj describes best."""
+        # assume rosettasciio-specific formatting of the emd parser
+        # i.e. a dictionary with the following keys:
+        # "data", "axes", "metadata", "original_metadata", "mapping"
+        meta = fd.FlatDict(obj["metadata"], "/")
+        orgmeta = fd.FlatDict(obj["original_metadata"], "/")
+        dims = get_axes_dims(obj["axes"])
+        units = get_axes_units(obj["axes"])
+        if "General/title" not in meta.keys():
+            return "n/a"
+        if (meta["General/title"] in ("BF")) or (meta["General/title"].startswith("DF")):
+            # TODO::the problem with using here the explicit name DF4 is that this may only
+            # work for a particular microscope:
+            # Core/MetadataDefinitionVersion: 7.9, Core/MetadataSchemaVersion: v1/2013/07
+            # Instrument/ControlSoftwareVersion: 1.15.4, Instrument/Manufacturer: FEI Company
+            # Instrument/InstrumentId: 6338, Instrument/InstrumentModel: Talos F200X
+            # instead there should be a logic added which resolves which concept
+            # the data in this obj are best described by when asking a community-wide
+            # glossary but not the FEI-specific glossary
+            # all that logic is unneeded and thereby the data more interoperable
+            # if FEI would harmonize their obvious company metadata standard with the
+            # electron microscopy community!
+            return "imgs"
+        if meta["General/title"] in ("HAADF"):
+            return "adf"
+        # all units indicating we are in real or complex i.e. reciprocal space
+        vote_r_c = [0, 0]  # real space, complex space
+        for unit in units:
+            if unit.startswith("1 /"):
+                vote_r_c[1] += 1
+            else:
+                vote_r_c[0] += 1
+        if vote_r_c[0] == len(units) and vote_r_c[1] == 0:
+            return "imgs"
+        if vote_r_c[0] == 0 and vote_r_c[1] == len(units):
+            return "diff"
+        del vote_r_c
+        return "n/a"
+
+    def normalize_imgs_content(self, obj: dict, template: dict) -> dict:
+        """Map generic scanned images (e.g. BF/DF) to NeXus."""
+        meta = fd.FlatDict(obj["metadata"], "/")
+        dims = get_axes_dims(obj["axes"])
+        trg = f"/ENTRY[entry{self.entry_id}]/measurement/event_data_em_set/" \
+              f"EVENT_DATA_EM[event_data_em{self.id_mgn['event']}]/" \
+              f"IMAGE_R_SET[image_r_set{self.id_mgn['event_img']}]"
+        template[f"{trg}/PROCESS[process]/source/type"] = "file"
+        template[f"{trg}/PROCESS[process]/source/path"] = self.file_path
+        template[f"{trg}/PROCESS[process]/source/checksum"] = self.file_path_sha256
+        template[f"{trg}/PROCESS[process]/source/algorithm"] = "SHA256"
+        template[f"{trg}/PROCESS[process]/detector_identifier"] = meta["General/title"]
+        template[f"{trg}/image_twod/@NX_class"] = "NXdata"  # TODO::writer should do!
+        template[f"{trg}/image_twod/@signal"] = "intensity"
+        template[f"{trg}/image_twod/@axes"] = []
+        for dim in dims:
+            template[f"{trg}/image_twod/@axes"].append(f"axis_{dim[0]}")
+            template[f"{trg}/image_twod/@AXISNAME_indices[axis_{dim[0]}]"] \
+                = np.uint32(dim[1])
+            support, unit = get_named_axis(obj["axes"], dim[0])
+            if support is not None and unit is not None:
+                template[f"{trg}/image_twod/axis_{dim[0]}"] \
+                    = {"compress": support, "strength": 1}
+                template[f"{trg}/image_twod/axis_{dim[0]}/@long_name"] \
+                    = f"{dim[0]}-axis position ({unit})"
+        template[f"{trg}/image_twod/title"] = meta["General/title"]
+        template[f"{trg}/image_twod/intensity"] \
+            = {"compress": np.asarray(obj["data"]), "strength": 1}
+        # template[f"{trg}/image_twod/intensity/@units"]
+        # TODO::add metadata
+        self.id_mgn["event_img"] += 1
+        self.id_mgn["event"] += 1
         return template
 
-    def normalize_adf_content(self, template: dict) -> dict:
+    def normalize_adf_content(self, obj: dict, template: dict) -> dict:
+        """Map relevant (high-angle) annular dark field images to NeXus."""
+        meta = fd.FlatDict(obj["metadata"], "/")
+        dims = get_axes_dims(obj["axes"])
+        trg = f"/ENTRY[entry{self.entry_id}]/measurement/event_data_em_set/" \
+              f"EVENT_DATA_EM[event_data_em{self.id_mgn['event']}]/" \
+              f"IMAGE_R_SET[image_r_set{self.id_mgn['event_img']}]"
+        template[f"{trg}/PROCESS[process]/source/type"] = "file"
+        template[f"{trg}/PROCESS[process]/source/path"] = self.file_path
+        template[f"{trg}/PROCESS[process]/source/checksum"] = self.file_path_sha256
+        template[f"{trg}/PROCESS[process]/source/algorithm"] = "SHA256"
+        template[f"{trg}/PROCESS[process]/detector_identifier"] = meta["General/title"]
+        template[f"{trg}/image_twod/@NX_class"] = "NXdata"  # TODO::writer should do!
+        template[f"{trg}/image_twod/@signal"] = "intensity"
+        template[f"{trg}/image_twod/@axes"] = []
+        for dim in dims:
+            template[f"{trg}/image_twod/@axes"].append(f"axis_{dim[0]}")
+            template[f"{trg}/image_twod/@AXISNAME_indices[axis_{dim[0]}]"] \
+                = np.uint32(dim[1])
+            support, unit = get_named_axis(obj["axes"], dim[0])
+            if support is not None and unit is not None:
+                template[f"{trg}/image_twod/axis_{dim[0]}"] \
+                    = {"compress": support, "strength": 1}
+                template[f"{trg}/image_twod/axis_{dim[0]}/@long_name"] \
+                    = f"{dim[0]}-axis position ({unit})"
+        template[f"{trg}/image_twod/title"] = meta["General/title"]
+        template[f"{trg}/image_twod/intensity"] \
+            = {"compress": np.asarray(obj["data"]), "strength": 1}
+        # template[f"{trg}/image_twod/intensity/@units"]
+        # TODO::coll. angles given in original_metadata map to half_angle_interval
+        # TODO::add metadata
+        self.id_mgn["event_img"] += 1
+        self.id_mgn["event"] += 1
+        return template
+
+    def normalize_diff_content(self, obj: dict, template: dict) -> dict:
+        """Map relevant diffraction images to NeXus."""
+        # TODO::the above-mentioned constraint is not general enough
+        # this can work only for cases where we know that we not only have a
+        # Ceta camera but also use it for taking diffraction pattern
+        # TODO::this is an example that more logic is needed to identify whether
+        # the information inside obj really has a similarity with the concept of
+        # somebody having taken a diffraction image
+        # one can compare the situation with the following:
+        # assume you wish to take pictures of apples and have an NXapple_picture
+        # but all you get is an image from a digital camera where the dataset is
+        # named maybe DCIM, without a logic one cannot make the mapping robustly!
+        # can one map y, x, on j, i indices
+        idx_map = {"y": "j", "x": "i"}
+        dims = get_axes_dims(obj["axes"])
+        print(dims)
+        for dim in dims:
+            if dim[0] not in idx_map.keys():
+                raise ValueError(f"Unable to map index {dim[0]} on something!")
+
+        trg = f"/ENTRY[entry{self.entry_id}]/measurement/event_data_em_set/" \
+                f"EVENT_DATA_EM[event_data_em{self.id_mgn['event']}]/" \
+                f"IMAGE_C_SET[image_c_set{self.id_mgn['event_img']}]"
+        template[f"{trg}/PROCESS[process]/source/type"] = "file"
+        template[f"{trg}/PROCESS[process]/source/path"] = self.file_path
+        template[f"{trg}/PROCESS[process]/source/checksum"] = self.file_path_sha256
+        template[f"{trg}/PROCESS[process]/source/algorithm"] = "SHA256"
+        template[f"{trg}/PROCESS[process]/detector_identifier"] = meta["General/title"]
+        template[f"{trg}/image_twod/@NX_class"] = "NXdata"  # TODO::writer should do!
+        template[f"{trg}/image_twod/@signal"] = "magnitude"
+        template[f"{trg}/image_twod/@axes"] = []
+        for dim in dims:
+            template[f"{trg}/image_twod/@axes"].append(f"axis_{idx_map[dim[0]]}")
+            template[f"{trg}/image_twod/@AXISNAME_indices[axis_{idx_map[dim[0]]}]"] \
+                = np.uint32(dim[1])
+            support, unit = get_named_axis(obj["axes"], dim[0])
+            if support is not None and unit is not None and unit.startswith("1 /"):
+                template[f"{trg}/image_twod/axis_{idx_map[dim[0]]}"] \
+                    = {"compress": support, "strength": 1}
+                template[f"{trg}/image_twod/axis_{idx_map[dim[0]]}/@long_name"] \
+                    = f"{idx_map[dim[0]]}-axis position ({unit})"
+        template[f"{trg}/image_twod/title"] = meta["General/title"]
+        template[f"{trg}/image_twod/magnitude"] \
+            = {"compress": np.asarray(obj["data"]), "strength": 1}
+        # template[f"{trg}/image_twod/magnitude/@units"]
+        # TODO::add metadata
+        self.id_mgn["event_img"] += 1
+        self.id_mgn["event"] += 1
         return template
 
-    def normalize_edxs_content(self, template: dict) -> dict:
+    def normalize_eds_content(self, obj: dict, template: dict) -> dict:
         return template
 
-    def normalize_eels_content(self, template: dict) -> dict:
+    def normalize_eels_content(self, obj: dict, template: dict) -> dict:
         return template
diff --git a/pynxtools/dataconverter/readers/em/utils/rsciio_hyperspy_utils.py b/pynxtools/dataconverter/readers/em/utils/rsciio_hyperspy_utils.py
@@ -0,0 +1,68 @@
+#
+# Copyright The NOMAD Authors.
+#
+# This file is part of NOMAD. See https://nomad-lab.eu for further info.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Utility functions to interpret data from hyperspy-project-specific representation."""
+
+import numpy as np
+
+
+def get_named_axis(axes_metadata, dim_name):
+    """Return numpy array with tuple (axis pos, unit) along dim_name or None."""
+    retval = None
+    if len(axes_metadata) >= 1:
+        for axis in axes_metadata:
+            if isinstance(axis, dict):
+                if ("name" in axis):
+                    if axis["name"] == dim_name:
+                        reqs = ["index_in_array", "offset", "scale", "size", "units", "navigate"]  # "name"
+                        for req in reqs:
+                            if req not in axis:
+                                raise ValueError(f"{req} not in {axis}!")
+                        retval = (
+                            np.asarray(
+                                axis["offset"] + (np.linspace(0.,
+                                                              axis["size"] - 1.,
+                                                              num=int(axis["size"]),
+                                                              endpoint=True)
+                                                  * axis["scale"]),
+                                       np.float64),
+                                  axis["units"])
+    return retval
+
+
+def get_axes_dims(axes_metadata):
+    """Return list of (axis) name, index_in_array tuple or empty list."""
+    retval = []
+    if len(axes_metadata) >= 1:
+        for axis in axes_metadata:
+            if isinstance(axis, dict):
+                if ("name" in axis) and ("index_in_array" in axis):
+                    retval.append((axis["name"], axis["index_in_array"]))
+    # TODO::it seems that hyperspy sorts this by index_in_array
+    return retval
+
+
+def get_axes_units(axes_metadata):
+    """Return list of units or empty list."""
+    retval = []
+    if len(axes_metadata) >= 1:
+        for axis in axes_metadata:
+            if isinstance(axis, dict):
+                if "units" in axis:
+                    retval.append(axis["units"])
+    # TODO::it seems that hyperspy sorts this by index_in_array
+    return retval