Fixed bug, refactored get_metadata, added and tested successfully has…

…hing of files inside zip, linting, styling, myping
FAIRmat-NFDI · Dec 19, 2023 · 97c7f8a · 97c7f8a
1 parent a065978
commit 97c7f8a
Show file tree

Hide file tree

Showing 11 changed files with 366 additions and 320 deletions.
diff --git a/imgs.batch.sh b/imgs.batch.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-datasource="../../../../paper_paper_paper/scidat_nomad_ebsd/bb_analysis/data/development_imgs/ikz_robert/"
+datasource="../../../../paper_paper_paper/scidat_nomad_ebsd/bb_analysis/data/development_imgs/axon/"
 
 # comments is detector mode
 examples="kit/FeMoOx_AntiA_04_1k5x_CN.tif"
@@ -9,9 +9,9 @@ examples="ikz_martin/ALN_baoh_021.tif"  # T2
 examples="ikz_robert/T3_image.tif"
 examples="ikz_robert/ETD_image.tif"  # ETD
 examples="ikz_martin/NavCam_normal_vis_light_ccd.tif"  # NavCam
-
 examples="0c8nA_3deg_003_AplusB_test.tif ALN_baoh_021.tif T3_image.tif ETD_image.tif NavCam_normal_vis_light_ccd.tif"
-
+examples="axon/20210426T224437.049Raw0.png"  #axon
+examples="ReductionOfFeOx.zip"
 
 for example in $examples; do
 	echo $example

diff --git a/pynxtools/dataconverter/readers/em/concepts/nxs_image_r_set.py b/pynxtools/dataconverter/readers/em/concepts/nxs_image_r_set.py
@@ -68,8 +68,8 @@ def __init__(self):
         self.tmp: Dict = {}
         for entry in NX_IMAGE_REAL_SPACE_SET_HDF_PATH:
             if entry.endswith("-field") is True:
-                self.tmp[entry[0:len(entry)-len("-field")]] = NxObject(eqv_hdf="dataset")
+                self.tmp[entry[0:len(entry) - len("-field")]] = NxObject(eqv_hdf="dataset")
             elif entry.endswith("-attribute") is True:
-                self.tmp[entry[0:len(entry)-len("-attribute")]] = NxObject(eqv_hdf="attribute")
+                self.tmp[entry[0:len(entry) - len("-attribute")]] = NxObject(eqv_hdf="attribute")
             else:
-                self.tmp[entry[0:len(entry)-len("-group")]] = NxObject(eqv_hdf="group")
+                self.tmp[entry[0:len(entry) - len("-group")]] = NxObject(eqv_hdf="group")
diff --git a/pynxtools/dataconverter/readers/em/concepts/nxs_object.py b/pynxtools/dataconverter/readers/em/concepts/nxs_object.py
@@ -36,7 +36,7 @@ def __init__(self,
         if (unit is not None) and (unit == ""):
             raise ValueError(f"Value for argument unit needs to be a non-empty string !")
         if (dtype is not None) and isinstance(dtype, type) is False:
-            raise ValueError(f"Value of argument dtype must not be None " \
+            raise ValueError(f"Value of argument dtype must not be None "
                              f" and a valid, ideally a numpy datatype !")
         # self.doc = None  # docstring
         self.name = name  # name of the field

diff --git a/pynxtools/dataconverter/readers/em/concepts/nxs_spectrum_set.py b/pynxtools/dataconverter/readers/em/concepts/nxs_spectrum_set.py
@@ -20,12 +20,12 @@
 # pylint: disable=no-member,too-few-public-methods
 
 
-from typing import Dict
+from typing import Dict, List
 
 from pynxtools.dataconverter.readers.em.concepts.nxs_object import NxObject
 
 
-NX_SPECTRUM_SET_HDF_PATH = []
+NX_SPECTRUM_SET_HDF_PATH: List = []
 # this one needs an update !
 
 
@@ -34,8 +34,8 @@ def __init__(self):
         self.tmp: Dict = {}
         for entry in NX_SPECTRUM_SET_HDF_PATH:
             if entry.endswith("-field") is True:
-                self.tmp[entry[0:len(entry)-len("-field")]] = NxObject(eqv_hdf="dataset")
+                self.tmp[entry[0:len(entry) - len("-field")]] = NxObject(eqv_hdf="dataset")
             elif entry.endswith("-attribute") is True:
-                self.tmp[entry[0:len(entry)-len("-attribute")]] = NxObject(eqv_hdf="attribute")
+                self.tmp[entry[0:len(entry) - len("-attribute")]] = NxObject(eqv_hdf="attribute")
             else:
-                self.tmp[entry[0:len(entry)-len("-group")]] = NxObject(eqv_hdf="group")
+                self.tmp[entry[0:len(entry) - len("-group")]] = NxObject(eqv_hdf="group")
diff --git a/pynxtools/dataconverter/readers/em/reader.py b/pynxtools/dataconverter/readers/em/reader.py
@@ -147,7 +147,8 @@ def read(self,
         # print("Create NeXus default plottable data...")
         # em_default_plot_generator(template, 1)
 
-        if True is False:
+        run_block = False
+        if run_block is True:
             nxs_plt = NxEmDefaultPlotResolver()
             # if nxs_mtex is the sub-parser
             resolved_path = nxs_plt.nxs_mtex_get_nxpath_to_default_plot(

diff --git a/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py b/pynxtools/dataconverter/readers/em/subparsers/hfive_apex.py
@@ -322,7 +322,7 @@ def parse_and_normalize_eds_spd(self, fp, src: str, ckey: str):
         for req in reqs:
             if req not in fp[f"{src}/SPD"].attrs.keys():  # also check for shape
                 raise ValueError(f"Required attribute named {req} not found in {src}/SPD !")
-        
+
         nyxe = {"y": fp[f"{src}/SPD"].attrs["NumberOfLines"][0],
                 "x": fp[f"{src}/SPD"].attrs["NumberOfPoints"][0],
                 "e": fp[f"{src}/SPD"].attrs["NumberofChannels"][0]}
@@ -334,7 +334,7 @@ def parse_and_normalize_eds_spd(self, fp, src: str, ckey: str):
         # thereby these EDAX energy count arrays are just some payload inside a set of compressed chunks
         # without some extra logic to resolve the third (energy) dimension reading them can be super inefficient
         # so let's read chunk-by-chunk to reuse chunk cache, hopefully...
-        chk_bnds = {"x": [], "y": []}
+        chk_bnds: Dict = {"x": [], "y": []}
         chk_info = {"ny": nyxe["y"], "cy": fp[f"{src}/SPD"].chunks[0],
                     "nx": nyxe["x"], "cx": fp[f"{src}/SPD"].chunks[1]}
         for dim in ["y", "x"]:

diff --git a/pynxtools/dataconverter/readers/em/subparsers/image_png_protochips.py b/pynxtools/dataconverter/readers/em/subparsers/image_png_protochips.py
@@ -24,7 +24,6 @@
 from typing import Dict
 from PIL import Image
 from zipfile import ZipFile
-from collections import OrderedDict
 
 from pynxtools.dataconverter.readers.em.subparsers.image_png_protochips_concepts import \
     get_protochips_variadic_concept
@@ -34,28 +33,9 @@
     import variadic_path_to_specific_path
 from pynxtools.dataconverter.readers.em.subparsers.image_png_protochips_modifier import \
     get_nexus_value
-from pynxtools.dataconverter.readers.em.subparsers.image_base import \
-    ImgsBaseParser
-
-
-def flatten_xml_to_dict(xml_content) -> dict:
-    # https://codereview.stackexchange.com/a/21035
-    # https://stackoverflow.com/questions/38852822/how-to-flatten-xml-file-in-python
-    def items():
-        for key, value in xml_content.items():
-            # nested subtree
-            if isinstance(value, dict):
-                for subkey, subvalue in flatten_xml_to_dict(value).items():
-                    yield '{}.{}'.format(key, subkey), subvalue
-            # nested list
-            elif isinstance(value, list):
-                for num, elem in enumerate(value):
-                    for subkey, subvalue in flatten_xml_to_dict(elem).items():
-                        yield '{}.[{}].{}'.format(key, num, subkey), subvalue
-            # everything else (only leafs should remain)
-            else:
-                yield key, value
-    return OrderedDict(items())
+from pynxtools.dataconverter.readers.em.subparsers.image_base import ImgsBaseParser
+from pynxtools.dataconverter.readers.em.utils.xml_utils import flatten_xml_to_dict
+from pynxtools.dataconverter.readers.shared.shared_utils import get_sha256_of_file_content
 
 
 class ProtochipsPngSetSubParser(ImgsBaseParser):
@@ -99,8 +79,8 @@ def check_if_zipped_png_protochips(self):
                                     try:
                                         nparr = np.array(png)
                                         self.png_info[file] = np.shape(nparr)
-                                    except:
-                                        raise ValueError(f"Loading image data in-place from {self.file_path}:{file} failed !")
+                                    except IOError:
+                                        print(f"Loading image data in-place from {self.file_path}:{file} failed !")
                             if method == "smart":  # knowing where to hunt width and height in PNG metadata
                                 # https://dev.exiv2.org/projects/exiv2/wiki/The_Metadata_in_PNG_files
                                 magic = fp.read(8)
@@ -125,6 +105,53 @@ def check_if_zipped_png_protochips(self):
         print("All tests passed successfully")
         self.supported = True
 
+    def get_xml_metadata(self, file, fp):
+        try:
+            fp.seek(0)
+            with Image.open(fp) as png:
+                png.load()
+                if "MicroscopeControlImage" in png.info.keys():
+                    meta = flatten_xml_to_dict(
+                        xmltodict.parse(png.info["MicroscopeControlImage"]))
+                    # first phase analyse the collection of Protochips metadata concept instance symbols and reduce to unique concepts
+                    grpnm_lookup = {}
+                    for concept, value in meta.items():
+                        # not every key is allowed to define a concept
+                        # print(f"{concept}: {value}")
+                        idxs = re.finditer(r".\[[0-9]+\].", concept)
+                        if (sum(1 for _ in idxs) > 0):  # is_variadic
+                            markers = [".Name", ".PositionerName"]
+                            for marker in markers:
+                                if concept.endswith(marker):
+                                    grpnm_lookup[f"{concept[0:len(concept)-len(marker)]}"] = value
+                        else:
+                            grpnm_lookup[concept] = value
+                    # second phase, evaluate each concept instance symbol wrt to its prefix coming from the unique concept
+                    self.tmp["meta"][file] = {}
+                    for k, v in meta.items():
+                        grpnms = None
+                        idxs = re.finditer(r".\[[0-9]+\].", k)
+                        if (sum(1 for _ in idxs) > 0):  # is variadic
+                            search_argument = k[0:k.rfind("].") + 1]
+                            for parent_grpnm, child_grpnm in grpnm_lookup.items():
+                                if parent_grpnm.startswith(search_argument):
+                                    grpnms = (parent_grpnm, child_grpnm)
+                                    break
+                            if grpnms is not None:
+                                if len(grpnms) == 2:
+                                    if "PositionerSettings" in k and k.endswith(".PositionerName") is False:
+                                        self.tmp["meta"][file][f"{grpnms[0]}.{grpnms[1]}{k[k.rfind('.') + 1:]}"] = v
+                                    if k.endswith(".Value"):
+                                        self.tmp["meta"][file][f"{grpnms[0]}.{grpnms[1]}"] = v
+                        else:
+                            self.tmp["meta"][file][f"{k}"] = v
+                        # TODO::simplify and check that metadata end up correctly in self.tmp["meta"][file]
+        except ValueError:
+            print(f"Flattening XML metadata content {self.file_path}:{file} failed !")
+
+    def get_file_hash(self, file, fp):
+        self.tmp["meta"][file]["sha256"] = get_sha256_of_file_content(fp)
+
     def parse_and_normalize(self):
         """Perform actual parsing filling cache self.tmp."""
         if self.supported is True:
@@ -133,55 +160,21 @@ def parse_and_normalize(self):
             with ZipFile(self.file_path) as zip_file_hdl:
                 for file in self.png_info.keys():
                     with zip_file_hdl.open(file) as fp:
-                        try:
-                            with Image.open(fp) as png:
-                                png.load()
-                                if "MicroscopeControlImage" in png.info.keys():
-                                    meta = flatten_xml_to_dict(
-                                        xmltodict.parse(png.info["MicroscopeControlImage"]))
-                                    # first phase analyse the collection of Protochips metadata concept instance symbols and reduce to unique concepts
-                                    self.tmp["meta"][file] = {}
-                                for concept, value in meta.items():
-                                    # not every key is allowed to define a concept
-                                    # print(f"{concept}: {value}")
-                                    idxs = re.finditer(".\[[0-9]+\].", concept)
-                                    if (sum(1 for _ in idxs) > 0):  # is_variadic
-                                        markers = [".Name", ".PositionerName"]
-                                        for marker in markers:
-                                            if concept.endswith(marker):
-                                                self.tmp["meta"][file][f"{concept[0:len(concept)-len(marker)]}"] = value
-                                    else:
-                                        self.tmp["meta"][file][concept] = value
-                                # print(f"First phase of metadata parsing {self.file_path}:{file} successful")
-                                # second phase, evaluate each concept instance symbol wrt to its prefix coming from the unique concept
-                                for k, v in meta.items():
-                                    grpnms = None
-                                    idxs = re.finditer(".\[[0-9]+\].", k)
-                                    if (sum(1 for _ in idxs) > 0):  # is variadic
-                                        search_argument = k[0:k.rfind("].")+1]
-                                        for parent_grpnm, child_grpnm in self.tmp["meta"][file].items():
-                                            if parent_grpnm.startswith(search_argument):
-                                                grpnms = (parent_grpnm, child_grpnm)
-                                                break
-                                        if grpnms is not None:
-                                            if len(grpnms) == 2:
-                                                if "PositionerSettings" in k and k.endswith(".PositionerName") is False:
-                                                    print(f"vv: {grpnms[0]}.{grpnms[1]}{k[k.rfind('.') + 1:]}: {v}")
-                                                if k.endswith(".Value"):
-                                                    print(f"vv: {grpnms[0]}.{grpnms[1]}: {v}")
-                                    else:
-                                        print(f"nv: {k}: {v}")
-                                    # TODO::simplify and check that metadata end up correctly in self.tmp["meta"][file]
-                        except:
-                            raise ValueError(f"Flattening XML metadata content {self.file_path}:{file} failed !")
+                        self.get_xml_metadata(file, fp)
+                        self.get_file_hash(file, fp)
+                        # print(f"Debugging self.tmp.file.items {file}")
+                        # for k, v in self.tmp["meta"][file].items():
+                        #    print(f"{k}: {v}")
+            print(f"{self.file_path} metadata within PNG collection processed "
+                  f"successfully ({len(self.tmp['meta'].keys())} PNGs evaluated).")
         else:
             print(f"{self.file_path} is not a Protochips-specific "
                   f"PNG file that this parser can process !")
 
     def process_into_template(self, template: dict) -> dict:
         if self.supported is True:
             self.process_event_data_em_metadata(template)
-            self.process_event_data_em_data(template)
+            # self.process_event_data_em_data(template)
         return template
 
     def process_event_data_em_metadata(self, template: dict) -> dict: