Merge branch 'master' into refactor-xps-reader

FAIRmat-NFDI · Dec 4, 2023 · fabe938 · fabe938
2 parents afe46e0 + 8620c4e
commit fabe938
Show file tree

Hide file tree

Showing 19 changed files with 2,051 additions and 162 deletions.
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -4,10 +4,6 @@
 #
 #    pip-compile --extra=dev --output-file=dev-requirements.txt pyproject.toml
 #
-appnope==0.1.3
-    # via
-    #   ipykernel
-    #   ipython
 asciitree==0.3.3
     # via zarr
 ase==3.22.1
@@ -48,16 +44,13 @@ comm==0.2.0
 contourpy==1.1.1
     # via matplotlib
 coverage[toml]==7.3.2
-    # via
-    #   coverage
-    #   pytest-cov
+    # via pytest-cov
 cycler==0.12.1
     # via matplotlib
 cython==3.0.6
     # via tables
 dask[array]==2023.5.0
     # via
-    #   dask
     #   hyperspy
     #   kikuchipy
     #   orix

diff --git a/pynxtools/dataconverter/convert.py b/pynxtools/dataconverter/convert.py
@@ -22,7 +22,7 @@
 import logging
 import os
 import sys
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 import xml.etree.ElementTree as ET
 
 import click
@@ -80,60 +80,146 @@ def get_names_of_all_readers() -> List[str]:
     return all_readers + plugins
 
 
-# pylint: disable=too-many-arguments,too-many-locals
-def convert(input_file: Tuple[str, ...],
-            reader: str,
-            nxdl: str,
-            output: str,
-            generate_template: bool = False,
-            fair: bool = False,
-            undocumented: bool = False,
-            **kwargs):
-    """The conversion routine that takes the input parameters and calls the necessary functions."""
+def get_nxdl_root_and_path(nxdl: str):
+    """Get xml root element and file path from nxdl name e.g. NXapm.
+
+    Parameters
+    ----------
+    nxdl: str
+        Name of nxdl file e.g. NXapm from NXapm.nxdl.xml.
+
+    Returns
+    -------
+    ET.root
+        Root element of nxdl file.
+    str
+        Path of nxdl file.
+
+    Raises
+    ------
+    FileNotFoundError
+        Error if no file with the given nxdl name is found.
+    """
     # Reading in the NXDL and generating a template
     definitions_path = nexus.get_nexus_definitions_path()
     if nxdl == "NXtest":
-        nxdl_path = os.path.join(
+        nxdl_f_path = os.path.join(
             f"{os.path.abspath(os.path.dirname(__file__))}/../../",
             "tests", "data", "dataconverter", "NXtest.nxdl.xml")
     elif nxdl == "NXroot":
-        nxdl_path = os.path.join(definitions_path, "base_classes", "NXroot.nxdl.xml")
+        nxdl_f_path = os.path.join(definitions_path, "base_classes", "NXroot.nxdl.xml")
     else:
-        nxdl_path = os.path.join(definitions_path, "contributed_definitions", f"{nxdl}.nxdl.xml")
-        if not os.path.exists(nxdl_path):
-            nxdl_path = os.path.join(definitions_path, "applications", f"{nxdl}.nxdl.xml")
-        if not os.path.exists(nxdl_path):
+        nxdl_f_path = os.path.join(definitions_path, "contributed_definitions", f"{nxdl}.nxdl.xml")
+        if not os.path.exists(nxdl_f_path):
+            nxdl_f_path = os.path.join(definitions_path, "applications", f"{nxdl}.nxdl.xml")
+        if not os.path.exists(nxdl_f_path):
+            nxdl_f_path = os.path.join(definitions_path, "base_classes", f"{nxdl}.nxdl.xml")
+        if not os.path.exists(nxdl_f_path):
             raise FileNotFoundError(f"The nxdl file, {nxdl}, was not found.")
 
-    nxdl_root = ET.parse(nxdl_path).getroot()
+    return ET.parse(nxdl_f_path).getroot(), nxdl_f_path
 
-    if undocumented:
-        logger.setLevel(UNDOCUMENTED)
+
+def transfer_data_into_template(input_file,
+                                reader, nxdl_name,
+                                nxdl_root: Optional[ET.Element] = None,
+                                **kwargs):
+    """Transfer parse and merged data from input experimental file, config file and eln.
+
+    Experimental and eln files will be parsed and finally will be merged into template.
+    Before returning the template validate the template data.
+
+    Parameters
+    ----------
+    input_file : Union[tuple[str], str]
+        Tuple of files or file
+    reader: str
+        Name of reader such as xps
+    nxdl_name : str
+        Root name of nxdl file, e.g. NXmpes from NXmpes.nxdl.xml
+    nxdl_root : ET.element
+        Root element of nxdl file, otherwise provide nxdl_name
+
+    Returns
+    -------
+    Template
+        Template filled with data from raw file and eln file.
+
+    """
+    if nxdl_root is None:
+        nxdl_root, _ = get_nxdl_root_and_path(nxdl=nxdl_name)
 
     template = Template()
     helpers.generate_template_from_nxdl(nxdl_root, template)
-    if generate_template:
-        logger.info(template)
-        return
 
-    # Setting up all the input data
     if isinstance(input_file, str):
         input_file = (input_file,)
+
     bulletpoint = "\n\u2022 "
     logger.info("Using %s reader to convert the given files: %s ",
                 reader,
                 bulletpoint.join((" ", *input_file)))
 
     data_reader = get_reader(reader)
-    if not (nxdl in data_reader.supported_nxdls or "*" in data_reader.supported_nxdls):
+    if not (nxdl_name in data_reader.supported_nxdls or "*" in data_reader.supported_nxdls):
         raise NotImplementedError("The chosen NXDL isn't supported by the selected reader.")
 
     data = data_reader().read(  # type: ignore[operator]
         template=Template(template),
         file_paths=input_file,
-        **kwargs,
+        **kwargs
     )
     helpers.validate_data_dict(template, data, nxdl_root)
+    return data
+
+
+# pylint: disable=too-many-arguments,too-many-locals
+def convert(input_file: Tuple[str, ...],
+            reader: str,
+            nxdl: str,
+            output: str,
+            generate_template: bool = False,
+            fair: bool = False,
+            undocumented: bool = False,
+            **kwargs):
+    """The conversion routine that takes the input parameters and calls the necessary functions.
+
+    Parameters
+    ----------
+    input_file : Tuple[str]
+        Tuple of files or file
+    reader: str
+        Name of reader such as xps
+    nxdl : str
+        Root name of nxdl file, e.g. NXmpes for NXmpes.nxdl.xml
+    output : str
+        Output file name.
+    generate_template : bool, default False
+        True if user wants template in logger info.
+    fair : bool, default False
+        If True, a warning is given that there are undocumented paths
+        in the template.
+    undocumented : bool, default False
+        If True, an undocumented warning is given.
+
+    Returns
+    -------
+    None.
+    """
+
+    nxdl_root, nxdl_f_path = get_nxdl_root_and_path(nxdl)
+
+    if generate_template:
+        template = Template()
+        helpers.generate_template_from_nxdl(nxdl_root, template)
+        logger.info(template)
+        return
+
+    data = transfer_data_into_template(input_file=input_file, reader=reader,
+                                       nxdl_name=nxdl, nxdl_root=nxdl_root,
+                                       **kwargs)
+    if undocumented:
+        logger.setLevel(UNDOCUMENTED)
     if fair and data.undocumented.keys():
         logger.warning("There are undocumented paths in the template. This is not acceptable!")
         return
@@ -147,7 +233,7 @@ def convert(input_file: Tuple[str, ...],
             path
         )
     helpers.add_default_root_attributes(data=data, filename=os.path.basename(output))
-    Writer(data=data, nxdl_path=nxdl_path, output_path=output).write()
+    Writer(data=data, nxdl_f_path=nxdl_f_path, output_path=output).write()
 
     logger.info("The output file generated: %s", output)
 

diff --git a/pynxtools/dataconverter/helpers.py b/pynxtools/dataconverter/helpers.py
@@ -17,12 +17,13 @@
 #
 """Helper functions commonly used by the convert routine."""
 
-from typing import List
+from typing import List, Optional, Any
 from typing import Tuple, Callable, Union
 import re
 import xml.etree.ElementTree as ET
 from datetime import datetime, timezone
 import logging
+import json
 
 import numpy as np
 from ase.data import chemical_symbols
@@ -650,3 +651,77 @@ def extract_atom_types(formula, mode='hill'):
         return convert_to_hill(atom_types)
 
     return atom_types
+
+
+# pylint: disable=too-many-branches
+def transform_to_intended_dt(str_value: Any) -> Optional[Any]:
+    """Transform string to the intended data type, if not then return str_value.
+
+    E.g '2.5E-2' will be transfor into 2.5E-2
+    tested with: '2.4E-23', '28', '45.98', 'test', ['59', '3.00005', '498E-34'],
+                 '23 34 444 5000', None
+    with result: 2.4e-23, 28, 45.98, test, [5.90000e+01 3.00005e+00 4.98000e-32],
+                 np.array([23 34 444 5000]), None
+    NOTE: add another arg in this func for giving 'hint' what kind of data like
+        numpy array or list
+    Parameters
+    ----------
+    str_value : str
+        Data from other format that comes as string e.g. string of list.
+
+    Returns
+    -------
+    Union[str, int, float, np.ndarray]
+        Converted data type
+    """
+
+    symbol_list_for_data_seperation = [';', ' ']
+    transformed: Any = None
+
+    if isinstance(str_value, list):
+        try:
+            transformed = np.array(str_value, dtype=np.float64)
+            return transformed
+        except ValueError:
+            pass
+
+    elif isinstance(str_value, np.ndarray):
+        return str_value
+    elif isinstance(str_value, str):
+        try:
+            transformed = int(str_value)
+        except ValueError:
+            try:
+                transformed = float(str_value)
+            except ValueError:
+                if '[' in str_value and ']' in str_value:
+                    transformed = json.loads(str_value)
+        if transformed is not None:
+            return transformed
+        for sym in symbol_list_for_data_seperation:
+            if sym in str_value:
+                parts = str_value.split(sym)
+                modified_parts: List = []
+                for part in parts:
+                    part = transform_to_intended_dt(part)
+                    if isinstance(part, (int, float)):
+                        modified_parts.append(part)
+                    else:
+                        return str_value
+                return transform_to_intended_dt(modified_parts)
+
+    return str_value
+
+
+def nested_dict_to_slash_separated_path(nested_dict: dict,
+                                        flattened_dict: dict,
+                                        parent_path=''):
+    """Convert nested dict into slash separeted path upto certain level."""
+    sep = '/'
+
+    for key, val in nested_dict.items():
+        path = parent_path + sep + key
+        if isinstance(val, dict):
+            nested_dict_to_slash_separated_path(val, flattened_dict, path)
+        else:
+            flattened_dict[path] = val
diff --git a/pynxtools/dataconverter/readers/xrd/README.md b/pynxtools/dataconverter/readers/xrd/README.md
@@ -0,0 +1,40 @@
+# XRD Reader
+With the XRD reader, data from X-ray diffraction experiment can be read and written into a NeXus file (h5 type file with extension .nxs) according to NXxrd_pan application definition in [NeXus](https://github.com/FAIRmat-NFDI/nexus_definitions). There are a few different methods of measuring XRD: 1. θ:2θ instruments (e.g. Rigaku H3R), and 2. θ:θ instrument (e.g. PANalytical X’Pert Pro). The goal with this reader is to support both of these methods.
+
+**NOTE: This reader is still under development. As of now, the reader can only handle files with the extension `.xrdml` , obtained with PANalytical X’Pert Pro version 1.5 (method 2 described above). Currently we are wtoking to include more file types and file versions.**
+
+## Contact Person in FAIRmat
+In principle, you can reach out to any member of Area B of the FAIRmat consortium, but Rubel Mozumder could be more reasonable for the early response.
+
+## Parsers
+Though, in computer science, parser is a process that reads code into smaller parts (called tocken) with relations among tockens in a tree diagram. The process helps compiler to understand the tocken relationship of the source code.
+
+The XRD reader calls a program or class (called parser) that reads the experimenal input file and re-organises the different physical/experiment concepts or properties in a certain structure which is defined by developer.
+
+### class pynxtools.dataconverter.readers.xrd.xrd_parser.XRDMLParser
+
+    **inputs:**
+        file_path: Full path of the input file.
+
+    **Important method:**
+        get_slash_separated_xrd_dict() -> dict
+
+        This method can be used to check if all the data from the input file have been read or not, it returns the slash separated dict as described.
+
+
+### Other Parsers
+    **Coming Soon!!**
+
+### How To
+The reader can be run from Jupyter-notebook or Jupyter-lab with the following command:
+
+```sh
+ ! dataconverter \
+--reader xrd \
+--nxdl NXxrd_pan \
+--input-file $<xps-file location> \
+--input-file $<eln-file location> \
+--output <output-file location>.nxs
+```
+
+An example file can be found here in GitLab in [nomad-remote-tools-hub](https://gitlab.mpcdf.mpg.de/nomad-lab/nomad-remote-tools-hub/-/tree/develop/docker/xrd) feel free to vist and try out the reader.
diff --git a/pynxtools/dataconverter/readers/xrd/__init__.py b/pynxtools/dataconverter/readers/xrd/__init__.py
@@ -0,0 +1,15 @@
+# Copyright The NOMAD Authors.
+#
+# This file is part of NOMAD. See https://nomad-lab.eu for further info.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.