From cb6fe0b6d2596e81969a60ae394298f3e439c852 Mon Sep 17 00:00:00 2001
From: atomprobe-tc <markus.kuehbach@physik.hu-berlin.de>
Date: Tue, 28 Nov 2023 15:40:32 +0100
Subject: [PATCH] Added support for pyccapt file formats and tested
 successfully with Mehrpad's example

---
 dev-requirements.txt                          |  18 ++
 ifes_apt_tc_data_modeling/ato/ato_reader.py   |   1 -
 .../pyccapt/pyccapt_reader.py                 | 213 ++++++++++++++++++
 pyproject.toml                                |   3 +-
 tests/TestsForDevelopers.ipynb                |  71 ++++++
 5 files changed, 304 insertions(+), 2 deletions(-)
 create mode 100644 ifes_apt_tc_data_modeling/pyccapt/pyccapt_reader.py

diff --git a/dev-requirements.txt b/dev-requirements.txt
index 0947a56..c34a224 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -29,6 +29,8 @@ bleach==6.0.0
     # via
     #   nbconvert
     #   readme-renderer
+blosc2==2.3.1
+    # via tables
 certifi==2022.12.7
     # via requests
 cffi==1.15.1
@@ -167,6 +169,8 @@ more-itertools==9.1.0
     # via jaraco-classes
 mpmath==1.3.0
     # via sympy
+msgpack==1.0.7
+    # via blosc2
 nbclassic==0.5.6
     # via
     #   jupyterlab
@@ -185,6 +189,8 @@ nbformat==5.8.0
     #   nbclient
     #   nbconvert
     #   notebook
+ndindex==1.7
+    # via blosc2
 nest-asyncio==1.5.6
     # via
     #   ipykernel
@@ -196,17 +202,22 @@ notebook==6.5.4
     # via jupyterlab
 notebook-shim==0.2.3
     # via nbclassic
+numexpr==2.8.7
+    # via tables
 numpy==1.24.3
     # via
     #   ase
+    #   blosc2
     #   contourpy
     #   h5grove
     #   h5py
     #   ifes-apt-tc-data-modeling (pyproject.toml)
     #   matplotlib
+    #   numexpr
     #   pandas
     #   radioactivedecay
     #   scipy
+    #   tables
     #   tifffile
 orjson==3.8.11
     # via h5grove
@@ -218,6 +229,7 @@ packaging==23.1
     #   jupyterlab-server
     #   matplotlib
     #   nbconvert
+    #   tables
 pandas==2.0.1
     # via ifes-apt-tc-data-modeling (pyproject.toml)
 pandocfilters==1.5.0
@@ -249,6 +261,10 @@ ptyprocess==0.7.0
     #   terminado
 pure-eval==0.2.2
     # via stack-data
+py-cpuinfo==9.0.0
+    # via
+    #   blosc2
+    #   tables
 pycparser==2.21
     # via cffi
 pygments==2.15.1
@@ -314,6 +330,8 @@ stack-data==0.6.2
     # via ipython
 sympy==1.11.1
     # via radioactivedecay
+tables==3.9.2
+    # via ifes-apt-tc-data-modeling (pyproject.toml)
 terminado==0.17.1
     # via
     #   jupyter-server
diff --git a/ifes_apt_tc_data_modeling/ato/ato_reader.py b/ifes_apt_tc_data_modeling/ato/ato_reader.py
index d9a75e1..a5ebce2 100644
--- a/ifes_apt_tc_data_modeling/ato/ato_reader.py
+++ b/ifes_apt_tc_data_modeling/ato/ato_reader.py
@@ -24,7 +24,6 @@
 import numpy as np
 
 from ifes_apt_tc_data_modeling.nexus.nx_field import NxField
-
 from ifes_apt_tc_data_modeling.utils.mmapped_io import get_memory_mapped_data
 
 
diff --git a/ifes_apt_tc_data_modeling/pyccapt/pyccapt_reader.py b/ifes_apt_tc_data_modeling/pyccapt/pyccapt_reader.py
new file mode 100644
index 0000000..5b793d5
--- /dev/null
+++ b/ifes_apt_tc_data_modeling/pyccapt/pyccapt_reader.py
@@ -0,0 +1,213 @@
+# POS file format reader used by atom probe microscopists.
+#
+# Copyright The NOMAD Authors.
+#
+# This file is part of NOMAD. See https://nomad-lab.eu for further info.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# pylint: disable=no-member,duplicate-code
+
+import os
+
+import h5py
+
+import numpy as np
+
+import pandas as pd
+
+from ase.data import atomic_numbers, chemical_symbols
+from ifes_apt_tc_data_modeling.nexus.nx_ion import NxIon
+from ifes_apt_tc_data_modeling.nexus.nx_field import NxField
+from ifes_apt_tc_data_modeling.utils.utils import \
+    isotope_to_hash, isotope_vector_to_nuclid_list, MAX_NUMBER_OF_ATOMS_PER_ION
+
+# this implementation focuses on the following state of the pyccapt repository
+# https://github.com/mmonajem/pyccapt/commit/e955beb4f2627befb8b4d26f2e74e4c52e00394e
+
+# during the course of an atom probe measurement and analysis with FAU/Erlangen's Oxcart instrument
+# several HDF5 files are generated with essentially two software tools. One is pyccapt which has a
+# a control module, a calibration module (where the voltage/bowl calibration and reconstruction is performed),
+# and a module/functionalities to document ranging i.e. ion type identification made
+# The other software typically used by the FAU/Erlangen atom probe group is Atom Probe Toolbox;
+# instructed as a set of Matlab live scripts this toolbox offers data analysis functionalities,
+# results are stored via an HDF5 file
+
+# specific comments 
+# pyccapt/control
+# an HDF5 file keeping relevant quantities 
+
+# pyccapt/calibration
+# unfortunately the generated HDF5 file has internally no provenance information
+# with which pyccapt version it was generated, therefore developers of pyccapt should
+# rather write the content of the HDF5 file explicitly dset by dset e.g. using h5py instead
+# of the pandas HDF5 dump convenience functionality
+# of course pandas stores its own version but that is not conclusive enough to infer with
+# which pyccapt version and most importantly from which other context the file was generated
+# this is an aspect of the FAIR RDM principles which the pyccapt approach currently ignores
+
+
+class ReadPyccaptControlFileFormat():
+    """Read FAU/Erlangen pyccapt (controle module) HDF5 file format."""
+
+    def __init__(self, filename: str):
+        assert len(filename) > 2, "H5 file incorrect filename ending!"
+        assert filename.lower().endswith(".h5") or filename.lower().endswith(".hdf5"), \
+            "HDF5 file incorrect file type!"
+        self.filename = filename
+
+        self.filesize = os.path.getsize(self.filename)
+        self.number_of_events = None
+        self.version = "e955beb4f2627befb8b4d26f2e74e4c52e00394e"
+
+        # check that the formatting matches that of an pyccapt control module output HDF5 file
+        with h5py.File(self.filename, "r") as h5r:
+            self.supported = 0  # voting-based
+            required_groups = ["apt", "dld", "tdc"]
+            for req_grpnm in required_groups:
+                if req_grpnm in h5r.keys():
+                    self.supported += 1
+            if self.supported == 3:
+                print(f"{self.filename} is a supported pyccapt/control HDF5 file!")
+            else:
+                print(f"{self.filename} is not a supported pyccapt/control HDF5 file!")
+                return
+
+
+class ReadPyccaptCalibrationFileFormat():
+    """Read FAU/Erlangen pyccapt (calibration module) HDF5 file format."""
+
+    def __init__(self, filename: str):
+        assert len(filename) > 2, "H5 file incorrect filename ending!"
+        assert filename.lower().endswith(".h5") or filename.lower().endswith(".hdf5"), \
+            "HDF5 file incorrect file type!"
+        self.filename = filename
+
+        self.filesize = os.path.getsize(self.filename)
+        self.number_of_events = None
+        self.version = "e955beb4f2627befb8b4d26f2e74e4c52e00394e"
+        self.df = None
+
+        with h5py.File(self.filename, "r") as h5r:
+            self.supported = 0  # voting-based
+            required_entries = ["df",
+                                "df/axis0", "df/axis1",
+                                "df/block0_items", "df/block0_values",
+                                "df/block1_items", "df/block1_values"]
+            for entry in required_entries:
+                if entry in h5r.keys():
+                    self.supported += 1
+            if self.supported == 7:
+                print(f"{self.filename} is a supported pyccapt/calibration HDF5 file!")
+            else:
+                print(f"{self.filename} is not a supported pyccapt/calibration HDF5 file!")
+                return
+
+        self.df = pd.read_hdf(self.filename)
+        self.number_of_events = np.shape(self.df)[0]
+    
+    def get_named_quantities(self, term: str):
+        if term in self.df.keys():
+            return self.df[term]
+        return None
+    
+    def get_reconstructed_positions(self):
+        """Read xyz columns."""
+
+        xyz = NxField()
+        xyz.typed_value = np.zeros(
+            [self.number_of_events, 3], np.float32)
+        xyz.unit = "nm"
+
+        dim = 0
+        for quant in ["x (nm)", "y (nm)", "z (nm)"]:
+            xyz.typed_value[:, dim] = np.asarray(self.get_named_quantities(quant), np.float32)
+            dim += 1
+        return xyz
+
+    def get_mass_to_charge_state_ratio(self):
+        """Read (calibrated) mass-to-charge-state-ratio column."""
+
+        m_n = NxField()
+        m_n.typed_value = np.zeros(
+            [self.number_of_events, 1], np.float32)
+        m_n.unit = "Da"
+
+        m_n.typed_value[:, 0] = np.asarray(self.get_named_quantities("mc_c (Da)"), np.float32)
+        return m_n
+
+
+class ReadPyccaptRangingFileFormat():
+    """Read FAU/Erlangen pyccapt (ranging module) HDF5 file format."""
+
+    def __init__(self, filename: str):
+        assert len(filename) > 2, "H5 file incorrect filename ending!"
+        assert filename.lower().endswith(".h5") or filename.lower().endswith(".hdf5"), \
+            "HDF5 file incorrect file type!"
+        self.filename = filename
+
+        self.filesize = os.path.getsize(self.filename)
+        self.number_of_events = None
+        self.version = "e955beb4f2627befb8b4d26f2e74e4c52e00394e"
+        self.df = None
+
+        with h5py.File(self.filename, "r") as h5r:
+            self.supported = 0  # voting-based
+            required_entries = ["df",
+                                "df/axis0", "df/axis1",
+                                "df/block0_items", "df/block0_values",
+                                "df/block1_items", "df/block1_values",
+                                "df/block2_items", "df/block2_values"]
+            for entry in required_entries:
+                if entry in h5r.keys():
+                    self.supported += 1
+            if self.supported == 9:
+                print(f"{self.filename} is a supported pyccapt/ranging HDF5 file!")
+            else:
+                print(f"{self.filename} is not a supported pyccapt/ranging HDF5 file!")
+                return
+
+        self.df = pd.read_hdf(self.filename)
+        self.rng = {}
+        self.rng["molecular_ions"] = []
+        print(np.shape(self.df)[0])
+        for idx in np.arange(0, np.shape(self.df)[0]):
+            if isinstance(self.df.iloc[idx, 6], str) is True:
+                if self.df.iloc[idx, 6] == "unranged":
+                    continue
+
+            elements = self.df.iloc[idx, 6]
+            complexs = self.df.iloc[idx, 7]
+            isotopes = self.df.iloc[idx, 8]
+            # assertions
+            ivec = np.zeros((MAX_NUMBER_OF_ATOMS_PER_ION,), np.uint16)
+            hashvector = []
+            for idxj in np.arange(0, len(elements)):
+                symbol = elements[idxj]
+                if symbol in chemical_symbols and symbol != "X":
+                    proton_number = atomic_numbers[symbol]
+                    neutron_number = isotopes[idxj] - proton_number
+                    for mult in np.arange(0, complexs[idxj]):
+                        hashvector.append(isotope_to_hash(proton_number, neutron_number))
+            ivec[0:len(hashvector)] = np.sort(np.asarray(hashvector, np.uint16), kind="stable")[::-1]
+            
+            m_ion = NxIon()
+            m_ion.isotope_vector.typed_value = ivec
+            m_ion.nuclid_list.typed_value = isotope_vector_to_nuclid_list(ivec)
+            m_ion.charge_state.typed_value = np.int8(self.df.iloc[idx, 9])
+            m_ion.add_range(self.df.iloc[idx, 3], self.df.iloc[idx, 4])
+            m_ion.update_human_readable_name()
+            # m_ion.report()
+            self.rng["molecular_ions"].append(m_ion)
+        print(f"{self.filename} parsed successfully")
diff --git a/pyproject.toml b/pyproject.toml
index 1b00af1..e3c0817 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,8 @@ classifiers = [
 dependencies = [
     "h5py>=3.6.0",
     "numpy>=1.21.2",
-    "pandas>=1.3.2",
+    "pandas",
+    "tables",
     "ase>=3.19.0",
     "radioactivedecay>=0.4.16"
 ]
diff --git a/tests/TestsForDevelopers.ipynb b/tests/TestsForDevelopers.ipynb
index b7e0e19..67fb6ee 100644
--- a/tests/TestsForDevelopers.ipynb
+++ b/tests/TestsForDevelopers.ipynb
@@ -30,6 +30,7 @@
     "import os\n",
     "import numpy as np\n",
     "import h5py\n",
+    "import pandas as pd\n",
     "from jupyterlab_h5web import H5Web\n",
     "from ifes_apt_tc_data_modeling.utils.utils import create_isotope_vector, \\\n",
     "    isotope_vector_to_dict_keyword, isotope_vector_to_human_readable_name, \\\n",
@@ -41,6 +42,76 @@
     "from ase.data import atomic_numbers, atomic_masses, chemical_symbols"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "e680a9d0-4005-40b0-ab45-a8f5fb215273",
+   "metadata": {},
+   "source": [
+    "## FAU/Erlangen pyccapt control/calibration/ranging module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2378059c-dae4-46e8-88b0-9fd1ac8a5a70",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "prefix = f\"{os.getcwd()}/../../../../../paper_paper_paper/joss_nomad_apt/bb_analysis/data\"\n",
+    "fnm = [\"1748_Al.h5\",\n",
+    "       \"1748_Al_range_.h5\",\n",
+    "       \"1748_Nov-14-2023_13-31_Al.h5\"]\n",
+    "# df = pd.read_hdf(f\"{prefix}/ger_erlangen_pyccapt_format/{fnm[1]}\")\n",
+    "# H5Web(f\"{prefix}/ger_erlangen_pyccapt_format/{fnm[1]}\")\n",
+    "# df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "319c3c28-8d4d-46a5-bb73-7a8396d84020",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ifes_apt_tc_data_modeling.pyccapt.pyccapt_reader import ReadPyccaptControlFileFormat, ReadPyccaptCalibrationFileFormat, ReadPyccaptRangingFileFormat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac6b2e02-371b-47ff-9c68-b23b6e1b4b83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pyc_r = ReadPyccaptRangingFileFormat(f\"{prefix}/ger_erlangen_pyccapt_format/{fnm[1]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7178b8b-51f7-4270-acbe-77844692477b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pyc_m = ReadPyccaptControlFileFormat(f\"{prefix}/ger_erlangen_pyccapt_format/{fnm[2]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1f1c4d8-75fe-4884-b81a-338a6c5eab94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pyc_c = ReadPyccaptCalibrationFileFormat(f\"{prefix}/ger_erlangen_pyccapt_format/{fnm[0]}\")\n",
+    "xyz = pyc_c.get_reconstructed_positions()\n",
+    "print(xyz.typed_value)\n",
+    "m_q = pyc_c.get_mass_to_charge_state_ratio()\n",
+    "print(m_q.typed_value)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "52294143-78c7-47bf-b39e-9e40eec3999d",