From cb6fe0b6d2596e81969a60ae394298f3e439c852 Mon Sep 17 00:00:00 2001 From: atomprobe-tc Date: Tue, 28 Nov 2023 15:40:32 +0100 Subject: [PATCH] Added support for pyccapt file formats and tested successfully with Mehrpad's example --- dev-requirements.txt | 18 ++ ifes_apt_tc_data_modeling/ato/ato_reader.py | 1 - .../pyccapt/pyccapt_reader.py | 213 ++++++++++++++++++ pyproject.toml | 3 +- tests/TestsForDevelopers.ipynb | 71 ++++++ 5 files changed, 304 insertions(+), 2 deletions(-) create mode 100644 ifes_apt_tc_data_modeling/pyccapt/pyccapt_reader.py diff --git a/dev-requirements.txt b/dev-requirements.txt index 0947a56..c34a224 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -29,6 +29,8 @@ bleach==6.0.0 # via # nbconvert # readme-renderer +blosc2==2.3.1 + # via tables certifi==2022.12.7 # via requests cffi==1.15.1 @@ -167,6 +169,8 @@ more-itertools==9.1.0 # via jaraco-classes mpmath==1.3.0 # via sympy +msgpack==1.0.7 + # via blosc2 nbclassic==0.5.6 # via # jupyterlab @@ -185,6 +189,8 @@ nbformat==5.8.0 # nbclient # nbconvert # notebook +ndindex==1.7 + # via blosc2 nest-asyncio==1.5.6 # via # ipykernel @@ -196,17 +202,22 @@ notebook==6.5.4 # via jupyterlab notebook-shim==0.2.3 # via nbclassic +numexpr==2.8.7 + # via tables numpy==1.24.3 # via # ase + # blosc2 # contourpy # h5grove # h5py # ifes-apt-tc-data-modeling (pyproject.toml) # matplotlib + # numexpr # pandas # radioactivedecay # scipy + # tables # tifffile orjson==3.8.11 # via h5grove @@ -218,6 +229,7 @@ packaging==23.1 # jupyterlab-server # matplotlib # nbconvert + # tables pandas==2.0.1 # via ifes-apt-tc-data-modeling (pyproject.toml) pandocfilters==1.5.0 @@ -249,6 +261,10 @@ ptyprocess==0.7.0 # terminado pure-eval==0.2.2 # via stack-data +py-cpuinfo==9.0.0 + # via + # blosc2 + # tables pycparser==2.21 # via cffi pygments==2.15.1 @@ -314,6 +330,8 @@ stack-data==0.6.2 # via ipython sympy==1.11.1 # via radioactivedecay +tables==3.9.2 + # via ifes-apt-tc-data-modeling (pyproject.toml) terminado==0.17.1 # via # jupyter-server diff --git a/ifes_apt_tc_data_modeling/ato/ato_reader.py b/ifes_apt_tc_data_modeling/ato/ato_reader.py index d9a75e1..a5ebce2 100644 --- a/ifes_apt_tc_data_modeling/ato/ato_reader.py +++ b/ifes_apt_tc_data_modeling/ato/ato_reader.py @@ -24,7 +24,6 @@ import numpy as np from ifes_apt_tc_data_modeling.nexus.nx_field import NxField - from ifes_apt_tc_data_modeling.utils.mmapped_io import get_memory_mapped_data diff --git a/ifes_apt_tc_data_modeling/pyccapt/pyccapt_reader.py b/ifes_apt_tc_data_modeling/pyccapt/pyccapt_reader.py new file mode 100644 index 0000000..5b793d5 --- /dev/null +++ b/ifes_apt_tc_data_modeling/pyccapt/pyccapt_reader.py @@ -0,0 +1,213 @@ +# POS file format reader used by atom probe microscopists. +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# pylint: disable=no-member,duplicate-code + +import os + +import h5py + +import numpy as np + +import pandas as pd + +from ase.data import atomic_numbers, chemical_symbols +from ifes_apt_tc_data_modeling.nexus.nx_ion import NxIon +from ifes_apt_tc_data_modeling.nexus.nx_field import NxField +from ifes_apt_tc_data_modeling.utils.utils import \ + isotope_to_hash, isotope_vector_to_nuclid_list, MAX_NUMBER_OF_ATOMS_PER_ION + +# this implementation focuses on the following state of the pyccapt repository +# https://github.com/mmonajem/pyccapt/commit/e955beb4f2627befb8b4d26f2e74e4c52e00394e + +# during the course of an atom probe measurement and analysis with FAU/Erlangen's Oxcart instrument +# several HDF5 files are generated with essentially two software tools. One is pyccapt which has a +# a control module, a calibration module (where the voltage/bowl calibration and reconstruction is performed), +# and a module/functionalities to document ranging i.e. ion type identification made +# The other software typically used by the FAU/Erlangen atom probe group is Atom Probe Toolbox; +# instructed as a set of Matlab live scripts this toolbox offers data analysis functionalities, +# results are stored via an HDF5 file + +# specific comments +# pyccapt/control +# an HDF5 file keeping relevant quantities + +# pyccapt/calibration +# unfortunately the generated HDF5 file has internally no provenance information +# with which pyccapt version it was generated, therefore developers of pyccapt should +# rather write the content of the HDF5 file explicitly dset by dset e.g. using h5py instead +# of the pandas HDF5 dump convenience functionality +# of course pandas stores its own version but that is not conclusive enough to infer with +# which pyccapt version and most importantly from which other context the file was generated +# this is an aspect of the FAIR RDM principles which the pyccapt approach currently ignores + + +class ReadPyccaptControlFileFormat(): + """Read FAU/Erlangen pyccapt (controle module) HDF5 file format.""" + + def __init__(self, filename: str): + assert len(filename) > 2, "H5 file incorrect filename ending!" + assert filename.lower().endswith(".h5") or filename.lower().endswith(".hdf5"), \ + "HDF5 file incorrect file type!" + self.filename = filename + + self.filesize = os.path.getsize(self.filename) + self.number_of_events = None + self.version = "e955beb4f2627befb8b4d26f2e74e4c52e00394e" + + # check that the formatting matches that of an pyccapt control module output HDF5 file + with h5py.File(self.filename, "r") as h5r: + self.supported = 0 # voting-based + required_groups = ["apt", "dld", "tdc"] + for req_grpnm in required_groups: + if req_grpnm in h5r.keys(): + self.supported += 1 + if self.supported == 3: + print(f"{self.filename} is a supported pyccapt/control HDF5 file!") + else: + print(f"{self.filename} is not a supported pyccapt/control HDF5 file!") + return + + +class ReadPyccaptCalibrationFileFormat(): + """Read FAU/Erlangen pyccapt (calibration module) HDF5 file format.""" + + def __init__(self, filename: str): + assert len(filename) > 2, "H5 file incorrect filename ending!" + assert filename.lower().endswith(".h5") or filename.lower().endswith(".hdf5"), \ + "HDF5 file incorrect file type!" + self.filename = filename + + self.filesize = os.path.getsize(self.filename) + self.number_of_events = None + self.version = "e955beb4f2627befb8b4d26f2e74e4c52e00394e" + self.df = None + + with h5py.File(self.filename, "r") as h5r: + self.supported = 0 # voting-based + required_entries = ["df", + "df/axis0", "df/axis1", + "df/block0_items", "df/block0_values", + "df/block1_items", "df/block1_values"] + for entry in required_entries: + if entry in h5r.keys(): + self.supported += 1 + if self.supported == 7: + print(f"{self.filename} is a supported pyccapt/calibration HDF5 file!") + else: + print(f"{self.filename} is not a supported pyccapt/calibration HDF5 file!") + return + + self.df = pd.read_hdf(self.filename) + self.number_of_events = np.shape(self.df)[0] + + def get_named_quantities(self, term: str): + if term in self.df.keys(): + return self.df[term] + return None + + def get_reconstructed_positions(self): + """Read xyz columns.""" + + xyz = NxField() + xyz.typed_value = np.zeros( + [self.number_of_events, 3], np.float32) + xyz.unit = "nm" + + dim = 0 + for quant in ["x (nm)", "y (nm)", "z (nm)"]: + xyz.typed_value[:, dim] = np.asarray(self.get_named_quantities(quant), np.float32) + dim += 1 + return xyz + + def get_mass_to_charge_state_ratio(self): + """Read (calibrated) mass-to-charge-state-ratio column.""" + + m_n = NxField() + m_n.typed_value = np.zeros( + [self.number_of_events, 1], np.float32) + m_n.unit = "Da" + + m_n.typed_value[:, 0] = np.asarray(self.get_named_quantities("mc_c (Da)"), np.float32) + return m_n + + +class ReadPyccaptRangingFileFormat(): + """Read FAU/Erlangen pyccapt (ranging module) HDF5 file format.""" + + def __init__(self, filename: str): + assert len(filename) > 2, "H5 file incorrect filename ending!" + assert filename.lower().endswith(".h5") or filename.lower().endswith(".hdf5"), \ + "HDF5 file incorrect file type!" + self.filename = filename + + self.filesize = os.path.getsize(self.filename) + self.number_of_events = None + self.version = "e955beb4f2627befb8b4d26f2e74e4c52e00394e" + self.df = None + + with h5py.File(self.filename, "r") as h5r: + self.supported = 0 # voting-based + required_entries = ["df", + "df/axis0", "df/axis1", + "df/block0_items", "df/block0_values", + "df/block1_items", "df/block1_values", + "df/block2_items", "df/block2_values"] + for entry in required_entries: + if entry in h5r.keys(): + self.supported += 1 + if self.supported == 9: + print(f"{self.filename} is a supported pyccapt/ranging HDF5 file!") + else: + print(f"{self.filename} is not a supported pyccapt/ranging HDF5 file!") + return + + self.df = pd.read_hdf(self.filename) + self.rng = {} + self.rng["molecular_ions"] = [] + print(np.shape(self.df)[0]) + for idx in np.arange(0, np.shape(self.df)[0]): + if isinstance(self.df.iloc[idx, 6], str) is True: + if self.df.iloc[idx, 6] == "unranged": + continue + + elements = self.df.iloc[idx, 6] + complexs = self.df.iloc[idx, 7] + isotopes = self.df.iloc[idx, 8] + # assertions + ivec = np.zeros((MAX_NUMBER_OF_ATOMS_PER_ION,), np.uint16) + hashvector = [] + for idxj in np.arange(0, len(elements)): + symbol = elements[idxj] + if symbol in chemical_symbols and symbol != "X": + proton_number = atomic_numbers[symbol] + neutron_number = isotopes[idxj] - proton_number + for mult in np.arange(0, complexs[idxj]): + hashvector.append(isotope_to_hash(proton_number, neutron_number)) + ivec[0:len(hashvector)] = np.sort(np.asarray(hashvector, np.uint16), kind="stable")[::-1] + + m_ion = NxIon() + m_ion.isotope_vector.typed_value = ivec + m_ion.nuclid_list.typed_value = isotope_vector_to_nuclid_list(ivec) + m_ion.charge_state.typed_value = np.int8(self.df.iloc[idx, 9]) + m_ion.add_range(self.df.iloc[idx, 3], self.df.iloc[idx, 4]) + m_ion.update_human_readable_name() + # m_ion.report() + self.rng["molecular_ions"].append(m_ion) + print(f"{self.filename} parsed successfully") diff --git a/pyproject.toml b/pyproject.toml index 1b00af1..e3c0817 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,8 @@ classifiers = [ dependencies = [ "h5py>=3.6.0", "numpy>=1.21.2", - "pandas>=1.3.2", + "pandas", + "tables", "ase>=3.19.0", "radioactivedecay>=0.4.16" ] diff --git a/tests/TestsForDevelopers.ipynb b/tests/TestsForDevelopers.ipynb index b7e0e19..67fb6ee 100644 --- a/tests/TestsForDevelopers.ipynb +++ b/tests/TestsForDevelopers.ipynb @@ -30,6 +30,7 @@ "import os\n", "import numpy as np\n", "import h5py\n", + "import pandas as pd\n", "from jupyterlab_h5web import H5Web\n", "from ifes_apt_tc_data_modeling.utils.utils import create_isotope_vector, \\\n", " isotope_vector_to_dict_keyword, isotope_vector_to_human_readable_name, \\\n", @@ -41,6 +42,76 @@ "from ase.data import atomic_numbers, atomic_masses, chemical_symbols" ] }, + { + "cell_type": "markdown", + "id": "e680a9d0-4005-40b0-ab45-a8f5fb215273", + "metadata": {}, + "source": [ + "## FAU/Erlangen pyccapt control/calibration/ranging module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2378059c-dae4-46e8-88b0-9fd1ac8a5a70", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "prefix = f\"{os.getcwd()}/../../../../../paper_paper_paper/joss_nomad_apt/bb_analysis/data\"\n", + "fnm = [\"1748_Al.h5\",\n", + " \"1748_Al_range_.h5\",\n", + " \"1748_Nov-14-2023_13-31_Al.h5\"]\n", + "# df = pd.read_hdf(f\"{prefix}/ger_erlangen_pyccapt_format/{fnm[1]}\")\n", + "# H5Web(f\"{prefix}/ger_erlangen_pyccapt_format/{fnm[1]}\")\n", + "# df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "319c3c28-8d4d-46a5-bb73-7a8396d84020", + "metadata": {}, + "outputs": [], + "source": [ + "from ifes_apt_tc_data_modeling.pyccapt.pyccapt_reader import ReadPyccaptControlFileFormat, ReadPyccaptCalibrationFileFormat, ReadPyccaptRangingFileFormat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac6b2e02-371b-47ff-9c68-b23b6e1b4b83", + "metadata": {}, + "outputs": [], + "source": [ + "pyc_r = ReadPyccaptRangingFileFormat(f\"{prefix}/ger_erlangen_pyccapt_format/{fnm[1]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7178b8b-51f7-4270-acbe-77844692477b", + "metadata": {}, + "outputs": [], + "source": [ + "pyc_m = ReadPyccaptControlFileFormat(f\"{prefix}/ger_erlangen_pyccapt_format/{fnm[2]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1f1c4d8-75fe-4884-b81a-338a6c5eab94", + "metadata": {}, + "outputs": [], + "source": [ + "pyc_c = ReadPyccaptCalibrationFileFormat(f\"{prefix}/ger_erlangen_pyccapt_format/{fnm[0]}\")\n", + "xyz = pyc_c.get_reconstructed_positions()\n", + "print(xyz.typed_value)\n", + "m_q = pyc_c.get_mass_to_charge_state_ratio()\n", + "print(m_q.typed_value)" + ] + }, { "cell_type": "markdown", "id": "52294143-78c7-47bf-b39e-9e40eec3999d",