From b1064bc7f41acd31130634972afc3d4d17e3f47c Mon Sep 17 00:00:00 2001 From: sanbrock Date: Mon, 28 Oct 2024 22:33:28 +0100 Subject: [PATCH 1/9] carefull name generation for new BasicELN archive --- src/pynxtools/nomad/schema.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/pynxtools/nomad/schema.py b/src/pynxtools/nomad/schema.py index 98dd86f85..9aa755a3e 100644 --- a/src/pynxtools/nomad/schema.py +++ b/src/pynxtools/nomad/schema.py @@ -16,6 +16,7 @@ # limitations under the License. # +import hashlib import json import os import os.path @@ -905,8 +906,10 @@ def get_entry_reference(archive, f_name): EntityReference.normalize(self, archive, logger) if not self.reference: logger.info(f"{self.lab_id} to be created") - - f_name = f"{current_cls.__name__}_{self.lab_id}.archive.json" + f_name = re.split("([0-9a-zA-Z.]+)", self.lab_id)[1] + if len(f_name) != len(self.lab_id): + f_name = f_name + hashlib.md5(self.lab_id.encode()).hexdigest() + f_name = f"{current_cls.__name__}_{f_name}.archive.json" create_Entity(self.lab_id, archive, f_name) self.reference = get_entry_reference(archive, f_name) logger.info(f"{self.reference} - referenced directly") From 7a17d13385ed499eb33bdece88d58c76de908f79 Mon Sep 17 00:00:00 2001 From: sanbrock Date: Tue, 29 Oct 2024 02:16:04 +0100 Subject: [PATCH 2/9] processing NXroot, too --- src/pynxtools/nexus/nexus.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/pynxtools/nexus/nexus.py b/src/pynxtools/nexus/nexus.py index 679d4f94d..8188ac551 100644 --- a/src/pynxtools/nexus/nexus.py +++ b/src/pynxtools/nexus/nexus.py @@ -85,6 +85,11 @@ def decode_if_string( def get_nxdl_entry(hdf_info): """Get the nxdl application definition for an HDF5 node""" entry = hdf_info + if ( + "NX_class" in entry["hdf_node"].attrs.keys() + and decode_if_string(entry["hdf_node"].attrs["NX_class"]) == "NXroot" + ): + return "NXroot" while ( isinstance(entry["hdf_node"], h5py.Dataset) or "NX_class" not in entry["hdf_node"].attrs.keys() @@ -97,7 +102,7 @@ def get_nxdl_entry(hdf_info): nxdef = entry["hdf_node"]["definition"][()] return nxdef.decode() except KeyError: # 'NO Definition referenced' - return "NXentry" + return "NXroot" def get_nx_class_path(hdf_info): @@ -398,6 +403,8 @@ def get_inherited_hdf_nodes( path = hdf_path for pind in range(len(path)): + if len(path) == 1 and path[0] == "": + return (["NXroot"], ["/"], elist) hdf_info2 = [hdf_path, hdf_node, hdf_class_path] [ hdf_path, @@ -803,9 +810,7 @@ def not_yet_visited(self, root, name): def full_visit(self, root, hdf_node, name, func): """visiting recursivly all children, but avoiding endless cycles""" - # print(name) - if len(name) > 0: - func(name, hdf_node) + func(name, hdf_node) if isinstance(hdf_node, h5py.Group): for ch_name, child in hdf_node.items(): full_name = ch_name if len(name) == 0 else name + "/" + ch_name From 96477620e4a3ffa65cf20161ea64889643539f19 Mon Sep 17 00:00:00 2001 From: sanbrock Date: Tue, 29 Oct 2024 02:29:54 +0100 Subject: [PATCH 3/9] fix nexus test file --- tests/data/nexus/Ref_nexus_test.log | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/data/nexus/Ref_nexus_test.log b/tests/data/nexus/Ref_nexus_test.log index bd8b70692..d37370392 100644 --- a/tests/data/nexus/Ref_nexus_test.log +++ b/tests/data/nexus/Ref_nexus_test.log @@ -1,3 +1,32 @@ +DEBUG - ===== GROUP (// [NO NXentry found::]): +DEBUG - classpath: None +DEBUG - NOT IN SCHEMA +DEBUG - +DEBUG - ===== ATTRS (//@HDF5_Version) +DEBUG - value: 1.10.5 +DEBUG - classpath: None +DEBUG - NOT IN SCHEMA +DEBUG - +DEBUG - ===== ATTRS (//@file_name) +DEBUG - value: /home/tommaso/Desktop/NeXus/Test/201805_WSe2_arpes.nxs +DEBUG - classpath: None +DEBUG - NOT IN SCHEMA +DEBUG - +DEBUG - ===== ATTRS (//@file_time) +DEBUG - value: 2020-06-04T19:19:48.464472 +DEBUG - classpath: None +DEBUG - NOT IN SCHEMA +DEBUG - +DEBUG - ===== ATTRS (//@h5py_version) +DEBUG - value: 2.10.0 +DEBUG - classpath: None +DEBUG - NOT IN SCHEMA +DEBUG - +DEBUG - ===== ATTRS (//@nexusformat_version) +DEBUG - value: 0.5.2 +DEBUG - classpath: None +DEBUG - NOT IN SCHEMA +DEBUG - DEBUG - ===== GROUP (//entry [NXarpes::/NXentry]): DEBUG - classpath: ['NXentry'] DEBUG - classes: From 443c2092d1ef30abcaad4105ca2152abebfdd088 Mon Sep 17 00:00:00 2001 From: sanbrock Date: Tue, 29 Oct 2024 09:56:40 +0100 Subject: [PATCH 4/9] populate NXroot and its attributes --- src/pynxtools/nexus/nexus.py | 2 +- src/pynxtools/nomad/parser.py | 20 +++++++++++++------- src/pynxtools/nomad/schema.py | 4 +++- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/pynxtools/nexus/nexus.py b/src/pynxtools/nexus/nexus.py index 8188ac551..177074df4 100644 --- a/src/pynxtools/nexus/nexus.py +++ b/src/pynxtools/nexus/nexus.py @@ -404,7 +404,7 @@ def get_inherited_hdf_nodes( for pind in range(len(path)): if len(path) == 1 and path[0] == "": - return (["NXroot"], ["/"], elist) + return ([""], ["/"], elist) hdf_info2 = [hdf_path, hdf_node, hdf_class_path] [ hdf_path, diff --git a/src/pynxtools/nomad/parser.py b/src/pynxtools/nomad/parser.py index ea3dd3c3e..67a533381 100644 --- a/src/pynxtools/nomad/parser.py +++ b/src/pynxtools/nomad/parser.py @@ -159,15 +159,21 @@ def _collect_class(self, current: MSection): self._sample_class_refs[class_name].append(current) def _populate_data( - self, depth: int, nx_path: list, nx_def: str, hdf_node, current: MSection + self, depth: int, nx_path: list, nx_def: str, hdf_node, current: MSection, attr ): """ Populate attributes and fields """ - if depth < len(nx_path): + if attr: # it is an attribute of either field or group - nx_attr = nx_path[depth] - nx_parent: ET.Element = nx_path[depth - 1] + if nx_path[0] == "/": + nx_attr = nx_path[1] + nx_parent = nx_attr.getparent() + nx_root = True + else: + nx_root = False + nx_attr = nx_path[depth] + nx_parent: ET.Element = nx_path[depth - 1] if isinstance(nx_attr, str): if nx_attr != "units": @@ -191,7 +197,7 @@ def _populate_data( current = _to_section(attr_name, nx_def, nx_attr, current) try: - if nx_parent.tag.endswith("group"): + if nx_root or nx_parent.tag.endswith("group"): current.m_set_section_attribute(attr_name, attr_value) else: parent_html_name = nx_path[-2].get("name") @@ -323,7 +329,7 @@ def __nexus_populate(self, params: dict, attr=None): # pylint: disable=W0613 if nx_def is not None: nx_def = rename_nx_for_nomad(nx_def) - if nx_path is None: + if nx_path is None or nx_path == "/": return current: MSection = _to_section(None, nx_def, None, self.nx_root) @@ -340,7 +346,7 @@ def __nexus_populate(self, params: dict, attr=None): # pylint: disable=W0613 if nx_node.tag.endswith("group"): current.m_set_section_attribute("m_nx_data_path", current_hdf_path) current.m_set_section_attribute("m_nx_data_file", self.nxs_fname) - self._populate_data(depth, nx_path, nx_def, hdf_node, current) + self._populate_data(depth, nx_path, nx_def, hdf_node, current, attr) def get_sub_element_names(self, elem: MSection): return elem.m_def.all_aliases.keys() diff --git a/src/pynxtools/nomad/schema.py b/src/pynxtools/nomad/schema.py index 9aa755a3e..43a6639c8 100644 --- a/src/pynxtools/nomad/schema.py +++ b/src/pynxtools/nomad/schema.py @@ -745,7 +745,9 @@ def __create_package_from_nxdl_directories(nexus_section: Section) -> Package: for section in sections: package.section_definitions.append(section) - if section.nx_category == "application": + if section.nx_category == "application" or ( + section.nx_category == "base" and section.nx_name == "NXroot" + ): nexus_section.sub_sections.append( SubSection(section_def=section, name=section.name) ) From 1f0ff23ce351586a3d009d58091b271a461edba5 Mon Sep 17 00:00:00 2001 From: sanbrock Date: Tue, 29 Oct 2024 10:11:45 +0100 Subject: [PATCH 5/9] linting --- src/pynxtools/nomad/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pynxtools/nomad/parser.py b/src/pynxtools/nomad/parser.py index 67a533381..cb56de77f 100644 --- a/src/pynxtools/nomad/parser.py +++ b/src/pynxtools/nomad/parser.py @@ -166,14 +166,14 @@ def _populate_data( """ if attr: # it is an attribute of either field or group + nx_root = False if nx_path[0] == "/": nx_attr = nx_path[1] nx_parent = nx_attr.getparent() nx_root = True else: - nx_root = False nx_attr = nx_path[depth] - nx_parent: ET.Element = nx_path[depth - 1] + nx_parent = nx_path[depth - 1] if isinstance(nx_attr, str): if nx_attr != "units": From 1c8f05cf7e356a94890b65c370b885f3b634ae0e Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Thu, 31 Oct 2024 10:32:50 +0100 Subject: [PATCH 6/9] skip some NXroot attributes in generic test framework --- src/pynxtools/testing/nexus_conversion.py | 43 +++++++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/src/pynxtools/testing/nexus_conversion.py b/src/pynxtools/testing/nexus_conversion.py index ca732c413..cc5d62ea1 100644 --- a/src/pynxtools/testing/nexus_conversion.py +++ b/src/pynxtools/testing/nexus_conversion.py @@ -14,7 +14,10 @@ from pynxtools.dataconverter.convert import get_reader, transfer_data_into_template -from pynxtools.dataconverter.helpers import get_nxdl_root_and_path +from pynxtools.dataconverter.helpers import ( + get_nxdl_root_and_path, + add_default_root_attributes, +) from pynxtools.dataconverter.validation import validate_dict_against from pynxtools.dataconverter.writer import Writer from pynxtools.nexus.nexus import HandleNexus @@ -115,6 +118,9 @@ def convert_to_nexus( ) assert self.caplog.text == "" + add_default_root_attributes( + data=read_data, filename=os.path.basename(self.created_nexus) + ) Writer(read_data, nxdl_file, self.created_nexus).write() if NOMAD_AVAILABLE: @@ -133,7 +139,19 @@ def check_reproducibility_of_nexus(self): IGNORE_LINES = [ "DEBUG - value: v", "DEBUG - value: https://github.com/FAIRmat-NFDI/nexus_definitions/blob/", + "DEBUG - ===== GROUP (// [NXroot::]):", ] + SECTION_IGNORE = { + "ATTRS (//@file_name)": ["DEBUG - value:"], + "ATTRS (//@file_time)": ["DEBUG - value:"], + "ATTRS (//@file_update_time)": ["DEBUG - value:"], + "ATTRS (//@h5py_version)": ["DEBUG - value:"], + } + + section = None + section_ignore_lines = [] + section_separator = "DEBUG - ===== " + ref_log = get_log_file(self.ref_nexus_file, "ref_nexus.log", self.tmp_path) gen_log = get_log_file(self.created_nexus, "gen_nexus.log", self.tmp_path) with open(gen_log, "r", encoding="utf-8") as gen, open( @@ -142,14 +160,33 @@ def check_reproducibility_of_nexus(self): gen_lines = gen.readlines() ref_lines = ref.readlines() if len(gen_lines) != len(ref_lines): - assert False, "Log files are different" + assert False, ( + f"Log files are different: mismatched line counts. " + f"Generated file has {len(gen_lines)} lines, " + f"while reference file has {len(ref_lines)} lines." + ) for ind, (gen_l, ref_l) in enumerate(zip(gen_lines, ref_lines)): + skip_it = False + if gen_l.startswith(section_separator) and ref_l.startswith( + section_separator + ): + section = gen_l.rsplit(section_separator)[-1].strip() + section_ignore_lines = SECTION_IGNORE.get(section, []) if gen_l != ref_l: # skip ignored lines (mainly version conflicts) for ignore_line in IGNORE_LINES: if gen_l.startswith(ignore_line) and ref_l.startswith(ignore_line): + skip_it = True break - else: + if not skip_it: + # skip ignored lines for this section + for ignore_line in section_ignore_lines: + if gen_l.startswith(ignore_line) and ref_l.startswith( + ignore_line + ): + skip_it = True + break + if not skip_it: assert False, ( f"Log files are different at line {ind}" f" generated: {gen_l} \n referenced : {ref_l}" From c7d58a8c154a7abbca18dd6b28b8dfaae7490a28 Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Thu, 31 Oct 2024 10:39:21 +0100 Subject: [PATCH 7/9] ignore HDF5 version attribute in NXroot --- src/pynxtools/testing/nexus_conversion.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pynxtools/testing/nexus_conversion.py b/src/pynxtools/testing/nexus_conversion.py index cc5d62ea1..9a3e40fb7 100644 --- a/src/pynxtools/testing/nexus_conversion.py +++ b/src/pynxtools/testing/nexus_conversion.py @@ -142,6 +142,7 @@ def check_reproducibility_of_nexus(self): "DEBUG - ===== GROUP (// [NXroot::]):", ] SECTION_IGNORE = { + "ATTRS (//@HDF5_version)": ["DEBUG - value:"], "ATTRS (//@file_name)": ["DEBUG - value:"], "ATTRS (//@file_time)": ["DEBUG - value:"], "ATTRS (//@file_update_time)": ["DEBUG - value:"], From 3a9d9e32a6d4f259e7bc9ebf2e9a3b35e890d002 Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Thu, 31 Oct 2024 12:16:16 +0100 Subject: [PATCH 8/9] clean up ref log comparison --- src/pynxtools/testing/nexus_conversion.py | 87 ++++++++++++----------- 1 file changed, 47 insertions(+), 40 deletions(-) diff --git a/src/pynxtools/testing/nexus_conversion.py b/src/pynxtools/testing/nexus_conversion.py index 9a3e40fb7..a55b8c378 100644 --- a/src/pynxtools/testing/nexus_conversion.py +++ b/src/pynxtools/testing/nexus_conversion.py @@ -148,47 +148,54 @@ def check_reproducibility_of_nexus(self): "ATTRS (//@file_update_time)": ["DEBUG - value:"], "ATTRS (//@h5py_version)": ["DEBUG - value:"], } + SECTION_SEPARATOR = "DEBUG - ===== " - section = None - section_ignore_lines = [] - section_separator = "DEBUG - ===== " - - ref_log = get_log_file(self.ref_nexus_file, "ref_nexus.log", self.tmp_path) - gen_log = get_log_file(self.created_nexus, "gen_nexus.log", self.tmp_path) - with open(gen_log, "r", encoding="utf-8") as gen, open( - ref_log, "r", encoding="utf-8" - ) as ref: - gen_lines = gen.readlines() - ref_lines = ref.readlines() - if len(gen_lines) != len(ref_lines): - assert False, ( - f"Log files are different: mismatched line counts. " - f"Generated file has {len(gen_lines)} lines, " - f"while reference file has {len(ref_lines)} lines." + def should_skip_line(gen_l: str, ref_l: str, ignore_lines: list[str]) -> bool: + """Check if both lines start with any ignored prefix.""" + return any( + gen_l.startswith(ignore) and ref_l.startswith(ignore) + for ignore in ignore_lines ) - for ind, (gen_l, ref_l) in enumerate(zip(gen_lines, ref_lines)): - skip_it = False - if gen_l.startswith(section_separator) and ref_l.startswith( - section_separator - ): - section = gen_l.rsplit(section_separator)[-1].strip() - section_ignore_lines = SECTION_IGNORE.get(section, []) - if gen_l != ref_l: - # skip ignored lines (mainly version conflicts) - for ignore_line in IGNORE_LINES: - if gen_l.startswith(ignore_line) and ref_l.startswith(ignore_line): - skip_it = True - break - if not skip_it: - # skip ignored lines for this section - for ignore_line in section_ignore_lines: - if gen_l.startswith(ignore_line) and ref_l.startswith( - ignore_line - ): - skip_it = True - break - if not skip_it: + + def load_logs( + gen_log_path: str, ref_log_path: str + ) -> tuple[list[str], list[str]]: + """Load log files and return their contents as lists of lines.""" + with open(gen_log_path, "r", encoding="utf-8") as gen, open( + ref_log_path, "r", encoding="utf-8" + ) as ref: + return gen.readlines(), ref.readlines() + + def compare_logs(gen_lines: list[str], ref_lines: list[str]) -> None: + """Compare log lines, ignoring specific differences.""" + if len(gen_lines) != len(ref_lines): + assert False, ( + f"Log files are different: mismatched line counts. " + f"Generated file has {len(gen_lines)} lines, " + f"while reference file has {len(ref_lines)} lines." + ) + + section_ignore_lines = [] + section = None + for ind, (gen_l, ref_l) in enumerate(zip(gen_lines, ref_lines)): + if gen_l.startswith(SECTION_SEPARATOR) and ref_l.startswith( + SECTION_SEPARATOR + ): + section = gen_l.rsplit(SECTION_SEPARATOR)[-1].strip() + section_ignore_lines = SECTION_IGNORE.get(section, []) + + # Compare lines if not in ignore list + if gen_l != ref_l and not should_skip_line( + gen_l, ref_l, IGNORE_LINES + section_ignore_lines + ): assert False, ( - f"Log files are different at line {ind}" - f" generated: {gen_l} \n referenced : {ref_l}" + f"Log files are different at line {ind}\n" + f"generated: {gen_l}\nreferenced: {ref_l}" ) + + ref_log_path = get_log_file(self.ref_nexus_file, "ref_nexus.log", self.tmp_path) + gen_log_path = get_log_file(self.created_nexus, "gen_nexus.log", self.tmp_path) + gen_lines, ref_lines = load_logs(gen_log_path, ref_log_path) + + # Compare logs + compare_logs(gen_lines, ref_lines) From 4af0896396b6778a8f9497d61f10f7c13707c008 Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Thu, 31 Oct 2024 12:29:35 +0100 Subject: [PATCH 9/9] check against NOMAD develop branch again --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 1a14e2804..034091f82 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -36,7 +36,7 @@ jobs: - name: Install nomad if: "${{ matrix.python_version != '3.8' && matrix.python_version != '3.12'}}" run: | - uv pip install nomad-lab@git+https://gitlab.mpcdf.mpg.de/nomad-lab/nomad-FAIR.git@Sprint_Nomad_BaseSection + uv pip install nomad-lab@git+https://gitlab.mpcdf.mpg.de/nomad-lab/nomad-FAIR.git - name: Install pynx run: | uv pip install ".[dev]"