From 5f4abdf9bdc297f8fe326f802a56ea3f1483ca3d Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 24 Oct 2023 15:28:51 -0400 Subject: [PATCH 1/4] add memmap_arrays option --- asdf/_asdf.py | 24 +++++++++++++++++++++++- asdf/_block/external.py | 2 +- asdf/_tests/_helpers.py | 6 +++--- asdf/testing/helpers.py | 2 +- 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/asdf/_asdf.py b/asdf/_asdf.py index 1691f2783..2e9118d16 100644 --- a/asdf/_asdf.py +++ b/asdf/_asdf.py @@ -74,6 +74,7 @@ def __init__( ignore_unrecognized_tag=False, ignore_implicit_conversion=False, copy_arrays=False, + memmap_arrays=NotSet, lazy_load=True, custom_schema=None, ): @@ -116,6 +117,14 @@ def __init__( When `False`, when reading files, attempt to memmap underlying data arrays when possible. + memmap_arrays : bool, optional + When `True`, when reading files, attempt to memmap underlying data + arrays when possible. When set, this argument will override + ``copy_arrays``. When not set, the ``copy_arrays`` will determine + if arrays are memory mapped or copied. ``copy_arrays`` will be + deprecated and the default will change in an upcoming asdf version + which by default will not memory map arrays. + lazy_load : bool, optional When `True` and the underlying file handle is seekable, data arrays will only be loaded lazily: i.e. when they are accessed @@ -168,6 +177,9 @@ def __init__( self._fd = None self._closed = False self._external_asdf_by_uri = {} + # if memmap_arrays is set, it overrides copy_arrays + if memmap_arrays is not NotSet: + copy_arrays = not memmap_arrays self._blocks = BlockManager(uri=uri, lazy_load=lazy_load, memmap=not copy_arrays) # this message is passed into find_references to only warn if # a reference was found @@ -1496,6 +1508,7 @@ def open_asdf( ignore_unrecognized_tag=False, _force_raw_types=False, copy_arrays=False, + memmap_arrays=NotSet, lazy_load=True, custom_schema=None, strict_extension_check=False, @@ -1539,6 +1552,14 @@ def open_asdf( When `False`, when reading files, attempt to memmap underlying data arrays when possible. + memmap_arrays : bool, optional + When `True`, when reading files, attempt to memmap underlying data + arrays when possible. When set, this argument will override + ``copy_arrays``. When not set, the ``copy_arrays`` will determine + if arrays are memory mapped or copied. ``copy_arrays`` will be + deprecated and the default will change in an upcoming asdf version + which by default will not memory map arrays. + lazy_load : bool, optional When `True` and the underlying file handle is seekable, data arrays will only be loaded lazily: i.e. when they are accessed @@ -1546,7 +1567,7 @@ def open_asdf( open during the lifetime of the tree. Setting to False causes all data arrays to be loaded up front, which means that they can be accessed even after the underlying file is closed. - Note: even if ``lazy_load`` is `False`, ``copy_arrays`` is still taken + Note: even if ``lazy_load`` is `False`, ``memmap_arrays`` is still taken into account. custom_schema : str, optional @@ -1590,6 +1611,7 @@ def open_asdf( ignore_version_mismatch=ignore_version_mismatch, ignore_unrecognized_tag=ignore_unrecognized_tag, copy_arrays=copy_arrays, + memmap_arrays=memmap_arrays, lazy_load=lazy_load, custom_schema=custom_schema, ) diff --git a/asdf/_block/external.py b/asdf/_block/external.py index 734e9a773..e91342204 100644 --- a/asdf/_block/external.py +++ b/asdf/_block/external.py @@ -36,7 +36,7 @@ def load(self, base_uri, uri, memmap=False, validate_checksums=False): from asdf import open as asdf_open with asdf_open( - resolved_uri, "r", lazy_load=False, copy_arrays=True, validate_checksums=validate_checksums + resolved_uri, "r", lazy_load=False, memmap_arrays=False, validate_checksums=validate_checksums ) as af: blk = af._blocks.blocks[0] if memmap and blk.header["compression"] == b"\0\0\0\0": diff --git a/asdf/_tests/_helpers.py b/asdf/_tests/_helpers.py index 693378f68..338ff13ee 100644 --- a/asdf/_tests/_helpers.py +++ b/asdf/_tests/_helpers.py @@ -267,7 +267,7 @@ def _assert_roundtrip_tree( with io.BytesIO() as buff: AsdfFile(tree, extensions=extensions, **init_options).write_to(buff, **write_options) buff.seek(0) - ff = asdf.open(buff, extensions=extensions, copy_arrays=True, lazy_load=False) + ff = asdf.open(buff, extensions=extensions, memmap_arrays=False, lazy_load=False) # Ensure that all the blocks are loaded for block in ff._blocks.blocks: assert block._data is not None and not callable(block._data) @@ -276,9 +276,9 @@ def _assert_roundtrip_tree( if asdf_check_func: asdf_check_func(ff) - # Now repeat with copy_arrays=False and a real file to test mmap() + # Now repeat with memmap_arrays=True and a real file to test mmap() AsdfFile(tree, extensions=extensions, **init_options).write_to(fname, **write_options) - with asdf.open(fname, mode="rw", extensions=extensions, copy_arrays=False, lazy_load=False) as ff: + with asdf.open(fname, mode="rw", extensions=extensions, memmap_arrays=True, lazy_load=False) as ff: for block in ff._blocks.blocks: assert block._data is not None and not callable(block._data) assert_tree_match(tree, ff.tree, ff, funcname=tree_match_func) diff --git a/asdf/testing/helpers.py b/asdf/testing/helpers.py index 0b70b2a32..d37aead4f 100644 --- a/asdf/testing/helpers.py +++ b/asdf/testing/helpers.py @@ -46,7 +46,7 @@ def roundtrip_object(obj, version=None): af.write_to(buff) buff.seek(0) - with asdf.open(buff, lazy_load=False, copy_arrays=True) as af: + with asdf.open(buff, lazy_load=False, memmap_arrays=False) as af: return af["obj"] From 0084f7b5b4fb07ec0af3a733cd58b373f0136dba Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 24 Oct 2023 16:13:10 -0400 Subject: [PATCH 2/4] test memmap_arrays copy_arrays interaction --- asdf/_tests/test_array_blocks.py | 53 ++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index f2c2b626b..0b23fcb33 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -800,28 +800,43 @@ def test_unordered_block_index(): assert ff._blocks.blocks[1].loaded -def test_open_no_memmap(tmp_path): - tmpfile = os.path.join(str(tmp_path), "random.asdf") - +@pytest.fixture(scope="module") +def filename_with_array(tmp_path_factory): + fn = tmp_path_factory.mktemp("data") / "filename_with_array.asdf" tree = {"array": np.random.random((20, 20))} - ff = asdf.AsdfFile(tree) - ff.write_to(tmpfile) - - # Test that by default we use memmapped arrays when possible - with asdf.open(tmpfile) as af: - array = af.tree["array"] - # Make sure to access the block so that it gets loaded - assert af._blocks.blocks[0].memmap - assert isinstance(array.base, np.memmap) - - # Test that if we ask for copy, we do not get memmapped arrays - with asdf.open(tmpfile, copy_arrays=True) as af: + ff.write_to(fn) + return fn + + +@pytest.mark.parametrize( + "open_kwargs,should_memmap", + [ + ({}, True), + ({"copy_arrays": True}, False), + ({"copy_arrays": False}, True), + ({"memmap_arrays": True}, True), + ({"memmap_arrays": False}, False), + ({"copy_arrays": True, "memmap_arrays": True}, True), + ({"copy_arrays": False, "memmap_arrays": True}, True), + ({"copy_arrays": True, "memmap_arrays": False}, False), + ({"copy_arrays": False, "memmap_arrays": False}, False), + ], +) +def test_open_no_memmap(filename_with_array, open_kwargs, should_memmap): + """ + Test that asdf.open does not (or does) return memmaps for arrays + depending on a number of arguments including: + default (no kwargs) + copy_arrays + memmap_arrays (overwrites copy_arrays) + """ + with asdf.open(filename_with_array, lazy_load=False, **open_kwargs) as af: array = af.tree["array"] - assert not af._blocks.blocks[0].memmap - # We can't just check for isinstance(..., np.array) since this will - # be true for np.memmap as well - assert not isinstance(array.base, np.memmap) + if should_memmap: + assert isinstance(array.base, np.memmap) + else: + assert not isinstance(array.base, np.memmap) def test_add_block_before_fully_loaded(tmp_path): From 102615804c1330f1322d4b455408f7fbb4156f32 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 24 Oct 2023 16:24:42 -0400 Subject: [PATCH 3/4] add changelog entry --- CHANGES.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 4cc160fd9..3d0e7e3f8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -15,6 +15,12 @@ The ASDF Standard is at v1.6.0 - Allow views of memmapped arrays to keep the backing mmap open to avoid segfaults [#1668] +- Introduce ``memmap_arrays`` argument to ``asdf.open`` that + overrides ``copy_arrays`` with documentation that describes + that the default for ``memmap_arrays`` when ``copy_arrays`` + is removed in an upcoming asdf release will be ``False`` and + asdf will no longer by-default memory map arrays. [#1667] + 3.0.1 (2023-10-30) ------------------ From 1ee4b999ae9c391b2286318f51e86794e3ee23b5 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 11 Dec 2023 10:13:53 -0500 Subject: [PATCH 4/4] rename memmap_arrays to memmap --- CHANGES.rst | 4 ++-- asdf/_asdf.py | 18 +++++++++--------- asdf/_block/external.py | 2 +- asdf/_tests/_helpers.py | 6 +++--- asdf/_tests/test_array_blocks.py | 14 +++++++------- asdf/testing/helpers.py | 2 +- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 3d0e7e3f8..62a76753b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -15,9 +15,9 @@ The ASDF Standard is at v1.6.0 - Allow views of memmapped arrays to keep the backing mmap open to avoid segfaults [#1668] -- Introduce ``memmap_arrays`` argument to ``asdf.open`` that +- Introduce ``memmap`` argument to ``asdf.open`` that overrides ``copy_arrays`` with documentation that describes - that the default for ``memmap_arrays`` when ``copy_arrays`` + that the default for ``memmap`` when ``copy_arrays`` is removed in an upcoming asdf release will be ``False`` and asdf will no longer by-default memory map arrays. [#1667] diff --git a/asdf/_asdf.py b/asdf/_asdf.py index 2e9118d16..69da27f9e 100644 --- a/asdf/_asdf.py +++ b/asdf/_asdf.py @@ -74,7 +74,7 @@ def __init__( ignore_unrecognized_tag=False, ignore_implicit_conversion=False, copy_arrays=False, - memmap_arrays=NotSet, + memmap=NotSet, lazy_load=True, custom_schema=None, ): @@ -117,7 +117,7 @@ def __init__( When `False`, when reading files, attempt to memmap underlying data arrays when possible. - memmap_arrays : bool, optional + memmap : bool, optional When `True`, when reading files, attempt to memmap underlying data arrays when possible. When set, this argument will override ``copy_arrays``. When not set, the ``copy_arrays`` will determine @@ -177,9 +177,9 @@ def __init__( self._fd = None self._closed = False self._external_asdf_by_uri = {} - # if memmap_arrays is set, it overrides copy_arrays - if memmap_arrays is not NotSet: - copy_arrays = not memmap_arrays + # if memmap is set, it overrides copy_arrays + if memmap is not NotSet: + copy_arrays = not memmap self._blocks = BlockManager(uri=uri, lazy_load=lazy_load, memmap=not copy_arrays) # this message is passed into find_references to only warn if # a reference was found @@ -1508,7 +1508,7 @@ def open_asdf( ignore_unrecognized_tag=False, _force_raw_types=False, copy_arrays=False, - memmap_arrays=NotSet, + memmap=NotSet, lazy_load=True, custom_schema=None, strict_extension_check=False, @@ -1552,7 +1552,7 @@ def open_asdf( When `False`, when reading files, attempt to memmap underlying data arrays when possible. - memmap_arrays : bool, optional + memmap : bool, optional When `True`, when reading files, attempt to memmap underlying data arrays when possible. When set, this argument will override ``copy_arrays``. When not set, the ``copy_arrays`` will determine @@ -1567,7 +1567,7 @@ def open_asdf( open during the lifetime of the tree. Setting to False causes all data arrays to be loaded up front, which means that they can be accessed even after the underlying file is closed. - Note: even if ``lazy_load`` is `False`, ``memmap_arrays`` is still taken + Note: even if ``lazy_load`` is `False`, ``memmap`` is still taken into account. custom_schema : str, optional @@ -1611,7 +1611,7 @@ def open_asdf( ignore_version_mismatch=ignore_version_mismatch, ignore_unrecognized_tag=ignore_unrecognized_tag, copy_arrays=copy_arrays, - memmap_arrays=memmap_arrays, + memmap=memmap, lazy_load=lazy_load, custom_schema=custom_schema, ) diff --git a/asdf/_block/external.py b/asdf/_block/external.py index e91342204..10a6531d0 100644 --- a/asdf/_block/external.py +++ b/asdf/_block/external.py @@ -36,7 +36,7 @@ def load(self, base_uri, uri, memmap=False, validate_checksums=False): from asdf import open as asdf_open with asdf_open( - resolved_uri, "r", lazy_load=False, memmap_arrays=False, validate_checksums=validate_checksums + resolved_uri, "r", lazy_load=False, memmap=False, validate_checksums=validate_checksums ) as af: blk = af._blocks.blocks[0] if memmap and blk.header["compression"] == b"\0\0\0\0": diff --git a/asdf/_tests/_helpers.py b/asdf/_tests/_helpers.py index 338ff13ee..daa8085ad 100644 --- a/asdf/_tests/_helpers.py +++ b/asdf/_tests/_helpers.py @@ -267,7 +267,7 @@ def _assert_roundtrip_tree( with io.BytesIO() as buff: AsdfFile(tree, extensions=extensions, **init_options).write_to(buff, **write_options) buff.seek(0) - ff = asdf.open(buff, extensions=extensions, memmap_arrays=False, lazy_load=False) + ff = asdf.open(buff, extensions=extensions, memmap=False, lazy_load=False) # Ensure that all the blocks are loaded for block in ff._blocks.blocks: assert block._data is not None and not callable(block._data) @@ -276,9 +276,9 @@ def _assert_roundtrip_tree( if asdf_check_func: asdf_check_func(ff) - # Now repeat with memmap_arrays=True and a real file to test mmap() + # Now repeat with memmap=True and a real file to test mmap() AsdfFile(tree, extensions=extensions, **init_options).write_to(fname, **write_options) - with asdf.open(fname, mode="rw", extensions=extensions, memmap_arrays=True, lazy_load=False) as ff: + with asdf.open(fname, mode="rw", extensions=extensions, memmap=True, lazy_load=False) as ff: for block in ff._blocks.blocks: assert block._data is not None and not callable(block._data) assert_tree_match(tree, ff.tree, ff, funcname=tree_match_func) diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index 0b23fcb33..b6163ce08 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -815,12 +815,12 @@ def filename_with_array(tmp_path_factory): ({}, True), ({"copy_arrays": True}, False), ({"copy_arrays": False}, True), - ({"memmap_arrays": True}, True), - ({"memmap_arrays": False}, False), - ({"copy_arrays": True, "memmap_arrays": True}, True), - ({"copy_arrays": False, "memmap_arrays": True}, True), - ({"copy_arrays": True, "memmap_arrays": False}, False), - ({"copy_arrays": False, "memmap_arrays": False}, False), + ({"memmap": True}, True), + ({"memmap": False}, False), + ({"copy_arrays": True, "memmap": True}, True), + ({"copy_arrays": False, "memmap": True}, True), + ({"copy_arrays": True, "memmap": False}, False), + ({"copy_arrays": False, "memmap": False}, False), ], ) def test_open_no_memmap(filename_with_array, open_kwargs, should_memmap): @@ -829,7 +829,7 @@ def test_open_no_memmap(filename_with_array, open_kwargs, should_memmap): depending on a number of arguments including: default (no kwargs) copy_arrays - memmap_arrays (overwrites copy_arrays) + memmap (overwrites copy_arrays) """ with asdf.open(filename_with_array, lazy_load=False, **open_kwargs) as af: array = af.tree["array"] diff --git a/asdf/testing/helpers.py b/asdf/testing/helpers.py index d37aead4f..44caf8d0f 100644 --- a/asdf/testing/helpers.py +++ b/asdf/testing/helpers.py @@ -46,7 +46,7 @@ def roundtrip_object(obj, version=None): af.write_to(buff) buff.seek(0) - with asdf.open(buff, lazy_load=False, memmap_arrays=False) as af: + with asdf.open(buff, lazy_load=False, memmap=False) as af: return af["obj"]