diff --git a/CHANGES.rst b/CHANGES.rst index 4cc160fd9..62a76753b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -15,6 +15,12 @@ The ASDF Standard is at v1.6.0 - Allow views of memmapped arrays to keep the backing mmap open to avoid segfaults [#1668] +- Introduce ``memmap`` argument to ``asdf.open`` that + overrides ``copy_arrays`` with documentation that describes + that the default for ``memmap`` when ``copy_arrays`` + is removed in an upcoming asdf release will be ``False`` and + asdf will no longer by-default memory map arrays. [#1667] + 3.0.1 (2023-10-30) ------------------ diff --git a/asdf/_asdf.py b/asdf/_asdf.py index 1691f2783..69da27f9e 100644 --- a/asdf/_asdf.py +++ b/asdf/_asdf.py @@ -74,6 +74,7 @@ def __init__( ignore_unrecognized_tag=False, ignore_implicit_conversion=False, copy_arrays=False, + memmap=NotSet, lazy_load=True, custom_schema=None, ): @@ -116,6 +117,14 @@ def __init__( When `False`, when reading files, attempt to memmap underlying data arrays when possible. + memmap : bool, optional + When `True`, when reading files, attempt to memmap underlying data + arrays when possible. When set, this argument will override + ``copy_arrays``. When not set, the ``copy_arrays`` will determine + if arrays are memory mapped or copied. ``copy_arrays`` will be + deprecated and the default will change in an upcoming asdf version + which by default will not memory map arrays. + lazy_load : bool, optional When `True` and the underlying file handle is seekable, data arrays will only be loaded lazily: i.e. when they are accessed @@ -168,6 +177,9 @@ def __init__( self._fd = None self._closed = False self._external_asdf_by_uri = {} + # if memmap is set, it overrides copy_arrays + if memmap is not NotSet: + copy_arrays = not memmap self._blocks = BlockManager(uri=uri, lazy_load=lazy_load, memmap=not copy_arrays) # this message is passed into find_references to only warn if # a reference was found @@ -1496,6 +1508,7 @@ def open_asdf( ignore_unrecognized_tag=False, _force_raw_types=False, copy_arrays=False, + memmap=NotSet, lazy_load=True, custom_schema=None, strict_extension_check=False, @@ -1539,6 +1552,14 @@ def open_asdf( When `False`, when reading files, attempt to memmap underlying data arrays when possible. + memmap : bool, optional + When `True`, when reading files, attempt to memmap underlying data + arrays when possible. When set, this argument will override + ``copy_arrays``. When not set, the ``copy_arrays`` will determine + if arrays are memory mapped or copied. ``copy_arrays`` will be + deprecated and the default will change in an upcoming asdf version + which by default will not memory map arrays. + lazy_load : bool, optional When `True` and the underlying file handle is seekable, data arrays will only be loaded lazily: i.e. when they are accessed @@ -1546,7 +1567,7 @@ def open_asdf( open during the lifetime of the tree. Setting to False causes all data arrays to be loaded up front, which means that they can be accessed even after the underlying file is closed. - Note: even if ``lazy_load`` is `False`, ``copy_arrays`` is still taken + Note: even if ``lazy_load`` is `False`, ``memmap`` is still taken into account. custom_schema : str, optional @@ -1590,6 +1611,7 @@ def open_asdf( ignore_version_mismatch=ignore_version_mismatch, ignore_unrecognized_tag=ignore_unrecognized_tag, copy_arrays=copy_arrays, + memmap=memmap, lazy_load=lazy_load, custom_schema=custom_schema, ) diff --git a/asdf/_block/external.py b/asdf/_block/external.py index 734e9a773..10a6531d0 100644 --- a/asdf/_block/external.py +++ b/asdf/_block/external.py @@ -36,7 +36,7 @@ def load(self, base_uri, uri, memmap=False, validate_checksums=False): from asdf import open as asdf_open with asdf_open( - resolved_uri, "r", lazy_load=False, copy_arrays=True, validate_checksums=validate_checksums + resolved_uri, "r", lazy_load=False, memmap=False, validate_checksums=validate_checksums ) as af: blk = af._blocks.blocks[0] if memmap and blk.header["compression"] == b"\0\0\0\0": diff --git a/asdf/_tests/_helpers.py b/asdf/_tests/_helpers.py index 693378f68..daa8085ad 100644 --- a/asdf/_tests/_helpers.py +++ b/asdf/_tests/_helpers.py @@ -267,7 +267,7 @@ def _assert_roundtrip_tree( with io.BytesIO() as buff: AsdfFile(tree, extensions=extensions, **init_options).write_to(buff, **write_options) buff.seek(0) - ff = asdf.open(buff, extensions=extensions, copy_arrays=True, lazy_load=False) + ff = asdf.open(buff, extensions=extensions, memmap=False, lazy_load=False) # Ensure that all the blocks are loaded for block in ff._blocks.blocks: assert block._data is not None and not callable(block._data) @@ -276,9 +276,9 @@ def _assert_roundtrip_tree( if asdf_check_func: asdf_check_func(ff) - # Now repeat with copy_arrays=False and a real file to test mmap() + # Now repeat with memmap=True and a real file to test mmap() AsdfFile(tree, extensions=extensions, **init_options).write_to(fname, **write_options) - with asdf.open(fname, mode="rw", extensions=extensions, copy_arrays=False, lazy_load=False) as ff: + with asdf.open(fname, mode="rw", extensions=extensions, memmap=True, lazy_load=False) as ff: for block in ff._blocks.blocks: assert block._data is not None and not callable(block._data) assert_tree_match(tree, ff.tree, ff, funcname=tree_match_func) diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index f2c2b626b..b6163ce08 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -800,28 +800,43 @@ def test_unordered_block_index(): assert ff._blocks.blocks[1].loaded -def test_open_no_memmap(tmp_path): - tmpfile = os.path.join(str(tmp_path), "random.asdf") - +@pytest.fixture(scope="module") +def filename_with_array(tmp_path_factory): + fn = tmp_path_factory.mktemp("data") / "filename_with_array.asdf" tree = {"array": np.random.random((20, 20))} - ff = asdf.AsdfFile(tree) - ff.write_to(tmpfile) - - # Test that by default we use memmapped arrays when possible - with asdf.open(tmpfile) as af: - array = af.tree["array"] - # Make sure to access the block so that it gets loaded - assert af._blocks.blocks[0].memmap - assert isinstance(array.base, np.memmap) - - # Test that if we ask for copy, we do not get memmapped arrays - with asdf.open(tmpfile, copy_arrays=True) as af: + ff.write_to(fn) + return fn + + +@pytest.mark.parametrize( + "open_kwargs,should_memmap", + [ + ({}, True), + ({"copy_arrays": True}, False), + ({"copy_arrays": False}, True), + ({"memmap": True}, True), + ({"memmap": False}, False), + ({"copy_arrays": True, "memmap": True}, True), + ({"copy_arrays": False, "memmap": True}, True), + ({"copy_arrays": True, "memmap": False}, False), + ({"copy_arrays": False, "memmap": False}, False), + ], +) +def test_open_no_memmap(filename_with_array, open_kwargs, should_memmap): + """ + Test that asdf.open does not (or does) return memmaps for arrays + depending on a number of arguments including: + default (no kwargs) + copy_arrays + memmap (overwrites copy_arrays) + """ + with asdf.open(filename_with_array, lazy_load=False, **open_kwargs) as af: array = af.tree["array"] - assert not af._blocks.blocks[0].memmap - # We can't just check for isinstance(..., np.array) since this will - # be true for np.memmap as well - assert not isinstance(array.base, np.memmap) + if should_memmap: + assert isinstance(array.base, np.memmap) + else: + assert not isinstance(array.base, np.memmap) def test_add_block_before_fully_loaded(tmp_path): diff --git a/asdf/testing/helpers.py b/asdf/testing/helpers.py index 0b70b2a32..44caf8d0f 100644 --- a/asdf/testing/helpers.py +++ b/asdf/testing/helpers.py @@ -46,7 +46,7 @@ def roundtrip_object(obj, version=None): af.write_to(buff) buff.seek(0) - with asdf.open(buff, lazy_load=False, copy_arrays=True) as af: + with asdf.open(buff, lazy_load=False, memmap=False) as af: return af["obj"]