Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add memmap option to asdf.open #1667

Merged
merged 4 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ The ASDF Standard is at v1.6.0
- Allow views of memmapped arrays to keep the backing mmap
open to avoid segfaults [#1668]

- Introduce ``memmap`` argument to ``asdf.open`` that
overrides ``copy_arrays`` with documentation that describes
that the default for ``memmap`` when ``copy_arrays``
is removed in an upcoming asdf release will be ``False`` and
asdf will no longer by-default memory map arrays. [#1667]

3.0.1 (2023-10-30)
------------------

Expand Down
24 changes: 23 additions & 1 deletion asdf/_asdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def __init__(
ignore_unrecognized_tag=False,
ignore_implicit_conversion=False,
copy_arrays=False,
memmap=NotSet,
lazy_load=True,
custom_schema=None,
):
Expand Down Expand Up @@ -116,6 +117,14 @@ def __init__(
When `False`, when reading files, attempt to memmap underlying data
arrays when possible.

memmap : bool, optional
When `True`, when reading files, attempt to memmap underlying data
arrays when possible. When set, this argument will override
``copy_arrays``. When not set, the ``copy_arrays`` will determine
if arrays are memory mapped or copied. ``copy_arrays`` will be
deprecated and the default will change in an upcoming asdf version
which by default will not memory map arrays.

lazy_load : bool, optional
When `True` and the underlying file handle is seekable, data
arrays will only be loaded lazily: i.e. when they are accessed
Expand Down Expand Up @@ -168,6 +177,9 @@ def __init__(
self._fd = None
self._closed = False
self._external_asdf_by_uri = {}
# if memmap is set, it overrides copy_arrays
if memmap is not NotSet:
copy_arrays = not memmap
self._blocks = BlockManager(uri=uri, lazy_load=lazy_load, memmap=not copy_arrays)
# this message is passed into find_references to only warn if
# a reference was found
Expand Down Expand Up @@ -1496,6 +1508,7 @@ def open_asdf(
ignore_unrecognized_tag=False,
_force_raw_types=False,
copy_arrays=False,
memmap=NotSet,
lazy_load=True,
custom_schema=None,
strict_extension_check=False,
Expand Down Expand Up @@ -1539,14 +1552,22 @@ def open_asdf(
When `False`, when reading files, attempt to memmap underlying data
arrays when possible.

memmap : bool, optional
When `True`, when reading files, attempt to memmap underlying data
arrays when possible. When set, this argument will override
``copy_arrays``. When not set, the ``copy_arrays`` will determine
if arrays are memory mapped or copied. ``copy_arrays`` will be
deprecated and the default will change in an upcoming asdf version
which by default will not memory map arrays.

lazy_load : bool, optional
When `True` and the underlying file handle is seekable, data
arrays will only be loaded lazily: i.e. when they are accessed
for the first time. In this case the underlying file must stay
open during the lifetime of the tree. Setting to False causes
all data arrays to be loaded up front, which means that they
can be accessed even after the underlying file is closed.
Note: even if ``lazy_load`` is `False`, ``copy_arrays`` is still taken
Note: even if ``lazy_load`` is `False`, ``memmap`` is still taken
into account.

custom_schema : str, optional
Expand Down Expand Up @@ -1590,6 +1611,7 @@ def open_asdf(
ignore_version_mismatch=ignore_version_mismatch,
ignore_unrecognized_tag=ignore_unrecognized_tag,
copy_arrays=copy_arrays,
memmap=memmap,
lazy_load=lazy_load,
custom_schema=custom_schema,
)
Expand Down
2 changes: 1 addition & 1 deletion asdf/_block/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def load(self, base_uri, uri, memmap=False, validate_checksums=False):
from asdf import open as asdf_open

with asdf_open(
resolved_uri, "r", lazy_load=False, copy_arrays=True, validate_checksums=validate_checksums
resolved_uri, "r", lazy_load=False, memmap=False, validate_checksums=validate_checksums
) as af:
blk = af._blocks.blocks[0]
if memmap and blk.header["compression"] == b"\0\0\0\0":
Expand Down
6 changes: 3 additions & 3 deletions asdf/_tests/_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def _assert_roundtrip_tree(
with io.BytesIO() as buff:
AsdfFile(tree, extensions=extensions, **init_options).write_to(buff, **write_options)
buff.seek(0)
ff = asdf.open(buff, extensions=extensions, copy_arrays=True, lazy_load=False)
ff = asdf.open(buff, extensions=extensions, memmap=False, lazy_load=False)
# Ensure that all the blocks are loaded
for block in ff._blocks.blocks:
assert block._data is not None and not callable(block._data)
Expand All @@ -276,9 +276,9 @@ def _assert_roundtrip_tree(
if asdf_check_func:
asdf_check_func(ff)

# Now repeat with copy_arrays=False and a real file to test mmap()
# Now repeat with memmap=True and a real file to test mmap()
AsdfFile(tree, extensions=extensions, **init_options).write_to(fname, **write_options)
with asdf.open(fname, mode="rw", extensions=extensions, copy_arrays=False, lazy_load=False) as ff:
with asdf.open(fname, mode="rw", extensions=extensions, memmap=True, lazy_load=False) as ff:
for block in ff._blocks.blocks:
assert block._data is not None and not callable(block._data)
assert_tree_match(tree, ff.tree, ff, funcname=tree_match_func)
Expand Down
53 changes: 34 additions & 19 deletions asdf/_tests/test_array_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,28 +800,43 @@ def test_unordered_block_index():
assert ff._blocks.blocks[1].loaded


def test_open_no_memmap(tmp_path):
tmpfile = os.path.join(str(tmp_path), "random.asdf")

@pytest.fixture(scope="module")
def filename_with_array(tmp_path_factory):
fn = tmp_path_factory.mktemp("data") / "filename_with_array.asdf"
tree = {"array": np.random.random((20, 20))}

ff = asdf.AsdfFile(tree)
ff.write_to(tmpfile)

# Test that by default we use memmapped arrays when possible
with asdf.open(tmpfile) as af:
array = af.tree["array"]
# Make sure to access the block so that it gets loaded
assert af._blocks.blocks[0].memmap
assert isinstance(array.base, np.memmap)

# Test that if we ask for copy, we do not get memmapped arrays
with asdf.open(tmpfile, copy_arrays=True) as af:
ff.write_to(fn)
return fn


@pytest.mark.parametrize(
"open_kwargs,should_memmap",
[
({}, True),
({"copy_arrays": True}, False),
({"copy_arrays": False}, True),
({"memmap": True}, True),
({"memmap": False}, False),
({"copy_arrays": True, "memmap": True}, True),
({"copy_arrays": False, "memmap": True}, True),
({"copy_arrays": True, "memmap": False}, False),
({"copy_arrays": False, "memmap": False}, False),
],
)
def test_open_no_memmap(filename_with_array, open_kwargs, should_memmap):
"""
Test that asdf.open does not (or does) return memmaps for arrays
depending on a number of arguments including:
default (no kwargs)
copy_arrays
memmap (overwrites copy_arrays)
"""
with asdf.open(filename_with_array, lazy_load=False, **open_kwargs) as af:
array = af.tree["array"]
assert not af._blocks.blocks[0].memmap
# We can't just check for isinstance(..., np.array) since this will
# be true for np.memmap as well
assert not isinstance(array.base, np.memmap)
if should_memmap:
assert isinstance(array.base, np.memmap)
else:
assert not isinstance(array.base, np.memmap)


def test_add_block_before_fully_loaded(tmp_path):
Expand Down
2 changes: 1 addition & 1 deletion asdf/testing/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def roundtrip_object(obj, version=None):
af.write_to(buff)

buff.seek(0)
with asdf.open(buff, lazy_load=False, copy_arrays=True) as af:
with asdf.open(buff, lazy_load=False, memmap=False) as af:
return af["obj"]


Expand Down
Loading