Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow memmapped arrays to keep the backing mmap file open #1668

Merged
merged 4 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ The ASDF Standard is at v1.6.0
- Deprecate automatic calling of ``AsdfFile.find_references`` during
``AsdfFile.__init__`` and ``asdf.open`` [#1708]

- Allow views of memmapped arrays to keep the backing mmap
open to avoid segfaults [#1668]

3.0.1 (2023-10-30)
------------------

Expand Down
4 changes: 2 additions & 2 deletions asdf/_asdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,9 +1070,9 @@ def update(
def rewrite():
self._fd.seek(0)
self._serial_write(self._fd, pad_blocks, include_block_index)
self._fd.truncate()
if self._fd.can_memmap():
self._fd.close_memmap()
self._fd.truncate()

# if we have no read blocks, we can just call write_to as no internal blocks are reused
if len(self._blocks.blocks) == 0:
Expand Down Expand Up @@ -1107,9 +1107,9 @@ def rewrite():

# close memmap to trigger arrays to reload themselves
self._fd.seek(end_of_file)
self._fd.truncate()
if self._fd.can_memmap():
self._fd.close_memmap()
self._fd.truncate()

finally:
self._post_write(fd)
Expand Down
2 changes: 2 additions & 0 deletions asdf/_core/_converters/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ def from_yaml_tree(self, node, tag, ctx):
elif isinstance(source, str):
# external
def data_callback(_attr=None, _ref=weakref.ref(ctx._blocks)):
if _attr not in (None, "cached_data", "data"):
raise AttributeError(f"_attr {_attr} is not supported")
blks = _ref()
if blks is None:
msg = "Failed to resolve reference to AsdfFile to read external block"
Expand Down
21 changes: 21 additions & 0 deletions asdf/_tests/_regtests/test_1334.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import numpy as np

import asdf


def test_memmap_view_access_after_close(tmp_path):
"""
Accessing a view of a memmap after the asdf file
is closed results in a segfault

https://github.com/asdf-format/asdf/issues/1334
"""

a = np.ones(10, dtype="uint8")
fn = tmp_path / "test.asdf"
asdf.AsdfFile({"a": a}).write_to(fn)

with asdf.open(fn, copy_arrays=False) as af:
v = af["a"][:5]

assert np.all(v == 1)
3 changes: 1 addition & 2 deletions asdf/_tests/_regtests/test_1523.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ def test_update_corrupts_stream_data(tmp_path):
with asdf.open(fn, mode="rw") as af:
af["a"] = np.arange(1000)
af.update()
# print(af['s']) # segmentation fault
np.testing.assert_array_equal(af["s"], [[1, 2, 3]])

with asdf.open(fn) as af:
# fails as af['s'] == [[116, 101, 111]]
np.testing.assert_array_equal(af["s"], [[1, 2, 3]])
8 changes: 4 additions & 4 deletions asdf/_tests/_regtests/test_1530.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import numpy as np
import pytest

import asdf


@pytest.mark.xfail(reason="fixing this may require subclassing ndarray")
def test_update_with_memmapped_data_can_make_view_data_invalid(tmp_path):
"""
Calling update with memmapped data can create invalid data in memmap views
Expand All @@ -31,5 +29,7 @@ def test_update_with_memmapped_data_can_make_view_data_invalid(tmp_path):
af.update()
np.testing.assert_array_equal(a, af["a"])
np.testing.assert_array_equal(b, af["b"])
assert False
# np.testing.assert_array_equal(va, ov) # segfault
# the view of 'a' taken above ('va') keeps the original memmap open
# and is not a valid view of af['a'] (as this now differs from the
# af['a'] used to generate the view).
assert not np.all(va == ov)
19 changes: 14 additions & 5 deletions asdf/generic_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,9 +738,6 @@ def fast_forward(self, size):
self.seek(size, SEEK_CUR)

def truncate(self, size=None):
# windows supports truncating as long as the file not opened
# more than once. So this must be called after closing all
# memmaps
if size is None:
self._fd.truncate()
else:
Expand Down Expand Up @@ -786,8 +783,11 @@ def memmap_array(self, offset, size):
return np.ndarray.__new__(np.memmap, shape=size, offset=offset, dtype="uint8", buffer=self._mmap)

def close_memmap(self):
if hasattr(self, "_mmap") and not self._mmap.closed:
self._mmap.close()
if hasattr(self, "_mmap"):
# we no longer close the _mmap here. This does mean that views of arrays
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we got the better end of that trade.

# that are backed by _mmap will keep the _mmap alive (and open). This is
# the cost of avoiding segfaults as np.memmap does not check if mmap is
# closed.
del self._mmap

def flush_memmap(self):
Expand Down Expand Up @@ -834,6 +834,15 @@ def close(self):
if self._close:
self._fix_permissions()

def truncate(self, size=None):
# windows supports truncating as long as the file not opened
# more than once. So this must be called after closing all
# memmaps
if sys.platform.startswith("win") and hasattr(self, "_mmap"):
self._mmap.close()
self.close_memmap()
super().truncate(size=size)


class MemoryIO(RandomAccessFile):
"""
Expand Down
13 changes: 11 additions & 2 deletions asdf/tags/core/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,17 @@ def _make_array(self):
# memory mapping.
if self._array is not None:
base = util.get_array_base(self._array)
if isinstance(base, np.memmap) and isinstance(base.base, mmap.mmap) and base.base.closed:
self._array = None
if isinstance(base, np.memmap) and isinstance(base.base, mmap.mmap):
# check if the underlying mmap matches the one generated by generic_io
try:
fd = self._data_callback(_attr="_fd")()
except AttributeError:
# external blocks do not have a '_fd' and don't need to be updated
fd = None
if fd is not None:
if getattr(fd, "_mmap", None) is not base.base:
self._array = None
del fd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a question for my own educational purposes, what does explicitly deleting the fd variable accomplish?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. The _fd attr is a weakef so I think I added this mostly out of habit. The explicit del of fd isn't needed here as fd isn't used or stored (so the reference will be released at the end of the function).


if self._array is None:
if isinstance(self._source, str):
Expand Down
Loading