diff --git a/CHANGES.rst b/CHANGES.rst index d713713c7..4cc160fd9 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -12,6 +12,9 @@ The ASDF Standard is at v1.6.0 - Deprecate automatic calling of ``AsdfFile.find_references`` during ``AsdfFile.__init__`` and ``asdf.open`` [#1708] +- Allow views of memmapped arrays to keep the backing mmap + open to avoid segfaults [#1668] + 3.0.1 (2023-10-30) ------------------ diff --git a/asdf/_asdf.py b/asdf/_asdf.py index b09b4f8d1..1691f2783 100644 --- a/asdf/_asdf.py +++ b/asdf/_asdf.py @@ -1070,9 +1070,9 @@ def update( def rewrite(): self._fd.seek(0) self._serial_write(self._fd, pad_blocks, include_block_index) + self._fd.truncate() if self._fd.can_memmap(): self._fd.close_memmap() - self._fd.truncate() # if we have no read blocks, we can just call write_to as no internal blocks are reused if len(self._blocks.blocks) == 0: @@ -1107,9 +1107,9 @@ def rewrite(): # close memmap to trigger arrays to reload themselves self._fd.seek(end_of_file) + self._fd.truncate() if self._fd.can_memmap(): self._fd.close_memmap() - self._fd.truncate() finally: self._post_write(fd) diff --git a/asdf/_core/_converters/ndarray.py b/asdf/_core/_converters/ndarray.py index b448a5e32..197d8b28c 100644 --- a/asdf/_core/_converters/ndarray.py +++ b/asdf/_core/_converters/ndarray.py @@ -161,6 +161,8 @@ def from_yaml_tree(self, node, tag, ctx): elif isinstance(source, str): # external def data_callback(_attr=None, _ref=weakref.ref(ctx._blocks)): + if _attr not in (None, "cached_data", "data"): + raise AttributeError(f"_attr {_attr} is not supported") blks = _ref() if blks is None: msg = "Failed to resolve reference to AsdfFile to read external block" diff --git a/asdf/_tests/_regtests/test_1334.py b/asdf/_tests/_regtests/test_1334.py new file mode 100644 index 000000000..383268ec9 --- /dev/null +++ b/asdf/_tests/_regtests/test_1334.py @@ -0,0 +1,21 @@ +import numpy as np + +import asdf + + +def test_memmap_view_access_after_close(tmp_path): + """ + Accessing a view of a memmap after the asdf file + is closed results in a segfault + + https://github.com/asdf-format/asdf/issues/1334 + """ + + a = np.ones(10, dtype="uint8") + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"a": a}).write_to(fn) + + with asdf.open(fn, copy_arrays=False) as af: + v = af["a"][:5] + + assert np.all(v == 1) diff --git a/asdf/_tests/_regtests/test_1523.py b/asdf/_tests/_regtests/test_1523.py index 1777f769d..727f1ff91 100644 --- a/asdf/_tests/_regtests/test_1523.py +++ b/asdf/_tests/_regtests/test_1523.py @@ -23,8 +23,7 @@ def test_update_corrupts_stream_data(tmp_path): with asdf.open(fn, mode="rw") as af: af["a"] = np.arange(1000) af.update() - # print(af['s']) # segmentation fault + np.testing.assert_array_equal(af["s"], [[1, 2, 3]]) with asdf.open(fn) as af: - # fails as af['s'] == [[116, 101, 111]] np.testing.assert_array_equal(af["s"], [[1, 2, 3]]) diff --git a/asdf/_tests/_regtests/test_1530.py b/asdf/_tests/_regtests/test_1530.py index 353bca11d..8cf2f73ee 100644 --- a/asdf/_tests/_regtests/test_1530.py +++ b/asdf/_tests/_regtests/test_1530.py @@ -1,10 +1,8 @@ import numpy as np -import pytest import asdf -@pytest.mark.xfail(reason="fixing this may require subclassing ndarray") def test_update_with_memmapped_data_can_make_view_data_invalid(tmp_path): """ Calling update with memmapped data can create invalid data in memmap views @@ -31,5 +29,7 @@ def test_update_with_memmapped_data_can_make_view_data_invalid(tmp_path): af.update() np.testing.assert_array_equal(a, af["a"]) np.testing.assert_array_equal(b, af["b"]) - assert False - # np.testing.assert_array_equal(va, ov) # segfault + # the view of 'a' taken above ('va') keeps the original memmap open + # and is not a valid view of af['a'] (as this now differs from the + # af['a'] used to generate the view). + assert not np.all(va == ov) diff --git a/asdf/generic_io.py b/asdf/generic_io.py index b63563a67..5edc6b44d 100644 --- a/asdf/generic_io.py +++ b/asdf/generic_io.py @@ -738,9 +738,6 @@ def fast_forward(self, size): self.seek(size, SEEK_CUR) def truncate(self, size=None): - # windows supports truncating as long as the file not opened - # more than once. So this must be called after closing all - # memmaps if size is None: self._fd.truncate() else: @@ -786,8 +783,11 @@ def memmap_array(self, offset, size): return np.ndarray.__new__(np.memmap, shape=size, offset=offset, dtype="uint8", buffer=self._mmap) def close_memmap(self): - if hasattr(self, "_mmap") and not self._mmap.closed: - self._mmap.close() + if hasattr(self, "_mmap"): + # we no longer close the _mmap here. This does mean that views of arrays + # that are backed by _mmap will keep the _mmap alive (and open). This is + # the cost of avoiding segfaults as np.memmap does not check if mmap is + # closed. del self._mmap def flush_memmap(self): @@ -834,6 +834,15 @@ def close(self): if self._close: self._fix_permissions() + def truncate(self, size=None): + # windows supports truncating as long as the file not opened + # more than once. So this must be called after closing all + # memmaps + if sys.platform.startswith("win") and hasattr(self, "_mmap"): + self._mmap.close() + self.close_memmap() + super().truncate(size=size) + class MemoryIO(RandomAccessFile): """ diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index 5567b909f..b59e331aa 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -259,8 +259,17 @@ def _make_array(self): # memory mapping. if self._array is not None: base = util.get_array_base(self._array) - if isinstance(base, np.memmap) and isinstance(base.base, mmap.mmap) and base.base.closed: - self._array = None + if isinstance(base, np.memmap) and isinstance(base.base, mmap.mmap): + # check if the underlying mmap matches the one generated by generic_io + try: + fd = self._data_callback(_attr="_fd")() + except AttributeError: + # external blocks do not have a '_fd' and don't need to be updated + fd = None + if fd is not None: + if getattr(fd, "_mmap", None) is not base.base: + self._array = None + del fd if self._array is None: if isinstance(self._source, str):