From dfec9d964e900d99d8094370a7072d011ecb4832 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 26 Oct 2023 10:55:12 -0400 Subject: [PATCH 1/4] allow memmapped arrays to keep mmap open asdf uses a mmap to memory map files to produce np.memmap arrays (when copy_arrays is True, the current default). np.memmap does not check if the backing mmap is closed before accessing array data and asdf closes the mmap when the asdf file is closed. This results in segfaults (see issue 1334). This commit changes when happens to the mmap when the asdf file is closed. asdf will no longer force closure of the mmap. This will result in an open file pointer if any array still references the mmap (the mmap will likely be closed by the garbage collector when all arrays are collected which matches the behavior of np.memmap). --- asdf/_core/_converters/ndarray.py | 2 ++ asdf/_tests/_regtests/test_1523.py | 3 +-- asdf/_tests/_regtests/test_1530.py | 8 ++++---- asdf/generic_io.py | 7 +++++-- asdf/tags/core/ndarray.py | 13 +++++++++++-- 5 files changed, 23 insertions(+), 10 deletions(-) diff --git a/asdf/_core/_converters/ndarray.py b/asdf/_core/_converters/ndarray.py index b448a5e32..197d8b28c 100644 --- a/asdf/_core/_converters/ndarray.py +++ b/asdf/_core/_converters/ndarray.py @@ -161,6 +161,8 @@ def from_yaml_tree(self, node, tag, ctx): elif isinstance(source, str): # external def data_callback(_attr=None, _ref=weakref.ref(ctx._blocks)): + if _attr not in (None, "cached_data", "data"): + raise AttributeError(f"_attr {_attr} is not supported") blks = _ref() if blks is None: msg = "Failed to resolve reference to AsdfFile to read external block" diff --git a/asdf/_tests/_regtests/test_1523.py b/asdf/_tests/_regtests/test_1523.py index 1777f769d..727f1ff91 100644 --- a/asdf/_tests/_regtests/test_1523.py +++ b/asdf/_tests/_regtests/test_1523.py @@ -23,8 +23,7 @@ def test_update_corrupts_stream_data(tmp_path): with asdf.open(fn, mode="rw") as af: af["a"] = np.arange(1000) af.update() - # print(af['s']) # segmentation fault + np.testing.assert_array_equal(af["s"], [[1, 2, 3]]) with asdf.open(fn) as af: - # fails as af['s'] == [[116, 101, 111]] np.testing.assert_array_equal(af["s"], [[1, 2, 3]]) diff --git a/asdf/_tests/_regtests/test_1530.py b/asdf/_tests/_regtests/test_1530.py index 353bca11d..8cf2f73ee 100644 --- a/asdf/_tests/_regtests/test_1530.py +++ b/asdf/_tests/_regtests/test_1530.py @@ -1,10 +1,8 @@ import numpy as np -import pytest import asdf -@pytest.mark.xfail(reason="fixing this may require subclassing ndarray") def test_update_with_memmapped_data_can_make_view_data_invalid(tmp_path): """ Calling update with memmapped data can create invalid data in memmap views @@ -31,5 +29,7 @@ def test_update_with_memmapped_data_can_make_view_data_invalid(tmp_path): af.update() np.testing.assert_array_equal(a, af["a"]) np.testing.assert_array_equal(b, af["b"]) - assert False - # np.testing.assert_array_equal(va, ov) # segfault + # the view of 'a' taken above ('va') keeps the original memmap open + # and is not a valid view of af['a'] (as this now differs from the + # af['a'] used to generate the view). + assert not np.all(va == ov) diff --git a/asdf/generic_io.py b/asdf/generic_io.py index b63563a67..76a3b9157 100644 --- a/asdf/generic_io.py +++ b/asdf/generic_io.py @@ -786,8 +786,11 @@ def memmap_array(self, offset, size): return np.ndarray.__new__(np.memmap, shape=size, offset=offset, dtype="uint8", buffer=self._mmap) def close_memmap(self): - if hasattr(self, "_mmap") and not self._mmap.closed: - self._mmap.close() + if hasattr(self, "_mmap"): + # we no longer close the _mmap here. This does mean that views of arrays + # that are backed by _mmap will keep the _mmap alive (and open). This is + # the cost of avoiding segfaults as np.memmap does not check if mmap is + # closed. del self._mmap def flush_memmap(self): diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index 5567b909f..b59e331aa 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -259,8 +259,17 @@ def _make_array(self): # memory mapping. if self._array is not None: base = util.get_array_base(self._array) - if isinstance(base, np.memmap) and isinstance(base.base, mmap.mmap) and base.base.closed: - self._array = None + if isinstance(base, np.memmap) and isinstance(base.base, mmap.mmap): + # check if the underlying mmap matches the one generated by generic_io + try: + fd = self._data_callback(_attr="_fd")() + except AttributeError: + # external blocks do not have a '_fd' and don't need to be updated + fd = None + if fd is not None: + if getattr(fd, "_mmap", None) is not base.base: + self._array = None + del fd if self._array is None: if isinstance(self._source, str): From e81a88aeb99df4b6542f6edf85e15e6e577cb04b Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 26 Oct 2023 11:11:52 -0400 Subject: [PATCH 2/4] add test for view of memmapped array after close --- asdf/_tests/_regtests/test_1334.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 asdf/_tests/_regtests/test_1334.py diff --git a/asdf/_tests/_regtests/test_1334.py b/asdf/_tests/_regtests/test_1334.py new file mode 100644 index 000000000..383268ec9 --- /dev/null +++ b/asdf/_tests/_regtests/test_1334.py @@ -0,0 +1,21 @@ +import numpy as np + +import asdf + + +def test_memmap_view_access_after_close(tmp_path): + """ + Accessing a view of a memmap after the asdf file + is closed results in a segfault + + https://github.com/asdf-format/asdf/issues/1334 + """ + + a = np.ones(10, dtype="uint8") + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"a": a}).write_to(fn) + + with asdf.open(fn, copy_arrays=False) as af: + v = af["a"][:5] + + assert np.all(v == 1) From 6173b89ab696e309b4b9bf6139798702d09eeb84 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 26 Oct 2023 11:13:42 -0400 Subject: [PATCH 3/4] update changelog --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index d713713c7..4cc160fd9 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -12,6 +12,9 @@ The ASDF Standard is at v1.6.0 - Deprecate automatic calling of ``AsdfFile.find_references`` during ``AsdfFile.__init__`` and ``asdf.open`` [#1708] +- Allow views of memmapped arrays to keep the backing mmap + open to avoid segfaults [#1668] + 3.0.1 (2023-10-30) ------------------ From 0c0aef495e0e1a15cef731d9374856a4b76a71b9 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 26 Oct 2023 12:16:27 -0400 Subject: [PATCH 4/4] close mmap on windows on update --- asdf/_asdf.py | 4 ++-- asdf/generic_io.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/asdf/_asdf.py b/asdf/_asdf.py index b09b4f8d1..1691f2783 100644 --- a/asdf/_asdf.py +++ b/asdf/_asdf.py @@ -1070,9 +1070,9 @@ def update( def rewrite(): self._fd.seek(0) self._serial_write(self._fd, pad_blocks, include_block_index) + self._fd.truncate() if self._fd.can_memmap(): self._fd.close_memmap() - self._fd.truncate() # if we have no read blocks, we can just call write_to as no internal blocks are reused if len(self._blocks.blocks) == 0: @@ -1107,9 +1107,9 @@ def rewrite(): # close memmap to trigger arrays to reload themselves self._fd.seek(end_of_file) + self._fd.truncate() if self._fd.can_memmap(): self._fd.close_memmap() - self._fd.truncate() finally: self._post_write(fd) diff --git a/asdf/generic_io.py b/asdf/generic_io.py index 76a3b9157..5edc6b44d 100644 --- a/asdf/generic_io.py +++ b/asdf/generic_io.py @@ -738,9 +738,6 @@ def fast_forward(self, size): self.seek(size, SEEK_CUR) def truncate(self, size=None): - # windows supports truncating as long as the file not opened - # more than once. So this must be called after closing all - # memmaps if size is None: self._fd.truncate() else: @@ -837,6 +834,15 @@ def close(self): if self._close: self._fix_permissions() + def truncate(self, size=None): + # windows supports truncating as long as the file not opened + # more than once. So this must be called after closing all + # memmaps + if sys.platform.startswith("win") and hasattr(self, "_mmap"): + self._mmap.close() + self.close_memmap() + super().truncate(size=size) + class MemoryIO(RandomAccessFile): """