From 4dc65a927e49509e430cfd8db6b1a62d1e20dae6 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 10 Apr 2023 11:46:37 -0400 Subject: [PATCH 001/154] move asdf.block to asdf._block --- asdf/_block/__init__.py | 5 + asdf/_block/block.py | 532 +++++++++++++++++++++++ asdf/{block.py => _block/manager.py} | 624 +-------------------------- asdf/_block/util.py | 87 ++++ asdf/_tests/_helpers.py | 2 +- asdf/_tests/test_array_blocks.py | 3 +- asdf/asdf.py | 8 +- asdf/commands/edit.py | 2 +- 8 files changed, 641 insertions(+), 622 deletions(-) create mode 100644 asdf/_block/__init__.py create mode 100644 asdf/_block/block.py rename asdf/{block.py => _block/manager.py} (58%) create mode 100644 asdf/_block/util.py diff --git a/asdf/_block/__init__.py b/asdf/_block/__init__.py new file mode 100644 index 000000000..575753691 --- /dev/null +++ b/asdf/_block/__init__.py @@ -0,0 +1,5 @@ +from .block import Block, UnloadedBlock +from .manager import BlockManager +from .util import calculate_updated_layout + +__all__ = ["Block", "UnloadedBlock", "BlockManager", "calculate_updated_layout"] diff --git a/asdf/_block/block.py b/asdf/_block/block.py new file mode 100644 index 000000000..de4b041e0 --- /dev/null +++ b/asdf/_block/block.py @@ -0,0 +1,532 @@ +import hashlib +import io +import struct + +from asdf import compression as mcompression +from asdf import constants, generic_io, util + + +class Block: + """ + Represents a single block in a ASDF file. This is an + implementation detail and should not be instantiated directly. + Instead, should only be created through the `BlockManager`. + """ + + _header = util.BinaryStruct( + [ + ("flags", "I"), + ("compression", "4s"), + ("allocated_size", "Q"), + ("used_size", "Q"), + ("data_size", "Q"), + ("checksum", "16s"), + ], + ) + + def __init__(self, data=None, uri=None, array_storage="internal", memmap=True, lazy_load=True, data_callback=None): + self._data_callback = data_callback + if self._data_callback is not None and data is not None: + msg = "Block.__init__ cannot contain non-None data and a non-None data_callback" + raise ValueError(msg) + self._data = data + self._uri = uri + self._array_storage = array_storage + + self._fd = None + self._offset = None + self._input_compression = None + self._output_compression = "input" + self._output_compression_kwargs = {} + self._checksum = None + self._should_memmap = memmap + self._memmapped = False + self._lazy_load = lazy_load + + self.update_size() + self._allocated = self._size + + def __repr__(self): + return "".format( + self._array_storage[:3], + self._offset, + self._allocated, + self._size, + ) + + def __len__(self): + return self._size + + @property + def offset(self): + return self._offset + + @offset.setter + def offset(self, offset): + self._offset = offset + + @property + def allocated(self): + return self._allocated + + @allocated.setter + def allocated(self, allocated): + self._allocated = allocated + + @property + def header_size(self): + return self._header.size + constants.BLOCK_HEADER_BOILERPLATE_SIZE + + @property + def data_offset(self): + return self._offset + self.header_size + + @property + def size(self): + return self._size + self.header_size + + @property + def end_offset(self): + """ + The offset of the end of the allocated space for the block, + and where the next block should begin. + """ + return self.offset + self.header_size + self.allocated + + @property + def array_storage(self): + return self._array_storage + + @property + def input_compression(self): + """ + The compression codec used to read the block. + """ + return self._input_compression + + @input_compression.setter + def input_compression(self, compression): + self._input_compression = mcompression.validate(compression) + + @property + def output_compression(self): + """ + The compression codec used to write the block. + :return: + """ + if self._output_compression == "input": + return self._input_compression + return self._output_compression + + @output_compression.setter + def output_compression(self, compression): + self._output_compression = mcompression.validate(compression) + + @property + def output_compression_kwargs(self): + """ + The configuration options to the Compressor constructor + used to write the block. + :return: + """ + return self._output_compression_kwargs + + @output_compression_kwargs.setter + def output_compression_kwargs(self, config): + if config is None: + config = {} + self._output_compression_kwargs = config.copy() + + @property + def checksum(self): + return self._checksum + + def _set_checksum(self, checksum): + if checksum == b"\0" * 16: + self._checksum = None + else: + self._checksum = checksum + + def _calculate_checksum(self, array): + # The following line is safe because we're only using + # the MD5 as a checksum. + m = hashlib.new("md5") # noqa: S324 + m.update(array) + return m.digest() + + def validate_checksum(self): + """ + Validate the content of the block against the current checksum. + + Returns + ------- + valid : bool + `True` if the content is valid against the current + checksum or there is no current checksum. Otherwise, + `False`. + """ + if self._checksum: + checksum = self._calculate_checksum(self._flattened_data) + if checksum != self._checksum: + return False + return True + + def update_checksum(self): + """ + Update the checksum based on the current data contents. + """ + self._checksum = self._calculate_checksum(self._flattened_data) + + def update_size(self): + """ + Recalculate the on-disk size of the block. This causes any + compression steps to run. It should only be called when + updating the file in-place, otherwise the work is redundant. + """ + if self._data is not None: + data = self._flattened_data + self._data_size = data.nbytes + + if not self.output_compression: + self._size = self._data_size + else: + self._size = mcompression.get_compressed_size( + data, + self.output_compression, + config=self.output_compression_kwargs, + ) + else: + self._data_size = self._size = 0 + + def read(self, fd, past_magic=False, validate_checksum=False): + """ + Read a Block from the given Python file-like object. + + If the file is seekable and lazy_load is True, the reading + or memmapping of the actual data is postponed until an array + requests it. If the file is a stream or lazy_load is False, + the data will be read into memory immediately. + + As Block is used for reading, writing, configuring and + managing data there are circumstances where read should + not be used. For instance, if a data_callback is defined + a call to read would override the data corresponding to a + block and conflict with the use of the data_callback. To + signify this conflict, a RuntimeError is raised if read + is called on a block with a defined data_callback. + + Parameters + ---------- + fd : GenericFile + + past_magic : bool, optional + If `True`, the file position is immediately after the + block magic token. If `False` (default), the file + position is exactly at the beginning of the block magic + token. + + validate_checksum : bool, optional + If `True`, validate the data against the checksum, and + raise a `ValueError` if the data doesn't match. + + Raises + ------ + + RuntimeError + Read was called on a block with a defined data_callback. + + ValueError + The read file contains invalid data. + """ + if self._data_callback is not None: + msg = "read called on a Block with a data_callback" + raise RuntimeError(msg) + offset = None + if fd.seekable(): + offset = fd.tell() + + if not past_magic: + buff = fd.read(len(constants.BLOCK_MAGIC)) + if len(buff) < 4: + return None + + if buff not in (constants.BLOCK_MAGIC, constants.INDEX_HEADER[: len(buff)]): + msg = ( + "Bad magic number in block. " + "This may indicate an internal inconsistency about the " + "sizes of the blocks in the file." + ) + raise ValueError(msg) + + if buff == constants.INDEX_HEADER[: len(buff)]: + return None + + elif offset is not None: + offset -= 4 + + buff = fd.read(2) + (header_size,) = struct.unpack(b">H", buff) + if header_size < self._header.size: + msg = f"Header size must be >= {self._header.size}" + raise ValueError(msg) + + buff = fd.read(header_size) + header = self._header.unpack(buff) + + # This is used by the documentation system, but nowhere else. + self._flags = header["flags"] + self._set_checksum(header["checksum"]) + + try: + self.input_compression = header["compression"] + except ValueError: + raise # TODO: hint extension? + + if self.input_compression is None and header["used_size"] != header["data_size"]: + msg = "used_size and data_size must be equal when no compression is used." + raise ValueError(msg) + + if header["flags"] & constants.BLOCK_FLAG_STREAMED and self.input_compression is not None: + msg = "Compression set on a streamed block." + raise ValueError(msg) + + if fd.seekable(): + # If the file is seekable, we can delay reading the actual + # data until later. + self._fd = fd + self._offset = offset + self._header_size = header_size + if header["flags"] & constants.BLOCK_FLAG_STREAMED: + # Support streaming blocks + self._array_storage = "streamed" + if self._lazy_load: + fd.fast_forward(-1) + self._data_size = self._size = self._allocated = (fd.tell() - self.data_offset) + 1 + else: + self._data = fd.read_into_array(-1) + self._data_size = self._size = self._allocated = len(self._data) + else: + self._allocated = header["allocated_size"] + self._size = header["used_size"] + self._data_size = header["data_size"] + if self._lazy_load: + fd.fast_forward(self._allocated) + else: + curpos = fd.tell() + self._memmap_data() + fd.seek(curpos) + if not self._memmapped: + self._data = self._read_data(fd, self._size, self._data_size) + fd.fast_forward(self._allocated - self._size) + else: + fd.fast_forward(self._allocated) + else: + # If the file is a stream, we need to get the data now. + if header["flags"] & constants.BLOCK_FLAG_STREAMED: + # Support streaming blocks + self._array_storage = "streamed" + self._data = fd.read_into_array(-1) + self._data_size = self._size = self._allocated = len(self._data) + else: + self._allocated = header["allocated_size"] + self._size = header["used_size"] + self._data_size = header["data_size"] + self._data = self._read_data(fd, self._size, self._data_size) + fd.fast_forward(self._allocated - self._size) + fd.close() + + if validate_checksum and not self.validate_checksum(): + msg = f"Block at {self._offset} does not match given checksum" + raise ValueError(msg) + + return self + + def _read_data(self, fd, used_size, data_size): + """ + Read the block data from a file. + """ + if not self.input_compression: + return fd.read_into_array(used_size) + + return mcompression.decompress(fd, used_size, data_size, self.input_compression) + + def _memmap_data(self): + """ + Memory map the block data from the file. + """ + memmap = self._fd.can_memmap() and not self.input_compression + if self._should_memmap and memmap: + self._data = self._fd.memmap_array(self.data_offset, self._size) + self._memmapped = True + + @property + def _flattened_data(self): + """ + Retrieve flattened data suitable for writing. + + Returns + ------- + np.ndarray + 1D contiguous array. + """ + data = self.data + + # 'K' order flattens the array in the order that elements + # occur in memory, except axes with negative strides which + # are reversed. That is a problem for base arrays with + # negative strides and is an outstanding bug in this library. + return data.ravel(order="K") + + def write(self, fd): + """ + Write an internal block to the given Python file-like object. + """ + self._header_size = self._header.size + + if self._data_callback is not None: + self._data = self._data_callback() + data = self._flattened_data + self.update_size() + self._data = None + self._allocated = self._size + else: + data = self._flattened_data if self._data is not None else None + + flags = 0 + data_size = used_size = allocated_size = 0 + if self._array_storage == "streamed": + flags |= constants.BLOCK_FLAG_STREAMED + elif data is not None: + self._checksum = self._calculate_checksum(data) + data_size = data.nbytes + if not fd.seekable() and self.output_compression: + buff = io.BytesIO() + mcompression.compress(buff, data, self.output_compression, config=self.output_compression_kwargs) + self.allocated = self._size = buff.tell() + allocated_size = self.allocated + used_size = self._size + self.input_compression = self.output_compression + + if allocated_size < used_size: + msg = f"Block used size {used_size} larger than allocated size {allocated_size}" + raise RuntimeError(msg) + + checksum = self.checksum if self.checksum is not None else b"\x00" * 16 + + fd.write(constants.BLOCK_MAGIC) + fd.write(struct.pack(b">H", self._header_size)) + fd.write( + self._header.pack( + flags=flags, + compression=mcompression.to_compression_header(self.output_compression), + allocated_size=allocated_size, + used_size=used_size, + data_size=data_size, + checksum=checksum, + ), + ) + + if data is not None: + if self.output_compression: + if not fd.seekable(): + fd.write(buff.getvalue()) + else: + # If the file is seekable, we write the + # compressed data directly to it, then go back + # and write the resulting size in the block + # header. + start = fd.tell() + mcompression.compress(fd, data, self.output_compression, config=self.output_compression_kwargs) + end = fd.tell() + self.allocated = self._size = end - start + fd.seek(self.offset + 6) + self._header.update(fd, allocated_size=self.allocated, used_size=self._size) + fd.seek(end) + else: + if used_size != data_size: + msg = f"Block used size {used_size} is not equal to the data size {data_size}" + raise RuntimeError(msg) + fd.write_array(data) + + @property + def data(self): + """ + Get the data for the block, as a numpy array. + """ + if self._data is not None: + return self._data + if self._data_callback is not None: + return self._data_callback() + if self._fd.is_closed(): + msg = "ASDF file has already been closed. Can not get the data." + raise OSError(msg) + + # Be nice and reset the file position after we're done + curpos = self._fd.tell() + try: + self._memmap_data() + if not self._memmapped: + self._fd.seek(self.data_offset) + self._data = self._read_data(self._fd, self._size, self._data_size) + finally: + self._fd.seek(curpos) + return self._data + + def close(self): + self._data = None + + def generate_read_data_callback(self): + """Used in SerializationContext.get_block_data_callback""" + + def callback(): + return self.data + + return callback + + +class UnloadedBlock: + """ + Represents an indexed, but not yet loaded, internal block. All + that is known about it is its offset. It converts itself to a + full-fledged block whenever the underlying data or more detail is + requested. + """ + + def __init__(self, fd, offset, memmap=True, lazy_load=True): + self._fd = fd + self._offset = offset + self._data = None + self._uri = None + self._array_storage = "internal" + self._input_compression = None + self._output_compression = "input" + self._output_compression_kwargs = {} + self._checksum = None + self._should_memmap = memmap + self._memmapped = False + self._lazy_load = lazy_load + self._data_callback = None + + def __len__(self): + self.load() + return len(self) + + def close(self): + pass + + @property + def array_storage(self): + return "internal" + + @property + def offset(self): + return self._offset + + def __getattr__(self, attr): + self.load() + return getattr(self, attr) + + def load(self): + self._fd.seek(self._offset, generic_io.SEEK_SET) + self.__class__ = Block + self.read(self._fd) diff --git a/asdf/block.py b/asdf/_block/manager.py similarity index 58% rename from asdf/block.py rename to asdf/_block/manager.py index b9cce1dfa..3f6e2477a 100644 --- a/asdf/block.py +++ b/asdf/_block/manager.py @@ -1,19 +1,17 @@ import copy -import hashlib -import io import os import re -import struct import weakref -from collections import namedtuple import numpy as np import yaml -from . import compression as mcompression -from . import constants, generic_io, treeutil, util, yamlutil -from .config import get_config -from .util import patched_urllib_parse +from asdf import compression as mcompression +from asdf import constants, generic_io, treeutil, util, yamlutil +from asdf.config import get_config +from asdf.util import patched_urllib_parse + +from .block import Block, UnloadedBlock class BlockManager: @@ -360,7 +358,8 @@ def write_external_blocks(self, uri, pad_blocks=False): uri : str The base uri of the external blocks """ - from . import asdf + + import asdf for i, block in enumerate(self.external_blocks): if uri is None: @@ -755,7 +754,7 @@ def find_or_create_block_for_array(self, arr): ------- block : Block """ - from .tags.core import ndarray + from asdf.tags.core import ndarray if isinstance(arr, ndarray.NDArrayType) and arr.block is not None and arr.block in self.blocks: return arr.block @@ -839,608 +838,3 @@ def __getitem__(self, arr): def close(self): for block in self.blocks: block.close() - - -class Block: - """ - Represents a single block in a ASDF file. This is an - implementation detail and should not be instantiated directly. - Instead, should only be created through the `BlockManager`. - """ - - _header = util.BinaryStruct( - [ - ("flags", "I"), - ("compression", "4s"), - ("allocated_size", "Q"), - ("used_size", "Q"), - ("data_size", "Q"), - ("checksum", "16s"), - ], - ) - - def __init__(self, data=None, uri=None, array_storage="internal", memmap=True, lazy_load=True, data_callback=None): - self._data_callback = data_callback - if self._data_callback is not None and data is not None: - msg = "Block.__init__ cannot contain non-None data and a non-None data_callback" - raise ValueError(msg) - self._data = data - self._uri = uri - self._array_storage = array_storage - - self._fd = None - self._offset = None - self._input_compression = None - self._output_compression = "input" - self._output_compression_kwargs = {} - self._checksum = None - self._should_memmap = memmap - self._memmapped = False - self._lazy_load = lazy_load - - self.update_size() - self._allocated = self._size - - def __repr__(self): - return f"" - - def __len__(self): - return self._size - - @property - def offset(self): - return self._offset - - @offset.setter - def offset(self, offset): - self._offset = offset - - @property - def allocated(self): - return self._allocated - - @allocated.setter - def allocated(self, allocated): - self._allocated = allocated - - @property - def header_size(self): - return self._header.size + constants.BLOCK_HEADER_BOILERPLATE_SIZE - - @property - def data_offset(self): - return self._offset + self.header_size - - @property - def size(self): - return self._size + self.header_size - - @property - def end_offset(self): - """ - The offset of the end of the allocated space for the block, - and where the next block should begin. - """ - return self.offset + self.header_size + self.allocated - - @property - def array_storage(self): - return self._array_storage - - @property - def input_compression(self): - """ - The compression codec used to read the block. - """ - return self._input_compression - - @input_compression.setter - def input_compression(self, compression): - self._input_compression = mcompression.validate(compression) - - @property - def output_compression(self): - """ - The compression codec used to write the block. - :return: - """ - if self._output_compression == "input": - return self._input_compression - return self._output_compression - - @output_compression.setter - def output_compression(self, compression): - self._output_compression = mcompression.validate(compression) - - @property - def output_compression_kwargs(self): - """ - The configuration options to the Compressor constructor - used to write the block. - :return: - """ - return self._output_compression_kwargs - - @output_compression_kwargs.setter - def output_compression_kwargs(self, config): - if config is None: - config = {} - self._output_compression_kwargs = config.copy() - - @property - def checksum(self): - return self._checksum - - def _set_checksum(self, checksum): - if checksum == b"\0" * 16: - self._checksum = None - else: - self._checksum = checksum - - def _calculate_checksum(self, array): - # The following line is safe because we're only using - # the MD5 as a checksum. - m = hashlib.new("md5") # noqa: S324 - m.update(array) - return m.digest() - - def validate_checksum(self): - """ - Validate the content of the block against the current checksum. - - Returns - ------- - valid : bool - `True` if the content is valid against the current - checksum or there is no current checksum. Otherwise, - `False`. - """ - if self._checksum: - checksum = self._calculate_checksum(self._flattened_data) - if checksum != self._checksum: - return False - return True - - def update_checksum(self): - """ - Update the checksum based on the current data contents. - """ - self._checksum = self._calculate_checksum(self._flattened_data) - - def update_size(self): - """ - Recalculate the on-disk size of the block. This causes any - compression steps to run. It should only be called when - updating the file in-place, otherwise the work is redundant. - """ - if self._data is not None: - data = self._flattened_data - self._data_size = data.nbytes - - if not self.output_compression: - self._size = self._data_size - else: - self._size = mcompression.get_compressed_size( - data, - self.output_compression, - config=self.output_compression_kwargs, - ) - else: - self._data_size = self._size = 0 - - def read(self, fd, past_magic=False, validate_checksum=False): - """ - Read a Block from the given Python file-like object. - - If the file is seekable and lazy_load is True, the reading - or memmapping of the actual data is postponed until an array - requests it. If the file is a stream or lazy_load is False, - the data will be read into memory immediately. - - As Block is used for reading, writing, configuring and - managing data there are circumstances where read should - not be used. For instance, if a data_callback is defined - a call to read would override the data corresponding to a - block and conflict with the use of the data_callback. To - signify this conflict, a RuntimeError is raised if read - is called on a block with a defined data_callback. - - Parameters - ---------- - fd : GenericFile - - past_magic : bool, optional - If `True`, the file position is immediately after the - block magic token. If `False` (default), the file - position is exactly at the beginning of the block magic - token. - - validate_checksum : bool, optional - If `True`, validate the data against the checksum, and - raise a `ValueError` if the data doesn't match. - - Raises - ------ - - RuntimeError - Read was called on a block with a defined data_callback. - - ValueError - The read file contains invalid data. - """ - if self._data_callback is not None: - msg = "read called on a Block with a data_callback" - raise RuntimeError(msg) - offset = None - if fd.seekable(): - offset = fd.tell() - - if not past_magic: - buff = fd.read(len(constants.BLOCK_MAGIC)) - if len(buff) < 4: - return None - - if buff not in (constants.BLOCK_MAGIC, constants.INDEX_HEADER[: len(buff)]): - msg = ( - "Bad magic number in block. " - "This may indicate an internal inconsistency about the " - "sizes of the blocks in the file." - ) - raise ValueError(msg) - - if buff == constants.INDEX_HEADER[: len(buff)]: - return None - - elif offset is not None: - offset -= 4 - - buff = fd.read(2) - (header_size,) = struct.unpack(b">H", buff) - if header_size < self._header.size: - msg = f"Header size must be >= {self._header.size}" - raise ValueError(msg) - - buff = fd.read(header_size) - header = self._header.unpack(buff) - - # This is used by the documentation system, but nowhere else. - self._flags = header["flags"] - self._set_checksum(header["checksum"]) - - try: - self.input_compression = header["compression"] - except ValueError: - raise # TODO: hint extension? - - if self.input_compression is None and header["used_size"] != header["data_size"]: - msg = "used_size and data_size must be equal when no compression is used." - raise ValueError(msg) - - if header["flags"] & constants.BLOCK_FLAG_STREAMED and self.input_compression is not None: - msg = "Compression set on a streamed block." - raise ValueError(msg) - - if fd.seekable(): - # If the file is seekable, we can delay reading the actual - # data until later. - self._fd = fd - self._offset = offset - self._header_size = header_size - if header["flags"] & constants.BLOCK_FLAG_STREAMED: - # Support streaming blocks - self._array_storage = "streamed" - if self._lazy_load: - fd.fast_forward(-1) - self._data_size = self._size = self._allocated = (fd.tell() - self.data_offset) + 1 - else: - self._data = fd.read_into_array(-1) - self._data_size = self._size = self._allocated = len(self._data) - else: - self._allocated = header["allocated_size"] - self._size = header["used_size"] - self._data_size = header["data_size"] - if self._lazy_load: - fd.fast_forward(self._allocated) - else: - curpos = fd.tell() - self._memmap_data() - fd.seek(curpos) - if not self._memmapped: - self._data = self._read_data(fd, self._size, self._data_size) - fd.fast_forward(self._allocated - self._size) - else: - fd.fast_forward(self._allocated) - else: - # If the file is a stream, we need to get the data now. - if header["flags"] & constants.BLOCK_FLAG_STREAMED: - # Support streaming blocks - self._array_storage = "streamed" - self._data = fd.read_into_array(-1) - self._data_size = self._size = self._allocated = len(self._data) - else: - self._allocated = header["allocated_size"] - self._size = header["used_size"] - self._data_size = header["data_size"] - self._data = self._read_data(fd, self._size, self._data_size) - fd.fast_forward(self._allocated - self._size) - fd.close() - - if validate_checksum and not self.validate_checksum(): - msg = f"Block at {self._offset} does not match given checksum" - raise ValueError(msg) - - return self - - def _read_data(self, fd, used_size, data_size): - """ - Read the block data from a file. - """ - if not self.input_compression: - return fd.read_into_array(used_size) - - return mcompression.decompress(fd, used_size, data_size, self.input_compression) - - def _memmap_data(self): - """ - Memory map the block data from the file. - """ - memmap = self._fd.can_memmap() and not self.input_compression - if self._should_memmap and memmap: - self._data = self._fd.memmap_array(self.data_offset, self._size) - self._memmapped = True - - @property - def _flattened_data(self): - """ - Retrieve flattened data suitable for writing. - - Returns - ------- - np.ndarray - 1D contiguous array. - """ - data = self.data - - # 'K' order flattens the array in the order that elements - # occur in memory, except axes with negative strides which - # are reversed. That is a problem for base arrays with - # negative strides and is an outstanding bug in this library. - return data.ravel(order="K") - - def write(self, fd): - """ - Write an internal block to the given Python file-like object. - """ - self._header_size = self._header.size - - if self._data_callback is not None: - self._data = self._data_callback() - data = self._flattened_data - self.update_size() - self._data = None - self._allocated = self._size - else: - data = self._flattened_data if self._data is not None else None - - flags = 0 - data_size = used_size = allocated_size = 0 - if self._array_storage == "streamed": - flags |= constants.BLOCK_FLAG_STREAMED - elif data is not None: - self._checksum = self._calculate_checksum(data) - data_size = data.nbytes - if not fd.seekable() and self.output_compression: - buff = io.BytesIO() - mcompression.compress(buff, data, self.output_compression, config=self.output_compression_kwargs) - self.allocated = self._size = buff.tell() - allocated_size = self.allocated - used_size = self._size - self.input_compression = self.output_compression - - if allocated_size < used_size: - msg = f"Block used size {used_size} larger than allocated size {allocated_size}" - raise RuntimeError(msg) - - checksum = self.checksum if self.checksum is not None else b"\x00" * 16 - - fd.write(constants.BLOCK_MAGIC) - fd.write(struct.pack(b">H", self._header_size)) - fd.write( - self._header.pack( - flags=flags, - compression=mcompression.to_compression_header(self.output_compression), - allocated_size=allocated_size, - used_size=used_size, - data_size=data_size, - checksum=checksum, - ), - ) - - if data is not None: - if self.output_compression: - if not fd.seekable(): - fd.write(buff.getvalue()) - else: - # If the file is seekable, we write the - # compressed data directly to it, then go back - # and write the resulting size in the block - # header. - start = fd.tell() - mcompression.compress(fd, data, self.output_compression, config=self.output_compression_kwargs) - end = fd.tell() - self.allocated = self._size = end - start - fd.seek(self.offset + 6) - self._header.update(fd, allocated_size=self.allocated, used_size=self._size) - fd.seek(end) - else: - if used_size != data_size: - msg = f"Block used size {used_size} is not equal to the data size {data_size}" - raise RuntimeError(msg) - fd.write_array(data) - - @property - def data(self): - """ - Get the data for the block, as a numpy array. - """ - if self._data is not None: - return self._data - if self._data_callback is not None: - return self._data_callback() - if self._fd.is_closed(): - msg = "ASDF file has already been closed. Can not get the data." - raise OSError(msg) - - # Be nice and reset the file position after we're done - curpos = self._fd.tell() - try: - self._memmap_data() - if not self._memmapped: - self._fd.seek(self.data_offset) - self._data = self._read_data(self._fd, self._size, self._data_size) - finally: - self._fd.seek(curpos) - return self._data - - def close(self): - self._data = None - - def generate_read_data_callback(self): - """Used in SerializationContext.get_block_data_callback""" - - def callback(): - return self.data - - return callback - - -class UnloadedBlock: - """ - Represents an indexed, but not yet loaded, internal block. All - that is known about it is its offset. It converts itself to a - full-fledged block whenever the underlying data or more detail is - requested. - """ - - def __init__(self, fd, offset, memmap=True, lazy_load=True): - self._fd = fd - self._offset = offset - self._data = None - self._uri = None - self._array_storage = "internal" - self._input_compression = None - self._output_compression = "input" - self._output_compression_kwargs = {} - self._checksum = None - self._should_memmap = memmap - self._memmapped = False - self._lazy_load = lazy_load - self._data_callback = None - - def __len__(self): - self.load() - return len(self) - - def close(self): - pass - - @property - def array_storage(self): - return "internal" - - @property - def offset(self): - return self._offset - - def __getattr__(self, attr): - self.load() - return getattr(self, attr) - - def load(self): - self._fd.seek(self._offset, generic_io.SEEK_SET) - self.__class__ = Block - self.read(self._fd) - - -def calculate_updated_layout(blocks, tree_size, pad_blocks, block_size): - """ - Calculates a block layout that will try to use as many blocks as - possible in their original locations, though at this point the - algorithm is fairly naive. The result will be stored in the - offsets of the blocks. - - Parameters - ---------- - blocks : Blocks instance - - tree_size : int - The amount of space to reserve for the tree at the beginning. - - Returns - ------- - Returns `False` if no good layout can be found and one is best off - rewriting the file serially, otherwise, returns `True`. - """ - - def unfix_block(i): - # If this algorithm gets more sophisticated we could carefully - # move memmapped blocks around without clobbering other ones. - - # TODO: Copy to a tmpfile on disk and memmap it from there. - entry = fixed[i] - copy = entry.block.data.copy() - entry.block.close() - entry.block._data = copy - del fixed[i] - free.append(entry.block) - - def fix_block(block, offset): - block.offset = offset - fixed.append(Entry(block.offset, block.offset + block.size, block)) - fixed.sort() - - Entry = namedtuple("Entry", ["start", "end", "block"]) - - fixed = [] - free = [] - for block in blocks._internal_blocks: - if block.offset is not None: - block.update_size() - fixed.append(Entry(block.offset, block.offset + block.size, block)) - else: - free.append(block) - - if not len(fixed): - return False - - fixed.sort() - - # Make enough room at the beginning for the tree, by popping off - # blocks at the beginning - while len(fixed) and fixed[0].start < tree_size: - unfix_block(0) - - if not len(fixed): - return False - - # This algorithm is pretty basic at this point -- it just looks - # for the first open spot big enough for the free block to fit. - while len(free): - block = free.pop() - last_end = tree_size - for entry in fixed: - if entry.start - last_end >= block.size: - fix_block(block, last_end) - break - last_end = entry.end - else: - padding = util.calculate_padding(entry.block.size, pad_blocks, block_size) - fix_block(block, last_end + padding) - - if blocks.streamed_block is not None: - padding = util.calculate_padding(fixed[-1].block.size, pad_blocks, block_size) - blocks.streamed_block.offset = fixed[-1].end + padding - - blocks._sort_blocks_by_offset() - - return True diff --git a/asdf/_block/util.py b/asdf/_block/util.py new file mode 100644 index 000000000..7319e2751 --- /dev/null +++ b/asdf/_block/util.py @@ -0,0 +1,87 @@ +from collections import namedtuple + +from asdf import util + + +def calculate_updated_layout(blocks, tree_size, pad_blocks, block_size): + """ + Calculates a block layout that will try to use as many blocks as + possible in their original locations, though at this point the + algorithm is fairly naive. The result will be stored in the + offsets of the blocks. + + Parameters + ---------- + blocks : Blocks instance + + tree_size : int + The amount of space to reserve for the tree at the beginning. + + Returns + ------- + Returns `False` if no good layout can be found and one is best off + rewriting the file serially, otherwise, returns `True`. + """ + + def unfix_block(i): + # If this algorithm gets more sophisticated we could carefully + # move memmapped blocks around without clobbering other ones. + + # TODO: Copy to a tmpfile on disk and memmap it from there. + entry = fixed[i] + copy = entry.block.data.copy() + entry.block.close() + entry.block._data = copy + del fixed[i] + free.append(entry.block) + + def fix_block(block, offset): + block.offset = offset + fixed.append(Entry(block.offset, block.offset + block.size, block)) + fixed.sort() + + Entry = namedtuple("Entry", ["start", "end", "block"]) + + fixed = [] + free = [] + for block in blocks._internal_blocks: + if block.offset is not None: + block.update_size() + fixed.append(Entry(block.offset, block.offset + block.size, block)) + else: + free.append(block) + + if not len(fixed): + return False + + fixed.sort() + + # Make enough room at the beginning for the tree, by popping off + # blocks at the beginning + while len(fixed) and fixed[0].start < tree_size: + unfix_block(0) + + if not len(fixed): + return False + + # This algorithm is pretty basic at this point -- it just looks + # for the first open spot big enough for the free block to fit. + while len(free): + block = free.pop() + last_end = tree_size + for entry in fixed: + if entry.start - last_end >= block.size: + fix_block(block, last_end) + break + last_end = entry.end + else: + padding = util.calculate_padding(entry.block.size, pad_blocks, block_size) + fix_block(block, last_end + padding) + + if blocks.streamed_block is not None: + padding = util.calculate_padding(fixed[-1].block.size, pad_blocks, block_size) + blocks.streamed_block.offset = fixed[-1].end + padding + + blocks._sort_blocks_by_offset() + + return True diff --git a/asdf/_tests/_helpers.py b/asdf/_tests/_helpers.py index cdf147155..a60808369 100644 --- a/asdf/_tests/_helpers.py +++ b/asdf/_tests/_helpers.py @@ -23,9 +23,9 @@ import asdf from asdf import generic_io, versioning +from asdf._block import Block from asdf._resolver import Resolver, ResolverChain from asdf.asdf import AsdfFile, get_asdf_library_info -from asdf.block import Block from asdf.constants import YAML_TAG_PREFIX from asdf.exceptions import AsdfConversionWarning, AsdfDeprecationWarning from asdf.extension import _legacy diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index b95be394b..9c163ad6e 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -8,7 +8,8 @@ from numpy.testing import assert_array_equal import asdf -from asdf import block, constants, generic_io +from asdf import _block as block +from asdf import constants, generic_io RNG = np.random.default_rng(6) diff --git a/asdf/asdf.py b/asdf/asdf.py index 4ff592034..699a05769 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -11,7 +11,8 @@ from . import _display as display from . import _node_info as node_info from . import _version as version -from . import block, constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil +from . import constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil +from ._block import BlockManager, calculate_updated_layout from ._helpers import validate_version from .config import config_context, get_config from .exceptions import ( @@ -150,8 +151,7 @@ def __init__( self._fd = None self._closed = False self._external_asdf_by_uri = {} - self._blocks = block.BlockManager(self, copy_arrays=copy_arrays, lazy_load=lazy_load) - # set the uri here so validation can generate any required external blocks + self._blocks = BlockManager(self, copy_arrays=copy_arrays, lazy_load=lazy_load) self._uri = uri if tree is None: # Bypassing the tree property here, to avoid validating @@ -1132,7 +1132,7 @@ def update( serialized_tree_size = tree_serialized.tell() + constants.MAX_BLOCKS_DIGITS * n_internal_blocks - if not block.calculate_updated_layout(self._blocks, serialized_tree_size, pad_blocks, fd.block_size): + if not calculate_updated_layout(self._blocks, serialized_tree_size, pad_blocks, fd.block_size): # If we don't have any blocks that are being reused, just # write out in a serial fashion. self._serial_write(fd, pad_blocks, include_block_index) diff --git a/asdf/commands/edit.py b/asdf/commands/edit.py index 77c4c44f6..61fc28d12 100644 --- a/asdf/commands/edit.py +++ b/asdf/commands/edit.py @@ -16,8 +16,8 @@ import yaml from asdf import constants, generic_io, schema, util +from asdf._block import BlockManager from asdf.asdf import AsdfFile, open_asdf -from asdf.block import BlockManager from .main import Command From e100e6bd97d60118634d3c3128017ffac631f93a Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 10 Apr 2023 12:02:50 -0400 Subject: [PATCH 002/154] remove deprecated AsdfFile.blocks --- asdf/_tests/test_deprecated.py | 6 ------ asdf/asdf.py | 13 ------------- 2 files changed, 19 deletions(-) diff --git a/asdf/_tests/test_deprecated.py b/asdf/_tests/test_deprecated.py index d6ed90ce8..d51b78c04 100644 --- a/asdf/_tests/test_deprecated.py +++ b/asdf/_tests/test_deprecated.py @@ -29,9 +29,3 @@ def test_asdf_type_format_tag(): with pytest.warns(AsdfDeprecationWarning, match="asdf.types.format_tag is deprecated"): asdf._types.format_tag asdf.testing.helpers.format_tag - - -def test_blocks_deprecated(): - af = asdf.AsdfFile() - with pytest.warns(AsdfDeprecationWarning, match="The property AsdfFile.blocks has been deprecated"): - af.blocks diff --git a/asdf/asdf.py b/asdf/asdf.py index 699a05769..db62752d0 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -638,19 +638,6 @@ def make_reference(self, path=None): """ return reference.make_reference(self, [] if path is None else path) - @property - def blocks(self): - """ - Get the block manager associated with the `AsdfFile`. - """ - warnings.warn( - "The property AsdfFile.blocks has been deprecated and will be removed " - "in asdf-3.0. Public use of the block manager is strongly discouraged " - "as there is no stable API", - AsdfDeprecationWarning, - ) - return self._blocks - def set_array_storage(self, arr, array_storage): """ Set the block type to use for the given array data. From e80729367025ed2e51102a9c7ab6b1311332f8b6 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 11 Apr 2023 10:08:05 -0400 Subject: [PATCH 003/154] add components to break apart block manager --- asdf/_block/io.py | 172 +++++++++++++++++++++ asdf/_block/key.py | 33 ++++ asdf/_block/options.py | 57 +++++++ asdf/_block/store.py | 81 ++++++++++ asdf/_tests/_block/__init__.py | 0 asdf/_tests/_block/test_io.py | 233 +++++++++++++++++++++++++++++ asdf/_tests/_block/test_key.py | 56 +++++++ asdf/_tests/_block/test_options.py | 96 ++++++++++++ asdf/_tests/_block/test_store.py | 135 +++++++++++++++++ asdf/compression.py | 2 +- 10 files changed, 864 insertions(+), 1 deletion(-) create mode 100644 asdf/_block/io.py create mode 100644 asdf/_block/key.py create mode 100644 asdf/_block/options.py create mode 100644 asdf/_block/store.py create mode 100644 asdf/_tests/_block/__init__.py create mode 100644 asdf/_tests/_block/test_io.py create mode 100644 asdf/_tests/_block/test_key.py create mode 100644 asdf/_tests/_block/test_options.py create mode 100644 asdf/_tests/_block/test_store.py diff --git a/asdf/_block/io.py b/asdf/_block/io.py new file mode 100644 index 000000000..06a69b10c --- /dev/null +++ b/asdf/_block/io.py @@ -0,0 +1,172 @@ +import hashlib +import io +import struct +import weakref + +from asdf import compression as mcompression +from asdf import constants, util + +BLOCK_HEADER = util.BinaryStruct( + [ + ("flags", "I"), + ("compression", "4s"), + ("allocated_size", "Q"), + ("used_size", "Q"), + ("data_size", "Q"), + ("checksum", "16s"), + ], +) + + +def calculate_block_checksum(data): + if data.ndim > 1: + data = data.ravel(order="K") + # The following line is safe because we're only using + # the MD5 as a checksum. + m = hashlib.new("md5") # noqa: S324 + m.update(data) + return m.digest() + + +def validate_block_header(header): + compression = mcompression.validate(header["compression"]) + if header["flags"] & constants.BLOCK_FLAG_STREAMED: + if compression is not None: + msg = "Compression set on a streamed block." + raise ValueError(msg) + else: + if compression is None and header["used_size"] != header["data_size"]: + msg = "used_size and data_size must be equal when no compression is used." + raise ValueError(msg) + return header + + +def read_block_header(fd, offset=None): + if offset is not None: + fd.seek(offset) + + # read the header size + buff = fd.read(2) + header_size = struct.unpack(b">H", buff)[0] + if header_size < BLOCK_HEADER.size: + msg = f"Header size must be >= {BLOCK_HEADER.size}" + raise ValueError(msg) + + header = BLOCK_HEADER.unpack(fd.read(header_size)) + return validate_block_header(header) + + +def read_block_data(fd, header, offset=None, memmap=False): + if fd.seekable(): + if offset is not None: + fd.seek(offset) + else: + offset = fd.tell() + + # load and possibly decompress the data + # read the raw bytes + if header["flags"] & constants.BLOCK_FLAG_STREAMED: + used_size = -1 + else: + used_size = header["used_size"] + + # if no compression, just read data + compression = mcompression.validate(header["compression"]) + if compression: + # the old code ignored memmapping for compressed data + data = mcompression.decompress(fd, used_size, header["data_size"], compression) + else: + if memmap and fd.can_memmap(): + data = fd.memmap_array(offset, used_size) + fd.fast_forward(header["allocated_size"]) + else: + data = fd.read_into_array(used_size) + fd.fast_forward(header["allocated_size"] - header["used_size"]) + return data + + +def read_block(fd, offset=None, memmap=False, lazy_load=False): + # expects the fd or offset is past the block magic + if offset is None and fd.seekable(): + offset = fd.tell() + header = read_block_header(fd, offset) + data_offset = fd.tell() + if lazy_load: + # setup a callback to later load the data + fd_ref = weakref.ref(fd) + + def callback(): + fd = fd_ref() + if fd is None or fd.is_closed(): + msg = "Attempt to read data from closed file" + raise OSError(msg) + return read_block_data(fd, header, offset=data_offset, memmap=memmap) + + data = callback + else: + data = read_block_data(fd, header, offset=None, memmap=memmap) + return offset, header, data_offset, data + + +def validate_write_data(data): + if data.ndim != 1 or data.dtype != "uint8": + msg = "Data must be of ndim==1 and dtype==uint8" + raise ValueError(msg) + + +def generate_write_header(fd, data, stream=False, compression_kwargs=None, padding=False, **header_kwargs): + validate_write_data(data) + if stream: + header_kwargs["flags"] = header_kwargs.get("flags", 0) | constants.BLOCK_FLAG_STREAMED + header_kwargs["data_size"] = 0 + header_kwargs["checksum"] = b"\0" * 16 + else: + header_kwargs["data_size"] = data.nbytes + header_kwargs["checksum"] = calculate_block_checksum(data) + + header_kwargs["compression"] = mcompression.to_compression_header(header_kwargs.get("compression", None)) + + if header_kwargs["compression"] == b"\0\0\0\0": + used_size = header_kwargs["data_size"] + buff = None + else: + buff = io.BytesIO() + mcompression.compress(buff, data, header_kwargs["compression"], config=compression_kwargs) + used_size = buff.tell() + if stream: + header_kwargs["used_size"] = 0 + header_kwargs["allocated_size"] = 0 + else: + header_kwargs["used_size"] = used_size + padding = util.calculate_padding(used_size, padding, fd.block_size) + header_kwargs["allocated_size"] = header_kwargs.get("allocated_size", used_size + padding) + + if header_kwargs["allocated_size"] < header_kwargs["used_size"]: + msg = ( + f"Block used size {header_kwargs['used_size']} larger than " + f"allocated size {header_kwargs['allocated_size']}", + ) + raise RuntimeError(msg) + header = BLOCK_HEADER.pack(**header_kwargs) + padding_bytes = header_kwargs["allocated_size"] - header_kwargs["used_size"] + return header, buff, padding_bytes + + +def write_block(fd, data, offset=None, stream=False, compression_kwargs=None, padding=False, **header_kwargs): + # TODO fd is only used for padding calculation, bring this out + header, buff, padding_bytes = generate_write_header(fd, data, stream, compression_kwargs, padding, **header_kwargs) + + if offset is not None: + if fd.seekable(): + fd.seek(offset) + else: + msg = "write_block received offset for non-seekable file" + raise ValueError(msg) + fd.write(struct.pack(b">H", len(header))) + fd.write(header) + if buff is None: # data is uncompressed + fd.write_array(data) + else: + fd.write(buff.getvalue()) + fd.fast_forward(padding_bytes) + return BLOCK_HEADER.unpack(header) diff --git a/asdf/_block/key.py b/asdf/_block/key.py new file mode 100644 index 000000000..5615d7ea6 --- /dev/null +++ b/asdf/_block/key.py @@ -0,0 +1,33 @@ +import weakref + + +class Key: + _next = 0 + + @classmethod + def _next_key(cls): + key = cls._next + cls._next += 1 + return key + + def __init__(self, obj, key=None): + if key is None: + key = Key._next_key() + self._key = key + self._ref = weakref.ref(obj) + + def is_valid(self): + r = self._ref() + if r is None: + return False + del r + return True + + def __hash__(self): + return self._key + + def matches(self, obj): + r = self._ref() + if r is None: + return False + return r is obj diff --git a/asdf/_block/options.py b/asdf/_block/options.py new file mode 100644 index 000000000..a3e16af7c --- /dev/null +++ b/asdf/_block/options.py @@ -0,0 +1,57 @@ +from asdf import compression as mcompression + + +class Options: + def __init__(self, storage_type, compression_type=None, compression_kwargs=None): + self._storage_type = None + self._compression = None + self._compression_kwargs = None + + # set via setters + # set kwargs first to avoid overwrite when compression type changes + self.compression_kwargs = compression_kwargs + self.compression = compression_type + # set storage type last to possibly overwrite compression/compression_kwargs + self.storage_type = storage_type + + @property + def storage_type(self): + return self._storage_type + + @storage_type.setter + def storage_type(self, storage_type): + if storage_type not in ["internal", "external", "streamed", "inline"]: + msg = "array_storage must be one of 'internal', 'external', 'streamed' or 'inline'" + raise ValueError(msg) + self._storage_type = storage_type + + @property + def compression(self): + return self._compression + + @compression.setter + def compression(self, compression): + msg = f"Invalid compression {compression}" + if compression == "input": + # "input" compression will validate as the ASDF compression module made + # some assumptions about availability of information (that the input block + # is known). The Options here do not have the same assumption. + raise ValueError(msg) + try: + compression = mcompression.validate(compression) + except ValueError: + raise ValueError(msg) + self._compression = compression + + @property + def compression_kwargs(self): + return self._compression_kwargs + + @compression_kwargs.setter + def compression_kwargs(self, kwargs): + if not kwargs: + kwargs = {} + self._compression_kwargs = kwargs + + def __copy__(self): + return type(self)(self._storage_type, self._compression, self._compression_kwargs) diff --git a/asdf/_block/store.py b/asdf/_block/store.py new file mode 100644 index 000000000..afe6468f2 --- /dev/null +++ b/asdf/_block/store.py @@ -0,0 +1,81 @@ +from .key import Key + + +class Store: + def __init__(self): + # store contains 2 layers of lookup: id(obj), Key + self._by_id = {} + + def get(self, obj, default=None): + if isinstance(obj, Key): + obj_id = id(obj._ref()) + obj_key = obj + else: + obj_id = id(obj) + obj_key = None + + # if id is unknown, return default + if obj_id not in self._by_id: + return default + + # first, lookup by id: O(1) + by_key = self._by_id[obj_id] + + # if we have a key, use it + if obj_key: + return by_key.get(obj_key, default) + + # look for a matching key: O(N) + for key, value in by_key.items(): + if key.matches(obj): + return value + + # no match, return default + return default + + def set(self, obj, value): + if isinstance(obj, Key): + obj_id = id(obj._ref()) + obj_key = obj + else: + obj_id = id(obj) + obj_key = None + + # if the id is unknown, just set it + if obj_id not in self._by_id: + if obj_key is None: + obj_key = Key(obj) + self._by_id[obj_id] = {obj_key: value} + return + + # if id is known + by_key = self._by_id[obj_id] + + # look for a matching matching key + if obj_key is None: + for key in by_key: + if key.matches(obj): + by_key[key] = value + return + # we didn't find a matching key, so make one + obj_key = Key(obj) + else: + # we already have a key, check if it's already in the store + if obj_key in by_key: + by_key[obj_key] = value + return + + # if no match was found, add using the key + self._by_id[obj_id][obj_key] = value + + def _cleanup(self, object_id=None): + if object_id is None: + for oid in set(self._by_id): + self._cleanup(oid) + return + by_key = self._by_id[object_id] + keys_to_remove = [k for k in by_key if not k.is_valid()] + for key in keys_to_remove: + del by_key[key] + if not len(by_key): + del self._by_id[object_id] diff --git a/asdf/_tests/_block/__init__.py b/asdf/_tests/_block/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/asdf/_tests/_block/test_io.py b/asdf/_tests/_block/test_io.py new file mode 100644 index 000000000..009b7ac5b --- /dev/null +++ b/asdf/_tests/_block/test_io.py @@ -0,0 +1,233 @@ +import io +import mmap + +import numpy as np +import pytest + +from asdf import constants, generic_io +from asdf._block import io as bio + + +def test_checksum(tmp_path): + my_array = np.arange(0, 64, dtype=" allocated_size + data = np.ones(30, dtype="uint8") + raw_fd = io.BytesIO() + fd = generic_io.get_file(raw_fd, mode="rw") + with pytest.raises(RuntimeError, match="Block used size.*"): + bio.write_block(fd, data, allocated_size=0) + assert fd.tell() == 0 + + +def test_fd_not_seekable(): + data = np.ones(30, dtype="uint8") + raw_fd = io.BytesIO() + fd = generic_io.get_file(raw_fd, mode="rw") + bio.write_block(fd, data) + + raw_fd.seek(0) + fd = generic_io.get_file(raw_fd, mode="rw") + + seekable = lambda: False # noqa: E731 + fd.seekable = seekable + + _, _, _, d = bio.read_block(fd) + + np.testing.assert_array_equal(d, data) + + with pytest.raises(ValueError, match="write_block received offset.*"): + bio.write_block(fd, data, offset=0) + + +def test_compressed_block(): + data = np.ones(30, dtype="uint8") + fd = generic_io.get_file(io.BytesIO(), mode="rw") + write_header = bio.write_block(fd, data, compression="zlib") + assert write_header["compression"] == b"zlib" + _, _, _, rdata = bio.read_block(fd, offset=0) + np.testing.assert_array_equal(rdata, data) + + +def test_stream_block(): + data = np.ones(10, dtype="uint8") + fd = generic_io.get_file(io.BytesIO(), mode="rw") + write_header = bio.write_block(fd, data, stream=True) + assert write_header["flags"] & constants.BLOCK_FLAG_STREAMED + # now write extra data to file + extra_data = np.ones(10, dtype="uint8") + fd.write_array(extra_data) + _, _, _, rdata = bio.read_block(fd, offset=0) + assert rdata.size == 20 + assert np.all(rdata == 1) + + +def test_read_from_closed(tmp_path): + fn = tmp_path / "test.blk" + data = np.ones(10, dtype="uint8") + with generic_io.get_file(fn, mode="w") as fd: + bio.write_block(fd, data, stream=True) + with generic_io.get_file(fn, mode="rw") as fd: + _, _, _, callback = bio.read_block(fd, offset=0, lazy_load=True) + with pytest.raises(OSError, match="Attempt to read data from closed file"): + callback() + + +@pytest.mark.parametrize("data", [np.ones(10, dtype="f4"), np.ones((3, 3), dtype="uint8")]) +def test_invalid_data(data): + fd = generic_io.get_file(io.BytesIO(), mode="rw") + with pytest.raises(ValueError, match="Data must be of.*"): + bio.write_block(fd, data, stream=True) diff --git a/asdf/_tests/_block/test_key.py b/asdf/_tests/_block/test_key.py new file mode 100644 index 000000000..2c9934252 --- /dev/null +++ b/asdf/_tests/_block/test_key.py @@ -0,0 +1,56 @@ +from asdf._block.key import Key + + +# a blank class for testing +class Foo: + pass + + +def test_unique_per_object(): + seen = set() + for _i in range(10): + bk = Key(Foo()) + assert bk not in seen + seen.add(bk) + + +def test_unique_same_object(): + seen = set() + f = Foo() + for _i in range(10): + bk = Key(f) + assert bk not in seen + seen.add(bk) + + +def test_matches_obj(): + f = Foo() + bk = Key(f) + assert bk.matches(f) + + +def test_is_valid(): + f = Foo() + bk = Key(f) + assert bk.is_valid() + del f + assert not bk.is_valid() + + +def test_memory_reuse(): + f = Foo() + bk = Key(f) + fid = id(f) + del f + objs = [] + for _ in range(100): + f = Foo() + objs.append(f) + if fid == id(f): + break + else: + raise AssertionError("Failed to find reused memory address") + + assert fid == id(f) + assert not bk.is_valid() + assert not bk.matches(f) diff --git a/asdf/_tests/_block/test_options.py b/asdf/_tests/_block/test_options.py new file mode 100644 index 000000000..f98bf0930 --- /dev/null +++ b/asdf/_tests/_block/test_options.py @@ -0,0 +1,96 @@ +import copy + +import pytest + +from asdf._block.options import Options + +valid_storage_types = ["internal", "external", "streamed", "inline"] +valid_compression_types = [None, "zlib", "bzp2", "lz4", ""] + +invalid_storage_types = ["foo", None] +invalid_compression_types = ["input", "foo"] + + +@pytest.mark.parametrize("storage", valid_storage_types) +def test_set_storage_init(storage): + o = Options(storage) + assert o.storage_type == storage + + +@pytest.mark.parametrize("storage", valid_storage_types) +def test_set_storage_attr(storage): + # start with a different storage type + o = Options("internal" if storage == "external" else "external") + o.storage_type = storage + assert o.storage_type == storage + + +@pytest.mark.parametrize("compression", valid_compression_types) +def test_set_compression_attr(compression): + o = Options("internal") + o.compression = compression + # allow "" to become None, both are falsey + assert o.compression == compression if compression else not o.compression + + +@pytest.mark.parametrize("compression", valid_compression_types) +def test_set_compression_init(compression): + o = Options("internal", compression) + # allow "" to become None, both are falsey + assert o.compression == compression if compression else not o.compression + + +def test_set_compression_kwargs_attr(): + o = Options("internal") + o.compression_kwargs = {"foo": 1} + assert o.compression_kwargs == {"foo": 1} + + +def test_set_compression_kwargs_init(): + o = Options("internal", compression_kwargs={"foo": 1}) + assert o.compression_kwargs == {"foo": 1} + + +def test_default_compression(): + o = Options("internal") + assert o.compression is None + + +@pytest.mark.parametrize("invalid_storage", invalid_storage_types) +def test_invalid_storage_type_init(invalid_storage): + with pytest.raises(ValueError, match="array_storage must be one of.*"): + Options(invalid_storage) + + +@pytest.mark.parametrize("invalid_storage", invalid_storage_types) +def test_invalid_storage_attr(invalid_storage): + o = Options("internal") + with pytest.raises(ValueError, match="array_storage must be one of.*"): + o.storage_type = invalid_storage + + +@pytest.mark.parametrize("invalid_compression", invalid_compression_types) +def test_invalid_compression_attr(invalid_compression): + o = Options("internal") + with pytest.raises(ValueError, match="Invalid compression.*"): + o.compression = invalid_compression + + +@pytest.mark.parametrize("invalid_compression", invalid_compression_types) +def test_invalid_compression_init(invalid_compression): + with pytest.raises(ValueError, match="Invalid compression.*"): + Options("internal", invalid_compression) + + +@pytest.mark.parametrize("storage", valid_storage_types) +@pytest.mark.parametrize("compression", valid_compression_types) +@pytest.mark.parametrize("compression_kwargs", [None, {"foo": 1}]) +def test_copy(storage, compression, compression_kwargs): + o = Options(storage, compression, compression_kwargs) + o2 = copy.copy(o) + assert o2 is not o + assert o2.storage_type == storage + # allow "" to become None, both are falsey + assert o2.compression == compression if compression else not o2.compression + # allow None to become {}, both are falsey + assert o2.compression_kwargs == compression_kwargs if compression_kwargs else not o2.compression_kwargs diff --git a/asdf/_tests/_block/test_store.py b/asdf/_tests/_block/test_store.py new file mode 100644 index 000000000..b2cf2de42 --- /dev/null +++ b/asdf/_tests/_block/test_store.py @@ -0,0 +1,135 @@ +from asdf._block.key import Key +from asdf._block.store import Store + + +# a blank class for testing +class Foo: + pass + + +def test_store_by_obj(): + f = Foo() + v = 42 + s = Store() + s.set(f, v) + assert s.get(f) == v + + +def test_get_missing_by_obj(): + f = Foo() + s = Store() + assert s.get(f) is None + + +def test_store_by_key(): + f = Foo() + v = 42 + s = Store() + k = Key(f) + s.set(k, v) + assert s.get(k) == v + + +def test_get_by_key(): + f = Foo() + v = 42 + s = Store() + k = Key(f) + s.set(k, v) + assert s.get(f) == v + + +def test_get_missing_key(): + f = Foo() + s = Store() + k = Key(f) + assert s.get(k) is None + + +def test_get_missing_key_same_obj(): + f = Foo() + v = 42 + s = Store() + k = Key(f) + s.set(k, v) + k2 = Key(f) + assert s.get(k2) is None + + +def test_get_existing_default(): + f = Foo() + v = 42 + s = Store() + s.set(f, v) + assert s.get(f, 26) == v + + +def test_get_missing_default(): + f = Foo() + v = 42 + s = Store() + assert s.get(f, v) == v + + +def test_set_same_object(): + f = Foo() + v = 42 + s = Store() + s.set(f, 26) + s.set(f, v) + assert s.get(f) == v + + +def test_set_same_key(): + f = Foo() + s = Store() + k = Key(f) + v = 42 + s.set(k, 26) + s.set(k, v) + assert s.get(k) == v + + +def test_get_memory_reused(): + f = Foo() + s = Store() + v = 42 + s.set(f, v) + fid = id(f) + del f + for _ in range(100): + f = Foo() + if id(f) == fid: + break + else: + raise AssertionError("Failed to trigger memory reuse") + assert s.get(f) is None + + +def test_set_memory_reused(): + f = Foo() + s = Store() + v = 42 + s.set(f, v) + fid = id(f) + del f + for _ in range(100): + f = Foo() + if id(f) == fid: + break + else: + raise AssertionError("Failed to trigger memory reuse") + nv = 26 + s.set(f, nv) + assert s.get(f) is nv + + +def test_cleanup(): + f = Foo() + s = Store() + k = Key(f) + s.set(s, 42) + s.set(k, 26) + del f + s._cleanup() + assert s.get(k, None) is None diff --git a/asdf/compression.py b/asdf/compression.py index 8c0e2741d..89076e313 100644 --- a/asdf/compression.py +++ b/asdf/compression.py @@ -227,7 +227,7 @@ def to_compression_header(compression): header. """ if not compression: - return b"" + return b"\0\0\0\0" if isinstance(compression, str): return compression.encode("ascii") From 71ac3e7db6516585e7ec2078a0e94a44db33c984 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 12 Apr 2023 10:24:34 -0400 Subject: [PATCH 004/154] add read/write block index --- asdf/_block/io.py | 71 +++++++++++++++++++- asdf/_tests/_block/test_io.py | 118 ++++++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+), 1 deletion(-) diff --git a/asdf/_block/io.py b/asdf/_block/io.py index 06a69b10c..d29979098 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -1,8 +1,11 @@ import hashlib import io +import os import struct import weakref +import yaml + from asdf import compression as mcompression from asdf import constants, util @@ -91,7 +94,7 @@ def read_block(fd, offset=None, memmap=False, lazy_load=False): offset = fd.tell() header = read_block_header(fd, offset) data_offset = fd.tell() - if lazy_load: + if lazy_load and fd.seekable(): # setup a callback to later load the data fd_ref = weakref.ref(fd) @@ -170,3 +173,69 @@ def write_block(fd, data, offset=None, stream=False, compression_kwargs=None, pa fd.write(buff.getvalue()) fd.fast_forward(padding_bytes) return BLOCK_HEADER.unpack(header) + + +def candidate_offsets(min_offset, max_offset, block_size): + offset = (max_offset // block_size) * block_size + if offset == max_offset: + # don't include the max_offset + offset -= block_size + while offset > min_offset: + yield offset + offset -= block_size + if offset <= min_offset: + yield min_offset + + +def find_block_index(fd, min_offset=None, max_offset=None): + if min_offset is None: + min_offset = fd.tell() + if max_offset is None: + fd.seek(0, os.SEEK_END) + max_offset = fd.tell() + block_size = fd.block_size + block_index_offset = None + buff = b"" + pattern = constants.INDEX_HEADER + for offset in candidate_offsets(min_offset, max_offset, block_size): + fd.seek(offset) + buff = fd.read(block_size) + buff + index = buff.find(pattern) + if index != -1: + block_index_offset = offset + index + if block_index_offset >= max_offset: + return None + break + buff = buff[: len(pattern)] + if block_index_offset is not None and block_index_offset < max_offset: + return block_index_offset + return None + + +def read_block_index(fd, offset=None): + if offset is not None: + fd.seek(offset) + buff = fd.read(len(constants.INDEX_HEADER)) + if buff != constants.INDEX_HEADER: + msg = "Failed to read block index header at offset {offset}" + raise OSError(msg) + return yaml.load(fd.read(-1), yaml.SafeLoader) + + +def write_block_index(fd, offsets, offset=None, yaml_version=None): + if yaml_version is None: + yaml_version = (1, 1) + if offset is not None: + fd.seek(offset) + fd.write(constants.INDEX_HEADER) + fd.write(b"\n") + yaml.dump( + offsets, + stream=fd, + Dumper=yaml.SafeDumper, + explicit_start=True, + explicit_end=True, + allow_unicode=True, + encoding="utf-8", + version=yaml_version, + ) diff --git a/asdf/_tests/_block/test_io.py b/asdf/_tests/_block/test_io.py index 009b7ac5b..288dc6b69 100644 --- a/asdf/_tests/_block/test_io.py +++ b/asdf/_tests/_block/test_io.py @@ -231,3 +231,121 @@ def test_invalid_data(data): fd = generic_io.get_file(io.BytesIO(), mode="rw") with pytest.raises(ValueError, match="Data must be of.*"): bio.write_block(fd, data, stream=True) + + +@pytest.mark.parametrize( + "options", + [ + (0, 10, 5, [5, 0]), + (0, 10, 3, [9, 6, 3, 0]), + (0, 10, 10, [0]), + (0, 10, 6, [6, 0]), + (0, 10, 11, [0]), + (0, 10, 4096, [0]), + ], +) +def test_candidate_offsets(options): + min_offset, max_offset, size, targets = options + for offset, target in zip(bio.candidate_offsets(min_offset, max_offset, size), targets): + assert offset == target + + +def generate_block_index_file(fn, values=None, offset=0): + if values is None: + values = [1, 2, 3] + with generic_io.get_file(fn, "w") as f: + f.write(b"\0" * offset) + bio.write_block_index(f, values) + + +def test_find_block_index(tmp_path): + offset = 42 + fn = tmp_path / "test" + generate_block_index_file(fn, offset=offset) + with generic_io.get_file(fn, "r") as fd: + assert bio.find_block_index(fd) == offset + + +def test_find_block_index_on_boundry(tmp_path): + fn = tmp_path / "test" + with generic_io.get_file(fn, "w") as fd: + block_size = fd.block_size + # put pattern across a block boundary + offset = block_size - (len(constants.INDEX_HEADER) // 2) + generate_block_index_file(fn, offset=offset) + with generic_io.get_file(fn, "r") as fd: + assert bio.find_block_index(fd) == offset + + +def test_missing_block_index(tmp_path): + fn = tmp_path / "test" + with open(fn, "w") as f: + f.write("\0" * 4096) + with generic_io.get_file(fn, "r") as fd: + assert bio.find_block_index(fd) is None + + +def test_less_than_min_offset_block_index(tmp_path): + fn = tmp_path / "test" + offset = 26 + min_offset = 42 + generate_block_index_file(fn, offset=offset) + with generic_io.get_file(fn, "r") as fd: + assert bio.find_block_index(fd, min_offset) is None + + +def test_greater_than_max_offset_block_index(tmp_path): + fn = tmp_path / "test" + offset = 72 + max_offset = 42 + generate_block_index_file(fn, offset=offset) + with generic_io.get_file(fn, "r") as fd: + assert bio.find_block_index(fd, 0, max_offset) is None + + +def test_read_block_index(tmp_path): + fn = tmp_path / "test" + values = [1, 2, 3] + generate_block_index_file(fn, values=values, offset=0) + with generic_io.get_file(fn, "r") as fd: + assert bio.read_block_index(fd) == values + + +def test_read_block_index_with_offset(tmp_path): + fn = tmp_path / "test" + values = [1, 2, 3] + offset = 42 + generate_block_index_file(fn, values=values, offset=offset) + with generic_io.get_file(fn, "r") as fd: + assert bio.read_block_index(fd, offset) == values + + +def test_read_block_index_pre_seek(tmp_path): + fn = tmp_path / "test" + values = [1, 2, 3] + offset = 42 + generate_block_index_file(fn, values=values, offset=offset) + with generic_io.get_file(fn, "r") as fd: + fd.seek(offset) + assert bio.read_block_index(fd) == values + + +def test_read_block_index_no_header(tmp_path): + fn = tmp_path / "test" + values = [1, 2, 3] + generate_block_index_file(fn, values=values, offset=0) + with generic_io.get_file(fn, "r") as fd: + fd.seek(len(constants.INDEX_HEADER)) + with pytest.raises(OSError, match="Failed to read block index.*"): + assert bio.read_block_index(fd) == values + + +def test_write_block_index_with_offset(tmp_path): + fn = tmp_path / "test" + offset = 50 + with generic_io.get_file(fn, "w") as fd: + fd.write(b"\0" * 100) + fd.seek(0) + bio.write_block_index(fd, [1, 2, 3], offset=offset) + with generic_io.get_file(fn, "r") as fd: + assert bio.find_block_index(fd) == offset From c1768bd11a2841b2009f7b96099e98fba5a08ea1 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 13 Apr 2023 12:11:51 -0400 Subject: [PATCH 005/154] add block reader/writer --- asdf/_block/io.py | 4 + asdf/_block/reader.py | 117 ++++++++++++++++++++++++++ asdf/_block/writer.py | 40 +++++++++ asdf/_tests/_block/test_io.py | 4 + asdf/_tests/_block/test_reader.py | 131 ++++++++++++++++++++++++++++++ asdf/_tests/_block/test_writer.py | 50 ++++++++++++ 6 files changed, 346 insertions(+) create mode 100644 asdf/_block/reader.py create mode 100644 asdf/_block/writer.py create mode 100644 asdf/_tests/_block/test_reader.py create mode 100644 asdf/_tests/_block/test_writer.py diff --git a/asdf/_block/io.py b/asdf/_block/io.py index d29979098..e460a3d6b 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -78,6 +78,7 @@ def read_block_data(fd, header, offset=None, memmap=False): if compression: # the old code ignored memmapping for compressed data data = mcompression.decompress(fd, used_size, header["data_size"], compression) + fd.fast_forward(header["allocated_size"] - header["used_size"]) else: if memmap and fd.can_memmap(): data = fd.memmap_array(offset, used_size) @@ -85,6 +86,8 @@ def read_block_data(fd, header, offset=None, memmap=False): else: data = fd.read_into_array(used_size) fd.fast_forward(header["allocated_size"] - header["used_size"]) + if (header["flags"] & constants.BLOCK_FLAG_STREAMED) and fd.seekable(): + fd.seek(0, os.SEEK_END) return data @@ -106,6 +109,7 @@ def callback(): return read_block_data(fd, header, offset=data_offset, memmap=memmap) data = callback + fd.fast_forward(header["allocated_size"]) else: data = read_block_data(fd, header, offset=None, memmap=memmap) return offset, header, data_offset, data diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py new file mode 100644 index 000000000..ecc5beeb2 --- /dev/null +++ b/asdf/_block/reader.py @@ -0,0 +1,117 @@ +import weakref + +from asdf import constants + +from . import io as bio + + +class ReadBlock: + def __init__(self, offset, fd, memmap, lazy_load, header=None, data_offset=None, data=None): + self.offset = offset + self._fd = weakref.ref(fd) + self.header = header + self.data_offset = data_offset + self._data = data + # TODO alternative to passing these down? + self.memmap = memmap + self.lazy_load = lazy_load + if not lazy_load: + self.load() + + @property + def loaded(self): + return self._data is not None + + def load(self): + if self.loaded: + return + fd = self._fd() + if fd is None or fd.is_closed(): + raise OSError("Attempt to load block from closed file") + _, self.header, self.data_offset, self._data = bio.read_block( + fd, offset=self.offset, memmap=self.memmap, lazy_load=self.lazy_load + ) + + @property + def data(self): + if not self.loaded: + self.load() + if callable(self._data): + return self._data() + return self._data + + def reset(self, fd, offset): + self._fd = weakref.ref(fd) + self.offset = offset + self.header = None + self.data_offset = None + self._data = None + if not self.lazy_load: + self.load() + + +def read_blocks_serially(fd, memmap=False, lazy_load=False): + blocks = [] + buff = b"" + while True: + # the expectation is that this will begin PRIOR to the block magic + # read 4 bytes + buff += fd.read(4 - len(buff)) + if len(buff) < 4: + # we are done, there are no more blocks and no index + # TODO error? we shouldn't have extra bytes, the old code allows this + break + + if buff == constants.INDEX_HEADER[:4]: + # we hit the block index, which is not useful here + break + + if buff == constants.BLOCK_MAGIC: + # this is another block + offset, header, data_offset, data = bio.read_block(fd, memmap=memmap, lazy_load=lazy_load) + blocks.append(ReadBlock(offset, fd, memmap, lazy_load, header=header, data_offset=data_offset, data=data)) + if blocks[-1].header["flags"] & constants.BLOCK_FLAG_STREAMED: + # a file can only have 1 streamed block and it must be at the end so we + # can stop looking for more blocks + break + buff = b"" + else: + if len(blocks) or buff[0] != 0: + # if this is not the first block or we haven't found any + # blocks and the first byte is non-zero + msg = f"Invalid bytes while reading blocks {buff}" + raise OSError(msg) + # this is the first block, allow empty bytes before block + buff = buff.strip(b"\0") + return blocks + + +def read_blocks(fd, memmap=False, lazy_load=False): + if not lazy_load or not fd.seekable(): + # load all blocks serially + return read_blocks_serially(fd, memmap, lazy_load) + + # try to find block index + starting_offset = fd.tell() + index_offset = bio.find_block_index(fd, starting_offset) + if index_offset is None: + # if failed, load all blocks serially + fd.seek(starting_offset) + return read_blocks_serially(fd, memmap, lazy_load) + + # setup empty blocks + block_index = bio.read_block_index(fd, index_offset) + # skip magic for each block + blocks = [ReadBlock(offset + 4, fd, memmap, lazy_load) for offset in block_index] + try: + # load first and last blocks to check if the index looks correct + for index in (0, -1): + fd.seek(block_index[index]) + buff = fd.read(4) + if buff != constants.BLOCK_MAGIC: + raise OSError("Invalid block magic") + blocks[index].load() + except (OSError, ValueError): + fd.seek(starting_offset) + return read_blocks_serially(fd, memmap, lazy_load) + return blocks diff --git a/asdf/_block/writer.py b/asdf/_block/writer.py new file mode 100644 index 000000000..6281d8cba --- /dev/null +++ b/asdf/_block/writer.py @@ -0,0 +1,40 @@ +from asdf import constants + +from . import io as bio + + +class WriteBlock: + def __init__(self, data, compression=None, compression_kwargs=None): + self._data = data + self.compression = compression + self.compression_kwargs = compression_kwargs + + @property + def data(self): + if callable(self._data): + return self._data() + return self._data + + +def write_blocks(fd, blocks, padding=False, streamed_block=None, write_index=True): + offsets = [] + headers = [] + for blk in blocks: + offsets.append(fd.tell()) + fd.write(constants.BLOCK_MAGIC) + headers.append( + bio.write_block( + fd, + blk.data, + compression_kwargs=blk.compression_kwargs, + padding=padding, + compression=blk.compression, + ) + ) + if streamed_block is not None: + offsets.append(fd.tell()) + fd.write(constants.BLOCK_MAGIC) + headers.append(bio.write_block(fd, streamed_block.data, stream=True)) + elif len(blocks) and write_index: + bio.write_block_index(fd, offsets) + return offsets, headers diff --git a/asdf/_tests/_block/test_io.py b/asdf/_tests/_block/test_io.py index 288dc6b69..636da86bc 100644 --- a/asdf/_tests/_block/test_io.py +++ b/asdf/_tests/_block/test_io.py @@ -349,3 +349,7 @@ def test_write_block_index_with_offset(tmp_path): bio.write_block_index(fd, [1, 2, 3], offset=offset) with generic_io.get_file(fn, "r") as fd: assert bio.find_block_index(fd) == offset + + +# TODO test that file pointer is always at the end of a block after a read +# for all possible block types diff --git a/asdf/_tests/_block/test_reader.py b/asdf/_tests/_block/test_reader.py new file mode 100644 index 000000000..e08173947 --- /dev/null +++ b/asdf/_tests/_block/test_reader.py @@ -0,0 +1,131 @@ +import contextlib +import io +import mmap +import os + +import numpy as np +import pytest + +from asdf import constants, generic_io, util +from asdf._block import io as bio +from asdf._block.reader import read_blocks + + +@contextlib.contextmanager +def gen_blocks(fn=None, n=5, size=10, padding=0, padding_byte=b"\0", with_index=False, block_padding=False): + offsets = [] + if fn is not None: + with generic_io.get_file(fn, mode="w") as fd: + pass + + def check(blocks): + assert len(blocks) == n + for i, blk in enumerate(blocks): + assert blk.data.size == size + assert np.all(blk.data == i) + + with generic_io.get_file(fn or io.BytesIO(), mode="rw") as fd: + fd.write(padding_byte * padding) + for i in range(n): + offsets.append(fd.tell()) + fd.write(constants.BLOCK_MAGIC) + data = np.ones(size, dtype="uint8") * i + bio.write_block(fd, data, padding=block_padding) + if with_index: + bio.write_block_index(fd, offsets) + fd.seek(0) + yield fd, check + + +# test a few paddings to test read_blocks checking 4 bytes while searching for the first block +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("memmap", [True, False]) +@pytest.mark.parametrize("with_index", [True, False]) +@pytest.mark.parametrize("padding", [0, 3, 4, 5]) +def test_read(tmp_path, lazy_load, memmap, with_index, padding): + fn = tmp_path / "test.bin" + n = 5 + size = 10 + with gen_blocks(fn=fn, n=n, size=size, padding=padding, with_index=with_index) as (fd, check): + r = read_blocks(fd, memmap=memmap, lazy_load=lazy_load) + if lazy_load and with_index: + assert r[0].loaded + assert r[-1].loaded + for blk in r[1:-1]: + assert not blk.loaded + else: + for blk in r: + assert blk.loaded + if memmap: + for blk in r: + base = util.get_array_base(blk.data) + assert isinstance(base.base, mmap.mmap) + check(r) + + +def test_read_invalid_padding(): + with gen_blocks(padding=1, padding_byte=b"\1") as (fd, check): + with pytest.raises(OSError, match="Invalid bytes.*"): + check(read_blocks(fd)) + + +def test_read_post_padding(): + with gen_blocks(padding=1) as (fd, check): + fd.seek(0, os.SEEK_END) + # acceptable to have <4 bytes after the last block + fd.write(b"\0" * 3) + fd.seek(0) + check(read_blocks(fd)) + + +# TODO non-seekable + + +@pytest.mark.parametrize("invalid_block_index", [0, 1, -1]) +def test_invalid_block_index(tmp_path, invalid_block_index): + fn = tmp_path / "test.bin" + with gen_blocks(fn=fn, with_index=True) as (fd, check): + offset = bio.find_block_index(fd) + assert offset is not None + block_index = bio.read_block_index(fd, offset) + block_index[invalid_block_index] += 4 + fd.seek(offset) + bio.write_block_index(fd, block_index) + fd.seek(0) + # when the block index is read, only the first and last blocks + # are check, so any other invalid entry should result in failure + if invalid_block_index in (0, -1): + check(read_blocks(fd, lazy_load=True)) + else: + with pytest.raises(ValueError, match="Header size.*"): + check(read_blocks(fd, lazy_load=True)) + + +def test_invalid_block_in_index_with_valid_magic(tmp_path): + fn = tmp_path / "test.bin" + with gen_blocks(fn=fn, with_index=True, block_padding=1.0) as (fd, check): + offset = bio.find_block_index(fd) + assert offset is not None + block_index = bio.read_block_index(fd, offset) + # move the first block offset to the padding before + # the second block with enough space to write + # valid magic (but invalid header) + block_index[0] = block_index[1] - 6 + fd.seek(block_index[0]) + fd.write(constants.BLOCK_MAGIC) + fd.write(b"\0\0") + + fd.seek(offset) + bio.write_block_index(fd, block_index) + + fd.seek(0) + check(read_blocks(fd, lazy_load=True)) + + +def test_closed_file(tmp_path): + fn = tmp_path / "test.bin" + with gen_blocks(fn=fn, with_index=True) as (fd, check): + blocks = read_blocks(fd, lazy_load=True) + blk = blocks[1] + with pytest.raises(OSError, match="Attempt to load block from closed file"): + blk.load() diff --git a/asdf/_tests/_block/test_writer.py b/asdf/_tests/_block/test_writer.py new file mode 100644 index 000000000..b0d1dc82e --- /dev/null +++ b/asdf/_tests/_block/test_writer.py @@ -0,0 +1,50 @@ +import numpy as np +import pytest + +import asdf._block.io as bio +from asdf import constants, generic_io +from asdf._block import reader, writer + +# TODO write blocks, with compression_kwargs: how to check this worked? +# TODO invalid inputs + + +@pytest.mark.parametrize("lazy", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("padding", [True, False, 0.1, 0.9]) +@pytest.mark.parametrize("compression", [None, b"zlib"]) +@pytest.mark.parametrize("stream", [True, False]) +def test_write_blocks(tmp_path, lazy, index, padding, compression, stream): + data = [np.ones(10, dtype=np.uint8), np.zeros(5, dtype=np.uint8)] + if lazy: + blocks = [writer.WriteBlock(lambda bd=d: bd, compression=compression) for d in data] + else: + blocks = [writer.WriteBlock(d, compression=compression) for d in data] + if stream: + streamed_block = writer.WriteBlock(np.ones(15, dtype=np.uint8)) + else: + streamed_block = None + fn = tmp_path / "test.bin" + with generic_io.get_file(fn, mode="w") as fd: + writer.write_blocks(fd, blocks, padding=padding, streamed_block=streamed_block, write_index=index) + with generic_io.get_file(fn, mode="r") as fd: + if index and not stream: + assert bio.find_block_index(fd) is not None + else: + assert bio.find_block_index(fd) is None + fd.seek(0) + read_blocks = reader.read_blocks(fd) + if stream: + assert len(read_blocks) == (len(data) + 1) + else: + assert len(read_blocks) == len(data) + for r, d in zip(read_blocks, data): + np.testing.assert_array_equal(r.data, d) + if compression is not None: + assert r.header["compression"] == compression + if padding: + assert r.header["allocated_size"] > r.header["used_size"] + if stream: + read_stream_block = read_blocks[-1] + np.testing.assert_array_equal(read_stream_block.data, streamed_block.data) + assert read_stream_block.header["flags"] & constants.BLOCK_FLAG_STREAMED From e64f8a380d178a2dcdaeae8987f86d81067bc794 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 13 Apr 2023 16:29:32 -0400 Subject: [PATCH 006/154] add _block.LinearStore for storing read blocks --- asdf/_block/reader.py | 6 ++- asdf/_block/store.py | 30 +++++++++++- asdf/_tests/_block/test_store.py | 80 +++++++++++++++++++++----------- 3 files changed, 84 insertions(+), 32 deletions(-) diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index ecc5beeb2..8247a25d7 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -27,7 +27,8 @@ def load(self): return fd = self._fd() if fd is None or fd.is_closed(): - raise OSError("Attempt to load block from closed file") + msg = "Attempt to load block from closed file" + raise OSError(msg) _, self.header, self.data_offset, self._data = bio.read_block( fd, offset=self.offset, memmap=self.memmap, lazy_load=self.lazy_load ) @@ -109,7 +110,8 @@ def read_blocks(fd, memmap=False, lazy_load=False): fd.seek(block_index[index]) buff = fd.read(4) if buff != constants.BLOCK_MAGIC: - raise OSError("Invalid block magic") + msg = "Invalid block magic" + raise OSError(msg) blocks[index].load() except (OSError, ValueError): fd.seek(starting_offset) diff --git a/asdf/_block/store.py b/asdf/_block/store.py index afe6468f2..741d2c9aa 100644 --- a/asdf/_block/store.py +++ b/asdf/_block/store.py @@ -1,3 +1,5 @@ +import collections.abc + from .key import Key @@ -6,7 +8,7 @@ def __init__(self): # store contains 2 layers of lookup: id(obj), Key self._by_id = {} - def get(self, obj, default=None): + def lookup_by_object(self, obj, default=None): if isinstance(obj, Key): obj_id = id(obj._ref()) obj_key = obj @@ -33,7 +35,7 @@ def get(self, obj, default=None): # no match, return default return default - def set(self, obj, value): + def assign_object(self, obj, value): if isinstance(obj, Key): obj_id = id(obj._ref()) obj_key = obj @@ -79,3 +81,27 @@ def _cleanup(self, object_id=None): del by_key[key] if not len(by_key): del self._by_id[object_id] + + +class LinearStore(Store, collections.abc.Sequence): + def __init__(self, init=None): + super().__init__() + if init is None: + init = [] + self._items = init + + def lookup_by_object(self, obj): + index = super().lookup_by_object(obj) + if index is None: + return None + return self[index] + + def assign_object(self, obj, value): + index = self._items.index(value) + super().assign_object(obj, index) + + def __getitem__(self, index): + return self._items.__getitem__(index) + + def __len__(self, index): + return self._items.__len__() diff --git a/asdf/_tests/_block/test_store.py b/asdf/_tests/_block/test_store.py index b2cf2de42..469ced244 100644 --- a/asdf/_tests/_block/test_store.py +++ b/asdf/_tests/_block/test_store.py @@ -1,5 +1,7 @@ +import pytest + from asdf._block.key import Key -from asdf._block.store import Store +from asdf._block.store import LinearStore, Store # a blank class for testing @@ -11,14 +13,14 @@ def test_store_by_obj(): f = Foo() v = 42 s = Store() - s.set(f, v) - assert s.get(f) == v + s.assign_object(f, v) + assert s.lookup_by_object(f) == v def test_get_missing_by_obj(): f = Foo() s = Store() - assert s.get(f) is None + assert s.lookup_by_object(f) is None def test_store_by_key(): @@ -26,8 +28,8 @@ def test_store_by_key(): v = 42 s = Store() k = Key(f) - s.set(k, v) - assert s.get(k) == v + s.assign_object(k, v) + assert s.lookup_by_object(k) == v def test_get_by_key(): @@ -35,15 +37,15 @@ def test_get_by_key(): v = 42 s = Store() k = Key(f) - s.set(k, v) - assert s.get(f) == v + s.assign_object(k, v) + assert s.lookup_by_object(f) == v def test_get_missing_key(): f = Foo() s = Store() k = Key(f) - assert s.get(k) is None + assert s.lookup_by_object(k) is None def test_get_missing_key_same_obj(): @@ -51,33 +53,33 @@ def test_get_missing_key_same_obj(): v = 42 s = Store() k = Key(f) - s.set(k, v) + s.assign_object(k, v) k2 = Key(f) - assert s.get(k2) is None + assert s.lookup_by_object(k2) is None def test_get_existing_default(): f = Foo() v = 42 s = Store() - s.set(f, v) - assert s.get(f, 26) == v + s.assign_object(f, v) + assert s.lookup_by_object(f, 26) == v def test_get_missing_default(): f = Foo() v = 42 s = Store() - assert s.get(f, v) == v + assert s.lookup_by_object(f, v) == v def test_set_same_object(): f = Foo() v = 42 s = Store() - s.set(f, 26) - s.set(f, v) - assert s.get(f) == v + s.assign_object(f, 26) + s.assign_object(f, v) + assert s.lookup_by_object(f) == v def test_set_same_key(): @@ -85,16 +87,16 @@ def test_set_same_key(): s = Store() k = Key(f) v = 42 - s.set(k, 26) - s.set(k, v) - assert s.get(k) == v + s.assign_object(k, 26) + s.assign_object(k, v) + assert s.lookup_by_object(k) == v def test_get_memory_reused(): f = Foo() s = Store() v = 42 - s.set(f, v) + s.assign_object(f, v) fid = id(f) del f for _ in range(100): @@ -103,14 +105,14 @@ def test_get_memory_reused(): break else: raise AssertionError("Failed to trigger memory reuse") - assert s.get(f) is None + assert s.lookup_by_object(f) is None def test_set_memory_reused(): f = Foo() s = Store() v = 42 - s.set(f, v) + s.assign_object(f, v) fid = id(f) del f for _ in range(100): @@ -120,16 +122,38 @@ def test_set_memory_reused(): else: raise AssertionError("Failed to trigger memory reuse") nv = 26 - s.set(f, nv) - assert s.get(f) is nv + s.assign_object(f, nv) + assert s.lookup_by_object(f) is nv def test_cleanup(): f = Foo() s = Store() k = Key(f) - s.set(s, 42) - s.set(k, 26) + s.assign_object(s, 42) + s.assign_object(k, 26) del f s._cleanup() - assert s.get(k, None) is None + assert s.lookup_by_object(k, None) is None + + +def test_linear_store(): + foos = [Foo(), Foo(), Foo()] + values = ["a", "b", "c"] + s = LinearStore(values) + assert len(s) == len(values) + for f, v in zip(foos, values): + s.assign_object(f, v) + for f, v in zip(foos, values): + assert s.lookup_by_object(f) == v + + +def test_linear_store_missing_value(): + s = LinearStore() + with pytest.raises(ValueError, match=".*is not in list.*"): + s.assign_object(Foo(), "missing") + + +def test_linear_store_lookup_unknown_object(): + s = LinearStore() + assert s.lookup_by_object(Foo()) is None From fc6befcd1b528862c5332d022b9c8f60754ffeb2 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 14 Apr 2023 09:44:32 -0400 Subject: [PATCH 007/154] fix _block.store.LinearStore.__len__ --- asdf/_block/store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asdf/_block/store.py b/asdf/_block/store.py index 741d2c9aa..5355d833c 100644 --- a/asdf/_block/store.py +++ b/asdf/_block/store.py @@ -103,5 +103,5 @@ def assign_object(self, obj, value): def __getitem__(self, index): return self._items.__getitem__(index) - def __len__(self, index): + def __len__(self): return self._items.__len__() From 12e4c3fade88a9503be93aa2ba9d83fc758d8006 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 14 Apr 2023 09:56:41 -0400 Subject: [PATCH 008/154] break update --- asdf/_tests/tags/core/tests/test_ndarray.py | 1 + asdf/_tests/test_api.py | 1 + asdf/_tests/test_array_blocks.py | 16 ++++++++++++++++ asdf/_tests/test_block_converter.py | 3 +++ asdf/_tests/test_compression.py | 5 +++++ asdf/asdf.py | 1 + 6 files changed, 27 insertions(+) diff --git a/asdf/_tests/tags/core/tests/test_ndarray.py b/asdf/_tests/tags/core/tests/test_ndarray.py index 5de7a0ce4..8949a19ac 100644 --- a/asdf/_tests/tags/core/tests/test_ndarray.py +++ b/asdf/_tests/tags/core/tests/test_ndarray.py @@ -930,6 +930,7 @@ def test_readonly_inline(tmpdir): # Confirm that NDArrayType's internal array is regenerated # following an update. @pytest.mark.parametrize("pad_blocks", [True, False]) +@pytest.mark.xfail def test_block_data_change(pad_blocks, tmpdir): tmpfile = str(tmpdir.join("data.asdf")) tree = {"data": np.zeros(10, dtype="uint8")} diff --git a/asdf/_tests/test_api.py b/asdf/_tests/test_api.py index c8a49f8f9..df86c1609 100644 --- a/asdf/_tests/test_api.py +++ b/asdf/_tests/test_api.py @@ -147,6 +147,7 @@ def test_default_version(): assert ff.file_format_version == version_map["FILE_FORMAT"] +@pytest.mark.xfail def test_update_exceptions(tmp_path): path = str(tmp_path / "test.asdf") diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index 9c163ad6e..add1d4fa9 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -116,6 +116,7 @@ def test_pad_blocks(tmp_path): assert_array_equal(ff.tree["my_array2"], my_array2) +@pytest.mark.xfail def test_update_expand_tree(tmp_path): tmp_path = str(tmp_path) testpath = os.path.join(tmp_path, "test.asdf") @@ -157,6 +158,7 @@ def test_update_expand_tree(tmp_path): assert_array_equal(ff.tree["arrays"][1], my_array2) +@pytest.mark.xfail def test_update_all_external(tmp_path): fn = tmp_path / "test.asdf" @@ -181,6 +183,7 @@ def _get_update_tree(): return {"arrays": [np.arange(64) * 1, np.arange(64) * 2, np.arange(64) * 3]} +@pytest.mark.xfail def test_update_delete_first_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -204,6 +207,7 @@ def test_update_delete_first_array(tmp_path): assert_array_equal(ff.tree["arrays"][1], tree["arrays"][2]) +@pytest.mark.xfail def test_update_delete_last_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -227,6 +231,7 @@ def test_update_delete_last_array(tmp_path): assert_array_equal(ff.tree["arrays"][1], tree["arrays"][1]) +@pytest.mark.xfail def test_update_delete_middle_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -254,6 +259,7 @@ def test_update_delete_middle_array(tmp_path): assert_array_equal(ff.tree["arrays"][1], tree["arrays"][2]) +@pytest.mark.xfail def test_update_replace_first_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -278,6 +284,7 @@ def test_update_replace_first_array(tmp_path): assert_array_equal(ff.tree["arrays"][2], tree["arrays"][2]) +@pytest.mark.xfail def test_update_replace_last_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -302,6 +309,7 @@ def test_update_replace_last_array(tmp_path): assert_array_equal(ff.tree["arrays"][2], np.arange(32)) +@pytest.mark.xfail def test_update_replace_middle_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -326,6 +334,7 @@ def test_update_replace_middle_array(tmp_path): assert_array_equal(ff.tree["arrays"][2], tree["arrays"][2]) +@pytest.mark.xfail def test_update_add_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -347,6 +356,7 @@ def test_update_add_array(tmp_path): assert_array_equal(ff.tree["arrays"][3], np.arange(32)) +@pytest.mark.xfail def test_update_add_array_at_end(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -373,6 +383,7 @@ def test_update_add_array_at_end(tmp_path): assert_array_equal(ff.tree["arrays"][3], np.arange(65536, dtype=" Date: Wed, 19 Apr 2023 18:08:45 -0400 Subject: [PATCH 009/154] messy write support A few things were disabled to limit the extent of the changes: - streamed blocks - external blocks - update - resolve_and_inline These changes are also a work-in-progress, more of an attempt to see if the updated strategy will work. --- asdf/_block/__init__.py | 2 +- asdf/_block/io.py | 12 +- asdf/_block/manager.py | 931 ++---------------- asdf/_block/old_manager.py | 835 ++++++++++++++++ asdf/_block/options.py | 8 +- asdf/_block/reader.py | 17 +- asdf/_block/writer.py | 10 +- asdf/_tests/_block/test_options.py | 12 +- asdf/_tests/_helpers.py | 9 +- asdf/_tests/commands/tests/test_defragment.py | 5 +- asdf/_tests/commands/tests/test_exploded.py | 2 + asdf/_tests/commands/tests/test_to_yaml.py | 2 + asdf/_tests/tags/core/tests/test_integer.py | 2 +- asdf/_tests/tags/core/tests/test_ndarray.py | 9 +- asdf/_tests/test_api.py | 14 +- asdf/_tests/test_array_blocks.py | 172 +--- asdf/_tests/test_compression.py | 2 +- asdf/_tests/test_file_format.py | 61 +- asdf/_tests/test_generic_io.py | 33 +- asdf/_tests/test_stream.py | 9 + asdf/_tests/test_yaml.py | 3 +- asdf/asdf.py | 307 +++--- asdf/tags/core/ndarray.py | 78 +- 23 files changed, 1274 insertions(+), 1261 deletions(-) create mode 100644 asdf/_block/old_manager.py diff --git a/asdf/_block/__init__.py b/asdf/_block/__init__.py index 575753691..10de0c6aa 100644 --- a/asdf/_block/__init__.py +++ b/asdf/_block/__init__.py @@ -1,5 +1,5 @@ from .block import Block, UnloadedBlock -from .manager import BlockManager +from .old_manager import BlockManager from .util import calculate_updated_layout __all__ = ["Block", "UnloadedBlock", "BlockManager", "calculate_updated_layout"] diff --git a/asdf/_block/io.py b/asdf/_block/io.py index e460a3d6b..2f6fa0e38 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -223,7 +223,17 @@ def read_block_index(fd, offset=None): if buff != constants.INDEX_HEADER: msg = "Failed to read block index header at offset {offset}" raise OSError(msg) - return yaml.load(fd.read(-1), yaml.SafeLoader) + try: + block_index = yaml.load(fd.read(-1), yaml.SafeLoader) + except yaml.parser.ParserError: + raise OSError("Failed to parse block index as yaml") + if ( + not isinstance(block_index, list) + or any(not isinstance(v, int) for v in block_index) + or block_index != sorted(block_index) + ): + raise OSError("Invalid block index") + return block_index def write_block_index(fd, offsets, offset=None, yaml_version=None): diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 3f6e2477a..795d94845 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -1,840 +1,127 @@ -import copy -import os -import re -import weakref +from asdf import constants, util -import numpy as np -import yaml +from . import store +from .options import Options +from .writer import WriteBlock -from asdf import compression as mcompression -from asdf import constants, generic_io, treeutil, util, yamlutil -from asdf.config import get_config -from asdf.util import patched_urllib_parse -from .block import Block, UnloadedBlock - - -class BlockManager: +class ReadBlocks(store.LinearStore): """ - Manages the `Block`s associated with a ASDF file. + {obj: block_index} : where obj is NDArrayType or other high level object + [block_0, block_1, ...] """ - def __init__(self, asdffile, copy_arrays=False, lazy_load=True): - self._asdffile = weakref.ref(asdffile) - - self._internal_blocks = [] - self._external_blocks = [] - self._inline_blocks = [] - self._streamed_blocks = [] - - self._block_type_mapping = { - "internal": self._internal_blocks, - "external": self._external_blocks, - "inline": self._inline_blocks, - "streamed": self._streamed_blocks, - } - - self._data_to_block_mapping = {} - self._key_to_block_mapping = {} - self._validate_checksums = False - self._memmap = not copy_arrays - self._lazy_load = lazy_load - self._internal_blocks_mapped = False - - def __len__(self): - """ - Return the total number of blocks being managed. - - This may not include all of the blocks in an open file, since - their reading may have been deferred. Call - `finish_reading_internal_blocks` to find the positions and - header information of all blocks in the file. - """ - return sum(len(x) for x in self._block_type_mapping.values()) - - def add(self, block, key=None): - """ - Add an internal block to the manager. - """ - if not self._internal_blocks_mapped: - # If the block index is missing we need to locate the remaining - # blocks so that we don't accidentally add our new block - # in the middle of the list. - self.finish_reading_internal_blocks() - - self._add(block, key=key) - - def _add(self, block, key=None): - block_set = self._block_type_mapping.get(block.array_storage, None) - if block_set is not None: - if block not in block_set: - block_set.append(block) - else: - msg = f"Unknown array storage type {block.array_storage}" - raise ValueError(msg) - - if block.array_storage == "streamed" and len(self._streamed_blocks) > 1: - msg = "Can not add second streaming block" - raise ValueError(msg) - - if block._data is not None or key is not None: - if key is None: - key = id(block._data) - self._data_to_block_mapping[key] = block - else: - self._key_to_block_mapping[key] = block + # def get_block_for_array(self, array): + # base = util.get_array_base(array) + # block_index = self.lookup_by_object(base) + # if block_index is None: + # return self.get_block_with_data(base) + # return self[block_index] + + # def get_block_with_data(self, array): + # base = util.get_array_base(array) + # for (block_index, block) in enumerate(self): + # if block._data is not None and not callable(block._data): + # if block._data is base: + # if self.lookup_by_object(base) is None: + # self.assign_array_to_block_index(base, block_index) + # return block + # return None + + # def assign_read_blocks(self): + # for (block_index, block) in enumerate(self): + # if block._data is not None and not callabale(block._data): + # self.assign_array_to_block_index(block._data, block_index) + # base = util.get_array_base(block._data) + + # def assign_array_to_block_index(self, array, block_index): + # base = util.get_array_base(array) + # self.assign_object(base, block_index) + + # def assign_array_to_block(self, array, block): + # block_index = self.index(block) + # self.assign_array_to_block_index(array, block_index) + pass + + +class BlockOptions(store.Store): + """ + {array_base: options} + read_blocks (instance of ReadBlocks) + """ - def remove(self, block): - """ - Remove a block from the manager. - """ - block_set = self._block_type_mapping.get(block.array_storage, None) - if block_set is not None: - if block in block_set: - block_set.remove(block) - for key, blk in list(self._data_to_block_mapping.items()): - if blk is block: - del self._data_to_block_mapping[key] - for key, blk in list(self._key_to_block_mapping.items()): - if blk is block: - del self._key_to_block_mapping[key] + def __init__(self, read_blocks=None): + super().__init__() + if read_blocks is None: + self._read_blocks = ReadBlocks([]) + elif isinstance(read_blocks, ReadBlocks): + self._read_blocks = read_blocks else: - msg = f"Unknown array storage type {block.array_storage}" - raise ValueError(msg) - - def set_array_storage(self, block, array_storage): - """ - Set the array storage type of the given block. - - Parameters - ---------- - block : Block instance - - array_storage : str - Must be one of: - - - ``internal``: The default. The array data will be - stored in a binary block in the same ASDF file. - - - ``external``: Store the data in a binary block in a - separate ASDF file. - - - ``inline``: Store the data as YAML inline in the tree. - - - ``streamed``: The special streamed inline block that - appears at the end of the file. - """ - if array_storage not in ["internal", "external", "streamed", "inline"]: - msg = "array_storage must be one of 'internal', 'external', 'streamed' or 'inline'" - raise ValueError(msg) + self._read_blocks = ReadBlocks(read_blocks) + + def get_options(self, array): + base = util.get_array_base(array) + options = self.lookup_by_object(base) + if options is None: + # look up by block with matching _data + for block in self._read_blocks: + if block._data is base: + # init options + if block.header["flags"] & constants.BLOCK_FLAG_STREAMED: + storage_type = "streamed" + else: + storage_type = "internal" + options = Options(storage_type, block.header["compression"]) + # set options + self.set_options(base, options) + break + if options is None: + options = Options() + self.set_options(base, options) + return options - if block.array_storage != array_storage: - if block in self.blocks: - self.remove(block) - block._array_storage = array_storage - self.add(block) - if array_storage == "streamed": - block.output_compression = None - block.output_compression_kwargs = None + def set_options(self, array, options): + base = util.get_array_base(array) + self.assign_object(base, options) - @property - def blocks(self): - """ - An iterator over all blocks being managed. + # TODO copy to allow for changing settings on write + # TODO make an 'update_options' - This may not include all of the blocks in an open file, - since their reading may have been deferred. Call - `finish_reading_internal_blocks` to find the positions and - header information of all blocks in the file. - """ - for block_set in self._block_type_mapping.values(): - yield from block_set +class Manager: + def __init__(self, read_blocks=None): + self.options = BlockOptions(read_blocks) + if read_blocks is None: + self.blocks = self.options._read_blocks + else: + self.blocks = read_blocks + self._write_blocks = [] + # TODO copy options and read_blocks on start of write + + def make_write_block(self, data, options): + # first, look for an existing block + for index, blk in enumerate(self._write_blocks): + if blk._data is data: + return index + # if no block is found, make a new block + self._write_blocks.append(WriteBlock(data, options.compression, options.compression_kwargs)) + # data_bytes = np.ndarray(-1, np.uint8, data.ravel(order='K').data) + # self._write_blocks.append(WriteBlock(data_bytes, options.compression, options.compression_kwargs)) + return len(self._write_blocks) - 1 + + # cludges for tests @property def internal_blocks(self): - """ - An iterator over all internal blocks being managed. - - This may not include all of the blocks in an open file, - since their reading may have been deferred. Call - `finish_reading_internal_blocks` to find the positions and - header information of all blocks in the file. - """ - for block_set in (self._internal_blocks, self._streamed_blocks): - yield from block_set + return self.blocks @property - def streamed_block(self): - """ - The streamed block (always the last internal block in a file), - or `None` if a streamed block is not present. - """ - self.finish_reading_internal_blocks() - - if len(self._streamed_blocks): - return self._streamed_blocks[0] - - return None - - @property - def external_blocks(self): - """ - An iterator over all external blocks being managed. - """ - yield from self._external_blocks - - @property - def inline_blocks(self): - """ - An iterator over all inline blocks being managed. - """ - yield from self._inline_blocks - - @property - def memmap(self): - """ - The flag which indicates whether the arrays are memory mapped - to the underlying file. - """ - return self._memmap - - @property - def lazy_load(self): - """ - The flag which indicates whether the blocks are lazily read. - """ - return self._lazy_load - - def has_blocks_with_offset(self): - """ - Returns `True` if any of the internal blocks currently have an - offset assigned. - """ - return any(block.offset is not None for block in self.internal_blocks) - - def _new_block(self): - return Block(memmap=self.memmap, lazy_load=self.lazy_load) - - def _sort_blocks_by_offset(self): - def sorter(x): - if x.offset is None: - msg = "Block is missing offset" - raise ValueError(msg) - - return x.offset - - self._internal_blocks.sort(key=sorter) - - def _read_next_internal_block(self, fd, past_magic=False): - # This assumes the file pointer is at the beginning of the - # block, (or beginning + 4 if past_magic is True) - block = self._new_block().read(fd, past_magic=past_magic, validate_checksum=self._validate_checksums) - if block is not None: - self._add(block) - - return block - - def read_internal_blocks(self, fd, past_magic=False, validate_checksums=False): - """ - Read internal blocks present in the file. If the file is - seekable, only the first block will be read, and the reading - of all others will be lazily deferred until an the loading of - an array requests it. - - Parameters - ---------- - fd : GenericFile - The file to read from. - - past_magic : bool, optional - If `True`, the file position is immediately after the - block magic token. If `False` (default), the file - position is exactly at the beginning of the block magic - token. - - validate_checksums : bool, optional - If `True`, validate the blocks against their checksums. - - """ - self._validate_checksums = validate_checksums - - while True: - block = self._read_next_internal_block(fd, past_magic=past_magic) - if block is None: - break - past_magic = False - - # If the file handle is seekable, we only read the first - # block and defer reading the rest until later. - if fd.seekable(): - break - - def finish_reading_internal_blocks(self): - """ - Read all remaining internal blocks present in the file, if any. - This is called before updating a file, since updating requires - knowledge of all internal blocks in the file. - """ - if not self._internal_blocks: - return - for block in self._internal_blocks: - if isinstance(block, UnloadedBlock): - block.load() - - last_block = self._internal_blocks[-1] - - # Read all of the remaining blocks in the file, if any - if last_block._fd is not None and last_block._fd.seekable(): - last_block._fd.seek(last_block.end_offset) - while True: - last_block = self._read_next_internal_block(last_block._fd, False) - if last_block is None: - break - - self._internal_blocks_mapped = True - - def write_internal_blocks_serial(self, fd, pad_blocks=False): - """ - Write all blocks to disk serially. - - Parameters - ---------- - fd : generic_io.GenericFile - The file to write internal blocks to. The file position - should be after the tree. - """ - for block in self.internal_blocks: - if block.output_compression: - block.offset = fd.tell() - block.write(fd) - else: - if block.input_compression: - block.update_size() - padding = util.calculate_padding(block.size, pad_blocks, fd.block_size) - block.allocated = block._size + padding - block.offset = fd.tell() - block.write(fd) - fd.fast_forward(block.allocated - block._size) - - def write_internal_blocks_random_access(self, fd): - """ - Write all blocks to disk at their specified offsets. All - internal blocks must have an offset assigned at this point. - - Parameters - ---------- - fd : generic_io.GenericFile - The file to write internal blocks to. The file position - should be after the tree. - """ - self._sort_blocks_by_offset() - - iter_ = self.internal_blocks - last_block = next(iter_) - # We need to explicitly clear anything between the tree - # and the first block, otherwise there may be other block - # markers left over which will throw off block indexing. - # We don't need to do this between each block. - fd.clear(last_block.offset - fd.tell()) - - for block in iter_: - last_block.allocated = (block.offset - last_block.offset) - last_block.header_size - fd.seek(last_block.offset) - last_block.write(fd) - last_block = block - - last_block.allocated = last_block.size - fd.seek(last_block.offset) - last_block.write(fd) - - fd.truncate(last_block.end_offset) - - def write_external_blocks(self, uri, pad_blocks=False): - """ - Write all blocks to disk serially. - - Parameters - ---------- - uri : str - The base uri of the external blocks - """ - - import asdf - - for i, block in enumerate(self.external_blocks): - if uri is None: - msg = "Can't write external blocks, since URI of main file is unknown." - raise ValueError(msg) - subfd = self.get_external_uri(uri, i) - asdffile = asdf.AsdfFile() - blk = copy.copy(block) - blk._array_storage = "internal" - asdffile._blocks.add(blk) - blk._used = True - # skip the new block manager here - asdffile._write_to(subfd, pad_blocks=pad_blocks, all_array_storage="internal") - - def write_block_index(self, fd, ctx): - """ - Write the block index. - - Parameters - ---------- - fd : GenericFile - The file to write to. The file pointer should be at the - end of the file. - """ - if len(self._internal_blocks) and not len(self._streamed_blocks): - fd.write(constants.INDEX_HEADER) - fd.write(b"\n") - offsets = [x.offset for x in self.internal_blocks] - - yaml_version = tuple(int(x) for x in ctx.version_map["YAML_VERSION"].split(".")) - - yaml.dump( - offsets, - Dumper=yamlutil._yaml_base_dumper, - stream=fd, - explicit_start=True, - explicit_end=True, - version=yaml_version, - allow_unicode=True, - encoding="utf-8", - ) - - _re_index_content = re.compile(rb"^" + constants.INDEX_HEADER + rb"\r?\n%YAML.*\.\.\.\r?\n?$") - _re_index_misc = re.compile(rb"^[\n\r\x20-\x7f]+$") - - def read_block_index(self, fd, ctx): - """ - Read the block index. - - Parameters - ---------- - fd : GenericFile - The file to read from. It must be seekable. - """ - # This reads the block index by reading backward from the end - # of the file. This tries to be as conservative as possible, - # since not reading an index isn't a deal breaker -- - # everything can still be read from the file, only slower. - # Importantly, it must remain "transactionally clean", and not - # create any blocks until we're sure the block index makes - # sense. + def _internal_blocks(self): + return self.blocks - if not fd.seekable(): - return + def set_array_storage(self, data, storage): + options = self.options.get_options(data) + options.storage_type = storage + self.options.set_options(data, options) - if not len(self._internal_blocks): - return - - first_block = self._internal_blocks[0] - first_block_end = first_block.end_offset - - fd.seek(0, generic_io.SEEK_END) - file_size = block_end = fd.tell() - # We want to read on filesystem block boundaries. We use - # "block_end - 5" here because we need to read at least 5 - # bytes in the first block. - block_start = ((block_end - 5) // fd.block_size) * fd.block_size - buff_size = block_end - block_start - - content = b"" - - fd.seek(block_start, generic_io.SEEK_SET) - buff = fd.read(buff_size) - - # Extra '\0' bytes are allowed after the ..., mainly to - # workaround poor truncation support on Windows - buff = buff.rstrip(b"\0") - content = buff - - # We need an explicit YAML end marker, or there's no - # block index - for ending in (b"...", b"...\r\n", b"...\n"): - if content.endswith(ending): - break - else: - return - - # Read blocks in reverse order from the end of the file - while True: - # Look for the index header - idx = content.rfind(constants.INDEX_HEADER) - if idx != -1: - content = content[idx:] - index_start = block_start + idx - break - - # If the rest of it starts to look like binary - # values, bail... - if not self._re_index_misc.match(buff): - return - - if block_start <= first_block_end: - return - - block_end = block_start - block_start = max(block_end - fd.block_size, first_block_end) - - fd.seek(block_start, generic_io.SEEK_SET) - buff_size = block_end - block_start - buff = fd.read(buff_size) - content = buff + content - - yaml_content = content[content.find(b"\n") + 1 :] - - # The following call to yaml.load is safe because we're - # using pyyaml's SafeLoader. - offsets = yaml.load(yaml_content, Loader=yamlutil._yaml_base_loader) # noqa: S506 - - # Make sure the indices look sane - if not isinstance(offsets, list) or len(offsets) == 0: - return - - last_offset = 0 - for x in offsets: - if not isinstance(x, int) or x > file_size or x < 0 or x <= last_offset + Block._header.size: - return - last_offset = x - - # We always read the first block, so we can confirm that the - # first entry in the block index matches the first block - if offsets[0] != first_block.offset: - return - - if len(offsets) == 1: - # If there's only one block in the index, we've already - # loaded the first block, so just return: we have nothing - # left to do - return - - # One last sanity check: Read the last block in the index and - # make sure it makes sense. - fd.seek(offsets[-1], generic_io.SEEK_SET) - try: - block = self._new_block().read(fd) - except (ValueError, OSError): - return - - # Now see if the end of the last block leads right into the index - if block.end_offset != index_start: - return - - # It seems we're good to go, so instantiate the UnloadedBlock - # objects - for offset in offsets[1:-1]: - self._internal_blocks.append(UnloadedBlock(fd, offset, memmap=self.memmap, lazy_load=self.lazy_load)) - - # We already read the last block in the file -- no need to read it again - self._internal_blocks.append(block) - - # Record that all block locations have been mapped out (used to avoid - # unnecessary calls to finish_reading_internal_blocks later). - self._internal_blocks_mapped = True - - # Materialize the internal blocks if we are not lazy - if not self.lazy_load: - self.finish_reading_internal_blocks() - - def get_external_filename(self, filename, index): - """ - Given a main filename and an index number, return a new file - name for referencing an external block. - """ - filename = os.path.splitext(filename)[0] - return filename + f"{index:04d}.asdf" - - def get_external_uri(self, uri, index): - """ - Given a main URI and an index number, return a new URI for - saving an external block. - """ - if uri is None: - uri = "" - parts = list(patched_urllib_parse.urlparse(uri)) - path = parts[2] - dirname, filename = os.path.split(path) - filename = self.get_external_filename(filename, index) - path = os.path.join(dirname, filename) - parts[2] = path - return patched_urllib_parse.urlunparse(parts) - - def _find_used_blocks(self, tree, ctx, remove=True): - reserved_blocks = set() - - for node in treeutil.iter_tree(tree): - if ctx.extension_manager.handles_type(type(node)): - converter = ctx.extension_manager.get_converter_for_type(type(node)) - sctx = ctx._create_serialization_context() - tag = converter.select_tag(node, sctx) - for key in converter.reserve_blocks(node, tag): - reserved_blocks.add(self.find_or_create_block(key)) - else: - hook = ctx._type_index.get_hook_for_type("reserve_blocks", type(node), ctx.version_string) - if hook is not None: - for block in hook(node, ctx): - reserved_blocks.add(block) - - if remove: - for block in list(self.blocks): - if not getattr(block, "_used", False) and block not in reserved_blocks: - self.remove(block) - return None - for block in list(self.blocks): - if getattr(block, "_used", False): - reserved_blocks.add(block) - return reserved_blocks - - def _handle_global_block_settings(self, block): - cfg = get_config() - all_array_storage = cfg.all_array_storage - if all_array_storage: - self.set_array_storage(block, all_array_storage) - - all_array_compression = cfg.all_array_compression - all_array_compression_kwargs = cfg.all_array_compression_kwargs - # Only override block compression algorithm if it wasn't explicitly set - # by AsdfFile.set_array_compression. - if all_array_compression != "input": - block.output_compression = all_array_compression - block.output_compression_kwargs = all_array_compression_kwargs - - if all_array_storage is None: - threshold = get_config().array_inline_threshold - if threshold is not None and block.array_storage in ["internal", "inline"]: - if np.prod(block.data.shape) < threshold: - self.set_array_storage(block, "inline") - else: - self.set_array_storage(block, "internal") - - def finalize(self, ctx): - """ - At this point, we have a complete set of blocks for the file, - with no extras. - - Here, they are reindexed, and possibly reorganized. - """ - # TODO: Should this reset the state (what's external and what - # isn't) afterword? - - self._find_used_blocks(ctx.tree, ctx) - - for block in list(self.blocks): - self._handle_global_block_settings(block) - - def get_block_by_key(self, key): - if key not in self._key_to_block_mapping: - msg = f"Unknown block key {key}" - raise KeyError(msg) - return self._key_to_block_mapping[key] - - def get_block(self, source): - """ - Given a "source identifier", return a block. - - Parameters - ---------- - source : any - If an integer, refers to the index of an internal block. - If a string, is a uri to an external block. - - Returns - ------- - buffer : buffer - """ - # If an "int", it is the index of an internal block - if isinstance(source, int): - if source == -1: - if len(self._streamed_blocks): - return self._streamed_blocks[0] - # If we don't have a streamed block, fall through so - # we can read all of the blocks, ultimately arriving - # at the last one, which, if all goes well is a - # streamed block. - - # First, look in the blocks we've already read - elif source >= 0: - if source < len(self._internal_blocks): - return self._internal_blocks[source] - else: - msg = f"Invalid source id {source}" - raise ValueError(msg) - - # If we have a streamed block or we already know we have - # no blocks, reading any further isn't going to yield any - # new blocks. - if len(self._streamed_blocks) or len(self._internal_blocks) == 0: - msg = f"Block '{source}' not found." - raise ValueError(msg) - - # If the desired block hasn't already been read, and the - # file is seekable, and we have at least one internal - # block, then we can move the file pointer to the end of - # the last known internal block, and start looking for - # more internal blocks. This is "deferred block loading". - last_block = self._internal_blocks[-1] - - if last_block._fd is not None and last_block._fd.seekable(): - last_block._fd.seek(last_block.end_offset) - while True: - next_block = self._read_next_internal_block(last_block._fd, False) - if next_block is None: - break - if len(self._internal_blocks) - 1 == source: - return next_block - last_block = next_block - - if source == -1 and last_block.array_storage == "streamed": - return last_block - - msg = f"Block '{source}' not found." - raise ValueError(msg) - - if isinstance(source, str): - asdffile = self._asdffile().open_external(source) - block = asdffile._blocks._internal_blocks[0] - self.set_array_storage(block, "external") - - # Handle the case of inline data - elif isinstance(source, list): - block = Block(data=np.array(source), array_storage="inline") - - else: - msg = f"Unknown source '{source}'" - raise TypeError(msg) - - return block - - def get_source(self, block): - """ - Get a source identifier for a given block. - - Parameters - ---------- - block : Block - - Returns - ------- - source_id : str - May be an integer for an internal block, or a URI for an - external block. - """ - for i, internal_block in enumerate(self.internal_blocks): - if block == internal_block: - if internal_block.array_storage == "streamed": - return -1 - return i - - for i, external_block in enumerate(self.external_blocks): - if block == external_block: - if self._asdffile().uri is None: - msg = "Can't write external blocks, since URI of main file is unknown." - raise ValueError(msg) - - parts = list(patched_urllib_parse.urlparse(self._asdffile().uri)) - path = parts[2] - filename = os.path.basename(path) - return self.get_external_filename(filename, i) - - msg = "block not found." - raise ValueError(msg) - - def find_or_create_block_for_array(self, arr): - """ - For a given array, looks for an existing block containing its - underlying data. If not found, adds a new block to the block - list. Returns the index in the block list to the array. - - Parameters - ---------- - arr : numpy.ndarray - - Returns - ------- - block : Block - """ - from asdf.tags.core import ndarray - - if isinstance(arr, ndarray.NDArrayType) and arr.block is not None and arr.block in self.blocks: - return arr.block - - base = util.get_array_base(arr) - block = self._data_to_block_mapping.get(id(base)) - if block is not None: - return block - - block = Block(base) - self.add(block) - self._handle_global_block_settings(block) - return block - - def find_or_create_block(self, key): - """ - For a given hashable key, looks for an existing block. If not - found, adds a new block to the block list. Returns the index - in the block list to the array. - - Parameters - ---------- - key : hashable - - Returns - ------- - block : Block - """ - block = self._key_to_block_mapping.get(key) - if block is not None: - return block - - block = Block() - self.add(block, key=key) - self._handle_global_block_settings(block) - - return block - - def get_streamed_block(self): - """ - Get the streamed block, which is always the last one. A - streamed block, on writing, does not manage data of its own, - but the user is expected to stream it to disk directly. - """ - block = self.streamed_block - if block is None: - block = Block(array_storage="streamed") - self.add(block) - return block - - def add_inline(self, array): - """ - Add an inline block for ``array`` to the block set. - """ - block = Block(array, array_storage="inline") - self.add(block) - return block - - def get_output_compressions(self): - """ - Get the list of unique compressions used on blocks. - """ - return list({b.output_compression for b in self.blocks}) - - def get_output_compression_extensions(self): - """ - Infer the compression extensions used on blocks. - Note that this is somewhat indirect and could be fooled if a new extension - for the same compression label is loaded after the compression of the block. - """ - ext = [] - for label in self.get_output_compressions(): - compressor = mcompression._get_compressor_from_extensions(label, return_extension=True) - if compressor is not None: - ext += [compressor[1]] # second item is the extension - return ext - - def __getitem__(self, arr): - return self.find_or_create_block_for_array(arr) - - def close(self): - for block in self.blocks: - block.close() + def __len__(self): + return len(self.blocks) diff --git a/asdf/_block/old_manager.py b/asdf/_block/old_manager.py new file mode 100644 index 000000000..37398e68b --- /dev/null +++ b/asdf/_block/old_manager.py @@ -0,0 +1,835 @@ +import copy +import os +import re +import weakref + +import numpy as np +import yaml + +from asdf import compression as mcompression +from asdf import constants, generic_io, treeutil, util, yamlutil +from asdf.config import get_config +from asdf.util import patched_urllib_parse + +from .block import Block, UnloadedBlock + + +class BlockManager: + """ + Manages the `Block`s associated with a ASDF file. + """ + + def __init__(self, asdffile, copy_arrays=False, lazy_load=True): + self._asdffile = weakref.ref(asdffile) + + self._internal_blocks = [] + self._external_blocks = [] + self._inline_blocks = [] + self._streamed_blocks = [] + + self._block_type_mapping = { + "internal": self._internal_blocks, + "external": self._external_blocks, + "inline": self._inline_blocks, + "streamed": self._streamed_blocks, + } + + self._data_to_block_mapping = {} + self._validate_checksums = False + self._memmap = not copy_arrays + self._lazy_load = lazy_load + self._internal_blocks_mapped = False + + def __len__(self): + """ + Return the total number of blocks being managed. + + This may not include all of the blocks in an open file, since + their reading may have been deferred. Call + `finish_reading_internal_blocks` to find the positions and + header information of all blocks in the file. + """ + return sum(len(x) for x in self._block_type_mapping.values()) + + def add(self, block, key=None): + """ + Add an internal block to the manager. + """ + if not self._internal_blocks_mapped: + # If the block index is missing we need to locate the remaining + # blocks so that we don't accidentally add our new block + # in the middle of the list. + self.finish_reading_internal_blocks() + + self._add(block, key=key) + + def _add(self, block, key=None): + block_set = self._block_type_mapping.get(block.array_storage, None) + if block_set is not None: + if block not in block_set: + block_set.append(block) + else: + msg = f"Unknown array storage type {block.array_storage}" + raise ValueError(msg) + + if block.array_storage == "streamed" and len(self._streamed_blocks) > 1: + msg = "Can not add second streaming block" + raise ValueError(msg) + + if block._data is not None or key is not None: + if key is None: + key = id(util.get_array_base(block._data)) + self._data_to_block_mapping[key] = block + + def remove(self, block): + """ + Remove a block from the manager. + """ + block_set = self._block_type_mapping.get(block.array_storage, None) + if block_set is not None: + if block in block_set: + block_set.remove(block) + for key, blk in list(self._data_to_block_mapping.items()): + if blk is block: + del self._data_to_block_mapping[key] + else: + msg = f"Unknown array storage type {block.array_storage}" + raise ValueError(msg) + + def set_array_storage(self, block, array_storage): + """ + Set the array storage type of the given block. + + Parameters + ---------- + block : Block instance + + array_storage : str + Must be one of: + + - ``internal``: The default. The array data will be + stored in a binary block in the same ASDF file. + + - ``external``: Store the data in a binary block in a + separate ASDF file. + + - ``inline``: Store the data as YAML inline in the tree. + + - ``streamed``: The special streamed inline block that + appears at the end of the file. + """ + if array_storage not in ["internal", "external", "streamed", "inline"]: + msg = "array_storage must be one of 'internal', 'external', 'streamed' or 'inline'" + raise ValueError(msg) + + if block.array_storage != array_storage: + if block in self.blocks: + self.remove(block) + block._array_storage = array_storage + self.add(block) + if array_storage == "streamed": + block.output_compression = None + block.output_compression_kwargs = None + + @property + def blocks(self): + """ + An iterator over all blocks being managed. + + This may not include all of the blocks in an open file, + since their reading may have been deferred. Call + `finish_reading_internal_blocks` to find the positions and + header information of all blocks in the file. + """ + for block_set in self._block_type_mapping.values(): + yield from block_set + + @property + def internal_blocks(self): + """ + An iterator over all internal blocks being managed. + + This may not include all of the blocks in an open file, + since their reading may have been deferred. Call + `finish_reading_internal_blocks` to find the positions and + header information of all blocks in the file. + """ + for block_set in (self._internal_blocks, self._streamed_blocks): + yield from block_set + + @property + def streamed_block(self): + """ + The streamed block (always the last internal block in a file), + or `None` if a streamed block is not present. + """ + self.finish_reading_internal_blocks() + + if len(self._streamed_blocks): + return self._streamed_blocks[0] + + return None + + @property + def external_blocks(self): + """ + An iterator over all external blocks being managed. + """ + yield from self._external_blocks + + @property + def inline_blocks(self): + """ + An iterator over all inline blocks being managed. + """ + yield from self._inline_blocks + + @property + def memmap(self): + """ + The flag which indicates whether the arrays are memory mapped + to the underlying file. + """ + return self._memmap + + @property + def lazy_load(self): + """ + The flag which indicates whether the blocks are lazily read. + """ + return self._lazy_load + + def has_blocks_with_offset(self): + """ + Returns `True` if any of the internal blocks currently have an + offset assigned. + """ + return any(block.offset is not None for block in self.internal_blocks) + + def _new_block(self): + return Block(memmap=self.memmap, lazy_load=self.lazy_load) + + def _sort_blocks_by_offset(self): + def sorter(x): + if x.offset is None: + msg = "Block is missing offset" + raise ValueError(msg) + + return x.offset + + self._internal_blocks.sort(key=sorter) + + def _read_next_internal_block(self, fd, past_magic=False): + # This assumes the file pointer is at the beginning of the + # block, (or beginning + 4 if past_magic is True) + block = self._new_block().read(fd, past_magic=past_magic, validate_checksum=self._validate_checksums) + if block is not None: + self._add(block) + + return block + + def read_internal_blocks(self, fd, past_magic=False, validate_checksums=False): + """ + Read internal blocks present in the file. If the file is + seekable, only the first block will be read, and the reading + of all others will be lazily deferred until an the loading of + an array requests it. + + Parameters + ---------- + fd : GenericFile + The file to read from. + + past_magic : bool, optional + If `True`, the file position is immediately after the + block magic token. If `False` (default), the file + position is exactly at the beginning of the block magic + token. + + validate_checksums : bool, optional + If `True`, validate the blocks against their checksums. + + """ + self._validate_checksums = validate_checksums + + while True: + block = self._read_next_internal_block(fd, past_magic=past_magic) + if block is None: + break + past_magic = False + + # If the file handle is seekable, we only read the first + # block and defer reading the rest until later. + if fd.seekable(): + break + + def finish_reading_internal_blocks(self): + """ + Read all remaining internal blocks present in the file, if any. + This is called before updating a file, since updating requires + knowledge of all internal blocks in the file. + """ + if not self._internal_blocks: + return + for block in self._internal_blocks: + if isinstance(block, UnloadedBlock): + block.load() + + last_block = self._internal_blocks[-1] + + # Read all of the remaining blocks in the file, if any + if last_block._fd is not None and last_block._fd.seekable(): + last_block._fd.seek(last_block.end_offset) + while True: + last_block = self._read_next_internal_block(last_block._fd, False) + if last_block is None: + break + + self._internal_blocks_mapped = True + + def write_internal_blocks_serial(self, fd, pad_blocks=False): + """ + Write all blocks to disk serially. + + Parameters + ---------- + fd : generic_io.GenericFile + The file to write internal blocks to. The file position + should be after the tree. + """ + for block in self.internal_blocks: + if block.output_compression: + block.offset = fd.tell() + block.write(fd) + else: + if block.input_compression: + block.update_size() + padding = util.calculate_padding(block.size, pad_blocks, fd.block_size) + block.allocated = block._size + padding + block.offset = fd.tell() + block.write(fd) + fd.fast_forward(block.allocated - block._size) + + def write_internal_blocks_random_access(self, fd): + """ + Write all blocks to disk at their specified offsets. All + internal blocks must have an offset assigned at this point. + + Parameters + ---------- + fd : generic_io.GenericFile + The file to write internal blocks to. The file position + should be after the tree. + """ + self._sort_blocks_by_offset() + + iter_ = self.internal_blocks + last_block = next(iter_) + # We need to explicitly clear anything between the tree + # and the first block, otherwise there may be other block + # markers left over which will throw off block indexing. + # We don't need to do this between each block. + fd.clear(last_block.offset - fd.tell()) + + for block in iter_: + last_block.allocated = (block.offset - last_block.offset) - last_block.header_size + fd.seek(last_block.offset) + last_block.write(fd) + last_block = block + + last_block.allocated = last_block.size + fd.seek(last_block.offset) + last_block.write(fd) + + fd.truncate(last_block.end_offset) + + def write_external_blocks(self, uri, pad_blocks=False): + """ + Write all blocks to disk serially. + + Parameters + ---------- + uri : str + The base uri of the external blocks + """ + + import asdf + + for i, block in enumerate(self.external_blocks): + if uri is None: + msg = "Can't write external blocks, since URI of main file is unknown." + raise ValueError(msg) + subfd = self.get_external_uri(uri, i) + asdffile = asdf.AsdfFile() + blk = copy.copy(block) + blk._array_storage = "internal" + asdffile._blocks.add(blk) + blk._used = True + # skip the new block manager here + asdffile._write_to(subfd, pad_blocks=pad_blocks, all_array_storage="internal") + + def write_block_index(self, fd, ctx): + """ + Write the block index. + + Parameters + ---------- + fd : GenericFile + The file to write to. The file pointer should be at the + end of the file. + """ + if len(self._internal_blocks) and not len(self._streamed_blocks): + fd.write(constants.INDEX_HEADER) + fd.write(b"\n") + offsets = [x.offset for x in self.internal_blocks] + + yaml_version = tuple(int(x) for x in ctx.version_map["YAML_VERSION"].split(".")) + + yaml.dump( + offsets, + Dumper=yamlutil._yaml_base_dumper, + stream=fd, + explicit_start=True, + explicit_end=True, + version=yaml_version, + allow_unicode=True, + encoding="utf-8", + ) + + _re_index_content = re.compile(rb"^" + constants.INDEX_HEADER + rb"\r?\n%YAML.*\.\.\.\r?\n?$") + _re_index_misc = re.compile(rb"^[\n\r\x20-\x7f]+$") + + def read_block_index(self, fd, ctx): + """ + Read the block index. + + Parameters + ---------- + fd : GenericFile + The file to read from. It must be seekable. + """ + # This reads the block index by reading backward from the end + # of the file. This tries to be as conservative as possible, + # since not reading an index isn't a deal breaker -- + # everything can still be read from the file, only slower. + # Importantly, it must remain "transactionally clean", and not + # create any blocks until we're sure the block index makes + # sense. + + if not fd.seekable(): + return + + if not len(self._internal_blocks): + return + + first_block = self._internal_blocks[0] + first_block_end = first_block.end_offset + + fd.seek(0, generic_io.SEEK_END) + file_size = block_end = fd.tell() + # We want to read on filesystem block boundaries. We use + # "block_end - 5" here because we need to read at least 5 + # bytes in the first block. + block_start = ((block_end - 5) // fd.block_size) * fd.block_size + buff_size = block_end - block_start + + content = b"" + + fd.seek(block_start, generic_io.SEEK_SET) + buff = fd.read(buff_size) + + # Extra '\0' bytes are allowed after the ..., mainly to + # workaround poor truncation support on Windows + buff = buff.rstrip(b"\0") + content = buff + + # We need an explicit YAML end marker, or there's no + # block index + for ending in (b"...", b"...\r\n", b"...\n"): + if content.endswith(ending): + break + else: + return + + # Read blocks in reverse order from the end of the file + while True: + # Look for the index header + idx = content.rfind(constants.INDEX_HEADER) + if idx != -1: + content = content[idx:] + index_start = block_start + idx + break + + # If the rest of it starts to look like binary + # values, bail... + if not self._re_index_misc.match(buff): + return + + if block_start <= first_block_end: + return + + block_end = block_start + block_start = max(block_end - fd.block_size, first_block_end) + + fd.seek(block_start, generic_io.SEEK_SET) + buff_size = block_end - block_start + buff = fd.read(buff_size) + content = buff + content + + yaml_content = content[content.find(b"\n") + 1 :] + + # The following call to yaml.load is safe because we're + # using pyyaml's SafeLoader. + offsets = yaml.load(yaml_content, Loader=yamlutil._yaml_base_loader) # noqa: S506 + + # Make sure the indices look sane + if not isinstance(offsets, list) or len(offsets) == 0: + return + + last_offset = 0 + for x in offsets: + if not isinstance(x, int) or x > file_size or x < 0 or x <= last_offset + Block._header.size: + return + last_offset = x + + # We always read the first block, so we can confirm that the + # first entry in the block index matches the first block + if offsets[0] != first_block.offset: + return + + if len(offsets) == 1: + # If there's only one block in the index, we've already + # loaded the first block, so just return: we have nothing + # left to do + return + + # One last sanity check: Read the last block in the index and + # make sure it makes sense. + fd.seek(offsets[-1], generic_io.SEEK_SET) + try: + block = self._new_block().read(fd) + except (ValueError, OSError): + return + + # Now see if the end of the last block leads right into the index + if block.end_offset != index_start: + return + + # It seems we're good to go, so instantiate the UnloadedBlock + # objects + for offset in offsets[1:-1]: + self._internal_blocks.append(UnloadedBlock(fd, offset, memmap=self.memmap, lazy_load=self.lazy_load)) + + # We already read the last block in the file -- no need to read it again + self._internal_blocks.append(block) + + # Record that all block locations have been mapped out (used to avoid + # unnecessary calls to finish_reading_internal_blocks later). + self._internal_blocks_mapped = True + + # Materialize the internal blocks if we are not lazy + if not self.lazy_load: + self.finish_reading_internal_blocks() + + def get_external_filename(self, filename, index): + """ + Given a main filename and an index number, return a new file + name for referencing an external block. + """ + filename = os.path.splitext(filename)[0] + return filename + f"{index:04d}.asdf" + + def get_external_uri(self, uri, index): + """ + Given a main URI and an index number, return a new URI for + saving an external block. + """ + if uri is None: + uri = "" + parts = list(patched_urllib_parse.urlparse(uri)) + path = parts[2] + dirname, filename = os.path.split(path) + filename = self.get_external_filename(filename, index) + path = os.path.join(dirname, filename) + parts[2] = path + return patched_urllib_parse.urlunparse(parts) + + def _find_used_blocks(self, tree, ctx, remove=True): + reserved_blocks = set() + + for node in treeutil.iter_tree(tree): + if ctx.extension_manager.handles_type(type(node)): + converter = ctx.extension_manager.get_converter_for_type(type(node)) + sctx = ctx._create_serialization_context() + tag = converter.select_tag(node, sctx) + for key in converter.reserve_blocks(node, tag, sctx): + reserved_blocks.add(self.find_or_create_block(key)) + else: + hook = ctx._type_index.get_hook_for_type("reserve_blocks", type(node), ctx.version_string) + if hook is not None: + for block in hook(node, ctx): + reserved_blocks.add(block) + + if remove: + for block in list(self.blocks): + if getattr(block, "_used", 0) == 0 and block not in reserved_blocks: + self.remove(block) + return None + for block in list(self.blocks): + if getattr(block, "_used", 0): + reserved_blocks.add(block) + return reserved_blocks + + def _handle_global_block_settings(self, block): + cfg = get_config() + all_array_storage = cfg.all_array_storage + if all_array_storage: + self.set_array_storage(block, all_array_storage) + + all_array_compression = cfg.all_array_compression + all_array_compression_kwargs = cfg.all_array_compression_kwargs + # Only override block compression algorithm if it wasn't explicitly set + # by AsdfFile.set_array_compression. + if all_array_compression != "input": + block.output_compression = all_array_compression + block.output_compression_kwargs = all_array_compression_kwargs + + if all_array_storage is None: + threshold = get_config().array_inline_threshold + if threshold is not None and block.array_storage in ["internal", "inline"]: + if np.prod(block.data.shape) < threshold: + self.set_array_storage(block, "inline") + else: + self.set_array_storage(block, "internal") + + def finalize(self, ctx): + """ + At this point, we have a complete set of blocks for the file, + with no extras. + + Here, they are reindexed, and possibly reorganized. + """ + # TODO: Should this reset the state (what's external and what + # isn't) afterword? + + self._find_used_blocks(ctx.tree, ctx) + + for block in list(self.blocks): + self._handle_global_block_settings(block) + + def get_block_by_key(self, key): + if key not in self._data_to_block_mapping: + msg = f"Unknown block key {key}" + raise KeyError(msg) + return self._data_to_block_mapping[key] + + def get_block(self, source): + """ + Given a "source identifier", return a block. + + Parameters + ---------- + source : any + If an integer, refers to the index of an internal block. + If a string, is a uri to an external block. + + Returns + ------- + buffer : buffer + """ + # If an "int", it is the index of an internal block + if isinstance(source, int): + if source == -1: + if len(self._streamed_blocks): + return self._streamed_blocks[0] + # If we don't have a streamed block, fall through so + # we can read all of the blocks, ultimately arriving + # at the last one, which, if all goes well is a + # streamed block. + + # First, look in the blocks we've already read + elif source >= 0: + if source < len(self._internal_blocks): + return self._internal_blocks[source] + else: + msg = f"Invalid source id {source}" + raise ValueError(msg) + + # If we have a streamed block or we already know we have + # no blocks, reading any further isn't going to yield any + # new blocks. + if len(self._streamed_blocks) or len(self._internal_blocks) == 0: + msg = f"Block '{source}' not found." + raise ValueError(msg) + + # If the desired block hasn't already been read, and the + # file is seekable, and we have at least one internal + # block, then we can move the file pointer to the end of + # the last known internal block, and start looking for + # more internal blocks. This is "deferred block loading". + last_block = self._internal_blocks[-1] + + if last_block._fd is not None and last_block._fd.seekable(): + last_block._fd.seek(last_block.end_offset) + while True: + next_block = self._read_next_internal_block(last_block._fd, False) + if next_block is None: + break + if len(self._internal_blocks) - 1 == source: + return next_block + last_block = next_block + + if source == -1 and last_block.array_storage == "streamed": + return last_block + + msg = f"Block '{source}' not found." + raise ValueError(msg) + + if isinstance(source, str): + asdffile = self._asdffile().open_external(source) + block = asdffile._blocks._internal_blocks[0] + self.set_array_storage(block, "external") + + # Handle the case of inline data + elif isinstance(source, list): + block = Block(data=np.array(source), array_storage="inline") + + else: + msg = f"Unknown source '{source}'" + raise TypeError(msg) + + return block + + def get_source(self, block): + """ + Get a source identifier for a given block. + + Parameters + ---------- + block : Block + + Returns + ------- + source_id : str + May be an integer for an internal block, or a URI for an + external block. + """ + for i, internal_block in enumerate(self.internal_blocks): + if block == internal_block: + if internal_block.array_storage == "streamed": + return -1 + return i + + for i, external_block in enumerate(self.external_blocks): + if block == external_block: + if self._asdffile().uri is None: + msg = "Can't write external blocks, since URI of main file is unknown." + raise ValueError(msg) + + parts = list(patched_urllib_parse.urlparse(self._asdffile().uri)) + path = parts[2] + filename = os.path.basename(path) + return self.get_external_filename(filename, i) + + msg = "block not found." + raise ValueError(msg) + + def find_or_create_block_for_array(self, arr): + """ + For a given array, looks for an existing block containing its + underlying data. If not found, adds a new block to the block + list. Returns the index in the block list to the array. + + Parameters + ---------- + arr : numpy.ndarray + + Returns + ------- + block : Block + """ + from asdf.tags.core import ndarray + + if isinstance(arr, ndarray.NDArrayType) and arr.block is not None and arr.block in self.blocks: + return arr.block + + base = util.get_array_base(arr) + block = self._data_to_block_mapping.get(id(base)) + if block is not None: + return block + + block = Block(base) + self.add(block) + self._handle_global_block_settings(block) + return block + + def find_or_create_block(self, key): + """ + For a given hashable key, looks for an existing block. If not + found, adds a new block to the block list. Returns the index + in the block list to the array. + + Parameters + ---------- + key : hashable + + Returns + ------- + block : Block + """ + block = self._data_to_block_mapping.get(key) + if block is not None: + return block + + block = Block() + self.add(block, key=key) + self._handle_global_block_settings(block) + self._data_to_block_mapping[key] = block + + return block + + def get_streamed_block(self): + """ + Get the streamed block, which is always the last one. A + streamed block, on writing, does not manage data of its own, + but the user is expected to stream it to disk directly. + """ + block = self.streamed_block + if block is None: + block = Block(array_storage="streamed") + self.add(block) + return block + + def add_inline(self, array): + """ + Add an inline block for ``array`` to the block set. + """ + block = Block(array, array_storage="inline") + self.add(block) + return block + + def get_output_compressions(self): + """ + Get the list of unique compressions used on blocks. + """ + return list({b.output_compression for b in self.blocks}) + + def get_output_compression_extensions(self): + """ + Infer the compression extensions used on blocks. + Note that this is somewhat indirect and could be fooled if a new extension + for the same compression label is loaded after the compression of the block. + """ + ext = [] + for label in self.get_output_compressions(): + compressor = mcompression._get_compressor_from_extensions(label, return_extension=True) + if compressor is not None: + ext += [compressor[1]] # second item is the extension + return ext + + def __getitem__(self, arr): + return self.find_or_create_block_for_array(arr) + + def close(self): + for block in self.blocks: + block.close() diff --git a/asdf/_block/options.py b/asdf/_block/options.py index a3e16af7c..3425e60a9 100644 --- a/asdf/_block/options.py +++ b/asdf/_block/options.py @@ -1,8 +1,12 @@ from asdf import compression as mcompression +from asdf.config import config_context class Options: - def __init__(self, storage_type, compression_type=None, compression_kwargs=None): + def __init__(self, storage_type=None, compression_type=None, compression_kwargs=None): + if storage_type is None: + with config_context() as cfg: + storage_type = cfg.all_array_storage or "internal" self._storage_type = None self._compression = None self._compression_kwargs = None @@ -31,7 +35,7 @@ def compression(self): @compression.setter def compression(self, compression): - msg = f"Invalid compression {compression}" + msg = f"Invalid compression type: {compression}" if compression == "input": # "input" compression will validate as the ASDF compression module made # some assumptions about availability of information (that the input block diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index 8247a25d7..9f0974ebb 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -9,7 +9,7 @@ class ReadBlock: def __init__(self, offset, fd, memmap, lazy_load, header=None, data_offset=None, data=None): self.offset = offset self._fd = weakref.ref(fd) - self.header = header + self._header = header self.data_offset = data_offset self._data = data # TODO alternative to passing these down? @@ -29,7 +29,7 @@ def load(self): if fd is None or fd.is_closed(): msg = "Attempt to load block from closed file" raise OSError(msg) - _, self.header, self.data_offset, self._data = bio.read_block( + _, self._header, self.data_offset, self._data = bio.read_block( fd, offset=self.offset, memmap=self.memmap, lazy_load=self.lazy_load ) @@ -41,6 +41,12 @@ def data(self): return self._data() return self._data + @property + def header(self): + if not self.loaded: + self.load() + return self._header + def reset(self, fd, offset): self._fd = weakref.ref(fd) self.offset = offset @@ -101,7 +107,12 @@ def read_blocks(fd, memmap=False, lazy_load=False): return read_blocks_serially(fd, memmap, lazy_load) # setup empty blocks - block_index = bio.read_block_index(fd, index_offset) + try: + block_index = bio.read_block_index(fd, index_offset) + except OSError: + # failed to read block index, fall back to serial reading + fd.seek(starting_offset) + return read_blocks_serially(fd, memmap, lazy_load) # skip magic for each block blocks = [ReadBlock(offset + 4, fd, memmap, lazy_load) for offset in block_index] try: diff --git a/asdf/_block/writer.py b/asdf/_block/writer.py index 6281d8cba..54546ffc2 100644 --- a/asdf/_block/writer.py +++ b/asdf/_block/writer.py @@ -1,3 +1,5 @@ +import numpy as np + from asdf import constants from . import io as bio @@ -15,6 +17,10 @@ def data(self): return self._data() return self._data + @property + def data_bytes(self): + return np.ndarray(-1, np.uint8, self.data.ravel(order="K").data) + def write_blocks(fd, blocks, padding=False, streamed_block=None, write_index=True): offsets = [] @@ -25,7 +31,7 @@ def write_blocks(fd, blocks, padding=False, streamed_block=None, write_index=Tru headers.append( bio.write_block( fd, - blk.data, + blk.data_bytes, compression_kwargs=blk.compression_kwargs, padding=padding, compression=blk.compression, @@ -34,7 +40,7 @@ def write_blocks(fd, blocks, padding=False, streamed_block=None, write_index=Tru if streamed_block is not None: offsets.append(fd.tell()) fd.write(constants.BLOCK_MAGIC) - headers.append(bio.write_block(fd, streamed_block.data, stream=True)) + headers.append(bio.write_block(fd, streamed_block.data_bytes, stream=True)) elif len(blocks) and write_index: bio.write_block_index(fd, offsets) return offsets, headers diff --git a/asdf/_tests/_block/test_options.py b/asdf/_tests/_block/test_options.py index f98bf0930..22bade26c 100644 --- a/asdf/_tests/_block/test_options.py +++ b/asdf/_tests/_block/test_options.py @@ -3,11 +3,13 @@ import pytest from asdf._block.options import Options +from asdf.config import config_context valid_storage_types = ["internal", "external", "streamed", "inline"] +valid_default_storage_types = [st for st in valid_storage_types if st != "streamed"] valid_compression_types = [None, "zlib", "bzp2", "lz4", ""] -invalid_storage_types = ["foo", None] +invalid_storage_types = ["foo", "bar"] invalid_compression_types = ["input", "foo"] @@ -17,6 +19,14 @@ def test_set_storage_init(storage): assert o.storage_type == storage +@pytest.mark.parametrize("storage", valid_default_storage_types) +def test_default_storage_init(storage): + with config_context() as cfg: + cfg.all_array_storage = storage + o = Options() + assert o.storage_type == storage + + @pytest.mark.parametrize("storage", valid_storage_types) def test_set_storage_attr(storage): # start with a different storage type diff --git a/asdf/_tests/_helpers.py b/asdf/_tests/_helpers.py index a60808369..31dc24ede 100644 --- a/asdf/_tests/_helpers.py +++ b/asdf/_tests/_helpers.py @@ -23,7 +23,6 @@ import asdf from asdf import generic_io, versioning -from asdf._block import Block from asdf._resolver import Resolver, ResolverChain from asdf.asdf import AsdfFile, get_asdf_library_info from asdf.constants import YAML_TAG_PREFIX @@ -264,8 +263,8 @@ def _assert_roundtrip_tree( ff = asdf.open(buff, extensions=extensions, copy_arrays=True, lazy_load=False) # Ensure that all the blocks are loaded for block in ff._blocks._internal_blocks: - assert isinstance(block, Block) - assert block._data is not None + # assert isinstance(block, Block) + assert block._data is not None and not callable(block._data) # The underlying file is closed at this time and everything should still work assert_tree_match(tree, ff.tree, ff, funcname=tree_match_func) if asdf_check_func: @@ -275,8 +274,8 @@ def _assert_roundtrip_tree( AsdfFile(tree, extensions=extensions, **init_options).write_to(fname, **write_options) with asdf.open(fname, mode="rw", extensions=extensions, copy_arrays=False, lazy_load=False) as ff: for block in ff._blocks._internal_blocks: - assert isinstance(block, Block) - assert block._data is not None + # assert isinstance(block, Block) + assert block._data is not None and not callable(block._data) assert_tree_match(tree, ff.tree, ff, funcname=tree_match_func) if asdf_check_func: asdf_check_func(ff) diff --git a/asdf/_tests/commands/tests/test_defragment.py b/asdf/_tests/commands/tests/test_defragment.py index 201bd61a8..e21b7f951 100644 --- a/asdf/_tests/commands/tests/test_defragment.py +++ b/asdf/_tests/commands/tests/test_defragment.py @@ -23,7 +23,8 @@ def _test_defragment(tmpdir, codec): out_path = os.path.join(str(tmpdir), "original.defragment.asdf") ff = AsdfFile(tree) ff.write_to(path) - assert len(ff._blocks) == 2 + with asdf.open(path) as af: + assert len(af._blocks.blocks) == 2 result = main.main_from_args(["defragment", path, "-o", out_path, "-c", codec]) @@ -38,7 +39,7 @@ def _test_defragment(tmpdir, codec): with asdf.open(os.path.join(str(tmpdir), "original.defragment.asdf")) as ff: assert_tree_match(ff.tree, tree) - assert len(list(ff._blocks.internal_blocks)) == 2 + assert len(ff._blocks.blocks) == 2 def test_defragment_zlib(tmpdir): diff --git a/asdf/_tests/commands/tests/test_exploded.py b/asdf/_tests/commands/tests/test_exploded.py index e431fbb2d..ea9dc680d 100644 --- a/asdf/_tests/commands/tests/test_exploded.py +++ b/asdf/_tests/commands/tests/test_exploded.py @@ -1,6 +1,7 @@ import os import numpy as np +import pytest import asdf from asdf import AsdfFile @@ -8,6 +9,7 @@ from asdf.commands import main +@pytest.mark.xfail(reason="external blocks are broken") def test_explode_then_implode(tmpdir): x = np.arange(0, 10, dtype=float) diff --git a/asdf/_tests/commands/tests/test_to_yaml.py b/asdf/_tests/commands/tests/test_to_yaml.py index a64a24977..15c1519ca 100644 --- a/asdf/_tests/commands/tests/test_to_yaml.py +++ b/asdf/_tests/commands/tests/test_to_yaml.py @@ -1,6 +1,7 @@ import os import numpy as np +import pytest import asdf from asdf import AsdfFile @@ -8,6 +9,7 @@ from asdf.commands import main +@pytest.mark.xfail(reason="resolve and inline is broken") def test_to_yaml(tmpdir): x = np.arange(0, 10, dtype=float) diff --git a/asdf/_tests/tags/core/tests/test_integer.py b/asdf/_tests/tags/core/tests/test_integer.py index 3cd690c25..622416e72 100644 --- a/asdf/_tests/tags/core/tests/test_integer.py +++ b/asdf/_tests/tags/core/tests/test_integer.py @@ -69,7 +69,7 @@ def test_integer_storage_duplication(tmpdir): af.write_to(tmpfile) with asdf.open(tmpfile, _force_raw_types=True) as rf: - assert len(af._blocks) == 1 + assert len(rf._blocks) == 1 assert rf.tree["integer1"]["words"]["source"] == 0 assert rf.tree["integer2"]["words"]["source"] == 0 diff --git a/asdf/_tests/tags/core/tests/test_ndarray.py b/asdf/_tests/tags/core/tests/test_ndarray.py index 8949a19ac..eff843ebf 100644 --- a/asdf/_tests/tags/core/tests/test_ndarray.py +++ b/asdf/_tests/tags/core/tests/test_ndarray.py @@ -65,8 +65,8 @@ def check_asdf(asdf): assert tree["science_data"].ctypes.data == tree["skipping"].ctypes.data - assert len(list(asdf._blocks.internal_blocks)) == 1 - assert next(asdf._blocks.internal_blocks)._size == 80 + assert len(asdf._blocks.blocks) == 1 + assert asdf._blocks.blocks[0].header["data_size"] == 80 if "w" in asdf._mode: tree["science_data"][0] = 42 @@ -137,7 +137,7 @@ def test_dont_load_data(): repr(ff.tree) for block in ff._blocks.internal_blocks: - assert block._data is None + assert callable(block._data) def test_table_inline(tmpdir): @@ -260,7 +260,7 @@ def test_inline(): buff = io.BytesIO() ff = asdf.AsdfFile(tree) - ff._blocks.set_array_storage(ff._blocks[tree["science_data"]], "inline") + ff.set_array_storage(x, "inline") ff.write_to(buff) buff.seek(0) @@ -397,6 +397,7 @@ def test_simple_table(): ff.write_to(io.BytesIO()) +@pytest.mark.xfail(reason="resolve and inline is broken") def test_unicode_to_list(tmpdir): arr = np.array(["", "𐀠"], dtype=" foo: ! [1, 2, 3] -... - """ +...""" # Check that fully qualified explicit tags work buff = helpers.yaml_to_asdf(yaml, yaml_headers=False) diff --git a/asdf/asdf.py b/asdf/asdf.py index 7f45a0d3f..1244b48b0 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -12,7 +12,10 @@ from . import _node_info as node_info from . import _version as version from . import constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil -from ._block import BlockManager, calculate_updated_layout +from ._block import reader as block_reader +from ._block import writer as block_writer +from ._block.manager import Manager as BlockManager +from ._block.options import Options as BlockOptions from ._helpers import validate_version from .config import config_context, get_config from .exceptions import ( @@ -151,7 +154,9 @@ def __init__( self._fd = None self._closed = False self._external_asdf_by_uri = {} - self._blocks = BlockManager(self, copy_arrays=copy_arrays, lazy_load=lazy_load) + self._blocks = BlockManager() + self._blocks.lazy_load = lazy_load + self._blocks.memmap = not copy_arrays self._uri = uri if tree is None: # Bypassing the tree property here, to avoid validating @@ -460,7 +465,6 @@ def close(self): for external in self._external_asdf_by_uri.values(): external.close() self._external_asdf_by_uri.clear() - self._blocks.close() def copy(self): return self.__class__( @@ -660,8 +664,9 @@ def set_array_storage(self, arr, array_storage): - ``inline``: Store the data as YAML inline in the tree. """ - block = self._blocks[arr] - self._blocks.set_array_storage(block, array_storage) + options = self._blocks.options.get_options(arr) + options.storage_type = array_storage + self._blocks.options.set_options(arr, options) def get_array_storage(self, arr): """ @@ -671,7 +676,7 @@ def get_array_storage(self, arr): ---------- arr : numpy.ndarray """ - return self._blocks[arr].array_storage + return self._blocks.options.get_options(arr).storage_type def set_array_compression(self, arr, compression, **compression_kwargs): """ @@ -699,8 +704,9 @@ def set_array_compression(self, arr, compression, **compression_kwargs): If there is no prior file, acts as None. """ - self._blocks[arr].output_compression = compression - self._blocks[arr].output_compression_kwargs = compression_kwargs + options = self._blocks.options.get_options(arr) + options.compression = compression + options.compression_kwargs = compression_kwargs def get_array_compression(self, arr): """ @@ -714,11 +720,11 @@ def get_array_compression(self, arr): ------- compression : str or None """ - return self._blocks[arr].output_compression + return self._blocks.options.get_options(arr).compression def get_array_compression_kwargs(self, arr): """ """ - return self._blocks[arr].output_compression_kwargs + return self._blocks.options.get_options(arr).compression_kwargs @classmethod def _parse_header_line(cls, line): @@ -821,8 +827,8 @@ def _open_asdf( self.extensions = extensions yaml_token = fd.read(4) - has_blocks = False tree = None + read_blocks = [] if yaml_token == b"%YAM": reader = fd.reader_until( constants.YAML_END_MARKER_REGEX, @@ -842,13 +848,19 @@ def _open_asdf( # now, but we don't do anything special with it until # after the blocks have been read tree = yamlutil.load_tree(reader) - has_blocks = fd.seek_until(constants.BLOCK_MAGIC, 4, include=True, exception=False) + # has_blocks = fd.seek_until(constants.BLOCK_MAGIC, 4, include=True, exception=False) + read_blocks = block_reader.read_blocks(fd, self._blocks.memmap, self._blocks.lazy_load) elif yaml_token == constants.BLOCK_MAGIC: - has_blocks = True + # this file has only blocks + raise NotImplementedError("Support for block only file does not yet exist") + # since we're after the magic, if seekable, just reset + # if not seekable, read first block, then read the reset serially, add them all up elif yaml_token != b"": msg = "ASDF file appears to contain garbage after header." raise OSError(msg) + self._blocks.blocks._items = read_blocks + if tree is None: # At this point the tree should be tagged, but we want it to be # tagged with the core/asdf version appropriate to this file's @@ -856,10 +868,6 @@ def _open_asdf( # to select the correct tag for us. tree = yamlutil.custom_tree_to_tagged_tree(AsdfObject(), self) - if has_blocks: - self._blocks.read_internal_blocks(fd, past_magic=True, validate_checksums=validate_checksums) - self._blocks.read_block_index(fd, self) - tree = reference.find_references(tree, self) if self.version <= versioning.FILL_DEFAULTS_MAX_VERSION and get_config().legacy_fill_schema_defaults: @@ -930,7 +938,9 @@ def _write_tree(self, tree, fd, pad_blocks): if len(tree): serialization_context = self._create_serialization_context() - compression_extensions = self._blocks.get_output_compression_extensions() + # TODO fix output compression extensions + # compression_extensions = self._blocks.get_output_compression_extensions() + compression_extensions = [] for ext in compression_extensions: serialization_context._mark_extension_used(ext) @@ -969,16 +979,26 @@ def _pre_write(self, fd): # This is where we'd do some more sophisticated block # reorganization, if necessary - self._blocks.finalize(self) + # self._blocks.finalize(self) self._tree["asdf_library"] = get_asdf_library_info() def _serial_write(self, fd, pad_blocks, include_block_index): + self._blocks._write_blocks = [] self._write_tree(self._tree, fd, pad_blocks) - self._blocks.write_internal_blocks_serial(fd, pad_blocks) - self._blocks.write_external_blocks(fd.uri, pad_blocks) - if include_block_index: - self._blocks.write_block_index(fd, self) + if len(self._blocks._write_blocks): + block_writer.write_blocks( + fd, + self._blocks._write_blocks, + pad_blocks, + streamed_block=None, # TODO streamed block + write_index=include_block_index, + ) + # TODO external blocks + # self._blocks.write_internal_blocks_serial(fd, pad_blocks) + # self._blocks.write_external_blocks(fd.uri, pad_blocks) + # if include_block_index: + # self._blocks.write_block_index(fd, self) def _random_write(self, fd, pad_blocks, include_block_index): self._write_tree(self._tree, fd, False) @@ -1055,91 +1075,91 @@ def update( """ raise NotImplementedError("broken update") - with config_context() as config: - if all_array_storage is not NotSet: - config.all_array_storage = all_array_storage - if all_array_compression is not NotSet: - config.all_array_compression = all_array_compression - if compression_kwargs is not NotSet: - config.all_array_compression_kwargs = compression_kwargs - - fd = self._fd - - if fd is None: - msg = "Can not update, since there is no associated file" - raise ValueError(msg) - - if not fd.writable(): - msg = ( - "Can not update, since associated file is read-only. Make " - "sure that the AsdfFile was opened with mode='rw' and the " - "underlying file handle is writable." - ) - raise OSError(msg) - - if version is not None: - self.version = version - - if config.all_array_storage == "external": - # If the file is fully exploded, there's no benefit to - # update, so just use write_to() - self.write_to(fd) - fd.truncate() - return - - if not fd.seekable(): - msg = "Can not update, since associated file is not seekable" - raise OSError(msg) - - self._blocks.finish_reading_internal_blocks() - - # flush all pending memmap writes - if fd.can_memmap(): - fd.flush_memmap() - - self._pre_write(fd) - - try: - fd.seek(0) - - if not self._blocks.has_blocks_with_offset(): - # If we don't have any blocks that are being reused, just - # write out in a serial fashion. - self._serial_write(fd, pad_blocks, include_block_index) - fd.truncate() - return - - # Estimate how big the tree will be on disk by writing the - # YAML out in memory. Since the block indices aren't yet - # known, we have to count the number of block references and - # add enough space to accommodate the largest block number - # possible there. - tree_serialized = io.BytesIO() - self._write_tree(self._tree, tree_serialized, pad_blocks=False) - n_internal_blocks = len(self._blocks._internal_blocks) - - serialized_tree_size = tree_serialized.tell() + constants.MAX_BLOCKS_DIGITS * n_internal_blocks - - if not calculate_updated_layout(self._blocks, serialized_tree_size, pad_blocks, fd.block_size): - # If we don't have any blocks that are being reused, just - # write out in a serial fashion. - self._serial_write(fd, pad_blocks, include_block_index) - fd.truncate() - return - - fd.seek(0) - self._random_write(fd, pad_blocks, include_block_index) - fd.flush() - finally: - self._post_write(fd) - # close memmaps so they will regenerate - if fd.can_memmap(): - fd.close_memmap() - # also clean any memmapped blocks - for b in self._blocks._internal_blocks: - if b._memmapped: - b._memmapped = False - b._data = None + # with config_context() as config: + # if all_array_storage is not NotSet: + # config.all_array_storage = all_array_storage + # if all_array_compression is not NotSet: + # config.all_array_compression = all_array_compression + # if compression_kwargs is not NotSet: + # config.all_array_compression_kwargs = compression_kwargs + + # fd = self._fd + + # if fd is None: + # msg = "Can not update, since there is no associated file" + # raise ValueError(msg) + + # if not fd.writable(): + # msg = ( + # "Can not update, since associated file is read-only. Make " + # "sure that the AsdfFile was opened with mode='rw' and the " + # "underlying file handle is writable." + # ) + # raise OSError(msg) + + # if version is not None: + # self.version = version + + # if config.all_array_storage == "external": + # # If the file is fully exploded, there's no benefit to + # # update, so just use write_to() + # self.write_to(fd) + # fd.truncate() + # return + + # if not fd.seekable(): + # msg = "Can not update, since associated file is not seekable" + # raise OSError(msg) + + # self._blocks.finish_reading_internal_blocks() + + # # flush all pending memmap writes + # if fd.can_memmap(): + # fd.flush_memmap() + + # self._pre_write(fd) + + # try: + # fd.seek(0) + + # if not self._blocks.has_blocks_with_offset(): + # # If we don't have any blocks that are being reused, just + # # write out in a serial fashion. + # self._serial_write(fd, pad_blocks, include_block_index) + # fd.truncate() + # return + + # # Estimate how big the tree will be on disk by writing the + # # YAML out in memory. Since the block indices aren't yet + # # known, we have to count the number of block references and + # # add enough space to accommodate the largest block number + # # possible there. + # tree_serialized = io.BytesIO() + # self._write_tree(self._tree, tree_serialized, pad_blocks=False) + # n_internal_blocks = len(self._blocks._internal_blocks) + + # serialized_tree_size = tree_serialized.tell() + constants.MAX_BLOCKS_DIGITS * n_internal_blocks + + # if not calculate_updated_layout(self._blocks, serialized_tree_size, pad_blocks, fd.block_size): + # # If we don't have any blocks that are being reused, just + # # write out in a serial fashion. + # self._serial_write(fd, pad_blocks, include_block_index) + # fd.truncate() + # return + + # fd.seek(0) + # self._random_write(fd, pad_blocks, include_block_index) + # fd.flush() + # finally: + # self._post_write(fd) + # # close memmaps so they will regenerate + # if fd.can_memmap(): + # fd.close_memmap() + # # also clean any memmapped blocks + # for b in self._blocks._internal_blocks: + # if b._memmapped: + # b._memmapped = False + # b._data = None def write_to( self, @@ -1213,77 +1233,6 @@ def write_to( Update the ASDF Standard version of this AsdfFile before writing. """ - with config_context() as config: - if all_array_storage is not NotSet: - config.all_array_storage = all_array_storage - if all_array_compression is not NotSet: - config.all_array_compression = all_array_compression - if compression_kwargs is not NotSet: - config.all_array_compression_kwargs = compression_kwargs - - used_blocks = self._blocks._find_used_blocks(self.tree, self, remove=False) - - naf = AsdfFile( - {}, - uri=self._uri, - extensions=self.extensions, - version=self.version, - ignore_version_mismatch=self._ignore_version_mismatch, - ignore_unrecognized_tag=self._ignore_unrecognized_tag, - ignore_implicit_conversion=self._ignore_implicit_conversion, - ) - naf._tree = copy.copy(self.tree) # avoid an extra validate - - # deep copy keys that will be modified during write - modified_keys = ["history", "asdf_library"] - for k in modified_keys: - if k in self.tree: - naf._tree[k] = copy.deepcopy(self.tree[k]) - - # copy over block storage and other settings - block_to_key_mapping = {v: k for k, v in self._blocks._key_to_block_mapping.items()} - # this creates blocks in the new block manager that correspond to blocks - # in the original file - for b in self._blocks.blocks: - if b not in used_blocks: - continue - if b in self._blocks._streamed_blocks and b._data is None: - # streamed blocks might not have data - # add a streamed block to naf - blk = naf._blocks.get_streamed_block() - # mark this block as used so it doesn't get removed - blk._used = True - elif b._data is not None or b._fd is not None: # this block has data - arr = b.data - blk = naf._blocks[arr] - blk._used = True - naf.set_array_storage(arr, b.array_storage) - naf.set_array_compression(arr, b.output_compression, **b.output_compression_kwargs) - else: # this block does not have data - key = block_to_key_mapping[b] - blk = naf._blocks.find_or_create_block(key) - blk._used = True - blk._data_callback = b._data_callback - naf._write_to( - fd, - all_array_storage=all_array_storage, - all_array_compression=all_array_compression, - compression_kwargs=compression_kwargs, - pad_blocks=pad_blocks, - include_block_index=include_block_index, - version=version, - ) - - def _write_to( - self, - fd, - all_array_storage=NotSet, - all_array_compression=NotSet, - compression_kwargs=NotSet, - pad_blocks=False, - include_block_index=True, - version=None, - ): with config_context() as config: if all_array_storage is not NotSet: config.all_array_storage = all_array_storage diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index be8a8e466..a9f79f0c6 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -1,4 +1,3 @@ -import mmap import sys import numpy as np @@ -6,6 +5,7 @@ from asdf import _types, util from asdf._jsonschema import ValidationError +from asdf.config import config_context _datatype_names = { "int8": "i1", @@ -242,7 +242,8 @@ def __init__(self, source, shape, dtype, offset, strides, order, mask, asdffile) if isinstance(source, list): self._array = inline_data_asarray(source, dtype) self._array = self._apply_mask(self._array, self._mask) - self._block = asdffile._blocks.add_inline(self._array) + # self._block = asdffile._blocks.add_inline(self._array) + asdffile.set_array_storage(self._array, "inline") if shape is not None and ( (shape[0] == "*" and self._array.shape[1:] != tuple(shape[1:])) or (self._array.shape != tuple(shape)) ): @@ -263,15 +264,19 @@ def _make_array(self): # closed and replaced. We need to check here and re-generate # the array if necessary, otherwise we risk segfaults when # memory mapping. - if self._array is not None: - base = util.get_array_base(self._array) - if isinstance(base, np.memmap) and isinstance(base.base, mmap.mmap) and base.base.closed: - self._array = None + # if self._array is not None: + # base = util.get_array_base(self._array) + # if isinstance(base, np.memmap) and isinstance(base.base, mmap.mmap) and base.base.closed: + # self._array = None if self._array is None: block = self.block - shape = self.get_actual_shape(self._shape, self._strides, self._dtype, len(block)) + # shape = self.get_actual_shape(self._shape, self._strides, self._dtype, len(block)) + # TODO streaming blocks have 0 data size + shape = self.get_actual_shape(self._shape, self._strides, self._dtype, block.header["data_size"]) + if callable(block._data): + block._data = block.data self._array = np.ndarray(shape, self._dtype, block.data, self._offset, self._strides, self._order) self._array = self._apply_mask(self._array, self._mask) return self._array @@ -338,7 +343,8 @@ def get_actual_shape(self, shape, strides, dtype, block_size): @property def block(self): if self._block is None: - self._block = self._asdffile._blocks.get_block(self._source) + self._block = self._asdffile._blocks.blocks[self._source] + # self._block = self._asdffile._blocks.get_block(self._source) return self._block @property @@ -346,7 +352,8 @@ def shape(self): if self._shape is None: return self.__array__().shape if "*" in self._shape: - return tuple(self.get_actual_shape(self._shape, self._strides, self._dtype, len(self.block))) + # return tuple(self.get_actual_shape(self._shape, self._strides, self._dtype, len(self.block))) + return tuple(self.get_actual_shape(self._shape, self._strides, self._dtype, self.block.header["data_size"])) return tuple(self._shape) @property @@ -451,7 +458,23 @@ def to_tree(cls, data, ctx): shape = data.shape - block = ctx._blocks.find_or_create_block_for_array(data) + options = ctx._blocks.options.get_options(data) + with config_context() as cfg: + if cfg.all_array_storage is not None: + options.storage_type = cfg.all_array_storage + if cfg.all_array_compression != "input": + options.compression = cfg.all_array_compression + options.compression_kwargs = cfg.all_array_compression_kwargs + inline_threshold = cfg.array_inline_threshold + + # block = ctx._blocks.find_or_create_block_for_array(data) + # foo + if inline_threshold is not None and options.storage_type in ("inline", "internal"): + if data.size < inline_threshold: + options.storage_type = "inline" + else: + options.storage_type = "internal" + ctx._blocks.options.set_options(data, options) # Compute the offset relative to the base array and not the # block data, in case the block is compressed. @@ -460,26 +483,31 @@ def to_tree(cls, data, ctx): strides = None if data.flags.c_contiguous else data.strides dtype, byteorder = numpy_dtype_to_asdf_datatype( data.dtype, - include_byteorder=(block.array_storage != "inline"), + # include_byteorder=(block.array_storage != "inline"), + include_byteorder=(options.storage_type != "inline"), ) result = {} result["shape"] = list(shape) - if block.array_storage == "streamed": + # if block.array_storage == "streamed": + if options.storage_type == "streamed": result["shape"][0] = "*" - if block.array_storage == "inline": + # if block.array_storage == "inline": + if options.storage_type == "inline": listdata = numpy_array_to_list(data) result["data"] = listdata result["datatype"] = dtype else: result["shape"] = list(shape) - if block.array_storage == "streamed": + if options.storage_type == "streamed": result["shape"][0] = "*" - result["source"] = ctx._blocks.get_source(block) + # result["source"] = ctx._blocks.get_source(block) + # convert data to byte array + result["source"] = ctx._blocks.make_write_block(base, options) result["datatype"] = dtype result["byteorder"] = byteorder @@ -490,8 +518,10 @@ def to_tree(cls, data, ctx): result["strides"] = list(strides) if isinstance(data, ma.MaskedArray) and np.any(data.mask): - if block.array_storage == "inline": - ctx._blocks.set_array_storage(ctx._blocks[data.mask], "inline") + # if block.array_storage == "inline": + if options.storage_type == "inline": + # ctx._blocks.set_array_storage(ctx._blocks[data.mask], "inline") + ctx.set_array_storage(data.mask, "inline") result["mask"] = data.mask @@ -538,13 +568,13 @@ def assert_allclose(cls, old, new): else: cls._assert_equality(old, new, assert_allclose) - @classmethod - def copy_to_new_asdf(cls, node, asdffile): - if isinstance(node, NDArrayType): - array = node._make_array() - asdffile._blocks.set_array_storage(asdffile._blocks[array], node.block.array_storage) - return node._make_array() - return node + # @classmethod + # def copy_to_new_asdf(cls, node, asdffile): + # if isinstance(node, NDArrayType): + # array = node._make_array() + # asdffile._blocks.set_array_storage(asdffile._blocks[array], node.block.array_storage) + # return node._make_array() + # return node def _make_operation(name): From 7944f03ff622e1e0fea6b25f70404ef42ebed8b7 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 19 Apr 2023 18:39:50 -0400 Subject: [PATCH 010/154] fix resolve_and_inline --- asdf/_tests/tags/core/tests/test_ndarray.py | 1 - asdf/_tests/test_api.py | 1 - asdf/asdf.py | 5 ++--- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/asdf/_tests/tags/core/tests/test_ndarray.py b/asdf/_tests/tags/core/tests/test_ndarray.py index eff843ebf..40e4c93cc 100644 --- a/asdf/_tests/tags/core/tests/test_ndarray.py +++ b/asdf/_tests/tags/core/tests/test_ndarray.py @@ -397,7 +397,6 @@ def test_simple_table(): ff.write_to(io.BytesIO()) -@pytest.mark.xfail(reason="resolve and inline is broken") def test_unicode_to_list(tmpdir): arr = np.array(["", "𐀠"], dtype=" Date: Thu, 20 Apr 2023 14:38:24 -0400 Subject: [PATCH 011/154] stream blocks working --- asdf/_block/manager.py | 15 ++++++++++++++- asdf/_block/writer.py | 5 ++++- asdf/_tests/test_api.py | 2 +- asdf/_tests/test_stream.py | 16 +++------------- asdf/asdf.py | 7 +++++-- asdf/stream.py | 6 ++++-- asdf/tags/core/ndarray.py | 34 ++++++++++++++++++++++++++++------ 7 files changed, 59 insertions(+), 26 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 795d94845..8d5b27e7b 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -81,6 +81,13 @@ def get_options(self, array): return options def set_options(self, array, options): + if options.storage_type == "streamed": + for d in self._by_id.values(): + for opt in d.values(): + if opt.storage_type == "streamed": + if opt is options: + continue + raise ValueError("Can not add second streaming block") base = util.get_array_base(array) self.assign_object(base, options) @@ -95,8 +102,9 @@ def __init__(self, read_blocks=None): self.blocks = self.options._read_blocks else: self.blocks = read_blocks - self._write_blocks = [] # TODO copy options and read_blocks on start of write + self._write_blocks = [] + self._streamed_block = None def make_write_block(self, data, options): # first, look for an existing block @@ -109,6 +117,11 @@ def make_write_block(self, data, options): # self._write_blocks.append(WriteBlock(data_bytes, options.compression, options.compression_kwargs)) return len(self._write_blocks) - 1 + def set_streamed_block(self, data): + if self._streamed_block is not None and data is not self._streamed_block.data: + raise ValueError("Can not add second streaming block") + self._streamed_block = WriteBlock(data) + # cludges for tests @property def internal_blocks(self): diff --git a/asdf/_block/writer.py b/asdf/_block/writer.py index 54546ffc2..e362b4c89 100644 --- a/asdf/_block/writer.py +++ b/asdf/_block/writer.py @@ -19,7 +19,10 @@ def data(self): @property def data_bytes(self): - return np.ndarray(-1, np.uint8, self.data.ravel(order="K").data) + data = self.data + if data is not None: + return np.ndarray(-1, np.uint8, data.ravel(order="K").data) + return np.ndarray(0, np.uint8) def write_blocks(fd, blocks, padding=False, streamed_block=None, write_index=True): diff --git a/asdf/_tests/test_api.py b/asdf/_tests/test_api.py index 88d8ffcbb..36bcb0757 100644 --- a/asdf/_tests/test_api.py +++ b/asdf/_tests/test_api.py @@ -147,7 +147,7 @@ def test_default_version(): assert ff.file_format_version == version_map["FILE_FORMAT"] -@pytest.mark.xfail +@pytest.mark.xfail(reason="update is broken") def test_update_exceptions(tmp_path): path = str(tmp_path / "test.asdf") diff --git a/asdf/_tests/test_stream.py b/asdf/_tests/test_stream.py index 2ecac46e7..eacb406dc 100644 --- a/asdf/_tests/test_stream.py +++ b/asdf/_tests/test_stream.py @@ -9,7 +9,6 @@ from asdf import generic_io, stream -@pytest.mark.xfail(reason="stream is broken") def test_stream(): buff = io.BytesIO() @@ -29,7 +28,6 @@ def test_stream(): assert np.all(row == i) -@pytest.mark.xfail(reason="stream is broken") def test_stream_write_nothing(): """ Test that if you write nothing, you get a zero-length array @@ -49,7 +47,6 @@ def test_stream_write_nothing(): assert ff.tree["stream"].shape == (0, 6, 2) -@pytest.mark.xfail(reason="stream is broken") def test_stream_twice(): """ Test that if you write nothing, you get a zero-length array @@ -72,7 +69,6 @@ def test_stream_twice(): assert ff.tree["stream2"].shape == (50, 12, 2) -@pytest.mark.xfail(reason="stream is broken") def test_stream_with_nonstream(): buff = io.BytesIO() @@ -89,15 +85,13 @@ def test_stream_with_nonstream(): buff.seek(0) with asdf.open(buff) as ff: - assert len(ff._blocks) == 1 + assert len(ff._blocks) == 2 assert_array_equal(ff.tree["nonstream"], np.array([1, 2, 3, 4], np.int64)) assert ff.tree["stream"].shape == (100, 6, 2) - assert len(ff._blocks) == 2 for i, row in enumerate(ff.tree["stream"]): assert np.all(row == i) -@pytest.mark.xfail(reason="stream is broken") def test_stream_real_file(tmp_path): path = os.path.join(str(tmp_path), "test.asdf") @@ -114,15 +108,13 @@ def test_stream_real_file(tmp_path): fd.write(np.array([i] * 12, np.float64).tobytes()) with asdf.open(path) as ff: - assert len(ff._blocks) == 1 + assert len(ff._blocks) == 2 assert_array_equal(ff.tree["nonstream"], np.array([1, 2, 3, 4], np.int64)) assert ff.tree["stream"].shape == (100, 6, 2) - assert len(ff._blocks) == 2 for i, row in enumerate(ff.tree["stream"]): assert np.all(row == i) -@pytest.mark.xfail(reason="stream is broken") def test_stream_to_stream(): tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": stream.Stream([6, 2], np.float64)} @@ -144,7 +136,6 @@ def test_stream_to_stream(): assert np.all(row == i) -@pytest.mark.xfail(reason="stream is broken") def test_array_to_stream(tmp_path): tree = { "stream": np.array([1, 2, 3, 4], np.int64), @@ -161,6 +152,7 @@ def test_array_to_stream(tmp_path): assert_array_equal(ff.tree["stream"], [1, 2, 3, 4, 5, 6, 7, 8]) buff.seek(0) ff2 = asdf.AsdfFile(ff) + ff2.set_array_storage(ff2["stream"], "streamed") ff2.write_to(buff) assert b"shape: ['*']" in buff.getvalue() @@ -177,7 +169,6 @@ def test_array_to_stream(tmp_path): assert b"shape: ['*']" in buff.getvalue() -@pytest.mark.xfail(reason="stream is broken") def test_too_many_streams(): tree = {"stream1": np.array([1, 2, 3, 4], np.int64), "stream2": np.array([1, 2, 3, 4], np.int64)} @@ -187,7 +178,6 @@ def test_too_many_streams(): ff.set_array_storage(tree["stream2"], "streamed") -@pytest.mark.xfail(reason="stream is broken") def test_stream_repr_and_str(): tree = {"stream": stream.Stream([16], np.int64)} diff --git a/asdf/asdf.py b/asdf/asdf.py index 5f204190a..cd4a71d3e 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -985,15 +985,18 @@ def _pre_write(self, fd): def _serial_write(self, fd, pad_blocks, include_block_index): self._blocks._write_blocks = [] + self._blocks._streamed_block = None self._write_tree(self._tree, fd, pad_blocks) - if len(self._blocks._write_blocks): + if len(self._blocks._write_blocks) or self._blocks._streamed_block: block_writer.write_blocks( fd, self._blocks._write_blocks, pad_blocks, - streamed_block=None, # TODO streamed block + streamed_block=self._blocks._streamed_block, write_index=include_block_index, ) + self._blocks._write_blocks = [] + self._blocks._streamed_block = None # TODO external blocks # self._blocks.write_internal_blocks_serial(fd, pad_blocks) # self._blocks.write_external_blocks(fd.uri, pad_blocks) diff --git a/asdf/stream.py b/asdf/stream.py index 05984ba97..586af891e 100644 --- a/asdf/stream.py +++ b/asdf/stream.py @@ -40,11 +40,13 @@ def reserve_blocks(cls, data, ctx): @classmethod def from_tree(cls, data, ctx): - return ndarray.NDArrayType.from_tree(data, ctx) + # this is never called because tags always trigger loading with NDArrayType + raise NotImplementedError("never called") @classmethod def to_tree(cls, data, ctx): - ctx._blocks.get_streamed_block() + # TODO previously, stream never passed on data? + ctx._blocks.set_streamed_block(data._array) result = {} result["source"] = -1 diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index a9f79f0c6..0da474e0f 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -271,12 +271,17 @@ def _make_array(self): if self._array is None: block = self.block - # shape = self.get_actual_shape(self._shape, self._strides, self._dtype, len(block)) - # TODO streaming blocks have 0 data size - shape = self.get_actual_shape(self._shape, self._strides, self._dtype, block.header["data_size"]) if callable(block._data): block._data = block.data + + # streaming blocks have 0 data size + shape = self.get_actual_shape( + self._shape, + self._strides, + self._dtype, + block.header["data_size"] or block._data.size, + ) self._array = np.ndarray(shape, self._dtype, block.data, self._offset, self._strides, self._order) self._array = self._apply_mask(self._array, self._mask) return self._array @@ -344,6 +349,11 @@ def get_actual_shape(self, shape, strides, dtype, block_size): def block(self): if self._block is None: self._block = self._asdffile._blocks.blocks[self._source] + if self._source == -1: + if callable(self._block._data): + self._block._data = self._block.data + if self._asdffile.get_array_storage(self._block.data) != "streamed": + self._asdffile.set_array_storage(self._block.data, "streamed") # self._block = self._asdffile._blocks.get_block(self._source) return self._block @@ -353,7 +363,16 @@ def shape(self): return self.__array__().shape if "*" in self._shape: # return tuple(self.get_actual_shape(self._shape, self._strides, self._dtype, len(self.block))) - return tuple(self.get_actual_shape(self._shape, self._strides, self._dtype, self.block.header["data_size"])) + if not self.block.header["data_size"]: + self._make_array() + return tuple( + self.get_actual_shape( + self._shape, + self._strides, + self._dtype, + self.block.header["data_size"] or self.block._data.size, + ) + ) return tuple(self._shape) @property @@ -490,7 +509,6 @@ def to_tree(cls, data, ctx): result = {} result["shape"] = list(shape) - # if block.array_storage == "streamed": if options.storage_type == "streamed": result["shape"][0] = "*" @@ -507,7 +525,11 @@ def to_tree(cls, data, ctx): # result["source"] = ctx._blocks.get_source(block) # convert data to byte array - result["source"] = ctx._blocks.make_write_block(base, options) + if options.storage_type == "streamed": + ctx._blocks.set_streamed_block(base) + result["source"] = -1 + else: + result["source"] = ctx._blocks.make_write_block(base, options) result["datatype"] = dtype result["byteorder"] = byteorder From b912b78d21eb32183dfeda5e2921e50fb09a19cf Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 21 Apr 2023 09:52:55 -0400 Subject: [PATCH 012/154] external blocks working --- asdf/_block/manager.py | 59 ++++++++++++++++++++- asdf/_tests/commands/tests/test_exploded.py | 2 - asdf/_tests/test_array_blocks.py | 1 - asdf/_tests/test_generic_io.py | 12 ++--- asdf/asdf.py | 11 ++-- asdf/tags/core/ndarray.py | 16 +++--- 6 files changed, 75 insertions(+), 26 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 8d5b27e7b..237eeafc4 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -1,8 +1,10 @@ -from asdf import constants, util +import os + +from asdf import constants, generic_io, util from . import store from .options import Options -from .writer import WriteBlock +from .writer import WriteBlock, write_blocks class ReadBlocks(store.LinearStore): @@ -95,6 +97,30 @@ def set_options(self, array, options): # TODO make an 'update_options' +def make_external_uri(uri, index): + if uri is None: + uri = "" + parts = list(util.patched_urllib_parse.urlparse(uri)) + path = parts[2] + dirname, filename = os.path.split(path) + filename = os.path.splitext(filename)[0] + f"{index:04d}.asdf" + return filename + # path = os.path.join(dirname, filename) + # parts[2] = path + # return util.patched_urllib_parse.urlunparse(parts) + + +def resolve_external_uri(uri, relative): + if uri is None: + uri = "" + parts = list(util.patched_urllib_parse.urlparse(uri)) + path = parts[2] + dirname, filename = os.path.split(path) + path = os.path.join(dirname, relative) + parts[2] = path + return util.patched_urllib_parse.urlunparse(parts) + + class Manager: def __init__(self, read_blocks=None): self.options = BlockOptions(read_blocks) @@ -104,9 +130,38 @@ def __init__(self, read_blocks=None): self.blocks = read_blocks # TODO copy options and read_blocks on start of write self._write_blocks = [] + self._external_write_blocks = [] self._streamed_block = None + self._write_fd = None + + def _write_external_blocks(self): + from asdf import AsdfFile + + if not len(self._external_write_blocks): + return + + if self._write_fd.uri is None: + raise ValueError("Can't write external blocks, since URI of main file is unknown.") + + for blk in self._external_write_blocks: + uri = resolve_external_uri(self._write_fd.uri, blk._uri) + af = AsdfFile() + with generic_io.get_file(uri, mode="w") as f: + af.write_to(f, include_block_index=False) + write_blocks(f, [blk]) def make_write_block(self, data, options): + if options.storage_type == "external": + for index, blk in enumerate(self._external_write_blocks): + if blk._data is data: + # this external uri is already ready to go + return blk._uri + # need to set up new external block + index = len(self._external_write_blocks) + blk = WriteBlock(data, options.compression, options.compression_kwargs) + blk._uri = make_external_uri(self._write_fd.uri, index) + self._external_write_blocks.append(blk) + return blk._uri # first, look for an existing block for index, blk in enumerate(self._write_blocks): if blk._data is data: diff --git a/asdf/_tests/commands/tests/test_exploded.py b/asdf/_tests/commands/tests/test_exploded.py index ea9dc680d..e431fbb2d 100644 --- a/asdf/_tests/commands/tests/test_exploded.py +++ b/asdf/_tests/commands/tests/test_exploded.py @@ -1,7 +1,6 @@ import os import numpy as np -import pytest import asdf from asdf import AsdfFile @@ -9,7 +8,6 @@ from asdf.commands import main -@pytest.mark.xfail(reason="external blocks are broken") def test_explode_then_implode(tmpdir): x = np.arange(0, 10, dtype=float) diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index 5e303e71f..f3f7e872b 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -15,7 +15,6 @@ RNG = np.random.default_rng(6) -@pytest.mark.xfail(reason="external blocks are broken") def test_external_block(tmp_path): tmp_path = str(tmp_path) diff --git a/asdf/_tests/test_generic_io.py b/asdf/_tests/test_generic_io.py index 38e6e18d4..474372268 100644 --- a/asdf/_tests/test_generic_io.py +++ b/asdf/_tests/test_generic_io.py @@ -273,12 +273,11 @@ def get_read_fd(): return fd with _roundtrip(tree, get_write_fd, get_read_fd) as ff: - assert len(list(ff._blocks.internal_blocks)) == 2 - assert isinstance(next(ff._blocks.internal_blocks)._data, np.ndarray) + assert len(ff._blocks.blocks) == 2 + assert isinstance(ff._blocks.blocks[0]._data, np.ndarray) assert (ff.tree["science_data"] == tree["science_data"]).all() -@pytest.mark.xfail(reason="external blocks are broken") def test_exploded_filesystem(tree, tmp_path): path = os.path.join(str(tmp_path), "test.asdf") @@ -289,11 +288,9 @@ def get_read_fd(): return generic_io.get_file(path, mode="r") with _roundtrip(tree, get_write_fd, get_read_fd, write_options={"all_array_storage": "external"}) as ff: - assert len(list(ff._blocks.internal_blocks)) == 0 - assert len(list(ff._blocks.external_blocks)) == 2 + assert len(ff._blocks.blocks) == 0 -@pytest.mark.xfail(reason="external blocks are broken") def test_exploded_filesystem_fail(tree, tmp_path): path = os.path.join(str(tmp_path), "test.asdf") @@ -314,7 +311,6 @@ def get_read_fd(): helpers.assert_tree_match(tree, ff.tree) -@pytest.mark.xfail(reason="external blocks are broken") @pytest.mark.remote_data() def test_exploded_http(tree, httpserver): path = os.path.join(httpserver.tmpdir, "test.asdf") @@ -330,7 +326,6 @@ def get_read_fd(): assert len(list(ff._blocks.external_blocks)) == 2 -@pytest.mark.xfail(reason="external blocks are broken") def test_exploded_stream_write(small_tree): # Writing an exploded file to an output stream should fail, since # we can't write "files" alongside it. @@ -341,7 +336,6 @@ def test_exploded_stream_write(small_tree): ff.write_to(io.BytesIO(), all_array_storage="external") -@pytest.mark.xfail(reason="external blocks are broken") def test_exploded_stream_read(tmp_path, small_tree): # Reading from an exploded input file should fail, but only once # the data block is accessed. This behavior is important so that diff --git a/asdf/asdf.py b/asdf/asdf.py index cd4a71d3e..3905edfb5 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -985,6 +985,7 @@ def _pre_write(self, fd): def _serial_write(self, fd, pad_blocks, include_block_index): self._blocks._write_blocks = [] + self._blocks._external_write_blocks = [] self._blocks._streamed_block = None self._write_tree(self._tree, fd, pad_blocks) if len(self._blocks._write_blocks) or self._blocks._streamed_block: @@ -995,7 +996,10 @@ def _serial_write(self, fd, pad_blocks, include_block_index): streamed_block=self._blocks._streamed_block, write_index=include_block_index, ) + if len(self._blocks._external_write_blocks): + self._blocks._write_external_blocks() self._blocks._write_blocks = [] + self._blocks._external_write_blocks = [] self._blocks._streamed_block = None # TODO external blocks # self._blocks.write_internal_blocks_serial(fd, pad_blocks) @@ -1248,11 +1252,7 @@ def write_to( self.version = version with generic_io.get_file(fd, mode="w") as fd: - # TODO: This is not ideal: we really should pass the URI through - # explicitly to wherever it is required instead of making it an - # attribute of the AsdfFile. - if self._uri is None: - self._uri = fd.uri + self._blocks._write_fd = fd self._pre_write(fd) try: @@ -1260,6 +1260,7 @@ def write_to( fd.flush() finally: self._post_write(fd) + self._blocks._write_fd = None def find_references(self): """ diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index 0da474e0f..93156107c 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -348,13 +348,15 @@ def get_actual_shape(self, shape, strides, dtype, block_size): @property def block(self): if self._block is None: - self._block = self._asdffile._blocks.blocks[self._source] - if self._source == -1: - if callable(self._block._data): - self._block._data = self._block.data - if self._asdffile.get_array_storage(self._block.data) != "streamed": - self._asdffile.set_array_storage(self._block.data, "streamed") - # self._block = self._asdffile._blocks.get_block(self._source) + if isinstance(self._source, str): + self._block = self._asdffile.open_external(self._source)._blocks.blocks[0] + else: + self._block = self._asdffile._blocks.blocks[self._source] + if self._source == -1: + if callable(self._block._data): + self._block._data = self._block.data + if self._asdffile.get_array_storage(self._block.data) != "streamed": + self._asdffile.set_array_storage(self._block.data, "streamed") return self._block @property From eecd86207906c671c9ad0376ec45eca7519affe4 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 27 Apr 2023 13:32:10 -0400 Subject: [PATCH 013/154] add block data caching for ndarraytype --- asdf/_block/manager.py | 2 +- asdf/_block/reader.py | 7 +++++++ asdf/_tests/test_generic_io.py | 12 +++++------ asdf/tags/core/ndarray.py | 37 ++++++++++++++++++++-------------- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 237eeafc4..1f4e98149 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -67,7 +67,7 @@ def get_options(self, array): if options is None: # look up by block with matching _data for block in self._read_blocks: - if block._data is base: + if block._cached_data is base or block._data is base: # init options if block.header["flags"] & constants.BLOCK_FLAG_STREAMED: storage_type = "streamed" diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index 9f0974ebb..3f7af43c7 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -12,6 +12,7 @@ def __init__(self, offset, fd, memmap, lazy_load, header=None, data_offset=None, self._header = header self.data_offset = data_offset self._data = data + self._cached_data = None # TODO alternative to passing these down? self.memmap = memmap self.lazy_load = lazy_load @@ -41,6 +42,12 @@ def data(self): return self._data() return self._data + @property + def cached_data(self): + if self._cached_data is None: + self._cached_data = self.data + return self._cached_data + @property def header(self): if not self.loaded: diff --git a/asdf/_tests/test_generic_io.py b/asdf/_tests/test_generic_io.py index 474372268..6bb3c929f 100644 --- a/asdf/_tests/test_generic_io.py +++ b/asdf/_tests/test_generic_io.py @@ -113,7 +113,7 @@ def get_read_fd(): with _roundtrip(tree, get_write_fd, get_read_fd) as ff: assert len(ff._blocks.blocks) == 2 - assert isinstance(ff._blocks.blocks[0].data, np.core.memmap) + assert isinstance(ff._blocks.blocks[0].cached_data, np.core.memmap) def test_open2(tree, tmp_path): @@ -136,7 +136,7 @@ def get_read_fd(): with _roundtrip(tree, get_write_fd, get_read_fd) as ff: assert len(ff._blocks.blocks) == 2 - assert isinstance(ff._blocks.blocks[0].data, np.core.memmap) + assert isinstance(ff._blocks.blocks[0].cached_data, np.core.memmap) @pytest.mark.parametrize("mode", ["r", "w", "rw"]) @@ -173,7 +173,7 @@ def get_read_fd(): with _roundtrip(tree, get_write_fd, get_read_fd) as ff: assert len(ff._blocks.blocks) == 2 - assert isinstance(ff._blocks.blocks[0].data, np.core.memmap) + assert isinstance(ff._blocks.blocks[0].cached_data, np.core.memmap) ff.tree["science_data"][0] = 42 @@ -209,7 +209,7 @@ def get_read_fd(): with _roundtrip(tree, get_write_fd, get_read_fd) as ff: assert len(ff._blocks.blocks) == 2 - assert not isinstance(ff._blocks.blocks[0].data, np.core.memmap) + assert not isinstance(ff._blocks.blocks[0].cached_data, np.core.memmap) ff.tree["science_data"][0] = 42 @@ -225,7 +225,7 @@ def get_read_fd(): with _roundtrip(tree, get_write_fd, get_read_fd) as ff: assert len(ff._blocks.blocks) == 2 - assert not isinstance(ff._blocks.blocks[0].data, np.core.memmap) + assert not isinstance(ff._blocks.blocks[0].cached_data, np.core.memmap) ff.tree["science_data"][0] = 42 @@ -253,7 +253,7 @@ def get_read_fd(): with _roundtrip(tree, get_write_fd, get_read_fd) as ff: assert len(ff._blocks.blocks) == 2 - assert not isinstance(ff._blocks.blocks[0].data, np.core.memmap) + assert not isinstance(ff._blocks.blocks[0].cached_data, np.core.memmap) @pytest.mark.remote_data() diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index 93156107c..946ece166 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -1,3 +1,4 @@ +import mmap import sys import numpy as np @@ -264,25 +265,29 @@ def _make_array(self): # closed and replaced. We need to check here and re-generate # the array if necessary, otherwise we risk segfaults when # memory mapping. - # if self._array is not None: - # base = util.get_array_base(self._array) - # if isinstance(base, np.memmap) and isinstance(base.base, mmap.mmap) and base.base.closed: - # self._array = None + if self._array is not None: + base = util.get_array_base(self._array) + if isinstance(base, np.memmap) and isinstance(base.base, mmap.mmap) and base.base.closed: + self._array = None if self._array is None: block = self.block - if callable(block._data): - block._data = block.data + # cached data is used here so that multiple NDArrayTypes will all use + # the same base array + data = block.cached_data + + if hasattr(data, "base") and isinstance(data.base, mmap.mmap) and data.base.closed: + raise OSError("Attempt to read data from a closed file") # streaming blocks have 0 data size shape = self.get_actual_shape( self._shape, self._strides, self._dtype, - block.header["data_size"] or block._data.size, + block.header["data_size"] or data.size, ) - self._array = np.ndarray(shape, self._dtype, block.data, self._offset, self._strides, self._order) + self._array = np.ndarray(shape, self._dtype, data, self._offset, self._strides, self._order) self._array = self._apply_mask(self._array, self._mask) return self._array @@ -353,10 +358,9 @@ def block(self): else: self._block = self._asdffile._blocks.blocks[self._source] if self._source == -1: - if callable(self._block._data): - self._block._data = self._block.data - if self._asdffile.get_array_storage(self._block.data) != "streamed": - self._asdffile.set_array_storage(self._block.data, "streamed") + arr = self._make_array() + if self._asdffile.get_array_storage(arr) != "streamed": + self._asdffile.set_array_storage(arr, "streamed") return self._block @property @@ -366,13 +370,13 @@ def shape(self): if "*" in self._shape: # return tuple(self.get_actual_shape(self._shape, self._strides, self._dtype, len(self.block))) if not self.block.header["data_size"]: - self._make_array() + return self._make_array().shape return tuple( self.get_actual_shape( self._shape, self._strides, self._dtype, - self.block.header["data_size"] or self.block._data.size, + self.block.header["data_size"], ) ) return tuple(self._shape) @@ -447,7 +451,10 @@ def from_tree(cls, node, ctx): strides = node.get("strides", None) mask = node.get("mask", None) - return cls(source, shape, dtype, offset, strides, "A", mask, ctx) + instance = cls(source, shape, dtype, offset, strides, "A", mask, ctx) + if isinstance(source, int): + ctx._blocks.blocks.assign_object(instance, ctx._blocks.blocks[source]) + return instance msg = "Invalid ndarray description." raise TypeError(msg) From e610dd6ac057443fe64e962ab28cdfe7b3c4bf36 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 27 Apr 2023 17:26:36 -0400 Subject: [PATCH 014/154] convert ndarray to using callback instead of raw block access --- asdf/_block/callback.py | 19 ++++++++++++ asdf/_block/manager.py | 5 ++++ asdf/_block/reader.py | 2 +- asdf/tags/core/ndarray.py | 63 +++++++++++++++++---------------------- 4 files changed, 52 insertions(+), 37 deletions(-) create mode 100644 asdf/_block/callback.py diff --git a/asdf/_block/callback.py b/asdf/_block/callback.py new file mode 100644 index 000000000..8c07d526b --- /dev/null +++ b/asdf/_block/callback.py @@ -0,0 +1,19 @@ +import weakref + + +class DataCallback: + def __init__(self, index, read_blocks): + self._index = index + self._read_blocks_ref = weakref.ref(read_blocks) + + def __call__(self, _attr=None): + read_blocks = self._read_blocks_ref() + if read_blocks is None: + msg = "Attempt to read block data from missing block" + raise OSError(msg) + if _attr is None: + return read_blocks[self._index].data + else: + # _attr allows NDArrayType to have low level block access for things + # like reading the header and cached_data + return getattr(read_blocks[self._index], _attr) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 1f4e98149..0a6841a86 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -3,6 +3,7 @@ from asdf import constants, generic_io, util from . import store +from .callback import DataCallback from .options import Options from .writer import WriteBlock, write_blocks @@ -128,6 +129,7 @@ def __init__(self, read_blocks=None): self.blocks = self.options._read_blocks else: self.blocks = read_blocks + self._data_callbacks = store.Store() # TODO copy options and read_blocks on start of write self._write_blocks = [] self._external_write_blocks = [] @@ -177,6 +179,9 @@ def set_streamed_block(self, data): raise ValueError("Can not add second streaming block") self._streamed_block = WriteBlock(data) + def _get_data_callback(self, index): + return DataCallback(index, self.blocks) + # cludges for tests @property def internal_blocks(self): diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index 3f7af43c7..ad0b48ca2 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -7,7 +7,7 @@ class ReadBlock: def __init__(self, offset, fd, memmap, lazy_load, header=None, data_offset=None, data=None): - self.offset = offset + self.offset = offset # after magic self._fd = weakref.ref(fd) self._header = header self.data_offset = data_offset diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index 946ece166..af9f647cd 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -235,15 +235,17 @@ class NDArrayType(_types._AsdfType): def __init__(self, source, shape, dtype, offset, strides, order, mask, asdffile): self._asdffile = asdffile + # source can be a: + # - list of numbers for an inline block + # - string for an external block + # - a data callback for an internal block self._source = source - self._block = None self._array = None self._mask = mask if isinstance(source, list): self._array = inline_data_asarray(source, dtype) self._array = self._apply_mask(self._array, self._mask) - # self._block = asdffile._blocks.add_inline(self._array) asdffile.set_array_storage(self._array, "inline") if shape is not None and ( (shape[0] == "*" and self._array.shape[1:] != tuple(shape[1:])) or (self._array.shape != tuple(shape)) @@ -271,11 +273,14 @@ def _make_array(self): self._array = None if self._array is None: - block = self.block - - # cached data is used here so that multiple NDArrayTypes will all use - # the same base array - data = block.cached_data + if isinstance(self._source, str): + data = ( + self._asdffile.open_external(self._source, lazy_load=False, copy_arrays=True)._blocks.blocks[0].data + ) + else: + # cached data is used here so that multiple NDArrayTypes will all use + # the same base array + data = self._source(_attr="cached_data") if hasattr(data, "base") and isinstance(data.base, mmap.mmap) and data.base.closed: raise OSError("Attempt to read data from a closed file") @@ -285,7 +290,7 @@ def _make_array(self): self._shape, self._strides, self._dtype, - block.header["data_size"] or data.size, + data.size, ) self._array = np.ndarray(shape, self._dtype, data, self._offset, self._strides, self._order) self._array = self._apply_mask(self._array, self._mask) @@ -350,33 +355,22 @@ def get_actual_shape(self, shape, strides, dtype, block_size): msg = f"Invalid shape '{shape}'" raise ValueError(msg) - @property - def block(self): - if self._block is None: - if isinstance(self._source, str): - self._block = self._asdffile.open_external(self._source)._blocks.blocks[0] - else: - self._block = self._asdffile._blocks.blocks[self._source] - if self._source == -1: - arr = self._make_array() - if self._asdffile.get_array_storage(arr) != "streamed": - self._asdffile.set_array_storage(arr, "streamed") - return self._block - @property def shape(self): - if self._shape is None: + if self._shape is None or self._array is not None: return self.__array__().shape if "*" in self._shape: - # return tuple(self.get_actual_shape(self._shape, self._strides, self._dtype, len(self.block))) - if not self.block.header["data_size"]: + if isinstance(self._source, str): + return self._make_array().shape + data_size = self._source(_attr="header")["data_size"] + if not data_size: return self._make_array().shape return tuple( self.get_actual_shape( self._shape, self._strides, self._dtype, - self.block.header["data_size"], + data_size, ) ) return tuple(self._shape) @@ -451,23 +445,20 @@ def from_tree(cls, node, ctx): strides = node.get("strides", None) mask = node.get("mask", None) - instance = cls(source, shape, dtype, offset, strides, "A", mask, ctx) if isinstance(source, int): - ctx._blocks.blocks.assign_object(instance, ctx._blocks.blocks[source]) + block_index = source + source = ctx._blocks._get_data_callback(source) + else: + block_index = None + instance = cls(source, shape, dtype, offset, strides, "A", mask, ctx) + if block_index is not None: + ctx._blocks.blocks.assign_object(instance, ctx._blocks.blocks[block_index]) + ctx._blocks._data_callbacks.assign_object(instance, source) return instance msg = "Invalid ndarray description." raise TypeError(msg) - @classmethod - def reserve_blocks(cls, data, ctx): - # Find all of the used data buffers so we can add or rearrange - # them if necessary - if isinstance(data, np.ndarray): - yield ctx._blocks.find_or_create_block_for_array(data) - elif isinstance(data, NDArrayType): - yield data.block - @classmethod def to_tree(cls, data, ctx): # The ndarray-1.0.0 schema does not permit 0 valued strides. From 9aae606d8aa1b14a8bcfe8f5186cd1b270ed82ec Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 27 Apr 2023 17:46:17 -0400 Subject: [PATCH 015/154] remove cludges added to new block manager these allowed smaller updates to tests by providing a few methods that performed similar to the old block manager --- asdf/_block/manager.py | 26 +++++++++++---------- asdf/_tests/_helpers.py | 4 ++-- asdf/_tests/commands/tests/test_exploded.py | 4 ++-- asdf/_tests/tags/core/tests/test_integer.py | 2 +- asdf/_tests/tags/core/tests/test_ndarray.py | 8 +++---- asdf/_tests/test_array_blocks.py | 12 +++++----- asdf/_tests/test_file_format.py | 4 ++-- asdf/_tests/test_stream.py | 12 +++++----- asdf/asdf.py | 14 ++++------- 9 files changed, 42 insertions(+), 44 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 0a6841a86..b74e398bf 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -182,19 +182,21 @@ def set_streamed_block(self, data): def _get_data_callback(self, index): return DataCallback(index, self.blocks) - # cludges for tests - @property - def internal_blocks(self): - return self.blocks - - @property - def _internal_blocks(self): - return self.blocks - - def set_array_storage(self, data, storage): + def _set_array_storage(self, data, storage): options = self.options.get_options(data) options.storage_type = storage self.options.set_options(data, options) - def __len__(self): - return len(self.blocks) + def _get_array_storage(self, data): + return self.options.get_options(data).storage_type + + def _set_array_compression(self, arr, compression, **compression_kwargs): + options = self.options.get_options(arr) + options.compression = compression + options.compression_kwargs = compression_kwargs + + def _get_array_compression(self, arr): + return self.options.get_options(arr).compression + + def _get_array_compression_kwargs(self, arr): + return self.options.get_options(arr).compression_kwargs diff --git a/asdf/_tests/_helpers.py b/asdf/_tests/_helpers.py index 31dc24ede..75a48dd37 100644 --- a/asdf/_tests/_helpers.py +++ b/asdf/_tests/_helpers.py @@ -262,7 +262,7 @@ def _assert_roundtrip_tree( buff.seek(0) ff = asdf.open(buff, extensions=extensions, copy_arrays=True, lazy_load=False) # Ensure that all the blocks are loaded - for block in ff._blocks._internal_blocks: + for block in ff._blocks.blocks: # assert isinstance(block, Block) assert block._data is not None and not callable(block._data) # The underlying file is closed at this time and everything should still work @@ -273,7 +273,7 @@ def _assert_roundtrip_tree( # Now repeat with copy_arrays=False and a real file to test mmap() AsdfFile(tree, extensions=extensions, **init_options).write_to(fname, **write_options) with asdf.open(fname, mode="rw", extensions=extensions, copy_arrays=False, lazy_load=False) as ff: - for block in ff._blocks._internal_blocks: + for block in ff._blocks.blocks: # assert isinstance(block, Block) assert block._data is not None and not callable(block._data) assert_tree_match(tree, ff.tree, ff, funcname=tree_match_func) diff --git a/asdf/_tests/commands/tests/test_exploded.py b/asdf/_tests/commands/tests/test_exploded.py index e431fbb2d..1c08f4272 100644 --- a/asdf/_tests/commands/tests/test_exploded.py +++ b/asdf/_tests/commands/tests/test_exploded.py @@ -25,7 +25,7 @@ def test_explode_then_implode(tmpdir): # inline. ff.write_to(path, all_array_storage="internal") with asdf.open(path) as af: - assert len(af._blocks._internal_blocks) == 2 + assert len(af._blocks.blocks) == 2 result = main.main_from_args(["explode", path]) @@ -48,7 +48,7 @@ def test_explode_then_implode(tmpdir): with asdf.open(str(tmpdir.join("original_exploded_all.asdf"))) as af: assert_tree_match(af.tree, tree) - assert len(af._blocks) == 2 + assert len(af._blocks.blocks) == 2 def test_file_not_found(tmpdir): diff --git a/asdf/_tests/tags/core/tests/test_integer.py b/asdf/_tests/tags/core/tests/test_integer.py index 622416e72..2b3528613 100644 --- a/asdf/_tests/tags/core/tests/test_integer.py +++ b/asdf/_tests/tags/core/tests/test_integer.py @@ -69,7 +69,7 @@ def test_integer_storage_duplication(tmpdir): af.write_to(tmpfile) with asdf.open(tmpfile, _force_raw_types=True) as rf: - assert len(rf._blocks) == 1 + assert len(rf._blocks.blocks) == 1 assert rf.tree["integer1"]["words"]["source"] == 0 assert rf.tree["integer2"]["words"]["source"] == 0 diff --git a/asdf/_tests/tags/core/tests/test_ndarray.py b/asdf/_tests/tags/core/tests/test_ndarray.py index 40e4c93cc..1b1fe0ab9 100644 --- a/asdf/_tests/tags/core/tests/test_ndarray.py +++ b/asdf/_tests/tags/core/tests/test_ndarray.py @@ -136,7 +136,7 @@ def test_dont_load_data(): str(ff.tree["science_data"]) repr(ff.tree) - for block in ff._blocks.internal_blocks: + for block in ff._blocks.blocks: assert callable(block._data) @@ -266,7 +266,7 @@ def test_inline(): buff.seek(0) with asdf.open(buff, mode="rw") as ff: helpers.assert_tree_match(tree, ff.tree) - assert len(list(ff._blocks.internal_blocks)) == 0 + assert len(list(ff._blocks.blocks)) == 0 buff = io.BytesIO() ff.write_to(buff) @@ -292,7 +292,7 @@ def check_asdf(asdf): m = tree["masked_array"] assert np.all(m.mask[6:]) - assert len(asdf._blocks) == 2 + assert len(asdf._blocks.blocks) == 2 helpers.assert_roundtrip_tree(tree, tmpdir, asdf_check_func=check_asdf) @@ -422,7 +422,7 @@ def test_inline_masked_array(tmpdir): f.write_to(testfile) with asdf.open(testfile) as f2: - assert len(list(f2._blocks.internal_blocks)) == 0 + assert len(list(f2._blocks.blocks)) == 0 assert_array_equal(f.tree["test"], f2.tree["test"]) with open(testfile, "rb") as fd: diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index f3f7e872b..91d08a070 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -460,12 +460,12 @@ def test_seek_until_on_block_boundary(): buff = io.BytesIO(content) ff = asdf.open(buff) - assert len(ff._blocks) == 1 + assert len(ff._blocks.blocks) == 1 buff.seek(0) fd = generic_io.InputStream(buff, "r") ff = asdf.open(fd) - assert len(ff._blocks) == 1 + assert len(ff._blocks.blocks) == 1 def test_checksum(tmp_path): @@ -621,7 +621,7 @@ def test_short_file_find_block_index(): buff.seek(0) with asdf.open(buff) as ff: - assert len(ff._blocks) == 2 + assert len(ff._blocks.blocks) == 2 assert ff._blocks.blocks[1].loaded @@ -650,7 +650,7 @@ def test_invalid_block_index_values(): buff.seek(0) with asdf.open(buff) as ff: - assert len(ff._blocks) == 10 + assert len(ff._blocks.blocks) == 10 assert ff._blocks.blocks[1].loaded @@ -694,7 +694,7 @@ def test_invalid_block_index_offset(block_index_index): buff.seek(0) with asdf.open(buff) as ff: - assert len(ff._blocks) == 10 + assert len(ff._blocks.blocks) == 10 for i, a in enumerate(arrays): assert ff._blocks.blocks[i].loaded assert_array_equal(ff["arrays"][i], a) @@ -725,7 +725,7 @@ def test_unordered_block_index(): buff.seek(0) with asdf.open(buff) as ff: - assert len(ff._blocks) == 10 + assert len(ff._blocks.blocks) == 10 assert ff._blocks.blocks[1].loaded diff --git a/asdf/_tests/test_file_format.py b/asdf/_tests/test_file_format.py index e65aad28c..36b315c72 100644 --- a/asdf/_tests/test_file_format.py +++ b/asdf/_tests/test_file_format.py @@ -88,14 +88,14 @@ def test_empty_file(): with asdf.open(buff) as ff: assert ff.tree == {} - assert len(ff._blocks) == 0 + assert len(ff._blocks.blocks) == 0 buff = io.BytesIO(b"#ASDF 1.0.0\n#ASDF_STANDARD 1.0.0") buff.seek(0) with asdf.open(buff) as ff: assert ff.tree == {} - assert len(ff._blocks) == 0 + assert len(ff._blocks.blocks) == 0 @pytest.mark.filterwarnings("ignore::astropy.io.fits.verify.VerifyWarning") diff --git a/asdf/_tests/test_stream.py b/asdf/_tests/test_stream.py index eacb406dc..65b83431e 100644 --- a/asdf/_tests/test_stream.py +++ b/asdf/_tests/test_stream.py @@ -22,7 +22,7 @@ def test_stream(): buff.seek(0) with asdf.open(buff) as ff: - assert len(ff._blocks) == 1 + assert len(ff._blocks.blocks) == 1 assert ff.tree["stream"].shape == (100, 6, 2) for i, row in enumerate(ff.tree["stream"]): assert np.all(row == i) @@ -43,7 +43,7 @@ def test_stream_write_nothing(): buff.seek(0) with asdf.open(buff) as ff: - assert len(ff._blocks) == 1 + assert len(ff._blocks.blocks) == 1 assert ff.tree["stream"].shape == (0, 6, 2) @@ -64,7 +64,7 @@ def test_stream_twice(): buff.seek(0) ff = asdf.open(buff) - assert len(ff._blocks) == 1 + assert len(ff._blocks.blocks) == 1 assert ff.tree["stream"].shape == (100, 6, 2) assert ff.tree["stream2"].shape == (50, 12, 2) @@ -85,7 +85,7 @@ def test_stream_with_nonstream(): buff.seek(0) with asdf.open(buff) as ff: - assert len(ff._blocks) == 2 + assert len(ff._blocks.blocks) == 2 assert_array_equal(ff.tree["nonstream"], np.array([1, 2, 3, 4], np.int64)) assert ff.tree["stream"].shape == (100, 6, 2) for i, row in enumerate(ff.tree["stream"]): @@ -108,7 +108,7 @@ def test_stream_real_file(tmp_path): fd.write(np.array([i] * 12, np.float64).tobytes()) with asdf.open(path) as ff: - assert len(ff._blocks) == 2 + assert len(ff._blocks.blocks) == 2 assert_array_equal(ff.tree["nonstream"], np.array([1, 2, 3, 4], np.int64)) assert ff.tree["stream"].shape == (100, 6, 2) for i, row in enumerate(ff.tree["stream"]): @@ -129,7 +129,7 @@ def test_stream_to_stream(): buff.seek(0) with asdf.open(generic_io.InputStream(buff, "r")) as ff: - assert len(ff._blocks) == 2 + assert len(ff._blocks.blocks) == 2 assert_array_equal(ff.tree["nonstream"], np.array([1, 2, 3, 4], np.int64)) assert ff.tree["stream"].shape == (100, 6, 2) for i, row in enumerate(ff.tree["stream"]): diff --git a/asdf/asdf.py b/asdf/asdf.py index 3905edfb5..083041ca6 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -664,9 +664,7 @@ def set_array_storage(self, arr, array_storage): - ``inline``: Store the data as YAML inline in the tree. """ - options = self._blocks.options.get_options(arr) - options.storage_type = array_storage - self._blocks.options.set_options(arr, options) + self._blocks._set_array_storage(arr, array_storage) def get_array_storage(self, arr): """ @@ -676,7 +674,7 @@ def get_array_storage(self, arr): ---------- arr : numpy.ndarray """ - return self._blocks.options.get_options(arr).storage_type + return self._blocks._get_array_storage(arr) def set_array_compression(self, arr, compression, **compression_kwargs): """ @@ -704,9 +702,7 @@ def set_array_compression(self, arr, compression, **compression_kwargs): If there is no prior file, acts as None. """ - options = self._blocks.options.get_options(arr) - options.compression = compression - options.compression_kwargs = compression_kwargs + self._blocks._set_array_compression(arr, compression, **compression_kwargs) def get_array_compression(self, arr): """ @@ -720,11 +716,11 @@ def get_array_compression(self, arr): ------- compression : str or None """ - return self._blocks.options.get_options(arr).compression + return self._blocks._get_array_compression(arr) def get_array_compression_kwargs(self, arr): """ """ - return self._blocks.options.get_options(arr).compression_kwargs + return self._blocks._get_array_compression_kwargs(arr) @classmethod def _parse_header_line(cls, line): From 8d4e686eaf491af2837d01ed02532f3148e008e4 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 27 Apr 2023 18:01:42 -0400 Subject: [PATCH 016/154] reduce usage of asdffile reference in ndarray --- asdf/tags/core/ndarray.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index af9f647cd..f1df7c1d1 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -246,7 +246,6 @@ def __init__(self, source, shape, dtype, offset, strides, order, mask, asdffile) if isinstance(source, list): self._array = inline_data_asarray(source, dtype) self._array = self._apply_mask(self._array, self._mask) - asdffile.set_array_storage(self._array, "inline") if shape is not None and ( (shape[0] == "*" and self._array.shape[1:] != tuple(shape[1:])) or (self._array.shape != tuple(shape)) ): @@ -428,7 +427,9 @@ def __getattribute__(self, name): @classmethod def from_tree(cls, node, ctx): if isinstance(node, list): - return cls(node, None, None, None, None, None, None, ctx) + instance = cls(node, None, None, None, None, None, None, ctx) + ctx._blocks._set_array_storage(instance, "inline") + return instance if isinstance(node, dict): source = node.get("source") @@ -454,6 +455,9 @@ def from_tree(cls, node, ctx): if block_index is not None: ctx._blocks.blocks.assign_object(instance, ctx._blocks.blocks[block_index]) ctx._blocks._data_callbacks.assign_object(instance, source) + else: + if not isinstance(source, str): + ctx._blocks._set_array_storage(instance, "inline") return instance msg = "Invalid ndarray description." @@ -486,8 +490,6 @@ def to_tree(cls, data, ctx): options.compression_kwargs = cfg.all_array_compression_kwargs inline_threshold = cfg.array_inline_threshold - # block = ctx._blocks.find_or_create_block_for_array(data) - # foo if inline_threshold is not None and options.storage_type in ("inline", "internal"): if data.size < inline_threshold: options.storage_type = "inline" @@ -543,7 +545,7 @@ def to_tree(cls, data, ctx): # if block.array_storage == "inline": if options.storage_type == "inline": # ctx._blocks.set_array_storage(ctx._blocks[data.mask], "inline") - ctx.set_array_storage(data.mask, "inline") + ctx._blocks._set_array_storage(data.mask, "inline") result["mask"] = data.mask From 4d7183aad49add60dfd3376b13c78998b95722a8 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 28 Apr 2023 09:49:21 -0400 Subject: [PATCH 017/154] associate objects with write blocks --- asdf/_block/manager.py | 20 ++++++++++----- asdf/asdf.py | 13 ++-------- asdf/extension/_serialization_context.py | 31 ++++++++++-------------- asdf/stream.py | 2 +- asdf/tags/core/ndarray.py | 4 +-- 5 files changed, 32 insertions(+), 38 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index b74e398bf..e8e6cc96f 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -131,11 +131,17 @@ def __init__(self, read_blocks=None): self.blocks = read_blocks self._data_callbacks = store.Store() # TODO copy options and read_blocks on start of write - self._write_blocks = [] + self._write_blocks = store.LinearStore() self._external_write_blocks = [] self._streamed_block = None self._write_fd = None + def _clear_write(self): + self._write_blocks = store.LinearStore() + self._external_write_blocks = [] + self._streamed_block = None + # self._write_fd = None + def _write_external_blocks(self): from asdf import AsdfFile @@ -152,7 +158,7 @@ def _write_external_blocks(self): af.write_to(f, include_block_index=False) write_blocks(f, [blk]) - def make_write_block(self, data, options): + def make_write_block(self, data, options, obj): if options.storage_type == "external": for index, blk in enumerate(self._external_write_blocks): if blk._data is data: @@ -167,17 +173,19 @@ def make_write_block(self, data, options): # first, look for an existing block for index, blk in enumerate(self._write_blocks): if blk._data is data: + self._write_blocks.assign_object(obj, blk) return index # if no block is found, make a new block - self._write_blocks.append(WriteBlock(data, options.compression, options.compression_kwargs)) - # data_bytes = np.ndarray(-1, np.uint8, data.ravel(order='K').data) - # self._write_blocks.append(WriteBlock(data_bytes, options.compression, options.compression_kwargs)) + blk = WriteBlock(data, options.compression, options.compression_kwargs) + self._write_blocks._items.append(blk) + self._write_blocks.assign_object(obj, blk) return len(self._write_blocks) - 1 - def set_streamed_block(self, data): + def set_streamed_block(self, data, obj): if self._streamed_block is not None and data is not self._streamed_block.data: raise ValueError("Can not add second streaming block") self._streamed_block = WriteBlock(data) + # TODO associate object with streamed block def _get_data_callback(self, index): return DataCallback(index, self.blocks) diff --git a/asdf/asdf.py b/asdf/asdf.py index 083041ca6..51361eb3b 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -980,9 +980,7 @@ def _pre_write(self, fd): self._tree["asdf_library"] = get_asdf_library_info() def _serial_write(self, fd, pad_blocks, include_block_index): - self._blocks._write_blocks = [] - self._blocks._external_write_blocks = [] - self._blocks._streamed_block = None + self._blocks._clear_write() self._write_tree(self._tree, fd, pad_blocks) if len(self._blocks._write_blocks) or self._blocks._streamed_block: block_writer.write_blocks( @@ -994,14 +992,7 @@ def _serial_write(self, fd, pad_blocks, include_block_index): ) if len(self._blocks._external_write_blocks): self._blocks._write_external_blocks() - self._blocks._write_blocks = [] - self._blocks._external_write_blocks = [] - self._blocks._streamed_block = None - # TODO external blocks - # self._blocks.write_internal_blocks_serial(fd, pad_blocks) - # self._blocks.write_external_blocks(fd.uri, pad_blocks) - # if include_block_index: - # self._blocks.write_block_index(fd, self) + self._blocks._clear_write() def _random_write(self, fd, pad_blocks, include_block_index): self._write_tree(self._tree, fd, False) diff --git a/asdf/extension/_serialization_context.py b/asdf/extension/_serialization_context.py index 6a6723500..eba47ccc5 100644 --- a/asdf/extension/_serialization_context.py +++ b/asdf/extension/_serialization_context.py @@ -11,11 +11,11 @@ class SerializationContext: classes (like Converters) via method arguments. """ - def __init__(self, version, extension_manager, url, block_manager): + def __init__(self, version, extension_manager, url, blocks): self._version = validate_version(version) self._extension_manager = extension_manager self._url = url - self._block_manager = block_manager + self._blocks = blocks self.__extensions_used = set() @@ -93,8 +93,12 @@ def get_block_data_callback(self, index): A callable that when called (with no arguments) returns the block data as a one dimensional array of uint8 """ - blk = self._block_manager.get_block(index) - return blk.generate_read_data_callback() + blk = self._blocks.blocks[index] + + def callback(blk=blk): + return blk.data + + return callback def assign_block_key(self, block_index, key): """ @@ -119,14 +123,7 @@ def assign_block_key(self, block_index, key): key : hashable A unique hashable key to associate with a block """ - blk = self._block_manager.get_block(block_index) - if self._block_manager._key_to_block_mapping.get(key, blk) is not blk: - msg = f"key {key} is already assigned to a block" - raise ValueError(msg) - if blk in self._block_manager._key_to_block_mapping.values(): - msg = f"block {block_index} is already assigned to a key" - raise ValueError(msg) - self._block_manager._key_to_block_mapping[key] = blk + self._blocks.blocks.assign_object(key, self._blocks.blocks[block_index]) def find_block_index(self, lookup_key, data_callback=None): """ @@ -153,9 +150,7 @@ def find_block_index(self, lookup_key, data_callback=None): Index of the block where data returned from data_callback will be written. """ - new_block = lookup_key not in self._block_manager._key_to_block_mapping - blk = self._block_manager.find_or_create_block(lookup_key) - # if we're not creating a block, don't update the data callback - if data_callback is not None and (new_block or (blk._data_callback is None and blk._fd is None)): - blk._data_callback = data_callback - return self._block_manager.get_source(blk) + + # TODO eventually this will need to map memmap blocks to not rewrite data + # TODO lookup options from previous block + return self._blocks.make_write_block(data_callback, BlockOptions(), lookup_key) diff --git a/asdf/stream.py b/asdf/stream.py index 586af891e..29fe68735 100644 --- a/asdf/stream.py +++ b/asdf/stream.py @@ -46,7 +46,7 @@ def from_tree(cls, data, ctx): @classmethod def to_tree(cls, data, ctx): # TODO previously, stream never passed on data? - ctx._blocks.set_streamed_block(data._array) + ctx._blocks.set_streamed_block(data._array, data) result = {} result["source"] = -1 diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index f1df7c1d1..7d743b1e8 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -528,10 +528,10 @@ def to_tree(cls, data, ctx): # result["source"] = ctx._blocks.get_source(block) # convert data to byte array if options.storage_type == "streamed": - ctx._blocks.set_streamed_block(base) + ctx._blocks.set_streamed_block(base, data) result["source"] = -1 else: - result["source"] = ctx._blocks.make_write_block(base, options) + result["source"] = ctx._blocks.make_write_block(base, options, data) result["datatype"] = dtype result["byteorder"] = byteorder From a944de617e968a70cf67dc6170520621b47a40b2 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 28 Apr 2023 12:09:09 -0400 Subject: [PATCH 018/154] associate streamed block with object --- asdf/_block/manager.py | 12 ++++++++---- asdf/tags/core/ndarray.py | 5 +++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index e8e6cc96f..fbea9c904 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -1,4 +1,5 @@ import os +import weakref from asdf import constants, generic_io, util @@ -85,8 +86,10 @@ def get_options(self, array): def set_options(self, array, options): if options.storage_type == "streamed": - for d in self._by_id.values(): - for opt in d.values(): + for oid, by_key in self._by_id.items(): + for key, opt in by_key.items(): + if not key.is_valid(): + continue if opt.storage_type == "streamed": if opt is options: continue @@ -130,16 +133,17 @@ def __init__(self, read_blocks=None): else: self.blocks = read_blocks self._data_callbacks = store.Store() - # TODO copy options and read_blocks on start of write self._write_blocks = store.LinearStore() self._external_write_blocks = [] self._streamed_block = None + self._streamed_obj = None self._write_fd = None def _clear_write(self): self._write_blocks = store.LinearStore() self._external_write_blocks = [] self._streamed_block = None + self._streamed_obj = None # self._write_fd = None def _write_external_blocks(self): @@ -185,7 +189,7 @@ def set_streamed_block(self, data, obj): if self._streamed_block is not None and data is not self._streamed_block.data: raise ValueError("Can not add second streaming block") self._streamed_block = WriteBlock(data) - # TODO associate object with streamed block + self._streamed_obj = weakref.ref(obj) def _get_data_callback(self, index): return DataCallback(index, self.blocks) diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index 7d743b1e8..9554f6fa0 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -464,7 +464,8 @@ def from_tree(cls, node, ctx): raise TypeError(msg) @classmethod - def to_tree(cls, data, ctx): + def to_tree(cls, obj, ctx): + data = obj # The ndarray-1.0.0 schema does not permit 0 valued strides. # Perhaps we'll want to allow this someday, to efficiently # represent an array of all the same value. @@ -531,7 +532,7 @@ def to_tree(cls, data, ctx): ctx._blocks.set_streamed_block(base, data) result["source"] = -1 else: - result["source"] = ctx._blocks.make_write_block(base, options, data) + result["source"] = ctx._blocks.make_write_block(base, options, obj) result["datatype"] = dtype result["byteorder"] = byteorder From 2a22176f9d4f6c5bf3a2031f2ba344bcea61a798 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 28 Apr 2023 12:11:04 -0400 Subject: [PATCH 019/154] Revert "break update" This reverts commit c968a9e58c1973168296bf8b4539cea81cc58b4a. --- asdf/_tests/tags/core/tests/test_ndarray.py | 1 - asdf/_tests/test_api.py | 1 - asdf/_tests/test_array_blocks.py | 16 ---------------- asdf/_tests/test_block_converter.py | 4 ---- asdf/_tests/test_compression.py | 5 ----- asdf/asdf.py | 1 - 6 files changed, 28 deletions(-) diff --git a/asdf/_tests/tags/core/tests/test_ndarray.py b/asdf/_tests/tags/core/tests/test_ndarray.py index 1b1fe0ab9..2334c319e 100644 --- a/asdf/_tests/tags/core/tests/test_ndarray.py +++ b/asdf/_tests/tags/core/tests/test_ndarray.py @@ -930,7 +930,6 @@ def test_readonly_inline(tmpdir): # Confirm that NDArrayType's internal array is regenerated # following an update. @pytest.mark.parametrize("pad_blocks", [True, False]) -@pytest.mark.xfail def test_block_data_change(pad_blocks, tmpdir): tmpfile = str(tmpdir.join("data.asdf")) tree = {"data": np.zeros(10, dtype="uint8")} diff --git a/asdf/_tests/test_api.py b/asdf/_tests/test_api.py index 36bcb0757..3e0f4cb57 100644 --- a/asdf/_tests/test_api.py +++ b/asdf/_tests/test_api.py @@ -147,7 +147,6 @@ def test_default_version(): assert ff.file_format_version == version_map["FILE_FORMAT"] -@pytest.mark.xfail(reason="update is broken") def test_update_exceptions(tmp_path): path = str(tmp_path / "test.asdf") diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index 91d08a070..88053cbb5 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -108,7 +108,6 @@ def test_pad_blocks(tmp_path): assert_array_equal(ff.tree["my_array2"], my_array2) -@pytest.mark.xfail def test_update_expand_tree(tmp_path): tmp_path = str(tmp_path) testpath = os.path.join(tmp_path, "test.asdf") @@ -150,7 +149,6 @@ def test_update_expand_tree(tmp_path): assert_array_equal(ff.tree["arrays"][1], my_array2) -@pytest.mark.xfail def test_update_all_external(tmp_path): fn = tmp_path / "test.asdf" @@ -175,7 +173,6 @@ def _get_update_tree(): return {"arrays": [np.arange(64) * 1, np.arange(64) * 2, np.arange(64) * 3]} -@pytest.mark.xfail def test_update_delete_first_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -199,7 +196,6 @@ def test_update_delete_first_array(tmp_path): assert_array_equal(ff.tree["arrays"][1], tree["arrays"][2]) -@pytest.mark.xfail def test_update_delete_last_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -223,7 +219,6 @@ def test_update_delete_last_array(tmp_path): assert_array_equal(ff.tree["arrays"][1], tree["arrays"][1]) -@pytest.mark.xfail def test_update_delete_middle_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -251,7 +246,6 @@ def test_update_delete_middle_array(tmp_path): assert_array_equal(ff.tree["arrays"][1], tree["arrays"][2]) -@pytest.mark.xfail def test_update_replace_first_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -276,7 +270,6 @@ def test_update_replace_first_array(tmp_path): assert_array_equal(ff.tree["arrays"][2], tree["arrays"][2]) -@pytest.mark.xfail def test_update_replace_last_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -301,7 +294,6 @@ def test_update_replace_last_array(tmp_path): assert_array_equal(ff.tree["arrays"][2], np.arange(32)) -@pytest.mark.xfail def test_update_replace_middle_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -326,7 +318,6 @@ def test_update_replace_middle_array(tmp_path): assert_array_equal(ff.tree["arrays"][2], tree["arrays"][2]) -@pytest.mark.xfail def test_update_add_array(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -348,7 +339,6 @@ def test_update_add_array(tmp_path): assert_array_equal(ff.tree["arrays"][3], np.arange(32)) -@pytest.mark.xfail def test_update_add_array_at_end(tmp_path): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -375,7 +365,6 @@ def test_update_add_array_at_end(tmp_path): assert_array_equal(ff.tree["arrays"][3], np.arange(65536, dtype=" Date: Fri, 28 Apr 2023 13:17:27 -0400 Subject: [PATCH 020/154] update working block converter and compression extensions broken --- asdf/_block/io.py | 5 +- asdf/_block/reader.py | 2 + asdf/_tests/test_array_blocks.py | 60 ++----- asdf/asdf.py | 260 +++++++++++++++++++++---------- 4 files changed, 198 insertions(+), 129 deletions(-) diff --git a/asdf/_block/io.py b/asdf/_block/io.py index 2f6fa0e38..7066f5ea0 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -106,7 +106,10 @@ def callback(): if fd is None or fd.is_closed(): msg = "Attempt to read data from closed file" raise OSError(msg) - return read_block_data(fd, header, offset=data_offset, memmap=memmap) + position = fd.tell() + data = read_block_data(fd, header, offset=data_offset, memmap=memmap) + fd.seek(position) + return data data = callback fd.fast_forward(header["allocated_size"]) diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index ad0b48ca2..99d03df37 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -30,9 +30,11 @@ def load(self): if fd is None or fd.is_closed(): msg = "Attempt to load block from closed file" raise OSError(msg) + position = fd.tell() _, self._header, self.data_offset, self._data = bio.read_block( fd, offset=self.offset, memmap=self.memmap, lazy_load=self.lazy_load ) + fd.seek(position) @property def data(self): diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index 88053cbb5..77265beca 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -119,17 +119,15 @@ def test_update_expand_tree(tmp_path): ff = asdf.AsdfFile(tree) ff.set_array_storage(tree["arrays"][2], "inline") - assert len(list(ff._blocks.inline_blocks)) == 1 ff.write_to(testpath, pad_blocks=True) with asdf.open(testpath, mode="rw") as ff: + assert len(list(ff._blocks.blocks)) == 2 assert_array_equal(ff.tree["arrays"][0], my_array) - orig_offset = ff._blocks[ff.tree["arrays"][0]].offset ff.tree["extra"] = [0] * 6000 ff.update() with asdf.open(testpath) as ff: - assert orig_offset <= ff._blocks[ff.tree["arrays"][0]].offset - assert ff._blocks[ff.tree["arrays"][2]].array_storage == "inline" + assert ff.get_array_storage(ff.tree["arrays"][2]) == "inline" assert_array_equal(ff.tree["arrays"][0], my_array) assert_array_equal(ff.tree["arrays"][1], my_array2) @@ -138,13 +136,11 @@ def test_update_expand_tree(tmp_path): ff.set_array_storage(tree["arrays"][2], "inline") ff.write_to(os.path.join(tmp_path, "test2.asdf"), pad_blocks=True) with asdf.open(os.path.join(tmp_path, "test2.asdf"), mode="rw") as ff: - orig_offset = ff._blocks[ff.tree["arrays"][0]].offset ff.tree["extra"] = [0] * 2 ff.update() with asdf.open(os.path.join(tmp_path, "test2.asdf")) as ff: - assert orig_offset == ff._blocks[ff.tree["arrays"][0]].offset - assert ff._blocks[ff.tree["arrays"][2]].array_storage == "inline" + assert ff.get_array_storage(ff.tree["arrays"][2]) == "inline" assert_array_equal(ff.tree["arrays"][0], my_array) assert_array_equal(ff.tree["arrays"][1], my_array2) @@ -234,14 +230,12 @@ def test_update_delete_middle_array(tmp_path): with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: del ff.tree["arrays"][1] ff.update() - assert len(ff._blocks._internal_blocks) == 2 + assert len(ff._blocks.blocks) == 2 assert os.stat(path).st_size <= original_size with asdf.open(os.path.join(tmp_path, "test.asdf")) as ff: assert len(ff.tree["arrays"]) == 2 - assert ff.tree["arrays"][0]._source == 0 - assert ff.tree["arrays"][1]._source == 1 assert_array_equal(ff.tree["arrays"][0], tree["arrays"][0]) assert_array_equal(ff.tree["arrays"][1], tree["arrays"][2]) @@ -354,7 +348,7 @@ def test_update_add_array_at_end(tmp_path): with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: ff.tree["arrays"].append(np.arange(65536, dtype="= original_size @@ -486,7 +480,7 @@ def test_checksum_update(tmp_path): ff.update() with asdf.open(path, validate_checksums=True) as ff: - assert ff._blocks._internal_blocks[0].checksum == b"T\xaf~[\x90\x8a\x88^\xc2B\x96D,N\xadL" + assert ff._blocks.blocks[0].header["checksum"] == b"T\xaf~[\x90\x8a\x88^\xc2B\x96D,N\xadL" def test_block_index(): @@ -808,27 +802,20 @@ def test_write_to_update_storage_options(tmp_path, all_array_storage, all_array_ compression_kwargs = {"compresslevel": 1} def assert_result(ff): - if "array" not in ff: - # this was called from _write_to while making an external block - # so don't check the result - return if all_array_storage == "external": assert "test0000.asdf" in os.listdir(tmp_path) else: assert "test0000.asdf" not in os.listdir(tmp_path) if all_array_storage == "internal": - assert len(ff._blocks._internal_blocks) == 1 + assert len(ff._blocks.blocks) == 1 else: - assert len(ff._blocks._internal_blocks) == 0 - blk = ff._blocks[ff["array"]] - - target_compression = all_array_compression or None - if target_compression == "input": - target_compression = None - assert blk.output_compression == target_compression + assert len(ff._blocks.blocks) == 0 - target_compression_kwargs = compression_kwargs or {} - assert blk._output_compression_kwargs == target_compression_kwargs + if all_array_storage == "internal": + target_compression = all_array_compression or None + if target_compression == "input": + target_compression = None + assert ff.get_array_compression(ff["array"]) == target_compression arr1 = np.ones((8, 8)) tree = {"array": arr1} @@ -836,18 +823,6 @@ def assert_result(ff): ff1 = asdf.AsdfFile(tree) - # as a new AsdfFile is used for write_to and we want - # to check blocks here, we patch _write_to to allow us - # to inspect the blocks in the new AsdfFile before - # it falls out of scope - original = asdf.AsdfFile._write_to - - def patched(self, *args, **kwargs): - original(self, *args, **kwargs) - assert_result(self) - - asdf.AsdfFile._write_to = patched - # first check write_to ff1.write_to( fn, @@ -856,10 +831,9 @@ def patched(self, *args, **kwargs): compression_kwargs=compression_kwargs, ) - asdf.AsdfFile._write_to = original - # then reuse the file to check update with asdf.open(fn, mode="rw") as ff2: + assert_result(ff2) arr2 = np.ones((8, 8)) * 42 ff2["array"] = arr2 ff2.update( @@ -924,18 +898,18 @@ def test_remove_blocks(tmp_path): af.write_to(fn1) with asdf.open(fn1, mode="rw") as af: - assert len(af._blocks._internal_blocks) == 2 + assert len(af._blocks.blocks) == 2 af["a"] = None af.write_to(fn2) with asdf.open(fn1, mode="rw") as af: - assert len(af._blocks._internal_blocks) == 2 + assert len(af._blocks.blocks) == 2 af["a"] = None af.update() for fn in (fn1, fn2): with asdf.open(fn) as af: - assert len(af._blocks._internal_blocks) == 1 + assert len(af._blocks.blocks) == 1 def test_write_to_before_update(tmp_path): diff --git a/asdf/asdf.py b/asdf/asdf.py index 8d9a7d8b8..ea0e5f551 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -5,6 +5,7 @@ import pathlib import time import warnings +import weakref from packaging.version import Version @@ -12,7 +13,9 @@ from . import _node_info as node_info from . import _version as version from . import constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil +from ._block import io as bio from ._block import reader as block_reader +from ._block import store as block_store from ._block import writer as block_writer from ._block.manager import Manager as BlockManager from ._block.options import Options as BlockOptions @@ -1068,91 +1071,178 @@ def update( writing. """ - # with config_context() as config: - # if all_array_storage is not NotSet: - # config.all_array_storage = all_array_storage - # if all_array_compression is not NotSet: - # config.all_array_compression = all_array_compression - # if compression_kwargs is not NotSet: - # config.all_array_compression_kwargs = compression_kwargs - - # fd = self._fd - - # if fd is None: - # msg = "Can not update, since there is no associated file" - # raise ValueError(msg) - - # if not fd.writable(): - # msg = ( - # "Can not update, since associated file is read-only. Make " - # "sure that the AsdfFile was opened with mode='rw' and the " - # "underlying file handle is writable." - # ) - # raise OSError(msg) - - # if version is not None: - # self.version = version - - # if config.all_array_storage == "external": - # # If the file is fully exploded, there's no benefit to - # # update, so just use write_to() - # self.write_to(fd) - # fd.truncate() - # return - - # if not fd.seekable(): - # msg = "Can not update, since associated file is not seekable" - # raise OSError(msg) - - # self._blocks.finish_reading_internal_blocks() - - # # flush all pending memmap writes - # if fd.can_memmap(): - # fd.flush_memmap() - - # self._pre_write(fd) - - # try: - # fd.seek(0) - - # if not self._blocks.has_blocks_with_offset(): - # # If we don't have any blocks that are being reused, just - # # write out in a serial fashion. - # self._serial_write(fd, pad_blocks, include_block_index) - # fd.truncate() - # return - - # # Estimate how big the tree will be on disk by writing the - # # YAML out in memory. Since the block indices aren't yet - # # known, we have to count the number of block references and - # # add enough space to accommodate the largest block number - # # possible there. - # tree_serialized = io.BytesIO() - # self._write_tree(self._tree, tree_serialized, pad_blocks=False) - # n_internal_blocks = len(self._blocks._internal_blocks) - - # serialized_tree_size = tree_serialized.tell() + constants.MAX_BLOCKS_DIGITS * n_internal_blocks - - # if not calculate_updated_layout(self._blocks, serialized_tree_size, pad_blocks, fd.block_size): - # # If we don't have any blocks that are being reused, just - # # write out in a serial fashion. - # self._serial_write(fd, pad_blocks, include_block_index) - # fd.truncate() - # return - - # fd.seek(0) - # self._random_write(fd, pad_blocks, include_block_index) - # fd.flush() - # finally: - # self._post_write(fd) - # # close memmaps so they will regenerate - # if fd.can_memmap(): - # fd.close_memmap() - # # also clean any memmapped blocks - # for b in self._blocks._internal_blocks: - # if b._memmapped: - # b._memmapped = False - # b._data = None + with config_context() as config: + if all_array_storage is not NotSet: + config.all_array_storage = all_array_storage + if all_array_compression is not NotSet: + config.all_array_compression = all_array_compression + if compression_kwargs is not NotSet: + config.all_array_compression_kwargs = compression_kwargs + + fd = self._fd + + if fd is None: + msg = "Can not update, since there is no associated file" + raise ValueError(msg) + + if not fd.writable(): + msg = ( + "Can not update, since associated file is read-only. Make " + "sure that the AsdfFile was opened with mode='rw' and the " + "underlying file handle is writable." + ) + raise OSError(msg) + + if not fd.seekable(): + msg = "Can not update, since associated file is not seekable" + raise OSError(msg) + + if version is not None: + self.version = version + + # flush all pending memmap writes + if fd.can_memmap(): + fd.flush_memmap() + + # TODO shortcuts for + # - no read blocks + if len(self._blocks.blocks) == 0 and self._blocks._streamed_block is None: + self.write_to(self._fd) + if self._fd.can_memmap(): + self._fd.close_memmap() + self._fd.truncate() + return + # - all external + if config.all_array_storage == "external": + self.write_to(self._fd) + if self._fd.can_memmap(): + self._fd.close_memmap() + self._fd.truncate() + return + # - no write blocks + + self._pre_write(fd) + + # TODO wrap a sensible try/finally + # prepare block manager for writing + self._blocks._clear_write() + + # write out tree to temporary buffer + tree_fd = generic_io.get_file(io.BytesIO(), mode="rw") + self._write_tree(self._tree, tree_fd, False) + new_tree_size = tree_fd.tell() + end_of_file = new_tree_size + + # TODO short circuit here if no blocks are used + + # find where to start writing blocks (either end of new tree or end of last 'free' block) + last_block = None + for blk in self._blocks.blocks[::-1]: + if not blk.memmap and (blk._cached_data is not None or not callable(blk._data)): + continue + last_block = blk + break + if last_block is None: + new_block_start = new_tree_size + else: + new_block_start = max( + last_block.data_offset + last_block.header["allocated_size"], + new_tree_size, + ) + + if len(self._blocks._external_write_blocks): + self._blocks._write_external_blocks() + + # do we have any blocks to write? + if len(self._blocks._write_blocks) or self._blocks._streamed_block: + self._fd.seek(new_block_start) + offsets, headers = block_writer.write_blocks( + self._fd, + self._blocks._write_blocks, + pad_blocks, + streamed_block=self._blocks._streamed_block, + write_index=False, # don't write an index as we will modify the offsets + ) + new_block_end = self._fd.tell() + end_of_file = new_block_end + + # move blocks to start TODO as 'chunks' + self._fd.seek(new_block_start) + block_data = self._fd.read(new_block_end - new_block_start) + self._fd.seek(new_tree_size) + self._fd.write(block_data) + # update offset to point at correct locations + offsets = [o - (new_block_start - new_tree_size) for o in offsets] + + # write index if no streamed block + if include_block_index and self._blocks._streamed_block is None: + bio.write_block_index(self._fd, offsets) + end_of_file = self._fd.tell() + + # map new blocks to old blocks + new_read_blocks = block_store.LinearStore() + for i, (offset, header) in enumerate(zip(offsets, headers)): + if i == len(self._blocks._write_blocks): # this is a streamed block + obj = self._blocks._streamed_obj() + wblk = self._blocks._streamed_block + else: + wblk = self._blocks._write_blocks[i] + # find object associated with wblk + obj = None + for oid, by_key in self._blocks._write_blocks._by_id.items(): + for key, index in by_key.items(): + if self._blocks._write_blocks[index] is wblk: + obj = key._ref() + break + if obj is None: + msg = "Update failed to associate blocks" + raise OSError(msg) + + # does the obj have an old read block? + rblk = self._blocks.blocks.lookup_by_object(obj) + if rblk is not None: + memmap = rblk.memmap + data = None + if not rblk.memmap: + if rblk._cached_data is not None: + data = rblk._cached_data + elif not callable(rblk._data): + data = rblk._data + else: + memmap = self._blocks.memmap + data = None + + # we have to be lazy here as the current memmap is invalid + new_read_block = block_reader.ReadBlock( + offset + 4, self._fd, memmap, True, header=header, data=data + ) + new_read_blocks._items.append(new_read_block) + new_index = len(new_read_blocks) - 1 + new_read_blocks.assign_object(obj, new_read_block) + + # update data callbacks to point to new blocks + cb = self._blocks._data_callbacks.lookup_by_object(obj) + if cb is not None: + cb._index = new_index + cb._read_blocks_ref = weakref.ref(new_read_blocks) + + # update read blocks to reflect new state + self._blocks.blocks = new_read_blocks + self._blocks.options._read_blocks = new_read_blocks + + # now write the tree + self._fd.seek(0) + tree_fd.seek(0) + self._fd.write(tree_fd.read()) + + # TODO post_write + # close memmap to trigger arrays to reload themselves + if self._fd.can_memmap(): + self._fd.close_memmap() + self._fd.seek(end_of_file) + self._fd.truncate() + + self._blocks._clear_write() def write_to( self, From aab0522a3545591a37a886df5a3515ed897af22e Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 28 Apr 2023 13:42:16 -0400 Subject: [PATCH 021/154] output compression extension list working --- asdf/_block/manager.py | 22 +++++++++++++++++++++- asdf/_tests/commands/tests/test_to_yaml.py | 7 +++---- asdf/asdf.py | 12 +++++++----- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index fbea9c904..9844580cf 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -1,7 +1,7 @@ import os import weakref -from asdf import constants, generic_io, util +from asdf import config, constants, generic_io, util from . import store from .callback import DataCallback @@ -100,6 +100,23 @@ def set_options(self, array, options): # TODO copy to allow for changing settings on write # TODO make an 'update_options' + def get_output_compressions(self): + compressions = set() + cfg = config.get_config() + if cfg.all_array_compression == "input": + for blk in self._read_blocks: + if blk.header["compression"]: + compressions.add(blk.header["compression"]) + else: + compressions.add(cfg.all_array_compression) + for _, by_key in self._by_id.items(): + for key, opts in by_key.items(): + if not key.is_valid(): + continue + if opts.compression: + compressions.add(opts.compression) + return compressions + def make_external_uri(uri, index): if uri is None: @@ -212,3 +229,6 @@ def _get_array_compression(self, arr): def _get_array_compression_kwargs(self, arr): return self.options.get_options(arr).compression_kwargs + + def get_output_compressions(self): + return self.options.get_output_compressions() diff --git a/asdf/_tests/commands/tests/test_to_yaml.py b/asdf/_tests/commands/tests/test_to_yaml.py index 15c1519ca..e2987beea 100644 --- a/asdf/_tests/commands/tests/test_to_yaml.py +++ b/asdf/_tests/commands/tests/test_to_yaml.py @@ -1,7 +1,6 @@ import os import numpy as np -import pytest import asdf from asdf import AsdfFile @@ -9,7 +8,6 @@ from asdf.commands import main -@pytest.mark.xfail(reason="resolve and inline is broken") def test_to_yaml(tmpdir): x = np.arange(0, 10, dtype=float) @@ -23,7 +21,8 @@ def test_to_yaml(tmpdir): path = os.path.join(str(tmpdir), "original.asdf") ff = AsdfFile(tree) ff.write_to(path) - assert len(ff._blocks) == 2 + with asdf.open(path) as ff2: + assert len(ff2._blocks.blocks) == 2 result = main.main_from_args(["to_yaml", path]) @@ -36,4 +35,4 @@ def test_to_yaml(tmpdir): with asdf.open(os.path.join(str(tmpdir), "original.yaml")) as ff: assert_tree_match(ff.tree, tree) - assert len(list(ff._blocks.internal_blocks)) == 0 + assert len(list(ff._blocks.blocks)) == 0 diff --git a/asdf/asdf.py b/asdf/asdf.py index ea0e5f551..0cc7c1190 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -12,6 +12,7 @@ from . import _display as display from . import _node_info as node_info from . import _version as version +from . import compression as mcompression from . import constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil from ._block import io as bio from ._block import reader as block_reader @@ -937,11 +938,12 @@ def _write_tree(self, tree, fd, pad_blocks): if len(tree): serialization_context = self._create_serialization_context() - # TODO fix output compression extensions - # compression_extensions = self._blocks.get_output_compression_extensions() - compression_extensions = [] - for ext in compression_extensions: - serialization_context._mark_extension_used(ext) + for compression in self._blocks.get_output_compressions(): + # lookup extension + compressor = mcompression._get_compressor_from_extensions(compression, return_extension=True) + if compressor is not None: + # mark it as used + serialization_context._mark_extension_used(compressor[1]) def _tree_finalizer(tagged_tree): """ From 081dd433c98345c842daa48c36d4397687e0edde Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 28 Apr 2023 13:47:28 -0400 Subject: [PATCH 022/154] fix block converter tests --- asdf/_tests/test_block_converter.py | 38 +++++++---------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/asdf/_tests/test_block_converter.py b/asdf/_tests/test_block_converter.py index 444b6a029..4e639ad10 100644 --- a/asdf/_tests/test_block_converter.py +++ b/asdf/_tests/test_block_converter.py @@ -79,35 +79,25 @@ def test_block_converter_block_allocation(tmp_path): af = asdf.AsdfFile({"a": None}) # now assign to the tree item (avoiding validation) af["a"] = a - # the AsdfFile instance should have no blocks - assert len(af._blocks._internal_blocks) == 0 - # validate will make a block - af.validate() - assert len(af._blocks._internal_blocks) == 1 - assert np.all(af._blocks._internal_blocks[0].data.tobytes() == a.payload) - # a second validate shouldn't result in more blocks - af.validate() - assert len(af._blocks._internal_blocks) == 1 - # write_to will create blocks here because + # they currently hold storage settings fn = tmp_path / "test.asdf" af.write_to(fn) - assert len(af._blocks._internal_blocks) == 1 # if we read a file with asdf.open(fn, mode="rw") as af: fn2 = tmp_path / "test2.asdf" # there should be 1 block - assert len(af._blocks._internal_blocks) == 1 + assert len(af._blocks.blocks) == 1 # validate should use that block af.validate() - assert len(af._blocks._internal_blocks) == 1 + assert len(af._blocks.blocks) == 1 # as should write_to af.write_to(fn2) - assert len(af._blocks._internal_blocks) == 1 + assert len(af._blocks.blocks) == 1 # and update af.update() - assert len(af._blocks._internal_blocks) == 1 + assert len(af._blocks.blocks) == 1 @with_extension(BlockExtension) @@ -186,34 +176,24 @@ def test_block_data_callback_converter(tmp_path): af = asdf.AsdfFile({"a": None}) # now assign to the tree item (avoiding validation) af["a"] = a - # the AsdfFile instance should have no blocks - assert len(af._blocks._internal_blocks) == 0 - # validate will make a block - af.validate() - assert len(af._blocks._internal_blocks) == 1 - assert np.all(af._blocks._internal_blocks[0].data == a.data) - # a second validate shouldn't result in more blocks - af.validate() - assert len(af._blocks._internal_blocks) == 1 # write_to will use the block fn1 = tmp_path / "test.asdf" af.write_to(fn1) - assert len(af._blocks._internal_blocks) == 1 # if we read a file with asdf.open(fn1, mode="rw") as af: fn2 = tmp_path / "test2.asdf" # there should be 1 block - assert len(af._blocks._internal_blocks) == 1 + assert len(af._blocks.blocks) == 1 # validate should use that block af.validate() - assert len(af._blocks._internal_blocks) == 1 + assert len(af._blocks.blocks) == 1 # as should write_to af.write_to(fn2) - assert len(af._blocks._internal_blocks) == 1 + assert len(af._blocks.blocks) == 1 # and update af.update() - assert len(af._blocks._internal_blocks) == 1 + assert len(af._blocks.blocks) == 1 # check that data was preserved for fn in (fn1, fn2): From ca364eff64aa420ada8e406f57fd6b7a0360bfce Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 28 Apr 2023 14:28:12 -0400 Subject: [PATCH 023/154] rebased, broke 2 tests test_external_block_url (which relies on write_blocks being allocated during validate) test_write_to_no_tree_modification (the tree is being modified again) --- asdf/_tests/test_block_converter.py | 54 ----------------------------- 1 file changed, 54 deletions(-) diff --git a/asdf/_tests/test_block_converter.py b/asdf/_tests/test_block_converter.py index 4e639ad10..a63a06a4b 100644 --- a/asdf/_tests/test_block_converter.py +++ b/asdf/_tests/test_block_converter.py @@ -19,7 +19,6 @@ class BlockConverter(Converter): tags = ["asdf://somewhere.org/tags/block_data-1.0.0"] types = [BlockData] _return_invalid_keys = False - _double_assign_block = False def to_yaml_tree(self, obj, tag, ctx): # lookup source for obj @@ -36,10 +35,6 @@ def from_yaml_tree(self, node, tag, ctx): data = ctx.get_block_data_callback(block_index)() obj = BlockData(data.tobytes()) ctx.assign_block_key(block_index, obj._asdf_key) - if self._double_assign_block: - self._double_assign_block = False - key2 = asdf.util.BlockKey() - ctx.assign_block_key(block_index, key2) return obj def reserve_blocks(self, obj, tag): @@ -100,28 +95,6 @@ def test_block_converter_block_allocation(tmp_path): assert len(af._blocks.blocks) == 1 -@with_extension(BlockExtension) -def test_invalid_reserve_block_keys(tmp_path): - a = BlockData(b"abcdefg") - af = asdf.AsdfFile({"a": a}) - fn = tmp_path / "test.asdf" - BlockExtension.converters[0]._return_invalid_keys = True - with pytest.raises(TypeError, match="unhashable type: .*"): - af.write_to(fn) - - -@with_extension(BlockExtension) -def test_double_assign_block(tmp_path): - a = BlockData(b"abcdefg") - af = asdf.AsdfFile({"a": a}) - fn = tmp_path / "test.asdf" - af.write_to(fn) - BlockExtension.converters[0]._double_assign_block = True - with pytest.raises(ValueError, match="block 0 is already assigned to a key"): - with asdf.open(fn): - pass - - class BlockDataCallback: """An example object that uses the data callback to read block data""" @@ -226,30 +199,3 @@ def test_block_with_callback_removal(tmp_path): af[remove_key] = None af.update() af[check_key] = b.data - - -def test_seralization_context_block_access(): - af = asdf.AsdfFile() - sctx = af._create_serialization_context() - - # finding an index for an unknown block should - # create one - key = 42 - arr = np.ones(3, dtype="uint8") - index = sctx.find_block_index(key, lambda: arr) - assert len(af._blocks) == 1 - assert id(arr) == id(sctx.get_block_data_callback(index)()) - # finding the same block should not create a new one - index = sctx.find_block_index(key, lambda: arr) - assert len(af._blocks) == 1 - - new_key = 26 - with pytest.raises(ValueError, match="block 0 is already assigned to a key"): - sctx.assign_block_key(index, new_key) - assert len(af._blocks) == 1 - - arr2 = np.zeros(3, dtype="uint8") - # test that providing a new callback won't overwrite - # the first one - index = sctx.find_block_index(key, lambda: arr2) - assert id(arr2) != id(sctx.get_block_data_callback(index)()) From 8cf80242ecbb10e78a3711f5a0f0ec58967f4730 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 1 May 2023 12:05:58 -0400 Subject: [PATCH 024/154] add tests for issues 1013 1505 1520 1523 1526 1530 --- asdf/_tests/_issues/__init__.py | 0 asdf/_tests/_issues/test_1013.py | 41 ++++++++++++++++++++++++++++++++ asdf/_tests/_issues/test_1505.py | 23 ++++++++++++++++++ asdf/_tests/_issues/test_1520.py | 35 +++++++++++++++++++++++++++ asdf/_tests/_issues/test_1523.py | 30 +++++++++++++++++++++++ asdf/_tests/_issues/test_1526.py | 34 ++++++++++++++++++++++++++ asdf/_tests/_issues/test_1530.py | 33 +++++++++++++++++++++++++ asdf/_tests/test_array_blocks.py | 16 ------------- 8 files changed, 196 insertions(+), 16 deletions(-) create mode 100644 asdf/_tests/_issues/__init__.py create mode 100644 asdf/_tests/_issues/test_1013.py create mode 100644 asdf/_tests/_issues/test_1505.py create mode 100644 asdf/_tests/_issues/test_1520.py create mode 100644 asdf/_tests/_issues/test_1523.py create mode 100644 asdf/_tests/_issues/test_1526.py create mode 100644 asdf/_tests/_issues/test_1530.py diff --git a/asdf/_tests/_issues/__init__.py b/asdf/_tests/_issues/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/asdf/_tests/_issues/test_1013.py b/asdf/_tests/_issues/test_1013.py new file mode 100644 index 000000000..27dafe3fc --- /dev/null +++ b/asdf/_tests/_issues/test_1013.py @@ -0,0 +1,41 @@ +import numpy as np + +import asdf + + +def test_1013(tmp_path): + class FooType: + def __init__(self, data): + self.data = data + + class FooConverter: + tags = ["asdf://somewhere.org/tag/foo-1.0.0"] + types = [FooType] + + def to_yaml_tree(self, obj, tag, ctx): + if obj.data.ndim < 2: + ctx._blocks._set_array_storage(obj.data, "inline") + return {"data": obj.data} + + def from_yaml_tree(self, obj, tag, ctx): + return FooType(obj["data"]) + + class FooExtension: + converters = [FooConverter()] + tags = ["asdf://somewhere.org/tag/foo-1.0.0"] + extension_uri = "asdf://somewhere.org/extensions/foo-1.0.0" + + with asdf.config_context() as cfg: + cfg.add_extension(FooExtension()) + + fn = tmp_path / "test.asdf" + + for shape in [3, (3, 3)]: + arr = np.zeros(shape) + n_blocks = 0 if arr.ndim == 1 else 1 + af = asdf.AsdfFile({"foo": FooType(arr)}) + af.write_to(fn) + + with asdf.open(fn) as af: + np.testing.assert_array_equal(af["foo"].data, arr) + assert len(af._blocks.blocks) == n_blocks diff --git a/asdf/_tests/_issues/test_1505.py b/asdf/_tests/_issues/test_1505.py new file mode 100644 index 000000000..7a0c8796d --- /dev/null +++ b/asdf/_tests/_issues/test_1505.py @@ -0,0 +1,23 @@ +import numpy as np + +import asdf + + +def test_1505(tmp_path): + """ + Calling update after write_to fails + + https://github.com/asdf-format/asdf/issues/1505 + """ + fn1 = tmp_path / "test1.asdf" + fn2 = tmp_path / "test2.asdf" + + tree = {"a": np.zeros(3), "b": np.ones(3)} + af = asdf.AsdfFile(tree) + + af.write_to(fn1) + + with asdf.open(fn1, mode="rw") as af: + af["a"] = None + af.write_to(fn2) + af.update() diff --git a/asdf/_tests/_issues/test_1520.py b/asdf/_tests/_issues/test_1520.py new file mode 100644 index 000000000..649d3c857 --- /dev/null +++ b/asdf/_tests/_issues/test_1520.py @@ -0,0 +1,35 @@ +import numpy as np + +import asdf + + +def test_1520(tmp_path): + """ + A failed update can corrupt the file + + https://github.com/asdf-format/asdf/issues/1520 + """ + fn = tmp_path / "test.asdf" + n_arrays = 10 + array_size = 10000 + + # make a tree with many arrays that will compress well + af = asdf.AsdfFile() + for i in range(n_arrays): + af[i] = np.zeros(array_size, dtype="uint8") + i + af.set_array_compression(af[i], "zlib") + af.write_to(fn) + + with asdf.open(fn, mode="rw") as af: + # now make the data difficult to compress + for i in range(n_arrays): + assert np.all(af[i] == i) + af[i][:] = np.random.randint(255, size=array_size) + af[i][0] = i + 1 + # this no longer causes update to fail + assert False + af.update() + + with asdf.open(fn, mode="r") as af: + for i in range(n_arrays): + assert af[i][0] == i + 1 diff --git a/asdf/_tests/_issues/test_1523.py b/asdf/_tests/_issues/test_1523.py new file mode 100644 index 000000000..65451df51 --- /dev/null +++ b/asdf/_tests/_issues/test_1523.py @@ -0,0 +1,30 @@ +import numpy as np + +import asdf + + +def test_1523(tmp_path): + """ + update corrupts stream data + https://github.com/asdf-format/asdf/issues/1523 + """ + fn = tmp_path / "stream.asdf" + + s = asdf.Stream([3], np.uint8) + asdf.AsdfFile({"s": s}).write_to(fn) + + with open(fn, "rb+") as f: + f.seek(0, 2) + f.write(b"\x01\x02\x03") + + with asdf.open(fn) as af: + np.testing.assert_array_equal(af["s"], [[1, 2, 3]]) + + with asdf.open(fn, mode="rw") as af: + af["a"] = np.arange(1000) + af.update() + # print(af['s']) # segmentation fault + + with asdf.open(fn) as af: + # fails as af['s'] == [[116, 101, 111]] + np.testing.assert_array_equal(af["s"], [[1, 2, 3]]) diff --git a/asdf/_tests/_issues/test_1526.py b/asdf/_tests/_issues/test_1526.py new file mode 100644 index 000000000..d70c0715e --- /dev/null +++ b/asdf/_tests/_issues/test_1526.py @@ -0,0 +1,34 @@ +import os + +import numpy as np + +import asdf + + +def test_1526(tmp_path): + """ + Rewriting a file with external blocks fails if arrays are not first accessed + + https://github.com/asdf-format/asdf/issues/1526 + """ + arrs = [np.arange(3) + i for i in range(3)] + af = asdf.AsdfFile({"arrs": arrs}) + [af.set_array_storage(a, "external") for a in arrs] + + dns = [] + for i in range(2): + dn = tmp_path / f"d{i}" + if not os.path.exists(dn): + os.makedirs(dn) + dns.append(dn) + fns = [dn / "test.asdf" for dn in dns] + + # write to d0 + af.write_to(fns[0]) + + with asdf.open(fns[0]) as af2: + af2["arrs"][0] = 42 + af2.write_to(fns[1]) + + for dn in dns: + assert len(os.listdir(dn)) == 4 diff --git a/asdf/_tests/_issues/test_1530.py b/asdf/_tests/_issues/test_1530.py new file mode 100644 index 000000000..cdd82a169 --- /dev/null +++ b/asdf/_tests/_issues/test_1530.py @@ -0,0 +1,33 @@ +import numpy as np + +import asdf + + +def test_1530(tmp_path): + """ + Calling update with memmapped data can create invalid data in memmap views + + https://github.com/asdf-format/asdf/issues/1530 + + A view of a memmapped array can return invalid data or segfault + after an update + """ + fn = tmp_path / "test.asdf" + a = np.zeros(10, dtype="uint8") + b = np.ones(10, dtype="uint8") + ov = a[:3] + + af = asdf.AsdfFile({"a": a, "b": b}) + af.write_to(fn) + + with asdf.open(fn, mode="rw", copy_arrays=False) as af: + va = af["a"][:3] + np.testing.assert_array_equal(a, af["a"]) + np.testing.assert_array_equal(b, af["b"]) + np.testing.assert_array_equal(va, ov) + af["c"] = "a" * 10000 + af.update() + np.testing.assert_array_equal(a, af["a"]) + np.testing.assert_array_equal(b, af["b"]) + assert False + # np.testing.assert_array_equal(va, ov) # segfault diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index 77265beca..c3236dc08 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -910,19 +910,3 @@ def test_remove_blocks(tmp_path): for fn in (fn1, fn2): with asdf.open(fn) as af: assert len(af._blocks.blocks) == 1 - - -def test_write_to_before_update(tmp_path): - # this is a regression test for: https://github.com/asdf-format/asdf/issues/1505 - fn1 = tmp_path / "test1.asdf" - fn2 = tmp_path / "test2.asdf" - - tree = {"a": np.zeros(3), "b": np.ones(3)} - af = asdf.AsdfFile(tree) - - af.write_to(fn1) - - with asdf.open(fn1, mode="rw") as af: - af["a"] = None - af.write_to(fn2) - af.update() From 2c4f6845a3a0d40d7bb36fdb084957b991a2f79d Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 1 May 2023 13:15:08 -0400 Subject: [PATCH 025/154] prevent modification of tree or blocks during write/validate --- asdf/_block/manager.py | 3 ++ asdf/_tests/_issues/test_1013.py | 6 +++- asdf/asdf.py | 51 ++++++++++++++++++++++---------- 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 9844580cf..da209bf72 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -180,6 +180,9 @@ def _write_external_blocks(self): write_blocks(f, [blk]) def make_write_block(self, data, options, obj): + # if we're not actually writing just return a junk index + # if self._write_fd is None: + # return constants.MAX_BLOCKS + 1 if options.storage_type == "external": for index, blk in enumerate(self._external_write_blocks): if blk._data is data: diff --git a/asdf/_tests/_issues/test_1013.py b/asdf/_tests/_issues/test_1013.py index 27dafe3fc..88b9e4818 100644 --- a/asdf/_tests/_issues/test_1013.py +++ b/asdf/_tests/_issues/test_1013.py @@ -33,8 +33,12 @@ class FooExtension: for shape in [3, (3, 3)]: arr = np.zeros(shape) n_blocks = 0 if arr.ndim == 1 else 1 - af = asdf.AsdfFile({"foo": FooType(arr)}) + af = asdf.AsdfFile() + # avoid a call to validate that will set the storage type + assert af.get_array_storage(arr) == "internal" + af.tree = {"foo": FooType(arr)} af.write_to(fn) + assert af.get_array_storage(arr) == "internal" with asdf.open(fn) as af: np.testing.assert_array_equal(af["foo"].data, arr) diff --git a/asdf/asdf.py b/asdf/asdf.py index 0cc7c1190..3d1b8e7b6 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -396,7 +396,7 @@ def _process_user_extensions(self, extensions): return result - def _update_extension_history(self, serialization_context): + def _update_extension_history(self, tree, serialization_context): """ Update the extension metadata on this file's tree to reflect extensions used during serialization. @@ -409,20 +409,20 @@ def _update_extension_history(self, serialization_context): if serialization_context.version < versioning.NEW_HISTORY_FORMAT_MIN_VERSION: return - if "history" not in self.tree: - self.tree["history"] = {"extensions": []} + if "history" not in tree: + tree["history"] = {"extensions": []} # Support clients who are still using the old history format - elif isinstance(self.tree["history"], list): - histlist = self.tree["history"] - self.tree["history"] = {"entries": histlist, "extensions": []} + elif isinstance(tree["history"], list): + histlist = tree["history"] + tree["history"] = {"entries": histlist, "extensions": []} warnings.warn( "The ASDF history format has changed in order to " "support metadata about extensions. History entries " "should now be stored under tree['history']['entries'].", AsdfWarning, ) - elif "extensions" not in self.tree["history"]: - self.tree["history"]["extensions"] = [] + elif "extensions" not in tree["history"]: + tree["history"]["extensions"] = [] for extension in serialization_context._extensions_used: ext_name = extension.class_name @@ -434,17 +434,17 @@ def _update_extension_history(self, serialization_context): if extension.compressors: ext_meta["supported_compression"] = [comp.label.decode("ascii") for comp in extension.compressors] - for i, entry in enumerate(self.tree["history"]["extensions"]): + for i, entry in enumerate(tree["history"]["extensions"]): # Update metadata about this extension if it already exists if ( entry.extension_uri is not None and entry.extension_uri == extension.extension_uri or entry.extension_class in extension.legacy_class_names ): - self.tree["history"]["extensions"][i] = ext_meta + tree["history"]["extensions"][i] = ext_meta break else: - self.tree["history"]["extensions"].append(ext_meta) + tree["history"]["extensions"].append(ext_meta) @property def file_format_version(self): @@ -604,6 +604,8 @@ def comments(self): return self._comments def _validate(self, tree, custom=True, reading=False): + previous_options = copy.deepcopy(self._blocks.options) + # If we're validating on read then the tree # is already guaranteed to be in tagged form. tagged_tree = tree if reading else yamlutil.custom_tree_to_tagged_tree(tree, self) @@ -613,6 +615,9 @@ def _validate(self, tree, custom=True, reading=False): if custom and self._custom_schema: schema.validate(tagged_tree, self, self._custom_schema, reading=reading) + self._blocks.options = previous_options + self._blocks.options._read_blocks = self._blocks.blocks + def validate(self): """ Validate the current state of the tree against the ASDF schema. @@ -952,10 +957,10 @@ def _tree_finalizer(tagged_tree): yamlutil.dump_tree to update extension metadata after the tree has been converted to tagged objects. """ - self._update_extension_history(serialization_context) - if "history" in self.tree: + self._update_extension_history(tree, serialization_context) + if "history" in tree: tagged_tree["history"] = yamlutil.custom_tree_to_tagged_tree( - self.tree["history"], + tree["history"], self, _serialization_context=serialization_context, ) @@ -982,11 +987,21 @@ def _pre_write(self, fd): # reorganization, if necessary # self._blocks.finalize(self) - self._tree["asdf_library"] = get_asdf_library_info() + # self._tree["asdf_library"] = get_asdf_library_info() def _serial_write(self, fd, pad_blocks, include_block_index): self._blocks._clear_write() - self._write_tree(self._tree, fd, pad_blocks) + + # prep a tree for a writing + tree = copy.copy(self._tree) + tree["asdf_library"] = get_asdf_library_info() + if "history" in self._tree: + tree["history"] = copy.deepcopy(self._tree["history"]) + + # TODO copy block options + previous_options = copy.deepcopy(self._blocks.options) + + self._write_tree(tree, fd, pad_blocks) if len(self._blocks._write_blocks) or self._blocks._streamed_block: block_writer.write_blocks( fd, @@ -998,6 +1013,8 @@ def _serial_write(self, fd, pad_blocks, include_block_index): if len(self._blocks._external_write_blocks): self._blocks._write_external_blocks() self._blocks._clear_write() + self._blocks.options = previous_options + self._blocks.options._read_blocks = self._blocks.blocks def _random_write(self, fd, pad_blocks, include_block_index): self._write_tree(self._tree, fd, False) @@ -1124,6 +1141,7 @@ def update( # - no write blocks self._pre_write(fd) + self._tree["asdf_library"] = get_asdf_library_info() # TODO wrap a sensible try/finally # prepare block manager for writing @@ -1243,6 +1261,7 @@ def update( self._fd.close_memmap() self._fd.seek(end_of_file) self._fd.truncate() + self._post_write(fd) self._blocks._clear_write() From afe172d89a4fcd2a0ae3363f34a8688dfa60ff9a Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 1 May 2023 13:22:03 -0400 Subject: [PATCH 026/154] minor reorganization of 1013 test --- asdf/_tests/_issues/test_1013.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asdf/_tests/_issues/test_1013.py b/asdf/_tests/_issues/test_1013.py index 88b9e4818..b10e43eaf 100644 --- a/asdf/_tests/_issues/test_1013.py +++ b/asdf/_tests/_issues/test_1013.py @@ -33,11 +33,11 @@ class FooExtension: for shape in [3, (3, 3)]: arr = np.zeros(shape) n_blocks = 0 if arr.ndim == 1 else 1 - af = asdf.AsdfFile() - # avoid a call to validate that will set the storage type + af = asdf.AsdfFile({"foo": FooType(arr)}) assert af.get_array_storage(arr) == "internal" - af.tree = {"foo": FooType(arr)} af.write_to(fn) + # make sure write_to doesn't change the settings outside of the + # writing context assert af.get_array_storage(arr) == "internal" with asdf.open(fn) as af: From 58c0a93221add57b1b43e058ce4a3a8317ec742a Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 1 May 2023 15:20:05 -0400 Subject: [PATCH 027/154] external block fixes pass AsdfFile.__init__ uri (if provided) to block manager inspect source of NDArrayType to detect external blocks to preserve storage type across writes --- asdf/_block/manager.py | 6 ++++-- asdf/_tests/_issues/test_1526.py | 5 +++-- asdf/asdf.py | 2 +- asdf/tags/core/ndarray.py | 10 +++++++++- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index da209bf72..6c1af18f9 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -143,7 +143,7 @@ def resolve_external_uri(uri, relative): class Manager: - def __init__(self, read_blocks=None): + def __init__(self, read_blocks=None, uri=None): self.options = BlockOptions(read_blocks) if read_blocks is None: self.blocks = self.options._read_blocks @@ -155,6 +155,7 @@ def __init__(self, read_blocks=None): self._streamed_block = None self._streamed_obj = None self._write_fd = None + self._uri = uri def _clear_write(self): self._write_blocks = store.LinearStore() @@ -191,7 +192,8 @@ def make_write_block(self, data, options, obj): # need to set up new external block index = len(self._external_write_blocks) blk = WriteBlock(data, options.compression, options.compression_kwargs) - blk._uri = make_external_uri(self._write_fd.uri, index) + base_uri = self._uri or self._write_fd.uri + blk._uri = make_external_uri(base_uri, index) self._external_write_blocks.append(blk) return blk._uri # first, look for an existing block diff --git a/asdf/_tests/_issues/test_1526.py b/asdf/_tests/_issues/test_1526.py index d70c0715e..6ff9b22cc 100644 --- a/asdf/_tests/_issues/test_1526.py +++ b/asdf/_tests/_issues/test_1526.py @@ -28,7 +28,8 @@ def test_1526(tmp_path): with asdf.open(fns[0]) as af2: af2["arrs"][0] = 42 + # write to d1 af2.write_to(fns[1]) - for dn in dns: - assert len(os.listdir(dn)) == 4 + assert len(os.listdir(dns[0])) == 4 + assert len(os.listdir(dns[1])) == 3 diff --git a/asdf/asdf.py b/asdf/asdf.py index 3d1b8e7b6..40eb1a8dc 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -158,7 +158,7 @@ def __init__( self._fd = None self._closed = False self._external_asdf_by_uri = {} - self._blocks = BlockManager() + self._blocks = BlockManager(uri=uri) self._blocks.lazy_load = lazy_load self._blocks.memmap = not copy_arrays self._uri = uri diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index 9554f6fa0..36ff041e0 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -6,6 +6,7 @@ from asdf import _types, util from asdf._jsonschema import ValidationError +from asdf._block.options import Options from asdf.config import config_context _datatype_names = { @@ -482,7 +483,14 @@ def to_tree(cls, obj, ctx): shape = data.shape - options = ctx._blocks.options.get_options(data) + if isinstance(obj, NDArrayType) and isinstance(obj._source, str): + # this is an external block, if we have no other settings, keep it as external + options = ctx._blocks.options.lookup_by_object(data) + if options is None: + options = Options("external") + else: + options = ctx._blocks.options.get_options(data) + with config_context() as cfg: if cfg.all_array_storage is not None: options.storage_type = cfg.all_array_storage From d462b316ff28de24f95ae7ccd98860d64a6b76e1 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 2 May 2023 09:54:11 -0400 Subject: [PATCH 028/154] remove old block manager --- asdf/_block/__init__.py | 5 - asdf/_block/block.py | 532 -------------------- asdf/_block/old_manager.py | 835 ------------------------------- asdf/_block/util.py | 87 ---- asdf/_tests/test_array_blocks.py | 72 --- asdf/commands/edit.py | 21 +- 6 files changed, 7 insertions(+), 1545 deletions(-) delete mode 100644 asdf/_block/block.py delete mode 100644 asdf/_block/old_manager.py delete mode 100644 asdf/_block/util.py diff --git a/asdf/_block/__init__.py b/asdf/_block/__init__.py index 10de0c6aa..e69de29bb 100644 --- a/asdf/_block/__init__.py +++ b/asdf/_block/__init__.py @@ -1,5 +0,0 @@ -from .block import Block, UnloadedBlock -from .old_manager import BlockManager -from .util import calculate_updated_layout - -__all__ = ["Block", "UnloadedBlock", "BlockManager", "calculate_updated_layout"] diff --git a/asdf/_block/block.py b/asdf/_block/block.py deleted file mode 100644 index de4b041e0..000000000 --- a/asdf/_block/block.py +++ /dev/null @@ -1,532 +0,0 @@ -import hashlib -import io -import struct - -from asdf import compression as mcompression -from asdf import constants, generic_io, util - - -class Block: - """ - Represents a single block in a ASDF file. This is an - implementation detail and should not be instantiated directly. - Instead, should only be created through the `BlockManager`. - """ - - _header = util.BinaryStruct( - [ - ("flags", "I"), - ("compression", "4s"), - ("allocated_size", "Q"), - ("used_size", "Q"), - ("data_size", "Q"), - ("checksum", "16s"), - ], - ) - - def __init__(self, data=None, uri=None, array_storage="internal", memmap=True, lazy_load=True, data_callback=None): - self._data_callback = data_callback - if self._data_callback is not None and data is not None: - msg = "Block.__init__ cannot contain non-None data and a non-None data_callback" - raise ValueError(msg) - self._data = data - self._uri = uri - self._array_storage = array_storage - - self._fd = None - self._offset = None - self._input_compression = None - self._output_compression = "input" - self._output_compression_kwargs = {} - self._checksum = None - self._should_memmap = memmap - self._memmapped = False - self._lazy_load = lazy_load - - self.update_size() - self._allocated = self._size - - def __repr__(self): - return "".format( - self._array_storage[:3], - self._offset, - self._allocated, - self._size, - ) - - def __len__(self): - return self._size - - @property - def offset(self): - return self._offset - - @offset.setter - def offset(self, offset): - self._offset = offset - - @property - def allocated(self): - return self._allocated - - @allocated.setter - def allocated(self, allocated): - self._allocated = allocated - - @property - def header_size(self): - return self._header.size + constants.BLOCK_HEADER_BOILERPLATE_SIZE - - @property - def data_offset(self): - return self._offset + self.header_size - - @property - def size(self): - return self._size + self.header_size - - @property - def end_offset(self): - """ - The offset of the end of the allocated space for the block, - and where the next block should begin. - """ - return self.offset + self.header_size + self.allocated - - @property - def array_storage(self): - return self._array_storage - - @property - def input_compression(self): - """ - The compression codec used to read the block. - """ - return self._input_compression - - @input_compression.setter - def input_compression(self, compression): - self._input_compression = mcompression.validate(compression) - - @property - def output_compression(self): - """ - The compression codec used to write the block. - :return: - """ - if self._output_compression == "input": - return self._input_compression - return self._output_compression - - @output_compression.setter - def output_compression(self, compression): - self._output_compression = mcompression.validate(compression) - - @property - def output_compression_kwargs(self): - """ - The configuration options to the Compressor constructor - used to write the block. - :return: - """ - return self._output_compression_kwargs - - @output_compression_kwargs.setter - def output_compression_kwargs(self, config): - if config is None: - config = {} - self._output_compression_kwargs = config.copy() - - @property - def checksum(self): - return self._checksum - - def _set_checksum(self, checksum): - if checksum == b"\0" * 16: - self._checksum = None - else: - self._checksum = checksum - - def _calculate_checksum(self, array): - # The following line is safe because we're only using - # the MD5 as a checksum. - m = hashlib.new("md5") # noqa: S324 - m.update(array) - return m.digest() - - def validate_checksum(self): - """ - Validate the content of the block against the current checksum. - - Returns - ------- - valid : bool - `True` if the content is valid against the current - checksum or there is no current checksum. Otherwise, - `False`. - """ - if self._checksum: - checksum = self._calculate_checksum(self._flattened_data) - if checksum != self._checksum: - return False - return True - - def update_checksum(self): - """ - Update the checksum based on the current data contents. - """ - self._checksum = self._calculate_checksum(self._flattened_data) - - def update_size(self): - """ - Recalculate the on-disk size of the block. This causes any - compression steps to run. It should only be called when - updating the file in-place, otherwise the work is redundant. - """ - if self._data is not None: - data = self._flattened_data - self._data_size = data.nbytes - - if not self.output_compression: - self._size = self._data_size - else: - self._size = mcompression.get_compressed_size( - data, - self.output_compression, - config=self.output_compression_kwargs, - ) - else: - self._data_size = self._size = 0 - - def read(self, fd, past_magic=False, validate_checksum=False): - """ - Read a Block from the given Python file-like object. - - If the file is seekable and lazy_load is True, the reading - or memmapping of the actual data is postponed until an array - requests it. If the file is a stream or lazy_load is False, - the data will be read into memory immediately. - - As Block is used for reading, writing, configuring and - managing data there are circumstances where read should - not be used. For instance, if a data_callback is defined - a call to read would override the data corresponding to a - block and conflict with the use of the data_callback. To - signify this conflict, a RuntimeError is raised if read - is called on a block with a defined data_callback. - - Parameters - ---------- - fd : GenericFile - - past_magic : bool, optional - If `True`, the file position is immediately after the - block magic token. If `False` (default), the file - position is exactly at the beginning of the block magic - token. - - validate_checksum : bool, optional - If `True`, validate the data against the checksum, and - raise a `ValueError` if the data doesn't match. - - Raises - ------ - - RuntimeError - Read was called on a block with a defined data_callback. - - ValueError - The read file contains invalid data. - """ - if self._data_callback is not None: - msg = "read called on a Block with a data_callback" - raise RuntimeError(msg) - offset = None - if fd.seekable(): - offset = fd.tell() - - if not past_magic: - buff = fd.read(len(constants.BLOCK_MAGIC)) - if len(buff) < 4: - return None - - if buff not in (constants.BLOCK_MAGIC, constants.INDEX_HEADER[: len(buff)]): - msg = ( - "Bad magic number in block. " - "This may indicate an internal inconsistency about the " - "sizes of the blocks in the file." - ) - raise ValueError(msg) - - if buff == constants.INDEX_HEADER[: len(buff)]: - return None - - elif offset is not None: - offset -= 4 - - buff = fd.read(2) - (header_size,) = struct.unpack(b">H", buff) - if header_size < self._header.size: - msg = f"Header size must be >= {self._header.size}" - raise ValueError(msg) - - buff = fd.read(header_size) - header = self._header.unpack(buff) - - # This is used by the documentation system, but nowhere else. - self._flags = header["flags"] - self._set_checksum(header["checksum"]) - - try: - self.input_compression = header["compression"] - except ValueError: - raise # TODO: hint extension? - - if self.input_compression is None and header["used_size"] != header["data_size"]: - msg = "used_size and data_size must be equal when no compression is used." - raise ValueError(msg) - - if header["flags"] & constants.BLOCK_FLAG_STREAMED and self.input_compression is not None: - msg = "Compression set on a streamed block." - raise ValueError(msg) - - if fd.seekable(): - # If the file is seekable, we can delay reading the actual - # data until later. - self._fd = fd - self._offset = offset - self._header_size = header_size - if header["flags"] & constants.BLOCK_FLAG_STREAMED: - # Support streaming blocks - self._array_storage = "streamed" - if self._lazy_load: - fd.fast_forward(-1) - self._data_size = self._size = self._allocated = (fd.tell() - self.data_offset) + 1 - else: - self._data = fd.read_into_array(-1) - self._data_size = self._size = self._allocated = len(self._data) - else: - self._allocated = header["allocated_size"] - self._size = header["used_size"] - self._data_size = header["data_size"] - if self._lazy_load: - fd.fast_forward(self._allocated) - else: - curpos = fd.tell() - self._memmap_data() - fd.seek(curpos) - if not self._memmapped: - self._data = self._read_data(fd, self._size, self._data_size) - fd.fast_forward(self._allocated - self._size) - else: - fd.fast_forward(self._allocated) - else: - # If the file is a stream, we need to get the data now. - if header["flags"] & constants.BLOCK_FLAG_STREAMED: - # Support streaming blocks - self._array_storage = "streamed" - self._data = fd.read_into_array(-1) - self._data_size = self._size = self._allocated = len(self._data) - else: - self._allocated = header["allocated_size"] - self._size = header["used_size"] - self._data_size = header["data_size"] - self._data = self._read_data(fd, self._size, self._data_size) - fd.fast_forward(self._allocated - self._size) - fd.close() - - if validate_checksum and not self.validate_checksum(): - msg = f"Block at {self._offset} does not match given checksum" - raise ValueError(msg) - - return self - - def _read_data(self, fd, used_size, data_size): - """ - Read the block data from a file. - """ - if not self.input_compression: - return fd.read_into_array(used_size) - - return mcompression.decompress(fd, used_size, data_size, self.input_compression) - - def _memmap_data(self): - """ - Memory map the block data from the file. - """ - memmap = self._fd.can_memmap() and not self.input_compression - if self._should_memmap and memmap: - self._data = self._fd.memmap_array(self.data_offset, self._size) - self._memmapped = True - - @property - def _flattened_data(self): - """ - Retrieve flattened data suitable for writing. - - Returns - ------- - np.ndarray - 1D contiguous array. - """ - data = self.data - - # 'K' order flattens the array in the order that elements - # occur in memory, except axes with negative strides which - # are reversed. That is a problem for base arrays with - # negative strides and is an outstanding bug in this library. - return data.ravel(order="K") - - def write(self, fd): - """ - Write an internal block to the given Python file-like object. - """ - self._header_size = self._header.size - - if self._data_callback is not None: - self._data = self._data_callback() - data = self._flattened_data - self.update_size() - self._data = None - self._allocated = self._size - else: - data = self._flattened_data if self._data is not None else None - - flags = 0 - data_size = used_size = allocated_size = 0 - if self._array_storage == "streamed": - flags |= constants.BLOCK_FLAG_STREAMED - elif data is not None: - self._checksum = self._calculate_checksum(data) - data_size = data.nbytes - if not fd.seekable() and self.output_compression: - buff = io.BytesIO() - mcompression.compress(buff, data, self.output_compression, config=self.output_compression_kwargs) - self.allocated = self._size = buff.tell() - allocated_size = self.allocated - used_size = self._size - self.input_compression = self.output_compression - - if allocated_size < used_size: - msg = f"Block used size {used_size} larger than allocated size {allocated_size}" - raise RuntimeError(msg) - - checksum = self.checksum if self.checksum is not None else b"\x00" * 16 - - fd.write(constants.BLOCK_MAGIC) - fd.write(struct.pack(b">H", self._header_size)) - fd.write( - self._header.pack( - flags=flags, - compression=mcompression.to_compression_header(self.output_compression), - allocated_size=allocated_size, - used_size=used_size, - data_size=data_size, - checksum=checksum, - ), - ) - - if data is not None: - if self.output_compression: - if not fd.seekable(): - fd.write(buff.getvalue()) - else: - # If the file is seekable, we write the - # compressed data directly to it, then go back - # and write the resulting size in the block - # header. - start = fd.tell() - mcompression.compress(fd, data, self.output_compression, config=self.output_compression_kwargs) - end = fd.tell() - self.allocated = self._size = end - start - fd.seek(self.offset + 6) - self._header.update(fd, allocated_size=self.allocated, used_size=self._size) - fd.seek(end) - else: - if used_size != data_size: - msg = f"Block used size {used_size} is not equal to the data size {data_size}" - raise RuntimeError(msg) - fd.write_array(data) - - @property - def data(self): - """ - Get the data for the block, as a numpy array. - """ - if self._data is not None: - return self._data - if self._data_callback is not None: - return self._data_callback() - if self._fd.is_closed(): - msg = "ASDF file has already been closed. Can not get the data." - raise OSError(msg) - - # Be nice and reset the file position after we're done - curpos = self._fd.tell() - try: - self._memmap_data() - if not self._memmapped: - self._fd.seek(self.data_offset) - self._data = self._read_data(self._fd, self._size, self._data_size) - finally: - self._fd.seek(curpos) - return self._data - - def close(self): - self._data = None - - def generate_read_data_callback(self): - """Used in SerializationContext.get_block_data_callback""" - - def callback(): - return self.data - - return callback - - -class UnloadedBlock: - """ - Represents an indexed, but not yet loaded, internal block. All - that is known about it is its offset. It converts itself to a - full-fledged block whenever the underlying data or more detail is - requested. - """ - - def __init__(self, fd, offset, memmap=True, lazy_load=True): - self._fd = fd - self._offset = offset - self._data = None - self._uri = None - self._array_storage = "internal" - self._input_compression = None - self._output_compression = "input" - self._output_compression_kwargs = {} - self._checksum = None - self._should_memmap = memmap - self._memmapped = False - self._lazy_load = lazy_load - self._data_callback = None - - def __len__(self): - self.load() - return len(self) - - def close(self): - pass - - @property - def array_storage(self): - return "internal" - - @property - def offset(self): - return self._offset - - def __getattr__(self, attr): - self.load() - return getattr(self, attr) - - def load(self): - self._fd.seek(self._offset, generic_io.SEEK_SET) - self.__class__ = Block - self.read(self._fd) diff --git a/asdf/_block/old_manager.py b/asdf/_block/old_manager.py deleted file mode 100644 index 37398e68b..000000000 --- a/asdf/_block/old_manager.py +++ /dev/null @@ -1,835 +0,0 @@ -import copy -import os -import re -import weakref - -import numpy as np -import yaml - -from asdf import compression as mcompression -from asdf import constants, generic_io, treeutil, util, yamlutil -from asdf.config import get_config -from asdf.util import patched_urllib_parse - -from .block import Block, UnloadedBlock - - -class BlockManager: - """ - Manages the `Block`s associated with a ASDF file. - """ - - def __init__(self, asdffile, copy_arrays=False, lazy_load=True): - self._asdffile = weakref.ref(asdffile) - - self._internal_blocks = [] - self._external_blocks = [] - self._inline_blocks = [] - self._streamed_blocks = [] - - self._block_type_mapping = { - "internal": self._internal_blocks, - "external": self._external_blocks, - "inline": self._inline_blocks, - "streamed": self._streamed_blocks, - } - - self._data_to_block_mapping = {} - self._validate_checksums = False - self._memmap = not copy_arrays - self._lazy_load = lazy_load - self._internal_blocks_mapped = False - - def __len__(self): - """ - Return the total number of blocks being managed. - - This may not include all of the blocks in an open file, since - their reading may have been deferred. Call - `finish_reading_internal_blocks` to find the positions and - header information of all blocks in the file. - """ - return sum(len(x) for x in self._block_type_mapping.values()) - - def add(self, block, key=None): - """ - Add an internal block to the manager. - """ - if not self._internal_blocks_mapped: - # If the block index is missing we need to locate the remaining - # blocks so that we don't accidentally add our new block - # in the middle of the list. - self.finish_reading_internal_blocks() - - self._add(block, key=key) - - def _add(self, block, key=None): - block_set = self._block_type_mapping.get(block.array_storage, None) - if block_set is not None: - if block not in block_set: - block_set.append(block) - else: - msg = f"Unknown array storage type {block.array_storage}" - raise ValueError(msg) - - if block.array_storage == "streamed" and len(self._streamed_blocks) > 1: - msg = "Can not add second streaming block" - raise ValueError(msg) - - if block._data is not None or key is not None: - if key is None: - key = id(util.get_array_base(block._data)) - self._data_to_block_mapping[key] = block - - def remove(self, block): - """ - Remove a block from the manager. - """ - block_set = self._block_type_mapping.get(block.array_storage, None) - if block_set is not None: - if block in block_set: - block_set.remove(block) - for key, blk in list(self._data_to_block_mapping.items()): - if blk is block: - del self._data_to_block_mapping[key] - else: - msg = f"Unknown array storage type {block.array_storage}" - raise ValueError(msg) - - def set_array_storage(self, block, array_storage): - """ - Set the array storage type of the given block. - - Parameters - ---------- - block : Block instance - - array_storage : str - Must be one of: - - - ``internal``: The default. The array data will be - stored in a binary block in the same ASDF file. - - - ``external``: Store the data in a binary block in a - separate ASDF file. - - - ``inline``: Store the data as YAML inline in the tree. - - - ``streamed``: The special streamed inline block that - appears at the end of the file. - """ - if array_storage not in ["internal", "external", "streamed", "inline"]: - msg = "array_storage must be one of 'internal', 'external', 'streamed' or 'inline'" - raise ValueError(msg) - - if block.array_storage != array_storage: - if block in self.blocks: - self.remove(block) - block._array_storage = array_storage - self.add(block) - if array_storage == "streamed": - block.output_compression = None - block.output_compression_kwargs = None - - @property - def blocks(self): - """ - An iterator over all blocks being managed. - - This may not include all of the blocks in an open file, - since their reading may have been deferred. Call - `finish_reading_internal_blocks` to find the positions and - header information of all blocks in the file. - """ - for block_set in self._block_type_mapping.values(): - yield from block_set - - @property - def internal_blocks(self): - """ - An iterator over all internal blocks being managed. - - This may not include all of the blocks in an open file, - since their reading may have been deferred. Call - `finish_reading_internal_blocks` to find the positions and - header information of all blocks in the file. - """ - for block_set in (self._internal_blocks, self._streamed_blocks): - yield from block_set - - @property - def streamed_block(self): - """ - The streamed block (always the last internal block in a file), - or `None` if a streamed block is not present. - """ - self.finish_reading_internal_blocks() - - if len(self._streamed_blocks): - return self._streamed_blocks[0] - - return None - - @property - def external_blocks(self): - """ - An iterator over all external blocks being managed. - """ - yield from self._external_blocks - - @property - def inline_blocks(self): - """ - An iterator over all inline blocks being managed. - """ - yield from self._inline_blocks - - @property - def memmap(self): - """ - The flag which indicates whether the arrays are memory mapped - to the underlying file. - """ - return self._memmap - - @property - def lazy_load(self): - """ - The flag which indicates whether the blocks are lazily read. - """ - return self._lazy_load - - def has_blocks_with_offset(self): - """ - Returns `True` if any of the internal blocks currently have an - offset assigned. - """ - return any(block.offset is not None for block in self.internal_blocks) - - def _new_block(self): - return Block(memmap=self.memmap, lazy_load=self.lazy_load) - - def _sort_blocks_by_offset(self): - def sorter(x): - if x.offset is None: - msg = "Block is missing offset" - raise ValueError(msg) - - return x.offset - - self._internal_blocks.sort(key=sorter) - - def _read_next_internal_block(self, fd, past_magic=False): - # This assumes the file pointer is at the beginning of the - # block, (or beginning + 4 if past_magic is True) - block = self._new_block().read(fd, past_magic=past_magic, validate_checksum=self._validate_checksums) - if block is not None: - self._add(block) - - return block - - def read_internal_blocks(self, fd, past_magic=False, validate_checksums=False): - """ - Read internal blocks present in the file. If the file is - seekable, only the first block will be read, and the reading - of all others will be lazily deferred until an the loading of - an array requests it. - - Parameters - ---------- - fd : GenericFile - The file to read from. - - past_magic : bool, optional - If `True`, the file position is immediately after the - block magic token. If `False` (default), the file - position is exactly at the beginning of the block magic - token. - - validate_checksums : bool, optional - If `True`, validate the blocks against their checksums. - - """ - self._validate_checksums = validate_checksums - - while True: - block = self._read_next_internal_block(fd, past_magic=past_magic) - if block is None: - break - past_magic = False - - # If the file handle is seekable, we only read the first - # block and defer reading the rest until later. - if fd.seekable(): - break - - def finish_reading_internal_blocks(self): - """ - Read all remaining internal blocks present in the file, if any. - This is called before updating a file, since updating requires - knowledge of all internal blocks in the file. - """ - if not self._internal_blocks: - return - for block in self._internal_blocks: - if isinstance(block, UnloadedBlock): - block.load() - - last_block = self._internal_blocks[-1] - - # Read all of the remaining blocks in the file, if any - if last_block._fd is not None and last_block._fd.seekable(): - last_block._fd.seek(last_block.end_offset) - while True: - last_block = self._read_next_internal_block(last_block._fd, False) - if last_block is None: - break - - self._internal_blocks_mapped = True - - def write_internal_blocks_serial(self, fd, pad_blocks=False): - """ - Write all blocks to disk serially. - - Parameters - ---------- - fd : generic_io.GenericFile - The file to write internal blocks to. The file position - should be after the tree. - """ - for block in self.internal_blocks: - if block.output_compression: - block.offset = fd.tell() - block.write(fd) - else: - if block.input_compression: - block.update_size() - padding = util.calculate_padding(block.size, pad_blocks, fd.block_size) - block.allocated = block._size + padding - block.offset = fd.tell() - block.write(fd) - fd.fast_forward(block.allocated - block._size) - - def write_internal_blocks_random_access(self, fd): - """ - Write all blocks to disk at their specified offsets. All - internal blocks must have an offset assigned at this point. - - Parameters - ---------- - fd : generic_io.GenericFile - The file to write internal blocks to. The file position - should be after the tree. - """ - self._sort_blocks_by_offset() - - iter_ = self.internal_blocks - last_block = next(iter_) - # We need to explicitly clear anything between the tree - # and the first block, otherwise there may be other block - # markers left over which will throw off block indexing. - # We don't need to do this between each block. - fd.clear(last_block.offset - fd.tell()) - - for block in iter_: - last_block.allocated = (block.offset - last_block.offset) - last_block.header_size - fd.seek(last_block.offset) - last_block.write(fd) - last_block = block - - last_block.allocated = last_block.size - fd.seek(last_block.offset) - last_block.write(fd) - - fd.truncate(last_block.end_offset) - - def write_external_blocks(self, uri, pad_blocks=False): - """ - Write all blocks to disk serially. - - Parameters - ---------- - uri : str - The base uri of the external blocks - """ - - import asdf - - for i, block in enumerate(self.external_blocks): - if uri is None: - msg = "Can't write external blocks, since URI of main file is unknown." - raise ValueError(msg) - subfd = self.get_external_uri(uri, i) - asdffile = asdf.AsdfFile() - blk = copy.copy(block) - blk._array_storage = "internal" - asdffile._blocks.add(blk) - blk._used = True - # skip the new block manager here - asdffile._write_to(subfd, pad_blocks=pad_blocks, all_array_storage="internal") - - def write_block_index(self, fd, ctx): - """ - Write the block index. - - Parameters - ---------- - fd : GenericFile - The file to write to. The file pointer should be at the - end of the file. - """ - if len(self._internal_blocks) and not len(self._streamed_blocks): - fd.write(constants.INDEX_HEADER) - fd.write(b"\n") - offsets = [x.offset for x in self.internal_blocks] - - yaml_version = tuple(int(x) for x in ctx.version_map["YAML_VERSION"].split(".")) - - yaml.dump( - offsets, - Dumper=yamlutil._yaml_base_dumper, - stream=fd, - explicit_start=True, - explicit_end=True, - version=yaml_version, - allow_unicode=True, - encoding="utf-8", - ) - - _re_index_content = re.compile(rb"^" + constants.INDEX_HEADER + rb"\r?\n%YAML.*\.\.\.\r?\n?$") - _re_index_misc = re.compile(rb"^[\n\r\x20-\x7f]+$") - - def read_block_index(self, fd, ctx): - """ - Read the block index. - - Parameters - ---------- - fd : GenericFile - The file to read from. It must be seekable. - """ - # This reads the block index by reading backward from the end - # of the file. This tries to be as conservative as possible, - # since not reading an index isn't a deal breaker -- - # everything can still be read from the file, only slower. - # Importantly, it must remain "transactionally clean", and not - # create any blocks until we're sure the block index makes - # sense. - - if not fd.seekable(): - return - - if not len(self._internal_blocks): - return - - first_block = self._internal_blocks[0] - first_block_end = first_block.end_offset - - fd.seek(0, generic_io.SEEK_END) - file_size = block_end = fd.tell() - # We want to read on filesystem block boundaries. We use - # "block_end - 5" here because we need to read at least 5 - # bytes in the first block. - block_start = ((block_end - 5) // fd.block_size) * fd.block_size - buff_size = block_end - block_start - - content = b"" - - fd.seek(block_start, generic_io.SEEK_SET) - buff = fd.read(buff_size) - - # Extra '\0' bytes are allowed after the ..., mainly to - # workaround poor truncation support on Windows - buff = buff.rstrip(b"\0") - content = buff - - # We need an explicit YAML end marker, or there's no - # block index - for ending in (b"...", b"...\r\n", b"...\n"): - if content.endswith(ending): - break - else: - return - - # Read blocks in reverse order from the end of the file - while True: - # Look for the index header - idx = content.rfind(constants.INDEX_HEADER) - if idx != -1: - content = content[idx:] - index_start = block_start + idx - break - - # If the rest of it starts to look like binary - # values, bail... - if not self._re_index_misc.match(buff): - return - - if block_start <= first_block_end: - return - - block_end = block_start - block_start = max(block_end - fd.block_size, first_block_end) - - fd.seek(block_start, generic_io.SEEK_SET) - buff_size = block_end - block_start - buff = fd.read(buff_size) - content = buff + content - - yaml_content = content[content.find(b"\n") + 1 :] - - # The following call to yaml.load is safe because we're - # using pyyaml's SafeLoader. - offsets = yaml.load(yaml_content, Loader=yamlutil._yaml_base_loader) # noqa: S506 - - # Make sure the indices look sane - if not isinstance(offsets, list) or len(offsets) == 0: - return - - last_offset = 0 - for x in offsets: - if not isinstance(x, int) or x > file_size or x < 0 or x <= last_offset + Block._header.size: - return - last_offset = x - - # We always read the first block, so we can confirm that the - # first entry in the block index matches the first block - if offsets[0] != first_block.offset: - return - - if len(offsets) == 1: - # If there's only one block in the index, we've already - # loaded the first block, so just return: we have nothing - # left to do - return - - # One last sanity check: Read the last block in the index and - # make sure it makes sense. - fd.seek(offsets[-1], generic_io.SEEK_SET) - try: - block = self._new_block().read(fd) - except (ValueError, OSError): - return - - # Now see if the end of the last block leads right into the index - if block.end_offset != index_start: - return - - # It seems we're good to go, so instantiate the UnloadedBlock - # objects - for offset in offsets[1:-1]: - self._internal_blocks.append(UnloadedBlock(fd, offset, memmap=self.memmap, lazy_load=self.lazy_load)) - - # We already read the last block in the file -- no need to read it again - self._internal_blocks.append(block) - - # Record that all block locations have been mapped out (used to avoid - # unnecessary calls to finish_reading_internal_blocks later). - self._internal_blocks_mapped = True - - # Materialize the internal blocks if we are not lazy - if not self.lazy_load: - self.finish_reading_internal_blocks() - - def get_external_filename(self, filename, index): - """ - Given a main filename and an index number, return a new file - name for referencing an external block. - """ - filename = os.path.splitext(filename)[0] - return filename + f"{index:04d}.asdf" - - def get_external_uri(self, uri, index): - """ - Given a main URI and an index number, return a new URI for - saving an external block. - """ - if uri is None: - uri = "" - parts = list(patched_urllib_parse.urlparse(uri)) - path = parts[2] - dirname, filename = os.path.split(path) - filename = self.get_external_filename(filename, index) - path = os.path.join(dirname, filename) - parts[2] = path - return patched_urllib_parse.urlunparse(parts) - - def _find_used_blocks(self, tree, ctx, remove=True): - reserved_blocks = set() - - for node in treeutil.iter_tree(tree): - if ctx.extension_manager.handles_type(type(node)): - converter = ctx.extension_manager.get_converter_for_type(type(node)) - sctx = ctx._create_serialization_context() - tag = converter.select_tag(node, sctx) - for key in converter.reserve_blocks(node, tag, sctx): - reserved_blocks.add(self.find_or_create_block(key)) - else: - hook = ctx._type_index.get_hook_for_type("reserve_blocks", type(node), ctx.version_string) - if hook is not None: - for block in hook(node, ctx): - reserved_blocks.add(block) - - if remove: - for block in list(self.blocks): - if getattr(block, "_used", 0) == 0 and block not in reserved_blocks: - self.remove(block) - return None - for block in list(self.blocks): - if getattr(block, "_used", 0): - reserved_blocks.add(block) - return reserved_blocks - - def _handle_global_block_settings(self, block): - cfg = get_config() - all_array_storage = cfg.all_array_storage - if all_array_storage: - self.set_array_storage(block, all_array_storage) - - all_array_compression = cfg.all_array_compression - all_array_compression_kwargs = cfg.all_array_compression_kwargs - # Only override block compression algorithm if it wasn't explicitly set - # by AsdfFile.set_array_compression. - if all_array_compression != "input": - block.output_compression = all_array_compression - block.output_compression_kwargs = all_array_compression_kwargs - - if all_array_storage is None: - threshold = get_config().array_inline_threshold - if threshold is not None and block.array_storage in ["internal", "inline"]: - if np.prod(block.data.shape) < threshold: - self.set_array_storage(block, "inline") - else: - self.set_array_storage(block, "internal") - - def finalize(self, ctx): - """ - At this point, we have a complete set of blocks for the file, - with no extras. - - Here, they are reindexed, and possibly reorganized. - """ - # TODO: Should this reset the state (what's external and what - # isn't) afterword? - - self._find_used_blocks(ctx.tree, ctx) - - for block in list(self.blocks): - self._handle_global_block_settings(block) - - def get_block_by_key(self, key): - if key not in self._data_to_block_mapping: - msg = f"Unknown block key {key}" - raise KeyError(msg) - return self._data_to_block_mapping[key] - - def get_block(self, source): - """ - Given a "source identifier", return a block. - - Parameters - ---------- - source : any - If an integer, refers to the index of an internal block. - If a string, is a uri to an external block. - - Returns - ------- - buffer : buffer - """ - # If an "int", it is the index of an internal block - if isinstance(source, int): - if source == -1: - if len(self._streamed_blocks): - return self._streamed_blocks[0] - # If we don't have a streamed block, fall through so - # we can read all of the blocks, ultimately arriving - # at the last one, which, if all goes well is a - # streamed block. - - # First, look in the blocks we've already read - elif source >= 0: - if source < len(self._internal_blocks): - return self._internal_blocks[source] - else: - msg = f"Invalid source id {source}" - raise ValueError(msg) - - # If we have a streamed block or we already know we have - # no blocks, reading any further isn't going to yield any - # new blocks. - if len(self._streamed_blocks) or len(self._internal_blocks) == 0: - msg = f"Block '{source}' not found." - raise ValueError(msg) - - # If the desired block hasn't already been read, and the - # file is seekable, and we have at least one internal - # block, then we can move the file pointer to the end of - # the last known internal block, and start looking for - # more internal blocks. This is "deferred block loading". - last_block = self._internal_blocks[-1] - - if last_block._fd is not None and last_block._fd.seekable(): - last_block._fd.seek(last_block.end_offset) - while True: - next_block = self._read_next_internal_block(last_block._fd, False) - if next_block is None: - break - if len(self._internal_blocks) - 1 == source: - return next_block - last_block = next_block - - if source == -1 and last_block.array_storage == "streamed": - return last_block - - msg = f"Block '{source}' not found." - raise ValueError(msg) - - if isinstance(source, str): - asdffile = self._asdffile().open_external(source) - block = asdffile._blocks._internal_blocks[0] - self.set_array_storage(block, "external") - - # Handle the case of inline data - elif isinstance(source, list): - block = Block(data=np.array(source), array_storage="inline") - - else: - msg = f"Unknown source '{source}'" - raise TypeError(msg) - - return block - - def get_source(self, block): - """ - Get a source identifier for a given block. - - Parameters - ---------- - block : Block - - Returns - ------- - source_id : str - May be an integer for an internal block, or a URI for an - external block. - """ - for i, internal_block in enumerate(self.internal_blocks): - if block == internal_block: - if internal_block.array_storage == "streamed": - return -1 - return i - - for i, external_block in enumerate(self.external_blocks): - if block == external_block: - if self._asdffile().uri is None: - msg = "Can't write external blocks, since URI of main file is unknown." - raise ValueError(msg) - - parts = list(patched_urllib_parse.urlparse(self._asdffile().uri)) - path = parts[2] - filename = os.path.basename(path) - return self.get_external_filename(filename, i) - - msg = "block not found." - raise ValueError(msg) - - def find_or_create_block_for_array(self, arr): - """ - For a given array, looks for an existing block containing its - underlying data. If not found, adds a new block to the block - list. Returns the index in the block list to the array. - - Parameters - ---------- - arr : numpy.ndarray - - Returns - ------- - block : Block - """ - from asdf.tags.core import ndarray - - if isinstance(arr, ndarray.NDArrayType) and arr.block is not None and arr.block in self.blocks: - return arr.block - - base = util.get_array_base(arr) - block = self._data_to_block_mapping.get(id(base)) - if block is not None: - return block - - block = Block(base) - self.add(block) - self._handle_global_block_settings(block) - return block - - def find_or_create_block(self, key): - """ - For a given hashable key, looks for an existing block. If not - found, adds a new block to the block list. Returns the index - in the block list to the array. - - Parameters - ---------- - key : hashable - - Returns - ------- - block : Block - """ - block = self._data_to_block_mapping.get(key) - if block is not None: - return block - - block = Block() - self.add(block, key=key) - self._handle_global_block_settings(block) - self._data_to_block_mapping[key] = block - - return block - - def get_streamed_block(self): - """ - Get the streamed block, which is always the last one. A - streamed block, on writing, does not manage data of its own, - but the user is expected to stream it to disk directly. - """ - block = self.streamed_block - if block is None: - block = Block(array_storage="streamed") - self.add(block) - return block - - def add_inline(self, array): - """ - Add an inline block for ``array`` to the block set. - """ - block = Block(array, array_storage="inline") - self.add(block) - return block - - def get_output_compressions(self): - """ - Get the list of unique compressions used on blocks. - """ - return list({b.output_compression for b in self.blocks}) - - def get_output_compression_extensions(self): - """ - Infer the compression extensions used on blocks. - Note that this is somewhat indirect and could be fooled if a new extension - for the same compression label is loaded after the compression of the block. - """ - ext = [] - for label in self.get_output_compressions(): - compressor = mcompression._get_compressor_from_extensions(label, return_extension=True) - if compressor is not None: - ext += [compressor[1]] # second item is the extension - return ext - - def __getitem__(self, arr): - return self.find_or_create_block_for_array(arr) - - def close(self): - for block in self.blocks: - block.close() diff --git a/asdf/_block/util.py b/asdf/_block/util.py deleted file mode 100644 index 7319e2751..000000000 --- a/asdf/_block/util.py +++ /dev/null @@ -1,87 +0,0 @@ -from collections import namedtuple - -from asdf import util - - -def calculate_updated_layout(blocks, tree_size, pad_blocks, block_size): - """ - Calculates a block layout that will try to use as many blocks as - possible in their original locations, though at this point the - algorithm is fairly naive. The result will be stored in the - offsets of the blocks. - - Parameters - ---------- - blocks : Blocks instance - - tree_size : int - The amount of space to reserve for the tree at the beginning. - - Returns - ------- - Returns `False` if no good layout can be found and one is best off - rewriting the file serially, otherwise, returns `True`. - """ - - def unfix_block(i): - # If this algorithm gets more sophisticated we could carefully - # move memmapped blocks around without clobbering other ones. - - # TODO: Copy to a tmpfile on disk and memmap it from there. - entry = fixed[i] - copy = entry.block.data.copy() - entry.block.close() - entry.block._data = copy - del fixed[i] - free.append(entry.block) - - def fix_block(block, offset): - block.offset = offset - fixed.append(Entry(block.offset, block.offset + block.size, block)) - fixed.sort() - - Entry = namedtuple("Entry", ["start", "end", "block"]) - - fixed = [] - free = [] - for block in blocks._internal_blocks: - if block.offset is not None: - block.update_size() - fixed.append(Entry(block.offset, block.offset + block.size, block)) - else: - free.append(block) - - if not len(fixed): - return False - - fixed.sort() - - # Make enough room at the beginning for the tree, by popping off - # blocks at the beginning - while len(fixed) and fixed[0].start < tree_size: - unfix_block(0) - - if not len(fixed): - return False - - # This algorithm is pretty basic at this point -- it just looks - # for the first open spot big enough for the free block to fit. - while len(free): - block = free.pop() - last_end = tree_size - for entry in fixed: - if entry.start - last_end >= block.size: - fix_block(block, last_end) - break - last_end = entry.end - else: - padding = util.calculate_padding(entry.block.size, pad_blocks, block_size) - fix_block(block, last_end + padding) - - if blocks.streamed_block is not None: - padding = util.calculate_padding(fixed[-1].block.size, pad_blocks, block_size) - blocks.streamed_block.offset = fixed[-1].end + padding - - blocks._sort_blocks_by_offset() - - return True diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index c3236dc08..4f73cfb30 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -8,7 +8,6 @@ from numpy.testing import assert_array_equal import asdf -from asdf import _block as block from asdf import constants, generic_io from asdf._block import io as bio @@ -734,33 +733,6 @@ def test_open_no_memmap(tmp_path): assert not isinstance(array.base, np.memmap) -def test_fd_not_seekable(): - data = np.ones(1024) - b = block.Block(data=data) - fd = io.BytesIO() - - seekable = lambda: False # noqa: E731 - fd.seekable = seekable - - write_array = lambda arr: fd.write(arr.tobytes()) # noqa: E731 - fd.write_array = write_array - - read_blocks = lambda us: [fd.read(us)] # noqa: E731 - fd.read_blocks = read_blocks - - fast_forward = lambda offset: fd.seek(offset, 1) # noqa: E731 - fd.fast_forward = fast_forward - - b.output_compression = "zlib" - b.write(fd) - fd.seek(0) - b = block.Block() - b.read(fd) - # We lost the information about the underlying array type, - # but still can compare the bytes. - assert b.data.tobytes() == data.tobytes() - - def test_add_block_before_fully_loaded(tmp_path): """ This test covers a subtle case where a block is added @@ -844,50 +816,6 @@ def assert_result(ff): assert_result(ff2) -@pytest.mark.parametrize("memmap", [True, False]) -@pytest.mark.parametrize("lazy_load", [True, False]) -def test_data_callback(tmp_path, memmap, lazy_load): - class Callback: - def __init__(self, data): - self.n_calls = 0 - self.data = data - - def __call__(self): - self.n_calls += 1 - return self.data - - arr = np.array([1, 2, 3], dtype="uint8") - callback = Callback(arr) - b = block.Block(memmap=memmap, lazy_load=lazy_load, data_callback=callback) - - assert callback.n_calls == 0 - assert b.data is arr - assert callback.n_calls == 1 - assert b._data is None - assert b.data is arr - assert callback.n_calls == 2 - - fn = tmp_path / "test.b" - with generic_io.get_file(fn, mode="w") as f: - b.write(f) - assert callback.n_calls == 3 - - with generic_io.get_file(fn, mode="r") as f: - rb = block.Block(memmap=memmap, lazy_load=lazy_load) - rb.read(f, past_magic=False) - assert_array_equal(rb.data, arr) - - with pytest.raises(ValueError, match=r"Block.__init__ cannot contain non-None data and a non-None data_callback"): - b = block.Block(data=arr, memmap=memmap, lazy_load=lazy_load, data_callback=callback) - - rb = block.Block(memmap=memmap, lazy_load=lazy_load, data_callback=callback) - with pytest.raises(RuntimeError, match=r"read called on a Block with a data_callback"), generic_io.get_file( - fn, - mode="r", - ) as f: - rb.read(f, past_magic=False) - - def test_remove_blocks(tmp_path): """Test that writing to a new file""" fn1 = tmp_path / "test.asdf" diff --git a/asdf/commands/edit.py b/asdf/commands/edit.py index 61fc28d12..08b7a254d 100644 --- a/asdf/commands/edit.py +++ b/asdf/commands/edit.py @@ -16,7 +16,6 @@ import yaml from asdf import constants, generic_io, schema, util -from asdf._block import BlockManager from asdf.asdf import AsdfFile, open_asdf from .main import Command @@ -130,20 +129,14 @@ def write_edited_yaml_larger(path, new_content, version): pad_length = util.calculate_padding(len(new_content), True, fd.block_size) fd.fast_forward(pad_length) + # copy blocks from original_fd to fd + fd.tell() with generic_io.get_file(path) as original_fd: - # Consume the file up to the first block, which must exist - # as a precondition to using this method. - original_fd.seek_until( - constants.BLOCK_MAGIC, - len(constants.BLOCK_MAGIC), - ) - ctx = AsdfFile(version=version) - blocks = BlockManager(ctx, copy_arrays=False, lazy_load=False) - blocks.read_internal_blocks(original_fd, past_magic=True, validate_checksums=False) - blocks.finish_reading_internal_blocks() - blocks.write_internal_blocks_serial(fd) - blocks.write_block_index(fd, ctx) - blocks.close() + original_fd.seek_until(constants.BLOCK_MAGIC, len(constants.BLOCK_MAGIC)) + fd.write(constants.BLOCK_MAGIC) + block_size = min(fd.block_size, original_fd.block_size) + while bs := original_fd.read(block_size): + fd.write(bs) # the file needs to be closed here to release all memmaps original_fd.close() From 3d9259781e9d303ce5e6c9c7e55ca02b6005405e Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 2 May 2023 11:26:11 -0400 Subject: [PATCH 029/154] increasing _block test coverage --- asdf/_block/callback.py | 7 +++++-- asdf/_block/manager.py | 14 ++++---------- asdf/_block/reader.py | 9 --------- asdf/asdf.py | 4 +--- 4 files changed, 10 insertions(+), 24 deletions(-) diff --git a/asdf/_block/callback.py b/asdf/_block/callback.py index 8c07d526b..00475b1d9 100644 --- a/asdf/_block/callback.py +++ b/asdf/_block/callback.py @@ -3,8 +3,7 @@ class DataCallback: def __init__(self, index, read_blocks): - self._index = index - self._read_blocks_ref = weakref.ref(read_blocks) + self.reassign(index, read_blocks) def __call__(self, _attr=None): read_blocks = self._read_blocks_ref() @@ -17,3 +16,7 @@ def __call__(self, _attr=None): # _attr allows NDArrayType to have low level block access for things # like reading the header and cached_data return getattr(read_blocks[self._index], _attr) + + def reassign(self, index, read_blocks): + self._index = index + self._read_blocks_ref = weakref.ref(read_blocks) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 6c1af18f9..f34d986d4 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -56,12 +56,7 @@ class BlockOptions(store.Store): def __init__(self, read_blocks=None): super().__init__() - if read_blocks is None: - self._read_blocks = ReadBlocks([]) - elif isinstance(read_blocks, ReadBlocks): - self._read_blocks = read_blocks - else: - self._read_blocks = ReadBlocks(read_blocks) + self._read_blocks = read_blocks def get_options(self, array): base = util.get_array_base(array) @@ -144,11 +139,10 @@ def resolve_external_uri(uri, relative): class Manager: def __init__(self, read_blocks=None, uri=None): - self.options = BlockOptions(read_blocks) if read_blocks is None: - self.blocks = self.options._read_blocks - else: - self.blocks = read_blocks + read_blocks = ReadBlocks([]) + self.options = BlockOptions(read_blocks) + self.blocks = read_blocks self._data_callbacks = store.Store() self._write_blocks = store.LinearStore() self._external_write_blocks = [] diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index 99d03df37..a5fa7662f 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -56,15 +56,6 @@ def header(self): self.load() return self._header - def reset(self, fd, offset): - self._fd = weakref.ref(fd) - self.offset = offset - self.header = None - self.data_offset = None - self._data = None - if not self.lazy_load: - self.load() - def read_blocks_serially(fd, memmap=False, lazy_load=False): blocks = [] diff --git a/asdf/asdf.py b/asdf/asdf.py index 40eb1a8dc..5a620e471 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -5,7 +5,6 @@ import pathlib import time import warnings -import weakref from packaging.version import Version @@ -1243,8 +1242,7 @@ def update( # update data callbacks to point to new blocks cb = self._blocks._data_callbacks.lookup_by_object(obj) if cb is not None: - cb._index = new_index - cb._read_blocks_ref = weakref.ref(new_read_blocks) + cb.reassign(new_index, new_read_blocks) # update read blocks to reflect new state self._blocks.blocks = new_read_blocks From 17d9d51e89251caf258dd18b42354d4fdb1849bf Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 2 May 2023 12:51:28 -0400 Subject: [PATCH 030/154] cleaning up _blocks usage in AsdfFile.write_to --- asdf/_block/manager.py | 43 +++++++++++++++++++++++ asdf/asdf.py | 77 +++++++++++------------------------------- 2 files changed, 63 insertions(+), 57 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index f34d986d4..70ccae1e6 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -1,3 +1,5 @@ +import contextlib +import copy import os import weakref @@ -15,6 +17,13 @@ class ReadBlocks(store.LinearStore): [block_0, block_1, ...] """ + def set_blocks(self, blocks): + self._items = blocks + # TODO should this invalidate the associations? + + def append_block(self, block): + self._items.append(block) + # def get_block_for_array(self, array): # base = util.get_array_base(array) # block_index = self.lookup_by_object(base) @@ -231,3 +240,37 @@ def _get_array_compression_kwargs(self, arr): def get_output_compressions(self): return self.options.get_output_compressions() + + @contextlib.contextmanager + def options_context(self): + previous_options = copy.deepcopy(self.options) + yield + self.options = previous_options + self.options._read_blocks = self.blocks + + @contextlib.contextmanager + def write_context(self, fd, copy_options=True): + self._write_fd = fd + self._clear_write() + if copy_options: + with self.options_context(): + yield + else: + yield + self._clear_write() + self._write_fd = None + + def write(self, fd, pad_blocks, include_block_index): + if self._write_fd is None or fd is not self._write_fd: + msg = "Write called outside of valid write_context" + raise OSError(msg) + if len(self._write_blocks) or self._streamed_block: + write_blocks( + fd, + self._write_blocks, + pad_blocks, + streamed_block=self._streamed_block, + write_index=include_block_index, + ) + if len(self._external_write_blocks): + self._write_external_blocks() diff --git a/asdf/asdf.py b/asdf/asdf.py index 5a620e471..ee3635c0f 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -15,9 +15,9 @@ from . import constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil from ._block import io as bio from ._block import reader as block_reader -from ._block import store as block_store from ._block import writer as block_writer from ._block.manager import Manager as BlockManager +from ._block.manager import ReadBlocks from ._block.options import Options as BlockOptions from ._helpers import validate_version from .config import config_context, get_config @@ -603,19 +603,15 @@ def comments(self): return self._comments def _validate(self, tree, custom=True, reading=False): - previous_options = copy.deepcopy(self._blocks.options) + with self._blocks.options_context(): + # If we're validating on read then the tree + # is already guaranteed to be in tagged form. + tagged_tree = tree if reading else yamlutil.custom_tree_to_tagged_tree(tree, self) - # If we're validating on read then the tree - # is already guaranteed to be in tagged form. - tagged_tree = tree if reading else yamlutil.custom_tree_to_tagged_tree(tree, self) - - schema.validate(tagged_tree, self, reading=reading) - # Perform secondary validation pass if requested - if custom and self._custom_schema: - schema.validate(tagged_tree, self, self._custom_schema, reading=reading) - - self._blocks.options = previous_options - self._blocks.options._read_blocks = self._blocks.blocks + schema.validate(tagged_tree, self, reading=reading) + # Perform secondary validation pass if requested + if custom and self._custom_schema: + schema.validate(tagged_tree, self, self._custom_schema, reading=reading) def validate(self): """ @@ -863,7 +859,7 @@ def _open_asdf( msg = "ASDF file appears to contain garbage after header." raise OSError(msg) - self._blocks.blocks._items = read_blocks + self._blocks.blocks.set_blocks(read_blocks) if tree is None: # At this point the tree should be tagged, but we want it to be @@ -982,46 +978,15 @@ def _pre_write(self, fd): if len(self._tree): self._run_hook("pre_write") - # This is where we'd do some more sophisticated block - # reorganization, if necessary - # self._blocks.finalize(self) - - # self._tree["asdf_library"] = get_asdf_library_info() - def _serial_write(self, fd, pad_blocks, include_block_index): - self._blocks._clear_write() - # prep a tree for a writing tree = copy.copy(self._tree) tree["asdf_library"] = get_asdf_library_info() if "history" in self._tree: tree["history"] = copy.deepcopy(self._tree["history"]) - # TODO copy block options - previous_options = copy.deepcopy(self._blocks.options) - self._write_tree(tree, fd, pad_blocks) - if len(self._blocks._write_blocks) or self._blocks._streamed_block: - block_writer.write_blocks( - fd, - self._blocks._write_blocks, - pad_blocks, - streamed_block=self._blocks._streamed_block, - write_index=include_block_index, - ) - if len(self._blocks._external_write_blocks): - self._blocks._write_external_blocks() - self._blocks._clear_write() - self._blocks.options = previous_options - self._blocks.options._read_blocks = self._blocks.blocks - - def _random_write(self, fd, pad_blocks, include_block_index): - self._write_tree(self._tree, fd, False) - self._blocks.write_internal_blocks_random_access(fd) - self._blocks.write_external_blocks(fd.uri, pad_blocks) - if include_block_index: - self._blocks.write_block_index(fd, self) - fd.truncate() + self._blocks.write(fd, pad_blocks, include_block_index) def _post_write(self, fd): if len(self._tree): @@ -1199,7 +1164,7 @@ def update( end_of_file = self._fd.tell() # map new blocks to old blocks - new_read_blocks = block_store.LinearStore() + new_read_blocks = ReadBlocks() for i, (offset, header) in enumerate(zip(offsets, headers)): if i == len(self._blocks._write_blocks): # this is a streamed block obj = self._blocks._streamed_obj() @@ -1235,7 +1200,7 @@ def update( new_read_block = block_reader.ReadBlock( offset + 4, self._fd, memmap, True, header=header, data=data ) - new_read_blocks._items.append(new_read_block) + new_read_blocks.append_block(new_read_block) new_index = len(new_read_blocks) - 1 new_read_blocks.assign_object(obj, new_read_block) @@ -1347,15 +1312,13 @@ def write_to( self.version = version with generic_io.get_file(fd, mode="w") as fd: - self._blocks._write_fd = fd - self._pre_write(fd) - - try: - self._serial_write(fd, pad_blocks, include_block_index) - fd.flush() - finally: - self._post_write(fd) - self._blocks._write_fd = None + with self._blocks.write_context(fd): + self._pre_write(fd) + try: + self._serial_write(fd, pad_blocks, include_block_index) + fd.flush() + finally: + self._post_write(fd) def find_references(self): """ From 517cb147211684f488d9752c1696cddaf0b3756d Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 2 May 2023 13:53:58 -0400 Subject: [PATCH 031/154] drop asdffile from __init__ args to NDArrayType --- asdf/tags/core/ndarray.py | 56 +++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index 36ff041e0..3fe37a5ca 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -1,5 +1,6 @@ import mmap import sys +import weakref import numpy as np from numpy import ma @@ -234,8 +235,7 @@ class NDArrayType(_types._AsdfType): supported_versions = {"1.0.0", "1.1.0"} types = [np.ndarray, ma.MaskedArray] - def __init__(self, source, shape, dtype, offset, strides, order, mask, asdffile): - self._asdffile = asdffile + def __init__(self, source, shape, dtype, offset, strides, order, mask): # source can be a: # - list of numbers for an inline block # - string for an external block @@ -258,8 +258,6 @@ def __init__(self, source, shape, dtype, offset, strides, order, mask, asdffile) self._offset = offset self._strides = strides self._order = order - if not asdffile._blocks.lazy_load: - self._make_array() def _make_array(self): # If the ASDF file has been updated in-place, then there's @@ -273,14 +271,12 @@ def _make_array(self): self._array = None if self._array is None: - if isinstance(self._source, str): - data = ( - self._asdffile.open_external(self._source, lazy_load=False, copy_arrays=True)._blocks.blocks[0].data - ) - else: + if callable(self._source): # cached data is used here so that multiple NDArrayTypes will all use # the same base array data = self._source(_attr="cached_data") + else: + data = self._source if hasattr(data, "base") and isinstance(data.base, mmap.mmap) and data.base.closed: raise OSError("Attempt to read data from a closed file") @@ -428,7 +424,7 @@ def __getattribute__(self, name): @classmethod def from_tree(cls, node, ctx): if isinstance(node, list): - instance = cls(node, None, None, None, None, None, None, ctx) + instance = cls(node, None, None, None, None, None, None) ctx._blocks._set_array_storage(instance, "inline") return instance @@ -448,17 +444,29 @@ def from_tree(cls, node, ctx): mask = node.get("mask", None) if isinstance(source, int): - block_index = source - source = ctx._blocks._get_data_callback(source) + data = ctx._blocks._get_data_callback(source) + instance = cls(data, shape, dtype, offset, strides, "A", mask) + ctx._blocks.blocks.assign_object(instance, ctx._blocks.blocks[source]) + ctx._blocks._data_callbacks.assign_object(instance, data) + elif isinstance(source, str): + # external + def data(_attr=None, _ref=weakref.ref(ctx)): + ctx = _ref() + if ctx is None: + msg = "Failed to resolve reference to AsdfFile to read external block" + raise OSError(msg) + array = ctx.open_external(source)._blocks.blocks[0].cached_data + ctx._blocks._set_array_storage(array, "external") + return array + + instance = cls(data, shape, dtype, offset, strides, "A", mask) else: - block_index = None - instance = cls(source, shape, dtype, offset, strides, "A", mask, ctx) - if block_index is not None: - ctx._blocks.blocks.assign_object(instance, ctx._blocks.blocks[block_index]) - ctx._blocks._data_callbacks.assign_object(instance, source) - else: - if not isinstance(source, str): - ctx._blocks._set_array_storage(instance, "inline") + # inline + instance = cls(source, shape, dtype, offset, strides, "A", mask) + ctx._blocks._set_array_storage(instance, "inline") + + if not ctx._blocks.lazy_load: + instance._make_array() return instance msg = "Invalid ndarray description." @@ -601,14 +609,6 @@ def assert_allclose(cls, old, new): else: cls._assert_equality(old, new, assert_allclose) - # @classmethod - # def copy_to_new_asdf(cls, node, asdffile): - # if isinstance(node, NDArrayType): - # array = node._make_array() - # asdffile._blocks.set_array_storage(asdffile._blocks[array], node.block.array_storage) - # return node._make_array() - # return node - def _make_operation(name): def operation(self, *args): From f7cc999bf7c31979c6f4ff9eadba00b133720b8e Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 2 May 2023 14:24:06 -0400 Subject: [PATCH 032/154] cleanup some comments --- asdf/_block/manager.py | 38 -------------------------------------- asdf/_block/options.py | 1 + asdf/tags/core/ndarray.py | 9 +-------- 3 files changed, 2 insertions(+), 46 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 70ccae1e6..6187a82c2 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -24,38 +24,6 @@ def set_blocks(self, blocks): def append_block(self, block): self._items.append(block) - # def get_block_for_array(self, array): - # base = util.get_array_base(array) - # block_index = self.lookup_by_object(base) - # if block_index is None: - # return self.get_block_with_data(base) - # return self[block_index] - - # def get_block_with_data(self, array): - # base = util.get_array_base(array) - # for (block_index, block) in enumerate(self): - # if block._data is not None and not callable(block._data): - # if block._data is base: - # if self.lookup_by_object(base) is None: - # self.assign_array_to_block_index(base, block_index) - # return block - # return None - - # def assign_read_blocks(self): - # for (block_index, block) in enumerate(self): - # if block._data is not None and not callabale(block._data): - # self.assign_array_to_block_index(block._data, block_index) - # base = util.get_array_base(block._data) - - # def assign_array_to_block_index(self, array, block_index): - # base = util.get_array_base(array) - # self.assign_object(base, block_index) - - # def assign_array_to_block(self, array, block): - # block_index = self.index(block) - # self.assign_array_to_block_index(array, block_index) - pass - class BlockOptions(store.Store): """ @@ -101,9 +69,6 @@ def set_options(self, array, options): base = util.get_array_base(array) self.assign_object(base, options) - # TODO copy to allow for changing settings on write - # TODO make an 'update_options' - def get_output_compressions(self): compressions = set() cfg = config.get_config() @@ -130,9 +95,6 @@ def make_external_uri(uri, index): dirname, filename = os.path.split(path) filename = os.path.splitext(filename)[0] + f"{index:04d}.asdf" return filename - # path = os.path.join(dirname, filename) - # parts[2] = path - # return util.patched_urllib_parse.urlunparse(parts) def resolve_external_uri(uri, relative): diff --git a/asdf/_block/options.py b/asdf/_block/options.py index 3425e60a9..ff7c6a2a8 100644 --- a/asdf/_block/options.py +++ b/asdf/_block/options.py @@ -15,6 +15,7 @@ def __init__(self, storage_type=None, compression_type=None, compression_kwargs= # set kwargs first to avoid overwrite when compression type changes self.compression_kwargs = compression_kwargs self.compression = compression_type + # set storage type last to possibly overwrite compression/compression_kwargs self.storage_type = storage_type diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index 3fe37a5ca..792dfe185 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -238,8 +238,7 @@ class NDArrayType(_types._AsdfType): def __init__(self, source, shape, dtype, offset, strides, order, mask): # source can be a: # - list of numbers for an inline block - # - string for an external block - # - a data callback for an internal block + # - a data callback for an internal or externalblock self._source = source self._array = None self._mask = mask @@ -521,7 +520,6 @@ def to_tree(cls, obj, ctx): strides = None if data.flags.c_contiguous else data.strides dtype, byteorder = numpy_dtype_to_asdf_datatype( data.dtype, - # include_byteorder=(block.array_storage != "inline"), include_byteorder=(options.storage_type != "inline"), ) @@ -531,7 +529,6 @@ def to_tree(cls, obj, ctx): if options.storage_type == "streamed": result["shape"][0] = "*" - # if block.array_storage == "inline": if options.storage_type == "inline": listdata = numpy_array_to_list(data) result["data"] = listdata @@ -542,8 +539,6 @@ def to_tree(cls, obj, ctx): if options.storage_type == "streamed": result["shape"][0] = "*" - # result["source"] = ctx._blocks.get_source(block) - # convert data to byte array if options.storage_type == "streamed": ctx._blocks.set_streamed_block(base, data) result["source"] = -1 @@ -559,9 +554,7 @@ def to_tree(cls, obj, ctx): result["strides"] = list(strides) if isinstance(data, ma.MaskedArray) and np.any(data.mask): - # if block.array_storage == "inline": if options.storage_type == "inline": - # ctx._blocks.set_array_storage(ctx._blocks[data.mask], "inline") ctx._blocks._set_array_storage(data.mask, "inline") result["mask"] = data.mask From f68f5eec316f40a833012f1f14b581f8afd5beaf Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 2 May 2023 15:08:54 -0400 Subject: [PATCH 033/154] enable block only files --- asdf/_block/reader.py | 26 ++++++++++++++------------ asdf/_tests/test_file_format.py | 2 -- asdf/asdf.py | 6 +++--- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index a5fa7662f..274bbd17c 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -57,23 +57,24 @@ def header(self): return self._header -def read_blocks_serially(fd, memmap=False, lazy_load=False): +def read_blocks_serially(fd, memmap=False, lazy_load=False, after_magic=False): blocks = [] buff = b"" while True: # the expectation is that this will begin PRIOR to the block magic # read 4 bytes - buff += fd.read(4 - len(buff)) - if len(buff) < 4: - # we are done, there are no more blocks and no index - # TODO error? we shouldn't have extra bytes, the old code allows this - break + if not after_magic: + buff += fd.read(4 - len(buff)) + if len(buff) < 4: + # we are done, there are no more blocks and no index + # TODO error? we shouldn't have extra bytes, the old code allows this + break if buff == constants.INDEX_HEADER[:4]: # we hit the block index, which is not useful here break - if buff == constants.BLOCK_MAGIC: + if after_magic or buff == constants.BLOCK_MAGIC: # this is another block offset, header, data_offset, data = bio.read_block(fd, memmap=memmap, lazy_load=lazy_load) blocks.append(ReadBlock(offset, fd, memmap, lazy_load, header=header, data_offset=data_offset, data=data)) @@ -82,6 +83,7 @@ def read_blocks_serially(fd, memmap=False, lazy_load=False): # can stop looking for more blocks break buff = b"" + after_magic = False else: if len(blocks) or buff[0] != 0: # if this is not the first block or we haven't found any @@ -93,10 +95,10 @@ def read_blocks_serially(fd, memmap=False, lazy_load=False): return blocks -def read_blocks(fd, memmap=False, lazy_load=False): +def read_blocks(fd, memmap=False, lazy_load=False, after_magic=False): if not lazy_load or not fd.seekable(): # load all blocks serially - return read_blocks_serially(fd, memmap, lazy_load) + return read_blocks_serially(fd, memmap, lazy_load, after_magic) # try to find block index starting_offset = fd.tell() @@ -104,7 +106,7 @@ def read_blocks(fd, memmap=False, lazy_load=False): if index_offset is None: # if failed, load all blocks serially fd.seek(starting_offset) - return read_blocks_serially(fd, memmap, lazy_load) + return read_blocks_serially(fd, memmap, lazy_load, after_magic) # setup empty blocks try: @@ -112,7 +114,7 @@ def read_blocks(fd, memmap=False, lazy_load=False): except OSError: # failed to read block index, fall back to serial reading fd.seek(starting_offset) - return read_blocks_serially(fd, memmap, lazy_load) + return read_blocks_serially(fd, memmap, lazy_load, after_magic) # skip magic for each block blocks = [ReadBlock(offset + 4, fd, memmap, lazy_load) for offset in block_index] try: @@ -126,5 +128,5 @@ def read_blocks(fd, memmap=False, lazy_load=False): blocks[index].load() except (OSError, ValueError): fd.seek(starting_offset) - return read_blocks_serially(fd, memmap, lazy_load) + return read_blocks_serially(fd, memmap, lazy_load, after_magic) return blocks diff --git a/asdf/_tests/test_file_format.py b/asdf/_tests/test_file_format.py index 36b315c72..6c9cc593b 100644 --- a/asdf/_tests/test_file_format.py +++ b/asdf/_tests/test_file_format.py @@ -136,7 +136,6 @@ def test_invalid_header_version(): pass -@pytest.mark.xfail(reason="block only file support is broken") def test_block_mismatch(): # This is a file with a single small block, followed by something # that has an invalid block magic number. @@ -151,7 +150,6 @@ def test_block_mismatch(): pass -@pytest.mark.xfail(reason="block only file support is broken") def test_block_header_too_small(): # The block header size must be at least 40 diff --git a/asdf/asdf.py b/asdf/asdf.py index ee3635c0f..6577808f6 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -852,9 +852,9 @@ def _open_asdf( read_blocks = block_reader.read_blocks(fd, self._blocks.memmap, self._blocks.lazy_load) elif yaml_token == constants.BLOCK_MAGIC: # this file has only blocks - raise NotImplementedError("Support for block only file does not yet exist") - # since we're after the magic, if seekable, just reset - # if not seekable, read first block, then read the reset serially, add them all up + read_blocks = block_reader.read_blocks( + fd, self._blocks.memmap, self._blocks.lazy_load, after_magic=True + ) elif yaml_token != b"": msg = "ASDF file appears to contain garbage after header." raise OSError(msg) From f9da319d5cbf70b73d8e9fe0d0bf58cdc1c4bf89 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 2 May 2023 16:15:58 -0400 Subject: [PATCH 034/154] clear _write_fd during block manager _clear_write --- asdf/_block/manager.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 6187a82c2..9bea82f04 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -127,7 +127,7 @@ def _clear_write(self): self._external_write_blocks = [] self._streamed_block = None self._streamed_obj = None - # self._write_fd = None + self._write_fd = None def _write_external_blocks(self): from asdf import AsdfFile @@ -212,15 +212,14 @@ def options_context(self): @contextlib.contextmanager def write_context(self, fd, copy_options=True): - self._write_fd = fd self._clear_write() + self._write_fd = fd if copy_options: with self.options_context(): yield else: yield self._clear_write() - self._write_fd = None def write(self, fd, pad_blocks, include_block_index): if self._write_fd is None or fd is not self._write_fd: From ef038f826e8dffe3bb68779768af050f842e6f9f Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 2 May 2023 16:52:22 -0400 Subject: [PATCH 035/154] make serialization context aware of current object allowing the removal of reserve_blocks and BlockKey --- asdf/_tests/test_block_converter.py | 20 +----- asdf/_tests/test_util.py | 15 ----- asdf/extension/_serialization_context.py | 81 ++++++++++++------------ asdf/util.py | 22 ------- asdf/yamlutil.py | 10 ++- 5 files changed, 51 insertions(+), 97 deletions(-) diff --git a/asdf/_tests/test_block_converter.py b/asdf/_tests/test_block_converter.py index a63a06a4b..76ef4758d 100644 --- a/asdf/_tests/test_block_converter.py +++ b/asdf/_tests/test_block_converter.py @@ -11,8 +11,6 @@ class BlockData: def __init__(self, payload): self.payload = payload - # generate a unique id - self._asdf_key = asdf.util.BlockKey() class BlockConverter(Converter): @@ -22,8 +20,7 @@ class BlockConverter(Converter): def to_yaml_tree(self, obj, tag, ctx): # lookup source for obj - block_index = ctx.find_block_index( - obj._asdf_key, + block_index = ctx.find_available_block_index( lambda: np.ndarray(len(obj.payload), dtype="uint8", buffer=obj.payload), ) return { @@ -34,16 +31,8 @@ def from_yaml_tree(self, node, tag, ctx): block_index = node["block_index"] data = ctx.get_block_data_callback(block_index)() obj = BlockData(data.tobytes()) - ctx.assign_block_key(block_index, obj._asdf_key) return obj - def reserve_blocks(self, obj, tag): - if self._return_invalid_keys: - # return something unhashable - self._return_invalid_keys = False - return [[]] - return [obj._asdf_key] - class BlockExtension(Extension): tags = ["asdf://somewhere.org/tags/block_data-1.0.0"] @@ -100,7 +89,6 @@ class BlockDataCallback: def __init__(self, callback): self.callback = callback - self._asdf_key = asdf.util.BlockKey() @property def data(self): @@ -112,7 +100,7 @@ class BlockDataCallbackConverter(Converter): types = [BlockDataCallback] def to_yaml_tree(self, obj, tag, ctx): - block_index = ctx.find_block_index(obj._asdf_key, obj.callback) + block_index = ctx.find_available_block_index(obj.callback) return { "block_index": block_index, } @@ -121,12 +109,8 @@ def from_yaml_tree(self, node, tag, ctx): block_index = node["block_index"] obj = BlockDataCallback(ctx.get_block_data_callback(block_index)) - ctx.assign_block_key(block_index, obj._asdf_key) return obj - def reserve_blocks(self, obj, tag): - return [obj._asdf_key] - class BlockDataCallbackExtension(Extension): tags = ["asdf://somewhere.org/tags/block_data_callback-1.0.0"] diff --git a/asdf/_tests/test_util.py b/asdf/_tests/test_util.py index 944631941..112221390 100644 --- a/asdf/_tests/test_util.py +++ b/asdf/_tests/test_util.py @@ -1,4 +1,3 @@ -import copy import io import pytest @@ -118,17 +117,3 @@ def test_minversion(): assert util.minversion(yaml, "3.1") assert util.minversion("yaml", "3.1") - - -def test_block_key(): - bk = util.BlockKey() - # make sure block key is hashable and can serve as a dictionary key - hash(bk) - d = {bk: 1} - # a new key should produce a different hash than the first - bk2 = util.BlockKey() - d[bk2] = 2 - assert len(d) == 2 - # check that equality and copying a key works - assert copy.copy(bk) == bk - assert bk != hash(bk) diff --git a/asdf/extension/_serialization_context.py b/asdf/extension/_serialization_context.py index eba47ccc5..0697925e2 100644 --- a/asdf/extension/_serialization_context.py +++ b/asdf/extension/_serialization_context.py @@ -1,3 +1,5 @@ +import contextlib + from asdf._helpers import validate_version from asdf.extension import ExtensionProxy @@ -77,7 +79,22 @@ def _extensions_used(self): """ return self.__extensions_used - def get_block_data_callback(self, index): + @contextlib.contextmanager + def _deserialization(self): + self._obj = None + self._blk = None + self._cb = None + yield self + if self._blk is not None: + self._blocks.blocks.assign_object(self._obj, self._blk) + self._blocks._data_callbacks.assign_object(self._obj, self._cb) + + @contextlib.contextmanager + def _serialization(self, obj): + self._obj = obj + yield self + + def get_block_data_callback(self, index, key=None): """ Generate a callable that when called will read data from a block at the provided index @@ -87,6 +104,9 @@ def get_block_data_callback(self, index): index : int Block index + key : BlockKey + TODO + Returns ------- callback : callable @@ -94,56 +114,39 @@ def get_block_data_callback(self, index): the block data as a one dimensional array of uint8 """ blk = self._blocks.blocks[index] + cb = self._blocks._get_data_callback(index) - def callback(blk=blk): - return blk.data - - return callback - - def assign_block_key(self, block_index, key): - """ - Associate a unique hashable key with a block. - - This is used during Converter.from_yaml_tree and allows - the AsdfFile to be aware of which blocks belong to the - object handled by the converter and allows load_block - to locate the block using the key instead of the index - (which might change if a file undergoes an AsdfFile.update). - - If the block index is later needed (like during to_yaml_tree) - the key can be used with find_block_index to lookup the - block index. - - Parameters - ---------- - - block_index : int - The index of the block to associate with the key + if key is None: + if self._blk is not None: + msg = "Converters accessing >1 block must provide a key for each block" + raise OSError(msg) + self._blk = blk + self._cb = cb + else: + self._blocks.blocks.assign_object(key, blk) + self._blocks._data_callbacks.assign_object(key, cb) - key : hashable - A unique hashable key to associate with a block - """ - self._blocks.blocks.assign_object(key, self._blocks.blocks[block_index]) + return cb - def find_block_index(self, lookup_key, data_callback=None): + def find_available_block_index(self, data_callback, lookup_key=None): """ - Find the index of a previously allocated or reserved block. + Find the index of an available block to write data. This is typically used inside asdf.extension.Converter.to_yaml_tree Parameters ---------- - lookup_key : hashable - Unique key used to retrieve the index of a block that was - previously allocated or reserved. For ndarrays this is - typically the id of the base ndarray. - - data_callback: callable, optional + data_callback: callable Callable that when called will return data (ndarray) that will be written to a block. At the moment, this is only assigned if a new block is created to avoid circular references during AsdfFile.update. + lookup_key : hashable, optional + Unique key used to retrieve the index of a block that was + previously allocated or reserved. For ndarrays this is + typically the id of the base ndarray. + Returns ------- block_index: int @@ -151,6 +154,6 @@ def find_block_index(self, lookup_key, data_callback=None): will be written. """ - # TODO eventually this will need to map memmap blocks to not rewrite data - # TODO lookup options from previous block + if lookup_key is None: + lookup_key = self._obj return self._blocks.make_write_block(data_callback, BlockOptions(), lookup_key) diff --git a/asdf/util.py b/asdf/util.py index 58623d84e..7d6dc1d69 100644 --- a/asdf/util.py +++ b/asdf/util.py @@ -529,25 +529,3 @@ class FileType(enum.Enum): ASDF = 1 FITS = 2 UNKNOWN = 3 - - -class BlockKey: - """ - Helper class that generates a unique hashable value for every instance - useful for associates blocks and objects during serialization and - deserialization - """ - - _next = 0 - - def __init__(self): - self._key = BlockKey._next - BlockKey._next += 1 - - def __hash__(self): - return self._key - - def __eq__(self, other): - if not isinstance(other, BlockKey): - return NotImplemented - return self._key == other._key diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index 8b2252dd6..e114ed40a 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -228,7 +228,8 @@ def _convert_obj(obj): # object which will be handled by a different converter while tag is None: converters.add(converter) - obj = converter.to_yaml_tree(obj, tag, _serialization_context) + with _serialization_context._serialization(obj): + obj = converter.to_yaml_tree(obj, tag, _serialization_context) try: converter = extension_manager.get_converter_for_type(type(obj)) except KeyError: @@ -239,7 +240,8 @@ def _convert_obj(obj): msg = "Conversion cycle detected" raise TypeError(msg) tag = converter.select_tag(obj, _serialization_context) - node = converter.to_yaml_tree(obj, tag, _serialization_context) + with _serialization_context._serialization(obj): + node = converter.to_yaml_tree(obj, tag, _serialization_context) if isinstance(node, GeneratorType): generator = node @@ -310,7 +312,9 @@ def _walker(node): if extension_manager.handles_tag(tag): converter = extension_manager.get_converter_for_tag(tag) - obj = converter.from_yaml_tree(node.data, tag, _serialization_context) + with _serialization_context._deserialization() as sctx: + obj = converter.from_yaml_tree(node.data, tag, sctx) + sctx._obj = obj _serialization_context._mark_extension_used(converter.extension) return obj From 27df3a79382e06b72c0f1d7fc967e2713235875a Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 2 May 2023 16:53:40 -0400 Subject: [PATCH 036/154] add _block/callback tests --- asdf/_tests/_block/test_callback.py | 56 +++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 asdf/_tests/_block/test_callback.py diff --git a/asdf/_tests/_block/test_callback.py b/asdf/_tests/_block/test_callback.py new file mode 100644 index 000000000..1b4d438bd --- /dev/null +++ b/asdf/_tests/_block/test_callback.py @@ -0,0 +1,56 @@ +import pytest + +from asdf._block.callback import DataCallback +from asdf._block.store import LinearStore + + +def test_default_attribute(): + class Data: + def __init__(self, value): + self.data = value + + blks = LinearStore([Data("a"), Data("b")]) + cbs = [DataCallback(0, blks), DataCallback(1, blks)] + + assert cbs[0]() == "a" + assert cbs[1]() == "b" + + +def test_attribute_access(): + class Foo: + def __init__(self, attr, value): + setattr(self, attr, value) + + blks = LinearStore([Foo("a", "foo"), Foo("a", "bar")]) + cb = DataCallback(0, blks) + + assert cb(_attr="a") == "foo" + + +def test_weakref(): + class Data: + def __init__(self, value): + self.data = value + + blks = LinearStore([Data("a"), Data("b")]) + cb = DataCallback(0, blks) + del blks + + with pytest.raises(OSError, match="Attempt to read block data from missing block"): + cb() + + +def test_reassign(): + class Data: + def __init__(self, value): + self.data = value + + blks = LinearStore([Data("a"), Data("b")]) + cb = DataCallback(0, blks) + + assert cb() == "a" + + blks2 = LinearStore([Data("c"), Data("d")]) + cb.reassign(1, blks2) + + assert cb() == "d" From df704d770e3d40042dbed9c90193d970b236229b Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 2 May 2023 17:35:27 -0400 Subject: [PATCH 037/154] add ndarray converter (external blocks broken) --- asdf/__init__.py | 21 +++- asdf/_tests/_helpers.py | 5 +- asdf/_tests/test_stream.py | 16 +-- asdf/core/_converters/ndarray.py | 182 ++++++++++++++++++++++++++++++ asdf/core/_extensions.py | 2 + asdf/tags/core/__init__.py | 2 + asdf/tags/core/ndarray.py | 188 +------------------------------ asdf/{ => tags/core}/stream.py | 30 +---- 8 files changed, 223 insertions(+), 223 deletions(-) create mode 100644 asdf/core/_converters/ndarray.py rename asdf/{ => tags/core}/stream.py (50%) diff --git a/asdf/__init__.py b/asdf/__init__.py index 92bbcf0fc..9e13d4185 100644 --- a/asdf/__init__.py +++ b/asdf/__init__.py @@ -23,6 +23,23 @@ from .asdf import open_asdf as open from .config import config_context, get_config from .exceptions import ValidationError -from .stream import Stream -from .tags.core import IntegerType +from .tags.core import IntegerType, Stream from .tags.core.external_reference import ExternalArrayReference + + +def __getattr__(name): + if name == "stream": + import warnings + + import asdf.tags.core.stream + from asdf.exceptions import AsdfDeprecationWarning + + warnings.warn( + "asdf.stream is deprecated. Please use asdf.tags.core.stream", + AsdfDeprecationWarning, + ) + + return asdf.tags.core.stream + + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) diff --git a/asdf/_tests/_helpers.py b/asdf/_tests/_helpers.py index 75a48dd37..f24386f17 100644 --- a/asdf/_tests/_helpers.py +++ b/asdf/_tests/_helpers.py @@ -19,6 +19,7 @@ except ImportError: CartesianDifferential = None +import numpy as np import yaml import asdf @@ -148,6 +149,8 @@ def recurse(old, new): elif ICRS is not None and isinstance(old, ICRS): assert old.ra == new.ra assert old.dec == new.dec + elif all([isinstance(obj, (np.ndarray, asdf.tags.core.NDArrayType)) for obj in (old, new)]): + np.testing.assert_array_equal(old, new) else: assert old == new @@ -440,7 +443,7 @@ def _assert_extension_type_correctness(extension, extension_type, resolver): if extension_type.yaml_tag is not None and extension_type.yaml_tag.startswith(YAML_TAG_PREFIX): return - if extension_type == asdf.stream.Stream: + if extension_type == asdf.Stream: # Stream is a special case. It was implemented as a subclass of NDArrayType, # but shares a tag with that class, so it isn't really a distinct type. return diff --git a/asdf/_tests/test_stream.py b/asdf/_tests/test_stream.py index 65b83431e..3b66cec8c 100644 --- a/asdf/_tests/test_stream.py +++ b/asdf/_tests/test_stream.py @@ -6,13 +6,13 @@ from numpy.testing import assert_array_equal import asdf -from asdf import generic_io, stream +from asdf import Stream, generic_io def test_stream(): buff = io.BytesIO() - tree = {"stream": stream.Stream([6, 2], np.float64)} + tree = {"stream": Stream([6, 2], np.float64)} ff = asdf.AsdfFile(tree) ff.write_to(buff) @@ -35,7 +35,7 @@ def test_stream_write_nothing(): buff = io.BytesIO() - tree = {"stream": stream.Stream([6, 2], np.float64)} + tree = {"stream": Stream([6, 2], np.float64)} ff = asdf.AsdfFile(tree) ff.write_to(buff) @@ -54,7 +54,7 @@ def test_stream_twice(): buff = io.BytesIO() - tree = {"stream": stream.Stream([6, 2], np.uint8), "stream2": stream.Stream([12, 2], np.uint8)} + tree = {"stream": Stream([6, 2], np.uint8), "stream2": Stream([12, 2], np.uint8)} ff = asdf.AsdfFile(tree) ff.write_to(buff) @@ -72,7 +72,7 @@ def test_stream_twice(): def test_stream_with_nonstream(): buff = io.BytesIO() - tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": stream.Stream([6, 2], np.float64)} + tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": Stream([6, 2], np.float64)} ff = asdf.AsdfFile(tree) # Since we're testing with small arrays, force this array to be stored in @@ -95,7 +95,7 @@ def test_stream_with_nonstream(): def test_stream_real_file(tmp_path): path = os.path.join(str(tmp_path), "test.asdf") - tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": stream.Stream([6, 2], np.float64)} + tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": Stream([6, 2], np.float64)} with open(path, "wb") as fd: ff = asdf.AsdfFile(tree) @@ -116,7 +116,7 @@ def test_stream_real_file(tmp_path): def test_stream_to_stream(): - tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": stream.Stream([6, 2], np.float64)} + tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": Stream([6, 2], np.float64)} buff = io.BytesIO() fd = generic_io.OutputStream(buff) @@ -179,7 +179,7 @@ def test_too_many_streams(): def test_stream_repr_and_str(): - tree = {"stream": stream.Stream([16], np.int64)} + tree = {"stream": Stream([16], np.int64)} ff = asdf.AsdfFile(tree) repr(ff.tree["stream"]) diff --git a/asdf/core/_converters/ndarray.py b/asdf/core/_converters/ndarray.py new file mode 100644 index 000000000..64470f056 --- /dev/null +++ b/asdf/core/_converters/ndarray.py @@ -0,0 +1,182 @@ +from asdf.extension import Converter + + +class NDArrayConverter(Converter): + tags = [ + "tag:stsci.edu:asdf/core/ndarray-1.0.0", + "tag:stsci.edu:asdf/core/ndarray-1.1.0", + ] + types = [ + "numpy.ndarray", + "numpy.ma.core.MaskedArray", + "asdf.tags.core.ndarray.NDArrayType", + "asdf.tags.core.stream.Stream", + ] + + def to_yaml_tree(self, obj, tag, ctx): + import numpy as np + from numpy import ma + + from asdf import util + from asdf._block.options import Options + from asdf.config import config_context + from asdf.tags.core.ndarray import NDArrayType, numpy_array_to_list, numpy_dtype_to_asdf_datatype + from asdf.tags.core.stream import Stream + + data = obj + + if isinstance(obj, Stream): + # TODO previously, stream never passed on data? + ctx._blocks.set_streamed_block(data._array, data) + + result = {} + result["source"] = -1 + result["shape"] = ["*", *data._shape] + result["datatype"] = data._datatype + result["byteorder"] = data._byteorder + if data._strides is not None: + result["strides"] = data._strides + return result + + # The ndarray-1.0.0 schema does not permit 0 valued strides. + # Perhaps we'll want to allow this someday, to efficiently + # represent an array of all the same value. + if any(stride == 0 for stride in data.strides): + data = np.ascontiguousarray(data) + + # The view computations that follow assume that the base array + # is contiguous. If not, we need to make a copy to avoid + # writing a nonsense view. + base = util.get_array_base(data) + if not base.flags.forc: + data = np.ascontiguousarray(data) + base = util.get_array_base(data) + + shape = data.shape + + if isinstance(obj, NDArrayType) and isinstance(obj._source, str): + # this is an external block, if we have no other settings, keep it as external + options = ctx._blocks.options.lookup_by_object(data) + if options is None: + options = Options("external") + else: + options = ctx._blocks.options.get_options(data) + + with config_context() as cfg: + if cfg.all_array_storage is not None: + options.storage_type = cfg.all_array_storage + if cfg.all_array_compression != "input": + options.compression = cfg.all_array_compression + options.compression_kwargs = cfg.all_array_compression_kwargs + inline_threshold = cfg.array_inline_threshold + + if inline_threshold is not None and options.storage_type in ("inline", "internal"): + if data.size < inline_threshold: + options.storage_type = "inline" + else: + options.storage_type = "internal" + ctx._blocks.options.set_options(data, options) + + # Compute the offset relative to the base array and not the + # block data, in case the block is compressed. + offset = data.ctypes.data - base.ctypes.data + + strides = None if data.flags.c_contiguous else data.strides + dtype, byteorder = numpy_dtype_to_asdf_datatype( + data.dtype, + include_byteorder=(options.storage_type != "inline"), + ) + + result = {} + + result["shape"] = list(shape) + if options.storage_type == "streamed": + result["shape"][0] = "*" + + if options.storage_type == "inline": + listdata = numpy_array_to_list(data) + result["data"] = listdata + result["datatype"] = dtype + + else: + result["shape"] = list(shape) + if options.storage_type == "streamed": + result["shape"][0] = "*" + + if options.storage_type == "streamed": + ctx._blocks.set_streamed_block(base, data) + result["source"] = -1 + else: + result["source"] = ctx._blocks.make_write_block(base, options, obj) + result["datatype"] = dtype + result["byteorder"] = byteorder + + if offset > 0: + result["offset"] = offset + + if strides is not None: + result["strides"] = list(strides) + + if isinstance(data, ma.MaskedArray) and np.any(data.mask): + if options.storage_type == "inline": + ctx._blocks._set_array_storage(data.mask, "inline") + + result["mask"] = data.mask + + return result + + def from_yaml_tree(self, node, tag, ctx): + import sys + import weakref + + from asdf.tags.core import NDArrayType + from asdf.tags.core.ndarray import asdf_datatype_to_numpy_dtype + + if isinstance(node, list): + instance = NDArrayType(node, None, None, None, None, None, None) + ctx._blocks._set_array_storage(instance, "inline") + return instance + + if isinstance(node, dict): + source = node.get("source") + data = node.get("data") + if source and data: + msg = "Both source and data may not be provided at the same time" + raise ValueError(msg) + if data: + source = data + shape = node.get("shape", None) + byteorder = sys.byteorder if data is not None else node["byteorder"] + dtype = asdf_datatype_to_numpy_dtype(node["datatype"], byteorder) if "datatype" in node else None + offset = node.get("offset", 0) + strides = node.get("strides", None) + mask = node.get("mask", None) + + if isinstance(source, int): + data = ctx.get_block_data_callback(source) + instance = NDArrayType(data, shape, dtype, offset, strides, "A", mask) + elif isinstance(source, str): + raise NotImplementedError("external blocks broken... again") + + # external + def data(_attr=None, _ref=weakref.ref(ctx)): + ctx = _ref() + if ctx is None: + msg = "Failed to resolve reference to AsdfFile to read external block" + raise OSError(msg) + array = ctx.open_external(source)._blocks.blocks[0].cached_data + ctx._blocks._set_array_storage(array, "external") + return array + + instance = NDArrayType(data, shape, dtype, offset, strides, "A", mask) + else: + # inline + instance = NDArrayType(source, shape, dtype, offset, strides, "A", mask) + ctx._blocks._set_array_storage(instance, "inline") + + if not ctx._blocks.lazy_load: + instance._make_array() + return instance + + msg = "Invalid ndarray description." + raise TypeError(msg) diff --git a/asdf/core/_extensions.py b/asdf/core/_extensions.py index 685994e27..f7493af7c 100644 --- a/asdf/core/_extensions.py +++ b/asdf/core/_extensions.py @@ -4,6 +4,7 @@ from ._converters.constant import ConstantConverter from ._converters.external_reference import ExternalArrayReferenceConverter from ._converters.reference import ReferenceConverter +from ._converters.ndarray import NDArrayConverter from ._converters.tree import ( AsdfObjectConverter, ExtensionMetadataConverter, @@ -23,6 +24,7 @@ SoftwareConverter(), SubclassMetadataConverter(), ReferenceConverter(), + NDArrayConverter(), ] diff --git a/asdf/tags/core/__init__.py b/asdf/tags/core/__init__.py index bf9e6138f..f075e6fce 100644 --- a/asdf/tags/core/__init__.py +++ b/asdf/tags/core/__init__.py @@ -2,6 +2,7 @@ from .external_reference import ExternalArrayReference from .integer import IntegerType from .ndarray import NDArrayType +from .stream import Stream __all__ = [ "AsdfObject", @@ -13,6 +14,7 @@ "NDArrayType", "IntegerType", "ExternalArrayReference", + "Stream", ] diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index 792dfe185..eb6d4de18 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -1,6 +1,5 @@ import mmap import sys -import weakref import numpy as np from numpy import ma @@ -229,7 +228,7 @@ def ascii_to_unicode(x): return ascii_to_unicode(tolist(array)) -class NDArrayType(_types._AsdfType): +class NDArrayType: name = "core/ndarray" version = "1.0.0" supported_versions = {"1.0.0", "1.1.0"} @@ -420,188 +419,6 @@ def __getattribute__(self, name): return _types._AsdfType.__getattribute__(self, name) - @classmethod - def from_tree(cls, node, ctx): - if isinstance(node, list): - instance = cls(node, None, None, None, None, None, None) - ctx._blocks._set_array_storage(instance, "inline") - return instance - - if isinstance(node, dict): - source = node.get("source") - data = node.get("data") - if source and data: - msg = "Both source and data may not be provided at the same time" - raise ValueError(msg) - if data: - source = data - shape = node.get("shape", None) - byteorder = sys.byteorder if data is not None else node["byteorder"] - dtype = asdf_datatype_to_numpy_dtype(node["datatype"], byteorder) if "datatype" in node else None - offset = node.get("offset", 0) - strides = node.get("strides", None) - mask = node.get("mask", None) - - if isinstance(source, int): - data = ctx._blocks._get_data_callback(source) - instance = cls(data, shape, dtype, offset, strides, "A", mask) - ctx._blocks.blocks.assign_object(instance, ctx._blocks.blocks[source]) - ctx._blocks._data_callbacks.assign_object(instance, data) - elif isinstance(source, str): - # external - def data(_attr=None, _ref=weakref.ref(ctx)): - ctx = _ref() - if ctx is None: - msg = "Failed to resolve reference to AsdfFile to read external block" - raise OSError(msg) - array = ctx.open_external(source)._blocks.blocks[0].cached_data - ctx._blocks._set_array_storage(array, "external") - return array - - instance = cls(data, shape, dtype, offset, strides, "A", mask) - else: - # inline - instance = cls(source, shape, dtype, offset, strides, "A", mask) - ctx._blocks._set_array_storage(instance, "inline") - - if not ctx._blocks.lazy_load: - instance._make_array() - return instance - - msg = "Invalid ndarray description." - raise TypeError(msg) - - @classmethod - def to_tree(cls, obj, ctx): - data = obj - # The ndarray-1.0.0 schema does not permit 0 valued strides. - # Perhaps we'll want to allow this someday, to efficiently - # represent an array of all the same value. - if any(stride == 0 for stride in data.strides): - data = np.ascontiguousarray(data) - - # The view computations that follow assume that the base array - # is contiguous. If not, we need to make a copy to avoid - # writing a nonsense view. - base = util.get_array_base(data) - if not base.flags.forc: - data = np.ascontiguousarray(data) - base = util.get_array_base(data) - - shape = data.shape - - if isinstance(obj, NDArrayType) and isinstance(obj._source, str): - # this is an external block, if we have no other settings, keep it as external - options = ctx._blocks.options.lookup_by_object(data) - if options is None: - options = Options("external") - else: - options = ctx._blocks.options.get_options(data) - - with config_context() as cfg: - if cfg.all_array_storage is not None: - options.storage_type = cfg.all_array_storage - if cfg.all_array_compression != "input": - options.compression = cfg.all_array_compression - options.compression_kwargs = cfg.all_array_compression_kwargs - inline_threshold = cfg.array_inline_threshold - - if inline_threshold is not None and options.storage_type in ("inline", "internal"): - if data.size < inline_threshold: - options.storage_type = "inline" - else: - options.storage_type = "internal" - ctx._blocks.options.set_options(data, options) - - # Compute the offset relative to the base array and not the - # block data, in case the block is compressed. - offset = data.ctypes.data - base.ctypes.data - - strides = None if data.flags.c_contiguous else data.strides - dtype, byteorder = numpy_dtype_to_asdf_datatype( - data.dtype, - include_byteorder=(options.storage_type != "inline"), - ) - - result = {} - - result["shape"] = list(shape) - if options.storage_type == "streamed": - result["shape"][0] = "*" - - if options.storage_type == "inline": - listdata = numpy_array_to_list(data) - result["data"] = listdata - result["datatype"] = dtype - - else: - result["shape"] = list(shape) - if options.storage_type == "streamed": - result["shape"][0] = "*" - - if options.storage_type == "streamed": - ctx._blocks.set_streamed_block(base, data) - result["source"] = -1 - else: - result["source"] = ctx._blocks.make_write_block(base, options, obj) - result["datatype"] = dtype - result["byteorder"] = byteorder - - if offset > 0: - result["offset"] = offset - - if strides is not None: - result["strides"] = list(strides) - - if isinstance(data, ma.MaskedArray) and np.any(data.mask): - if options.storage_type == "inline": - ctx._blocks._set_array_storage(data.mask, "inline") - - result["mask"] = data.mask - - return result - - @classmethod - def _assert_equality(cls, old, new, func): - if old.dtype.fields: - if not new.dtype.fields: - # This line is safe because this is actually a piece of test - # code, even though it lives in this file: - msg = "arrays not equal" - raise AssertionError(msg) - for a, b in zip(old, new): - cls._assert_equality(a, b, func) - else: - old = old.__array__() - new = new.__array__() - if old.dtype.char in "SU": - if old.dtype.char == "S": - old = old.astype("U") - if new.dtype.char == "S": - new = new.astype("U") - old = old.tolist() - new = new.tolist() - # This line is safe because this is actually a piece of test - # code, even though it lives in this file: - assert old == new # noqa: S101 - else: - func(old, new) - - @classmethod - def assert_equal(cls, old, new): - from numpy.testing import assert_array_equal - - cls._assert_equality(old, new, assert_array_equal) - - @classmethod - def assert_allclose(cls, old, new): - from numpy.testing import assert_allclose, assert_array_equal - - if old.dtype.kind in "iu" and new.dtype.kind in "iu": - cls._assert_equality(old, new, assert_array_equal) - else: - cls._assert_equality(old, new, assert_allclose) - def _make_operation(name): def operation(self, *args): @@ -610,7 +427,8 @@ def operation(self, *args): return operation -classes_to_modify = [*NDArrayType.__versioned_siblings, NDArrayType] +# classes_to_modify = [*NDArrayType.__versioned_siblings, NDArrayType] +classes_to_modify = [NDArrayType] for op in [ "__neg__", "__pos__", diff --git a/asdf/stream.py b/asdf/tags/core/stream.py similarity index 50% rename from asdf/stream.py rename to asdf/tags/core/stream.py index 29fe68735..1133b1c85 100644 --- a/asdf/stream.py +++ b/asdf/tags/core/stream.py @@ -1,7 +1,7 @@ -from .tags.core import ndarray +from .ndarray import NDArrayType, numpy_dtype_to_asdf_datatype -class Stream(ndarray.NDArrayType): +class Stream(NDArrayType): """ Used to put a streamed array into the tree. @@ -26,37 +26,13 @@ class Stream(ndarray.NDArrayType): def __init__(self, shape, dtype, strides=None): self._shape = shape - self._datatype, self._byteorder = ndarray.numpy_dtype_to_asdf_datatype(dtype) + self._datatype, self._byteorder = numpy_dtype_to_asdf_datatype(dtype) self._strides = strides self._array = None def _make_array(self): self._array = None - @classmethod - def reserve_blocks(cls, data, ctx): - if isinstance(data, Stream): - yield ctx._blocks.get_streamed_block() - - @classmethod - def from_tree(cls, data, ctx): - # this is never called because tags always trigger loading with NDArrayType - raise NotImplementedError("never called") - - @classmethod - def to_tree(cls, data, ctx): - # TODO previously, stream never passed on data? - ctx._blocks.set_streamed_block(data._array, data) - - result = {} - result["source"] = -1 - result["shape"] = ["*", *data._shape] - result["datatype"] = data._datatype - result["byteorder"] = data._byteorder - if data._strides is not None: - result["strides"] = data._strides - return result - def __repr__(self): return f"Stream({self._shape}, {self._datatype}, strides={self._strides})" From a3b4085c7ee8278886b8d951b031eb87ff679e3e Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 3 May 2023 09:58:57 -0400 Subject: [PATCH 038/154] external blocks working again --- asdf/_block/external.py | 23 +++++++++++++++++++++++ asdf/_block/manager.py | 19 ++++++++++++++++--- asdf/asdf.py | 4 ++++ asdf/core/_converters/ndarray.py | 14 +++++++------- 4 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 asdf/_block/external.py diff --git a/asdf/_block/external.py b/asdf/_block/external.py new file mode 100644 index 000000000..2fb232f26 --- /dev/null +++ b/asdf/_block/external.py @@ -0,0 +1,23 @@ +from asdf import generic_io, util + + +class UseInternal: + pass + + +class ExternalBlockCache: + def __init__(self): + self._cache = {} + + def load(self, base_uri, uri): + key = util.get_base_uri(uri) + if key not in self._cache: + resolved_uri = generic_io.resolve_uri(base_uri, uri) + if resolved_uri == "" or resolved_uri == base_uri: + return UseInternal + + from asdf import open as asdf_open + + with asdf_open(resolved_uri, lazy_load=False, copy_arrays=True) as af: + self._cache[key] = af._blocks.blocks[0].cached_data + return self._cache[key] diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 9bea82f04..3d57a4eeb 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -7,6 +7,7 @@ from . import store from .callback import DataCallback +from .external import ExternalBlockCache, UseInternal from .options import Options from .writer import WriteBlock, write_blocks @@ -97,7 +98,9 @@ def make_external_uri(uri, index): return filename -def resolve_external_uri(uri, relative): +def relative_uri_to_full(uri, relative): + # file://foo/bar, bam -> file://foo/bam + # TODO replace with generic_io.resolve_uri if uri is None: uri = "" parts = list(util.patched_urllib_parse.urlparse(uri)) @@ -121,6 +124,13 @@ def __init__(self, read_blocks=None, uri=None): self._streamed_obj = None self._write_fd = None self._uri = uri + self._external_block_cache = ExternalBlockCache() + + def _load_external(self, uri): + value = self._external_block_cache.load(self._uri, uri) + if value is UseInternal: + return self._blocks.blocks[0].data + return value def _clear_write(self): self._write_blocks = store.LinearStore() @@ -139,7 +149,7 @@ def _write_external_blocks(self): raise ValueError("Can't write external blocks, since URI of main file is unknown.") for blk in self._external_write_blocks: - uri = resolve_external_uri(self._write_fd.uri, blk._uri) + uri = relative_uri_to_full(self._write_fd.uri, blk._uri) af = AsdfFile() with generic_io.get_file(uri, mode="w") as f: af.write_to(f, include_block_index=False) @@ -157,7 +167,10 @@ def make_write_block(self, data, options, obj): # need to set up new external block index = len(self._external_write_blocks) blk = WriteBlock(data, options.compression, options.compression_kwargs) - base_uri = self._uri or self._write_fd.uri + if self._write_fd is not None: + base_uri = self._write_fd.uri or self._uri + else: + base_uri = self._uri blk._uri = make_external_uri(base_uri, index) self._external_write_blocks.append(blk) return blk._uri diff --git a/asdf/asdf.py b/asdf/asdf.py index 6577808f6..826e689f7 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -172,6 +172,7 @@ def __init__( msg = "Can not copy AsdfFile and change active extensions" raise ValueError(msg) self._uri = tree.uri + self._blocks._uri = self._uri # Set directly to self._tree (bypassing property), since # we can assume the other AsdfFile is already valid. self._tree = tree.tree @@ -804,6 +805,9 @@ def _open_asdf( with config_context(): self._mode = fd.mode self._fd = fd + if self._fd._uri: + self._uri = self._fd._uri + self._blocks._uri = self._fd._uri # The filename is currently only used for tracing warning information self._fname = self._fd._uri if self._fd._uri else "" try: diff --git a/asdf/core/_converters/ndarray.py b/asdf/core/_converters/ndarray.py index 64470f056..1598442e3 100644 --- a/asdf/core/_converters/ndarray.py +++ b/asdf/core/_converters/ndarray.py @@ -156,18 +156,18 @@ def from_yaml_tree(self, node, tag, ctx): data = ctx.get_block_data_callback(source) instance = NDArrayType(data, shape, dtype, offset, strides, "A", mask) elif isinstance(source, str): - raise NotImplementedError("external blocks broken... again") - # external - def data(_attr=None, _ref=weakref.ref(ctx)): - ctx = _ref() - if ctx is None: + def data(_attr=None, _ref=weakref.ref(ctx._blocks)): + blks = _ref() + if blks is None: msg = "Failed to resolve reference to AsdfFile to read external block" raise OSError(msg) - array = ctx.open_external(source)._blocks.blocks[0].cached_data - ctx._blocks._set_array_storage(array, "external") + array = blks._load_external(source) + blks._set_array_storage(array, "external") return array + # data = ctx._blocks._load_external(source) + # ctx._blocks._set_array_storage(data, "external") instance = NDArrayType(data, shape, dtype, offset, strides, "A", mask) else: # inline From 2c389007940dcf5616506286c2783683ccd00976 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 3 May 2023 10:14:10 -0400 Subject: [PATCH 039/154] update diff to not use NDArrayType directly --- asdf/commands/diff.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/asdf/commands/diff.py b/asdf/commands/diff.py index 1b9d19e97..d1e2f70c1 100644 --- a/asdf/commands/diff.py +++ b/asdf/commands/diff.py @@ -31,7 +31,6 @@ import asdf from asdf.tagged import Tagged -from asdf.tags.core.ndarray import NDArrayType from asdf.util import human_list from .main import Command @@ -259,8 +258,13 @@ def compare_ndarrays(diff_ctx, array0, array1, keys): if array0.get(field) != array1.get(field): differences.append(field) - array0 = NDArrayType.from_tree(array0, diff_ctx.asdf0) - array1 = NDArrayType.from_tree(array1, diff_ctx.asdf1) + def get_flat(af, keys): + for k in keys: + af = af[k] + return af + + array0 = get_flat(diff_ctx.asdf0, keys) + array1 = get_flat(diff_ctx.asdf1, keys) if not array_equal(array0, array1): differences.append("contents") From 4a26d7a09cfe554bb5a622c1d138f128c8d0a5ac Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 3 May 2023 10:33:02 -0400 Subject: [PATCH 040/154] clean up NDArrayType, xfail 1530 test fixing 1530 might require having NDArrayType subclass ndarray (to allow it to check the the memmap on every access). This would likely involve major code changes to NDArrayType which would make reviewing the already extensive changes more difficult. --- asdf/_tests/_issues/test_1520.py | 1 - asdf/_tests/_issues/test_1530.py | 2 ++ asdf/tags/core/ndarray.py | 23 ++--------------------- 3 files changed, 4 insertions(+), 22 deletions(-) diff --git a/asdf/_tests/_issues/test_1520.py b/asdf/_tests/_issues/test_1520.py index 649d3c857..8b99b0c26 100644 --- a/asdf/_tests/_issues/test_1520.py +++ b/asdf/_tests/_issues/test_1520.py @@ -27,7 +27,6 @@ def test_1520(tmp_path): af[i][:] = np.random.randint(255, size=array_size) af[i][0] = i + 1 # this no longer causes update to fail - assert False af.update() with asdf.open(fn, mode="r") as af: diff --git a/asdf/_tests/_issues/test_1530.py b/asdf/_tests/_issues/test_1530.py index cdd82a169..0d7b65778 100644 --- a/asdf/_tests/_issues/test_1530.py +++ b/asdf/_tests/_issues/test_1530.py @@ -1,8 +1,10 @@ import numpy as np +import pytest import asdf +@pytest.mark.xfail(reason="fixing this may require subclassing ndarray") def test_1530(tmp_path): """ Calling update with memmapped data can create invalid data in memmap views diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index eb6d4de18..d678d5c2e 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -4,7 +4,7 @@ import numpy as np from numpy import ma -from asdf import _types, util +from asdf import util from asdf._jsonschema import ValidationError from asdf._block.options import Options from asdf.config import config_context @@ -229,11 +229,6 @@ def ascii_to_unicode(x): class NDArrayType: - name = "core/ndarray" - version = "1.0.0" - supported_versions = {"1.0.0", "1.1.0"} - types = [np.ndarray, ma.MaskedArray] - def __init__(self, source, shape, dtype, offset, strides, order, mask): # source can be a: # - list of numbers for an inline block @@ -408,17 +403,6 @@ def __setitem__(self, *args): raise - def __getattribute__(self, name): - # The presence of these attributes on an NDArrayType instance - # can cause problems when the array is passed to other - # libraries. - # See https://github.com/asdf-format/asdf/issues/1015 - if name in ("name", "version", "supported_versions"): - msg = f"'{self.__class__.name}' object has no attribute '{name}'" - raise AttributeError(msg) - - return _types._AsdfType.__getattribute__(self, name) - def _make_operation(name): def operation(self, *args): @@ -427,8 +411,6 @@ def operation(self, *args): return operation -# classes_to_modify = [*NDArrayType.__versioned_siblings, NDArrayType] -classes_to_modify = [NDArrayType] for op in [ "__neg__", "__pos__", @@ -493,8 +475,7 @@ def operation(self, *args): "__delitem__", "__contains__", ]: - [setattr(cls, op, _make_operation(op)) for cls in classes_to_modify] -del classes_to_modify + setattr(NDArrayType, op, _make_operation(op)) def _get_ndim(instance): From 91024bb0ad64a928d609ef10523a74fd5890a91e Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 3 May 2023 10:40:12 -0400 Subject: [PATCH 041/154] fix failing test from rebase --- asdf/_tests/tags/core/tests/test_ndarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asdf/_tests/tags/core/tests/test_ndarray.py b/asdf/_tests/tags/core/tests/test_ndarray.py index 2334c319e..29a7b651a 100644 --- a/asdf/_tests/tags/core/tests/test_ndarray.py +++ b/asdf/_tests/tags/core/tests/test_ndarray.py @@ -183,7 +183,7 @@ def test_array_inline_threshold_recursive(tmpdir): af = asdf.AsdfFile(tree) af.write_to(fn) with asdf.open(fn) as af: - assert len(list(af._blocks.internal_blocks)) == 0 + assert len(list(af._blocks.blocks)) == 0 def test_copy_inline(): From f955c93fd689e232992910b32d47386ad90c53db Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 3 May 2023 11:13:31 -0400 Subject: [PATCH 042/154] remove Block usage in pytest plugin --- pytest_asdf/plugin.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pytest_asdf/plugin.py b/pytest_asdf/plugin.py index c4a94241c..469cf7fe9 100644 --- a/pytest_asdf/plugin.py +++ b/pytest_asdf/plugin.py @@ -218,7 +218,7 @@ def from_parent( return result def runtest(self): - from asdf import AsdfFile, block, util + from asdf import AsdfFile, _block, generic_io, util from asdf._tests import _helpers as helpers from asdf.exceptions import AsdfDeprecationWarning @@ -239,12 +239,11 @@ def runtest(self): util.filepath_to_url(os.path.abspath(os.path.join(os.path.dirname(self.filename), "external.asdf"))) ] = ff2 - # Add some dummy blocks so that the ndarray examples work - for _ in range(3): - b = block.Block(np.zeros((1024 * 1024 * 8), dtype=np.uint8)) - b._used = True - ff._blocks.add(b) - b._array_storage = "streamed" + wb = _block.writer.WriteBlock(np.zeros(1024 * 1024 * 8, dtype=np.uint8)) + with generic_io.get_file(buff, mode="rw") as f: + f.seek(0, 2) + _block.writer.write_blocks(f, [wb, wb], streamed_block=wb) + f.seek(0) try: # Do not tolerate any warnings that occur during schema validation From bcbd30dcb53568b7a50cf462426920d15e918a7b Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 3 May 2023 14:37:32 -0400 Subject: [PATCH 043/154] Allow converter to handle subclasses l --- asdf/core/_converters/ndarray.py | 14 +++++++------- asdf/extension/_manager.py | 9 +++++++++ asdf/tags/core/ndarray.py | 14 ++++++++++---- asdf/yamlutil.py | 2 ++ 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/asdf/core/_converters/ndarray.py b/asdf/core/_converters/ndarray.py index 1598442e3..fa21ccd76 100644 --- a/asdf/core/_converters/ndarray.py +++ b/asdf/core/_converters/ndarray.py @@ -1,3 +1,5 @@ +import numpy as np + from asdf.extension import Converter @@ -7,7 +9,7 @@ class NDArrayConverter(Converter): "tag:stsci.edu:asdf/core/ndarray-1.1.0", ] types = [ - "numpy.ndarray", + np.ndarray, # we use the type here so the extension can find the sub-classes "numpy.ma.core.MaskedArray", "asdf.tags.core.ndarray.NDArrayType", "asdf.tags.core.stream.Stream", @@ -153,11 +155,11 @@ def from_yaml_tree(self, node, tag, ctx): mask = node.get("mask", None) if isinstance(source, int): - data = ctx.get_block_data_callback(source) - instance = NDArrayType(data, shape, dtype, offset, strides, "A", mask) + data_callback = ctx.get_block_data_callback(source) + instance = NDArrayType(source, shape, dtype, offset, strides, "A", mask, data_callback) elif isinstance(source, str): # external - def data(_attr=None, _ref=weakref.ref(ctx._blocks)): + def data_callback(_attr=None, _ref=weakref.ref(ctx._blocks)): blks = _ref() if blks is None: msg = "Failed to resolve reference to AsdfFile to read external block" @@ -166,9 +168,7 @@ def data(_attr=None, _ref=weakref.ref(ctx._blocks)): blks._set_array_storage(array, "external") return array - # data = ctx._blocks._load_external(source) - # ctx._blocks._set_array_storage(data, "external") - instance = NDArrayType(data, shape, dtype, offset, strides, "A", mask) + instance = NDArrayType(source, shape, dtype, offset, strides, "A", mask, data_callback) else: # inline instance = NDArrayType(source, shape, dtype, offset, strides, "A", mask) diff --git a/asdf/extension/_manager.py b/asdf/extension/_manager.py index 8094ef701..68b6459d3 100644 --- a/asdf/extension/_manager.py +++ b/asdf/extension/_manager.py @@ -92,6 +92,15 @@ def handles_type(self, typ): """ return typ in self._converters_by_type or get_class_name(typ, instance=False) in self._converters_by_type + def handles_subtype(self, typ): + for ctyp in self._converters_by_type: + if isinstance(ctyp, str): + continue + if issubclass(typ, ctyp): + self._converters_by_type[typ] = self._converters_by_type[ctyp] + return True + return False + def handles_tag_definition(self, tag): """ Return `True` if the specified tag has a definition. diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index d678d5c2e..d1a171221 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -229,11 +229,12 @@ def ascii_to_unicode(x): class NDArrayType: - def __init__(self, source, shape, dtype, offset, strides, order, mask): + def __init__(self, source, shape, dtype, offset, strides, order, mask, data_callback=None): # source can be a: # - list of numbers for an inline block # - a data callback for an internal or externalblock self._source = source + self._data_callback = data_callback self._array = None self._mask = mask @@ -264,11 +265,16 @@ def _make_array(self): self._array = None if self._array is None: - if callable(self._source): + if isinstance(self._source, str): + # we need to keep _source as a str to allow stdatamodels to + # support AsdfInFits + data = self._data_callback() + elif isinstance(self._source, int): # cached data is used here so that multiple NDArrayTypes will all use # the same base array - data = self._source(_attr="cached_data") + data = self._data_callback(_attr="cached_data") else: + # inline data data = self._source if hasattr(data, "base") and isinstance(data.base, mmap.mmap) and data.base.closed: @@ -351,7 +357,7 @@ def shape(self): if "*" in self._shape: if isinstance(self._source, str): return self._make_array().shape - data_size = self._source(_attr="header")["data_size"] + data_size = self._data_callback(_attr="header")["data_size"] if not data_size: return self._make_array().shape return tuple( diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index e114ed40a..96bf39165 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -269,6 +269,8 @@ def _convert_obj(obj): def _walker(obj): if extension_manager.handles_type(type(obj)): return _convert_obj(obj) + if extension_manager.handles_subtype(type(obj)): + return _convert_obj(obj) tag = ctx._type_index.from_custom_type( type(obj), From 17ae69f3c08b55b5aa675c990140edf0d661c441 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 3 May 2023 16:14:22 -0400 Subject: [PATCH 044/154] fixes for remote_data tests --- asdf/_block/io.py | 5 ++++- asdf/_tests/test_generic_io.py | 4 +--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/asdf/_block/io.py b/asdf/_block/io.py index 7066f5ea0..dda2af815 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -96,7 +96,10 @@ def read_block(fd, offset=None, memmap=False, lazy_load=False): if offset is None and fd.seekable(): offset = fd.tell() header = read_block_header(fd, offset) - data_offset = fd.tell() + if fd.seekable(): + data_offset = fd.tell() + else: + data_offset = None if lazy_load and fd.seekable(): # setup a callback to later load the data fd_ref = weakref.ref(fd) diff --git a/asdf/_tests/test_generic_io.py b/asdf/_tests/test_generic_io.py index 6bb3c929f..1101f9f05 100644 --- a/asdf/_tests/test_generic_io.py +++ b/asdf/_tests/test_generic_io.py @@ -274,7 +274,6 @@ def get_read_fd(): with _roundtrip(tree, get_write_fd, get_read_fd) as ff: assert len(ff._blocks.blocks) == 2 - assert isinstance(ff._blocks.blocks[0]._data, np.ndarray) assert (ff.tree["science_data"] == tree["science_data"]).all() @@ -322,8 +321,7 @@ def get_read_fd(): return generic_io.get_file(httpserver.url + "test.asdf") with _roundtrip(tree, get_write_fd, get_read_fd, write_options={"all_array_storage": "external"}) as ff: - assert len(list(ff._blocks.internal_blocks)) == 0 - assert len(list(ff._blocks.external_blocks)) == 2 + assert len(list(ff._blocks.blocks)) == 0 def test_exploded_stream_write(small_tree): From ff8dbb3684009b4ed05bb6ec60382ab44889fcff Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 4 May 2023 11:57:32 -0400 Subject: [PATCH 045/154] restrict subclass conversion to ndarray To provide backwards compability, the ndarray converter is allowed to convert subclasses of ndarray that are not handled by other converters. This subclass conversion results in a AsdfConversionWarning as a future ASDF version will not support this behavior (as converters require explicit definition of what types are handled). To aid in testing, a configuration option is added `convert_unknown_ndarray_subclasses` that is True by default but can be turned off to not allow conversion of subclasses (to more closely match the behavior of a future version of ASDF). --- asdf/_tests/test_yaml.py | 18 +++++++++++++++++- asdf/config.py | 28 ++++++++++++++++++++++++++++ asdf/extension/_manager.py | 15 +++++++++++++-- asdf/yamlutil.py | 22 +++++++++++++++++----- 4 files changed, 75 insertions(+), 8 deletions(-) diff --git a/asdf/_tests/test_yaml.py b/asdf/_tests/test_yaml.py index 74c30128c..a4f9f7e31 100644 --- a/asdf/_tests/test_yaml.py +++ b/asdf/_tests/test_yaml.py @@ -8,7 +8,7 @@ import asdf from asdf import tagged, treeutil, yamlutil -from asdf.exceptions import AsdfWarning +from asdf.exceptions import AsdfConversionWarning, AsdfWarning from . import _helpers as helpers @@ -293,3 +293,19 @@ def test_numpy_scalar(numpy_value, expected_value): assert abs_diff < eps, abs_diff else: assert loaded_value == expected_value + + +def test_ndarray_subclass_conversion(tmp_path): + class MyNDArray(np.ndarray): + pass + + fn = tmp_path / "test.asdf" + af = asdf.AsdfFile() + af["a"] = MyNDArray([1, 2, 3]) + with pytest.warns(AsdfConversionWarning, match=r"A ndarray subclass .*"): + af.write_to(fn) + + with asdf.config.config_context() as cfg: + cfg.convert_unknown_ndarray_subclasses = False + with pytest.raises(yaml.representer.RepresenterError, match=r".*cannot represent.*"): + af.write_to(fn) diff --git a/asdf/config.py b/asdf/config.py index cb78d0642..fd82629c4 100644 --- a/asdf/config.py +++ b/asdf/config.py @@ -23,6 +23,7 @@ DEFAULT_ALL_ARRAY_STORAGE = None DEFAULT_ALL_ARRAY_COMPRESSION = "input" DEFAULT_ALL_ARRAY_COMPRESSION_KWARGS = None +DEFAULT_CONVERT_UNKNOWN_NDARRAY_SUBCLASSES = True class AsdfConfig: @@ -44,6 +45,7 @@ def __init__(self): self._all_array_storage = DEFAULT_ALL_ARRAY_STORAGE self._all_array_compression = DEFAULT_ALL_ARRAY_COMPRESSION self._all_array_compression_kwargs = DEFAULT_ALL_ARRAY_COMPRESSION_KWARGS + self._convert_unknown_ndarray_subclasses = DEFAULT_CONVERT_UNKNOWN_NDARRAY_SUBCLASSES self._lock = threading.RLock() @@ -413,6 +415,30 @@ def validate_on_read(self, value): """ self._validate_on_read = value + @property + def convert_unknown_ndarray_subclasses(self): + """ + Get configuration that controls if ndarray subclasses + (subclasses that aren't otherwise handled by a specific + converter) are serialized as ndarray. If `True`, instances + of these subclasses will appear in ASDF files as ndarrays + and when loaded, will load as ndarrays. + + Note that these conversions will result in an + AsdfConversionWarning being issued as this support for + converting subclasses will be removed in a future version + of ASDF. + + Returns + ------- + bool + """ + return self._convert_unknown_ndarray_subclasses + + @convert_unknown_ndarray_subclasses.setter + def convert_unknown_ndarray_subclasses(self, value): + self._convert_unknown_ndarray_subclasses = value + def __repr__(self): return ( "" ).format( self.array_inline_threshold, @@ -434,6 +461,7 @@ def __repr__(self): self.io_block_size, self.legacy_fill_schema_defaults, self.validate_on_read, + self.convert_unknown_ndarray_subclasses, ) diff --git a/asdf/extension/_manager.py b/asdf/extension/_manager.py index 68b6459d3..a8f0691ce 100644 --- a/asdf/extension/_manager.py +++ b/asdf/extension/_manager.py @@ -92,12 +92,11 @@ def handles_type(self, typ): """ return typ in self._converters_by_type or get_class_name(typ, instance=False) in self._converters_by_type - def handles_subtype(self, typ): + def _handles_subtype(self, typ): for ctyp in self._converters_by_type: if isinstance(ctyp, str): continue if issubclass(typ, ctyp): - self._converters_by_type[typ] = self._converters_by_type[ctyp] return True return False @@ -194,6 +193,18 @@ def get_converter_for_type(self, typ): ) raise KeyError(msg) from None + def _get_converter_for_subtype(self, typ): + for ctyp in self._converters_by_type: + if isinstance(ctyp, str): + continue + if issubclass(typ, ctyp): + return self._converters_by_type[ctyp] + msg = ( + f"No support available for Python type '{get_class_name(typ, instance=False)}'. " + "You may need to install or enable an extension." + ) + raise KeyError(msg) from None + @property def validator_manager(self): return self._validator_manager diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index 96bf39165..c225c7f93 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -5,7 +5,7 @@ import numpy as np import yaml -from . import schema, tagged, treeutil, util +from . import config, schema, tagged, treeutil, util from .constants import STSCI_SCHEMA_TAG_BASE, YAML_TAG_PREFIX from .exceptions import AsdfConversionWarning from .tags.core import AsdfObject @@ -220,8 +220,11 @@ def custom_tree_to_tagged_tree(tree, ctx, _serialization_context=None): extension_manager = _serialization_context.extension_manager - def _convert_obj(obj): - converter = extension_manager.get_converter_for_type(type(obj)) + def _convert_obj(obj, subtype=False): + if subtype: + converter = extension_manager._get_converter_for_subtype(type(obj)) + else: + converter = extension_manager.get_converter_for_type(type(obj)) tag = converter.select_tag(obj, _serialization_context) converters = set() # if select_tag returns None, converter.to_yaml_tree should return a new @@ -266,11 +269,20 @@ def _convert_obj(obj): if generator is not None: yield from generator + cfg = config.get_config() + convert_ndarray_subclasses = cfg.convert_unknown_ndarray_subclasses + def _walker(obj): if extension_manager.handles_type(type(obj)): return _convert_obj(obj) - if extension_manager.handles_subtype(type(obj)): - return _convert_obj(obj) + if convert_ndarray_subclasses and isinstance(obj, np.ndarray) and extension_manager._handles_subtype(type(obj)): + warnings.warn( + f"A ndarray subclass ({type(obj)}) was converted as a ndarray. " + "This behavior will be removed from a future version of ASDF. " + "See TODO some link", + AsdfConversionWarning, + ) + return _convert_obj(obj, subtype=True) tag = ctx._type_index.from_custom_type( type(obj), From 4f412464965e284ccccf8e4bac1b369805debdd3 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 4 May 2023 13:33:03 -0400 Subject: [PATCH 046/154] fix loading of empty inline arrays --- asdf/_tests/_issues/test_1538.py | 18 ++++++++++++++++++ asdf/core/_converters/ndarray.py | 15 ++++++++------- 2 files changed, 26 insertions(+), 7 deletions(-) create mode 100644 asdf/_tests/_issues/test_1538.py diff --git a/asdf/_tests/_issues/test_1538.py b/asdf/_tests/_issues/test_1538.py new file mode 100644 index 000000000..b79418722 --- /dev/null +++ b/asdf/_tests/_issues/test_1538.py @@ -0,0 +1,18 @@ +import numpy as np + +import asdf + + +def test_1538(tmp_path): + """ + ASDF unable to read empty inline array + + https://github.com/asdf-format/asdf/issues/1538 + """ + fn = tmp_path / "test.asdf" + a = np.array([]) + af = asdf.AsdfFile({"a": a}) + af.set_array_storage(a, "inline") + af.write_to(fn) + with asdf.open(fn) as af: + np.testing.assert_array_equal(af["a"], a) diff --git a/asdf/core/_converters/ndarray.py b/asdf/core/_converters/ndarray.py index fa21ccd76..dc93289ca 100644 --- a/asdf/core/_converters/ndarray.py +++ b/asdf/core/_converters/ndarray.py @@ -140,15 +140,16 @@ def from_yaml_tree(self, node, tag, ctx): return instance if isinstance(node, dict): - source = node.get("source") - data = node.get("data") - if source and data: + shape = node.get("shape", None) + if "source" in node and "data" in node: msg = "Both source and data may not be provided at the same time" raise ValueError(msg) - if data: - source = data - shape = node.get("shape", None) - byteorder = sys.byteorder if data is not None else node["byteorder"] + if "source" in node: + source = node["source"] + byteorder = node["byteorder"] + else: + source = node["data"] + byteorder = sys.byteorder dtype = asdf_datatype_to_numpy_dtype(node["datatype"], byteorder) if "datatype" in node else None offset = node.get("offset", 0) strides = node.get("strides", None) From f29597e4cd384c09ac6a3f7b10a11c738ad9a70d Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 4 May 2023 13:38:09 -0400 Subject: [PATCH 047/154] fix config doctests --- asdf/config.py | 4 ++-- docs/asdf/config.rst | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/asdf/config.py b/asdf/config.py index fd82629c4..9789c3913 100644 --- a/asdf/config.py +++ b/asdf/config.py @@ -446,22 +446,22 @@ def __repr__(self): " all_array_storage: {}\n" " all_array_compression: {}\n" " all_array_compression_kwargs: {}\n" + " convert_unknown_ndarray_subclasses: {}\n" " default_version: {}\n" " io_block_size: {}\n" " legacy_fill_schema_defaults: {}\n" " validate_on_read: {}\n" - " convert_unknown_ndarray_subclasses: {}\n" ">" ).format( self.array_inline_threshold, self.all_array_storage, self.all_array_compression, self.all_array_compression_kwargs, + self.convert_unknown_ndarray_subclasses, self.default_version, self.io_block_size, self.legacy_fill_schema_defaults, self.validate_on_read, - self.convert_unknown_ndarray_subclasses, ) diff --git a/docs/asdf/config.rst b/docs/asdf/config.rst index 30c889fa0..ab3ed3f96 100644 --- a/docs/asdf/config.rst +++ b/docs/asdf/config.rst @@ -39,6 +39,7 @@ the currently active config: all_array_storage: None all_array_compression: input all_array_compression_kwargs: None + convert_unknown_ndarray_subclasses: True default_version: 1.5.0 io_block_size: -1 legacy_fill_schema_defaults: True @@ -62,6 +63,7 @@ This allows for short-lived configuration changes that do not impact other code: all_array_storage: None all_array_compression: input all_array_compression_kwargs: None + convert_unknown_ndarray_subclasses: True default_version: 1.5.0 io_block_size: -1 legacy_fill_schema_defaults: True @@ -73,6 +75,7 @@ This allows for short-lived configuration changes that do not impact other code: all_array_storage: None all_array_compression: input all_array_compression_kwargs: None + convert_unknown_ndarray_subclasses: True default_version: 1.5.0 io_block_size: -1 legacy_fill_schema_defaults: True From c3404bffc438e422da64366558df1355c7894d64 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 4 May 2023 15:46:55 -0400 Subject: [PATCH 048/154] catch old FutureWarning from numpy in tests --- asdf/_tests/_helpers.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/asdf/_tests/_helpers.py b/asdf/_tests/_helpers.py index f24386f17..a988f619c 100644 --- a/asdf/_tests/_helpers.py +++ b/asdf/_tests/_helpers.py @@ -150,7 +150,12 @@ def recurse(old, new): assert old.ra == new.ra assert old.dec == new.dec elif all([isinstance(obj, (np.ndarray, asdf.tags.core.NDArrayType)) for obj in (old, new)]): - np.testing.assert_array_equal(old, new) + with warnings.catch_warnings(): + # The oldest deps job tests against versions of numpy where this + # testing function raised a FutureWarning but still functioned + # as expected + warnings.filterwarnings("ignore", category=FutureWarning) + np.testing.assert_array_equal(old, new) else: assert old == new From 933619ea7eb0f2af69e557af423ec4fa29c5367b Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 4 May 2023 16:30:50 -0400 Subject: [PATCH 049/154] seek to 0 before update shortcuts --- asdf/_tests/test_array_blocks.py | 6 +++++- asdf/asdf.py | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index 4f73cfb30..bfdf5a869 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -804,9 +804,10 @@ def assert_result(ff): ) # then reuse the file to check update + arr2 = np.ones((8, 8)) * 42 with asdf.open(fn, mode="rw") as ff2: assert_result(ff2) - arr2 = np.ones((8, 8)) * 42 + np.testing.assert_array_equal(arr1, ff2["array"]) ff2["array"] = arr2 ff2.update( all_array_storage=all_array_storage, @@ -814,6 +815,9 @@ def assert_result(ff): compression_kwargs=compression_kwargs, ) assert_result(ff2) + with asdf.open(fn) as ff3: + assert_result(ff3) + np.testing.assert_array_equal(arr2, ff3["array"]) def test_remove_blocks(tmp_path): diff --git a/asdf/asdf.py b/asdf/asdf.py index 826e689f7..4d87623e3 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -1094,6 +1094,7 @@ def update( # TODO shortcuts for # - no read blocks if len(self._blocks.blocks) == 0 and self._blocks._streamed_block is None: + self._fd.seek(0) self.write_to(self._fd) if self._fd.can_memmap(): self._fd.close_memmap() @@ -1101,6 +1102,7 @@ def update( return # - all external if config.all_array_storage == "external": + self._fd.seek(0) self.write_to(self._fd) if self._fd.can_memmap(): self._fd.close_memmap() From 0dada44702bdd7fa570ecd3723c4560697bf6313 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 5 May 2023 12:28:47 -0400 Subject: [PATCH 050/154] allow file truncation on windows --- asdf/generic_io.py | 38 +++++--------------------------------- 1 file changed, 5 insertions(+), 33 deletions(-) diff --git a/asdf/generic_io.py b/asdf/generic_io.py index 1907946a9..b7be72309 100644 --- a/asdf/generic_io.py +++ b/asdf/generic_io.py @@ -738,41 +738,13 @@ def fast_forward(self, size): self.seek(0, SEEK_END) self.seek(size, SEEK_CUR) - if sys.platform.startswith("win"): # pragma: no cover - - def truncate(self, size=None): - # ftruncate doesn't work on an open file in Windows. The - # best we can do is clear the extra bytes or add extra - # bytes to the end. - if size is None: - size = self.tell() - - self.seek(0, SEEK_END) - file_size = self.tell() - if size < file_size: - self.seek(size, SEEK_SET) - nbytes = file_size - size - elif size > file_size: - nbytes = size - file_size - else: - nbytes = 0 - - block = b"\0" * self.block_size - while nbytes > 0: - self.write(block[: min(nbytes, self.block_size)]) - nbytes -= self.block_size - + def truncate(self, size=None): + if size is None: + self._fd.truncate() + else: + self._fd.truncate(size) self.seek(size, SEEK_SET) - else: - - def truncate(self, size=None): - if size is None: - self._fd.truncate() - else: - self._fd.truncate(size) - self.seek(size, SEEK_SET) - class RealFile(RandomAccessFile): """ From 169482f96ab8c1ce1a06713c6eead29a633c4460 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 5 May 2023 12:51:41 -0400 Subject: [PATCH 051/154] update array equality testing in tests update to use old checks for arrays with fields --- asdf/_tests/_helpers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/asdf/_tests/_helpers.py b/asdf/_tests/_helpers.py index a988f619c..c2aed6c44 100644 --- a/asdf/_tests/_helpers.py +++ b/asdf/_tests/_helpers.py @@ -155,7 +155,14 @@ def recurse(old, new): # testing function raised a FutureWarning but still functioned # as expected warnings.filterwarnings("ignore", category=FutureWarning) - np.testing.assert_array_equal(old, new) + if old.dtype.fields: + if not new.dtype.fields: + msg = "arrays not equal" + raise AssertionError(msg) + for a, b in zip(old, new): + np.testing.assert_array_equal(a, b) + else: + np.testing.assert_array_equal(old, new) else: assert old == new From 7259e01237f88bc4ac376d4af241b4a26dd5524d Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 5 May 2023 13:09:33 -0400 Subject: [PATCH 052/154] reset file position after memmapping on windows --- asdf/generic_io.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/asdf/generic_io.py b/asdf/generic_io.py index b7be72309..1c7ede99b 100644 --- a/asdf/generic_io.py +++ b/asdf/generic_io.py @@ -777,8 +777,10 @@ def memmap_array(self, offset, size): acc = mmap.ACCESS_WRITE if "w" in self._mode else mmap.ACCESS_READ self._fd.seek(0, 2) nbytes = self._fd.tell() - self._fd.seek(loc, 0) self._mmap = mmap.mmap(self._fd.fileno(), nbytes, access=acc) + # on windows mmap seeks to the start of the file so return the file + # pointer to this previous location + self._fd.seek(loc, 0) return np.ndarray.__new__(np.memmap, shape=size, offset=offset, dtype="uint8", buffer=self._mmap) def close_memmap(self): From 18d0bf790d4e36158db478882bb1f617ea9cd107 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 5 May 2023 13:52:32 -0400 Subject: [PATCH 053/154] attempt to fix array equality for older numpy --- asdf/_tests/_helpers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asdf/_tests/_helpers.py b/asdf/_tests/_helpers.py index c2aed6c44..f2d92c8ff 100644 --- a/asdf/_tests/_helpers.py +++ b/asdf/_tests/_helpers.py @@ -159,10 +159,10 @@ def recurse(old, new): if not new.dtype.fields: msg = "arrays not equal" raise AssertionError(msg) - for a, b in zip(old, new): - np.testing.assert_array_equal(a, b) + for f in old.dtype.fields: + np.testing.assert_array_equal(old[f], new[f]) else: - np.testing.assert_array_equal(old, new) + np.testing.assert_array_equal(old.__array__(), new.__array__()) else: assert old == new From 4c388f5522c53fbcb97d996123c213bc514233ba Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 5 May 2023 15:54:41 -0400 Subject: [PATCH 054/154] fix reading for inline structured arrays fixes #1540 --- asdf/_tests/_issues/test_1540.py | 18 ++++++++++++++++++ asdf/tags/core/ndarray.py | 3 ++- 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 asdf/_tests/_issues/test_1540.py diff --git a/asdf/_tests/_issues/test_1540.py b/asdf/_tests/_issues/test_1540.py new file mode 100644 index 000000000..f739db7f4 --- /dev/null +++ b/asdf/_tests/_issues/test_1540.py @@ -0,0 +1,18 @@ +import numpy as np + +import asdf + + +def test_1540(tmp_path): + """ + ASDF writes but fails to read inline structured array + + https://github.com/asdf-format/asdf/issues/1540 + """ + x = np.array((0, 1.0, [2, 3]), dtype=[("MINE", "i1"), ("f1", " Date: Mon, 8 May 2023 09:56:59 -0400 Subject: [PATCH 055/154] add xfailed test for 1539 --- asdf/_tests/_issues/test_1539.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 asdf/_tests/_issues/test_1539.py diff --git a/asdf/_tests/_issues/test_1539.py b/asdf/_tests/_issues/test_1539.py new file mode 100644 index 000000000..bcea762c1 --- /dev/null +++ b/asdf/_tests/_issues/test_1539.py @@ -0,0 +1,22 @@ +import io + +import pytest + +import asdf + + +@pytest.mark.xfail(reason="Fix will require more major changes to generic_io") +def test_1539(): + """ + Seek and read from closed MemoryIO + + https://github.com/asdf-format/asdf/issues/1539 + """ + b = io.BytesIO() + b.write(b"\0" * 10) + b.seek(0) + f = asdf.generic_io.get_file(b) + f.close() + with pytest.raises(IOError, match="I/O operation on closed file."): + f.read_into_array(10) + assert b.tell() == 0 From 43c57b8754799321b06d6b5cb0e6a381b5bea59e Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 8 May 2023 09:58:11 -0400 Subject: [PATCH 056/154] increase number of tries to get reused memory --- asdf/_tests/_block/test_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asdf/_tests/_block/test_store.py b/asdf/_tests/_block/test_store.py index 469ced244..29a6f83d5 100644 --- a/asdf/_tests/_block/test_store.py +++ b/asdf/_tests/_block/test_store.py @@ -99,7 +99,7 @@ def test_get_memory_reused(): s.assign_object(f, v) fid = id(f) del f - for _ in range(100): + for _ in range(1000): f = Foo() if id(f) == fid: break @@ -115,7 +115,7 @@ def test_set_memory_reused(): s.assign_object(f, v) fid = id(f) del f - for _ in range(100): + for _ in range(1000): f = Foo() if id(f) == fid: break From 717363194230d420d2a35f281863686c96757f89 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 8 May 2023 14:11:02 -0400 Subject: [PATCH 057/154] fix validation of checksums on read fixes #1541 --- asdf/_block/reader.py | 34 ++++++++++++++++++++++--------- asdf/_tests/_block/test_reader.py | 5 +++-- asdf/_tests/_issues/test_1541.py | 34 +++++++++++++++++++++++++++++++ asdf/asdf.py | 8 +++++--- 4 files changed, 66 insertions(+), 15 deletions(-) create mode 100644 asdf/_tests/_issues/test_1541.py diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index 274bbd17c..43f9187d6 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -6,7 +6,7 @@ class ReadBlock: - def __init__(self, offset, fd, memmap, lazy_load, header=None, data_offset=None, data=None): + def __init__(self, offset, fd, memmap, lazy_load, validate_checksum, header=None, data_offset=None, data=None): self.offset = offset # after magic self._fd = weakref.ref(fd) self._header = header @@ -16,6 +16,7 @@ def __init__(self, offset, fd, memmap, lazy_load, header=None, data_offset=None, # TODO alternative to passing these down? self.memmap = memmap self.lazy_load = lazy_load + self.validate_checksum = validate_checksum if not lazy_load: self.load() @@ -41,8 +42,17 @@ def data(self): if not self.loaded: self.load() if callable(self._data): - return self._data() - return self._data + data = self._data() + else: + data = self._data + if self.validate_checksum: + checksum = bio.calculate_block_checksum(data) + if checksum != self._header["checksum"]: + msg = f"Block at {self.offset} does not match given checksum" + raise ValueError(msg) + # only validate data the first time it's read + self.validate_checksum = False + return data @property def cached_data(self): @@ -57,7 +67,7 @@ def header(self): return self._header -def read_blocks_serially(fd, memmap=False, lazy_load=False, after_magic=False): +def read_blocks_serially(fd, memmap=False, lazy_load=False, validate_checksums=False, after_magic=False): blocks = [] buff = b"" while True: @@ -77,7 +87,11 @@ def read_blocks_serially(fd, memmap=False, lazy_load=False, after_magic=False): if after_magic or buff == constants.BLOCK_MAGIC: # this is another block offset, header, data_offset, data = bio.read_block(fd, memmap=memmap, lazy_load=lazy_load) - blocks.append(ReadBlock(offset, fd, memmap, lazy_load, header=header, data_offset=data_offset, data=data)) + blocks.append( + ReadBlock( + offset, fd, memmap, lazy_load, validate_checksums, header=header, data_offset=data_offset, data=data + ) + ) if blocks[-1].header["flags"] & constants.BLOCK_FLAG_STREAMED: # a file can only have 1 streamed block and it must be at the end so we # can stop looking for more blocks @@ -95,10 +109,10 @@ def read_blocks_serially(fd, memmap=False, lazy_load=False, after_magic=False): return blocks -def read_blocks(fd, memmap=False, lazy_load=False, after_magic=False): +def read_blocks(fd, memmap=False, lazy_load=False, validate_checksums=False, after_magic=False): if not lazy_load or not fd.seekable(): # load all blocks serially - return read_blocks_serially(fd, memmap, lazy_load, after_magic) + return read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) # try to find block index starting_offset = fd.tell() @@ -106,7 +120,7 @@ def read_blocks(fd, memmap=False, lazy_load=False, after_magic=False): if index_offset is None: # if failed, load all blocks serially fd.seek(starting_offset) - return read_blocks_serially(fd, memmap, lazy_load, after_magic) + return read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) # setup empty blocks try: @@ -114,9 +128,9 @@ def read_blocks(fd, memmap=False, lazy_load=False, after_magic=False): except OSError: # failed to read block index, fall back to serial reading fd.seek(starting_offset) - return read_blocks_serially(fd, memmap, lazy_load, after_magic) + return read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) # skip magic for each block - blocks = [ReadBlock(offset + 4, fd, memmap, lazy_load) for offset in block_index] + blocks = [ReadBlock(offset + 4, fd, memmap, lazy_load, validate_checksums) for offset in block_index] try: # load first and last blocks to check if the index looks correct for index in (0, -1): diff --git a/asdf/_tests/_block/test_reader.py b/asdf/_tests/_block/test_reader.py index e08173947..62cce9c1c 100644 --- a/asdf/_tests/_block/test_reader.py +++ b/asdf/_tests/_block/test_reader.py @@ -41,13 +41,14 @@ def check(blocks): @pytest.mark.parametrize("lazy_load", [True, False]) @pytest.mark.parametrize("memmap", [True, False]) @pytest.mark.parametrize("with_index", [True, False]) +@pytest.mark.parametrize("validate_checksums", [True, False]) @pytest.mark.parametrize("padding", [0, 3, 4, 5]) -def test_read(tmp_path, lazy_load, memmap, with_index, padding): +def test_read(tmp_path, lazy_load, memmap, with_index, validate_checksums, padding): fn = tmp_path / "test.bin" n = 5 size = 10 with gen_blocks(fn=fn, n=n, size=size, padding=padding, with_index=with_index) as (fd, check): - r = read_blocks(fd, memmap=memmap, lazy_load=lazy_load) + r = read_blocks(fd, memmap=memmap, lazy_load=lazy_load, validate_checksums=validate_checksums) if lazy_load and with_index: assert r[0].loaded assert r[-1].loaded diff --git a/asdf/_tests/_issues/test_1541.py b/asdf/_tests/_issues/test_1541.py new file mode 100644 index 000000000..fc16c78aa --- /dev/null +++ b/asdf/_tests/_issues/test_1541.py @@ -0,0 +1,34 @@ +import numpy +import pytest + +import asdf + + +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("include_block_index", [True, False]) +@pytest.mark.parametrize("index", [0, 1, 2]) +def test_1541(tmp_path, index, include_block_index, lazy_load): + """ + Block checksums are only checked for first block if a block index is present + + https://github.com/asdf-format/asdf/issues/1541 + """ + fn = tmp_path / "test.asdf" + arrs = [numpy.zeros(1) + i for i in range(3)] + asdf.AsdfFile({"arrs": arrs}).write_to(fn, include_block_index=include_block_index) + + # read file to get block offset + with asdf.open(fn, lazy_load=False) as af: + checksum_offset = af._blocks.blocks[index].offset + 2 + 4 + 4 + 8 + 8 + 8 + + # now modify the block checksum + with open(fn, "r+b") as f: + f.seek(checksum_offset) + v = f.read(1)[0] + f.seek(checksum_offset) + f.write(bytes([v + 1])) + + # and check that it raises an error + with pytest.raises(ValueError, match=r".* does not match given checksum"): + with asdf.open(fn, lazy_load=lazy_load, validate_checksums=True) as af: + sum([a[0] for a in af["arrs"]]) diff --git a/asdf/asdf.py b/asdf/asdf.py index 4d87623e3..dbf758efc 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -853,11 +853,13 @@ def _open_asdf( # after the blocks have been read tree = yamlutil.load_tree(reader) # has_blocks = fd.seek_until(constants.BLOCK_MAGIC, 4, include=True, exception=False) - read_blocks = block_reader.read_blocks(fd, self._blocks.memmap, self._blocks.lazy_load) + read_blocks = block_reader.read_blocks( + fd, self._blocks.memmap, self._blocks.lazy_load, validate_checksums + ) elif yaml_token == constants.BLOCK_MAGIC: # this file has only blocks read_blocks = block_reader.read_blocks( - fd, self._blocks.memmap, self._blocks.lazy_load, after_magic=True + fd, self._blocks.memmap, self._blocks.lazy_load, validate_checksums, after_magic=True ) elif yaml_token != b"": msg = "ASDF file appears to contain garbage after header." @@ -1204,7 +1206,7 @@ def update( # we have to be lazy here as the current memmap is invalid new_read_block = block_reader.ReadBlock( - offset + 4, self._fd, memmap, True, header=header, data=data + offset + 4, self._fd, memmap, True, False, header=header, data=data ) new_read_blocks.append_block(new_read_block) new_index = len(new_read_blocks) - 1 From e7018add736148b49df1d65bad7413434c425a04 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 8 May 2023 14:40:15 -0400 Subject: [PATCH 058/154] add test for and fix asdf.stream deprecation also clean up old unused deprecation related code and test configuration --- asdf/__init__.py | 18 ------------------ asdf/_tests/test_deprecated.py | 9 +++++++-- asdf/conftest.py | 2 +- asdf/stream.py | 9 +++++++++ 4 files changed, 17 insertions(+), 21 deletions(-) create mode 100644 asdf/stream.py diff --git a/asdf/__init__.py b/asdf/__init__.py index 9e13d4185..6950a4bdf 100644 --- a/asdf/__init__.py +++ b/asdf/__init__.py @@ -25,21 +25,3 @@ from .exceptions import ValidationError from .tags.core import IntegerType, Stream from .tags.core.external_reference import ExternalArrayReference - - -def __getattr__(name): - if name == "stream": - import warnings - - import asdf.tags.core.stream - from asdf.exceptions import AsdfDeprecationWarning - - warnings.warn( - "asdf.stream is deprecated. Please use asdf.tags.core.stream", - AsdfDeprecationWarning, - ) - - return asdf.tags.core.stream - - msg = f"module {__name__!r} has no attribute {name!r}" - raise AttributeError(msg) diff --git a/asdf/_tests/test_deprecated.py b/asdf/_tests/test_deprecated.py index d51b78c04..45df227f9 100644 --- a/asdf/_tests/test_deprecated.py +++ b/asdf/_tests/test_deprecated.py @@ -9,8 +9,6 @@ from asdf._types import CustomType from asdf.exceptions import AsdfDeprecationWarning -from .test_entry_points import _monkeypatch_entry_points, mock_entry_points # noqa: F401 - def test_custom_type_warning(): with pytest.warns(AsdfDeprecationWarning, match=r"^.* subclasses the deprecated CustomType .*$"): @@ -29,3 +27,10 @@ def test_asdf_type_format_tag(): with pytest.warns(AsdfDeprecationWarning, match="asdf.types.format_tag is deprecated"): asdf._types.format_tag asdf.testing.helpers.format_tag + + +def test_asdf_stream_deprecation(): + with pytest.warns(AsdfDeprecationWarning, match="asdf.stream is deprecated"): + if "asdf.stream" in sys.modules: + del sys.modules["asdf.stream"] + import asdf.stream # noqa: F401 diff --git a/asdf/conftest.py b/asdf/conftest.py index f76e1778a..0fc13eb20 100644 --- a/asdf/conftest.py +++ b/asdf/conftest.py @@ -1,4 +1,4 @@ # We ignore these files because these modules create deprecation warnings on # import. When warnings are turned into errors this will completely prevent # test collection -collect_ignore = ["asdftypes.py", "fits_embed.py", "resolver.py", "type_index.py", "types.py"] +collect_ignore = ["stream.py"] diff --git a/asdf/stream.py b/asdf/stream.py new file mode 100644 index 000000000..b66952c65 --- /dev/null +++ b/asdf/stream.py @@ -0,0 +1,9 @@ +import warnings + +from .exceptions import AsdfDeprecationWarning +from .tags.core.stream import Stream # noqa: F401 + +warnings.warn( + "asdf.stream is deprecated. Please use asdf.tags.core.stream", + AsdfDeprecationWarning, +) From 5af81982280614eb526365dfd84db8c5984bf42f Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 8 May 2023 15:08:50 -0400 Subject: [PATCH 059/154] remove fd from generate_write_header args instead just pass in the filesystem block size --- asdf/_block/io.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/asdf/_block/io.py b/asdf/_block/io.py index dda2af815..9f19f8a6c 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -8,6 +8,7 @@ from asdf import compression as mcompression from asdf import constants, util +from asdf.config import get_config BLOCK_HEADER = util.BinaryStruct( [ @@ -66,8 +67,6 @@ def read_block_data(fd, header, offset=None, memmap=False): else: offset = fd.tell() - # load and possibly decompress the data - # read the raw bytes if header["flags"] & constants.BLOCK_FLAG_STREAMED: used_size = -1 else: @@ -76,7 +75,7 @@ def read_block_data(fd, header, offset=None, memmap=False): # if no compression, just read data compression = mcompression.validate(header["compression"]) if compression: - # the old code ignored memmapping for compressed data + # compressed data will not be memmapped data = mcompression.decompress(fd, used_size, header["data_size"], compression) fd.fast_forward(header["allocated_size"] - header["used_size"]) else: @@ -121,14 +120,12 @@ def callback(): return offset, header, data_offset, data -def validate_write_data(data): +def generate_write_header( + data, stream=False, compression_kwargs=None, padding=False, fs_block_size=None, **header_kwargs +): if data.ndim != 1 or data.dtype != "uint8": msg = "Data must be of ndim==1 and dtype==uint8" raise ValueError(msg) - - -def generate_write_header(fd, data, stream=False, compression_kwargs=None, padding=False, **header_kwargs): - validate_write_data(data) if stream: header_kwargs["flags"] = header_kwargs.get("flags", 0) | constants.BLOCK_FLAG_STREAMED header_kwargs["data_size"] = 0 @@ -151,7 +148,9 @@ def generate_write_header(fd, data, stream=False, compression_kwargs=None, paddi header_kwargs["allocated_size"] = 0 else: header_kwargs["used_size"] = used_size - padding = util.calculate_padding(used_size, padding, fd.block_size) + if fs_block_size is None: + fs_block_size = get_config().io_block_size + padding = util.calculate_padding(used_size, padding, fs_block_size) header_kwargs["allocated_size"] = header_kwargs.get("allocated_size", used_size + padding) if header_kwargs["allocated_size"] < header_kwargs["used_size"]: @@ -166,8 +165,9 @@ def generate_write_header(fd, data, stream=False, compression_kwargs=None, paddi def write_block(fd, data, offset=None, stream=False, compression_kwargs=None, padding=False, **header_kwargs): - # TODO fd is only used for padding calculation, bring this out - header, buff, padding_bytes = generate_write_header(fd, data, stream, compression_kwargs, padding, **header_kwargs) + header, buff, padding_bytes = generate_write_header( + data, stream, compression_kwargs, padding, fd.block_size, **header_kwargs + ) if offset is not None: if fd.seekable(): From 3a3c5bb20b60587d6014f68ae1d352f90936c20d Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 8 May 2023 15:31:24 -0400 Subject: [PATCH 060/154] compute len of block magic from value instead of hard coding --- asdf/_block/io.py | 5 +---- asdf/_block/key.py | 1 - asdf/_block/reader.py | 12 +++++++----- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/asdf/_block/io.py b/asdf/_block/io.py index 9f19f8a6c..619a785a3 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -188,7 +188,6 @@ def write_block(fd, data, offset=None, stream=False, compression_kwargs=None, pa def candidate_offsets(min_offset, max_offset, block_size): offset = (max_offset // block_size) * block_size if offset == max_offset: - # don't include the max_offset offset -= block_size while offset > min_offset: yield offset @@ -217,9 +216,7 @@ def find_block_index(fd, min_offset=None, max_offset=None): return None break buff = buff[: len(pattern)] - if block_index_offset is not None and block_index_offset < max_offset: - return block_index_offset - return None + return block_index_offset def read_block_index(fd, offset=None): diff --git a/asdf/_block/key.py b/asdf/_block/key.py index 5615d7ea6..b7463d5bb 100644 --- a/asdf/_block/key.py +++ b/asdf/_block/key.py @@ -20,7 +20,6 @@ def is_valid(self): r = self._ref() if r is None: return False - del r return True def __hash__(self): diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index 43f9187d6..64e591607 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -70,17 +70,18 @@ def header(self): def read_blocks_serially(fd, memmap=False, lazy_load=False, validate_checksums=False, after_magic=False): blocks = [] buff = b"" + magic_len = len(constants.BLOCK_MAGIC) while True: # the expectation is that this will begin PRIOR to the block magic # read 4 bytes if not after_magic: - buff += fd.read(4 - len(buff)) - if len(buff) < 4: + buff += fd.read(magic_len - len(buff)) + if len(buff) < magic_len: # we are done, there are no more blocks and no index # TODO error? we shouldn't have extra bytes, the old code allows this break - if buff == constants.INDEX_HEADER[:4]: + if buff == constants.INDEX_HEADER[:magic_len]: # we hit the block index, which is not useful here break @@ -130,12 +131,13 @@ def read_blocks(fd, memmap=False, lazy_load=False, validate_checksums=False, aft fd.seek(starting_offset) return read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) # skip magic for each block - blocks = [ReadBlock(offset + 4, fd, memmap, lazy_load, validate_checksums) for offset in block_index] + magic_len = len(constants.BLOCK_MAGIC) + blocks = [ReadBlock(offset + magic_len, fd, memmap, lazy_load, validate_checksums) for offset in block_index] try: # load first and last blocks to check if the index looks correct for index in (0, -1): fd.seek(block_index[index]) - buff = fd.read(4) + buff = fd.read(magic_len) if buff != constants.BLOCK_MAGIC: msg = "Invalid block magic" raise OSError(msg) From 6fc6a4eaef0ddf27779314309bb4713c1b04f370 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 8 May 2023 16:26:30 -0400 Subject: [PATCH 061/154] fix write for non-seekable files fixes #1542 --- asdf/_block/writer.py | 14 +++++++++++--- asdf/_tests/_issues/test_1542.py | 22 ++++++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 asdf/_tests/_issues/test_1542.py diff --git a/asdf/_block/writer.py b/asdf/_block/writer.py index e362b4c89..ad21dd7e0 100644 --- a/asdf/_block/writer.py +++ b/asdf/_block/writer.py @@ -29,7 +29,11 @@ def write_blocks(fd, blocks, padding=False, streamed_block=None, write_index=Tru offsets = [] headers = [] for blk in blocks: - offsets.append(fd.tell()) + if fd.seekable(): + offset = fd.tell() + else: + offset = None + offsets.append(offset) fd.write(constants.BLOCK_MAGIC) headers.append( bio.write_block( @@ -41,9 +45,13 @@ def write_blocks(fd, blocks, padding=False, streamed_block=None, write_index=Tru ) ) if streamed_block is not None: - offsets.append(fd.tell()) + if fd.seekable(): + offset = fd.tell() + else: + offset = None + offsets.append(offset) fd.write(constants.BLOCK_MAGIC) headers.append(bio.write_block(fd, streamed_block.data_bytes, stream=True)) - elif len(blocks) and write_index: + elif len(offsets) and write_index and fd.seekable(): bio.write_block_index(fd, offsets) return offsets, headers diff --git a/asdf/_tests/_issues/test_1542.py b/asdf/_tests/_issues/test_1542.py new file mode 100644 index 000000000..bce7a375d --- /dev/null +++ b/asdf/_tests/_issues/test_1542.py @@ -0,0 +1,22 @@ +import os + +import numpy as np + +import asdf + + +def test_1542(): + """ + ASDF fails to write blocks to non-seekable file + + https://github.com/asdf-format/asdf/issues/1542 + """ + r, w = os.pipe() + with os.fdopen(r, "rb") as rf: + with os.fdopen(w, "wb") as wf: + arrs = [np.zeros(1, dtype="uint8") + i for i in range(10)] + af = asdf.AsdfFile({"arrs": arrs}) + af.write_to(wf) + with asdf.open(rf) as raf: + for a, ra in zip(arrs, raf["arrs"]): + np.testing.assert_array_equal(a, ra) From 5141cc9c4eb1a45f84e63011c5ea3fd82daabde5 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 8 May 2023 17:29:48 -0400 Subject: [PATCH 062/154] mock id for _block.store testing --- asdf/_tests/_block/test_key.py | 16 +++----------- asdf/_tests/_block/test_store.py | 38 ++++++++++++++++++-------------- 2 files changed, 25 insertions(+), 29 deletions(-) diff --git a/asdf/_tests/_block/test_key.py b/asdf/_tests/_block/test_key.py index 2c9934252..a284816c6 100644 --- a/asdf/_tests/_block/test_key.py +++ b/asdf/_tests/_block/test_key.py @@ -37,20 +37,10 @@ def test_is_valid(): assert not bk.is_valid() -def test_memory_reuse(): +def test_same_class(): f = Foo() bk = Key(f) - fid = id(f) del f - objs = [] - for _ in range(100): - f = Foo() - objs.append(f) - if fid == id(f): - break - else: - raise AssertionError("Failed to find reused memory address") - - assert fid == id(f) + f2 = Foo() assert not bk.is_valid() - assert not bk.matches(f) + assert not bk.matches(f2) diff --git a/asdf/_tests/_block/test_store.py b/asdf/_tests/_block/test_store.py index 29a6f83d5..05ede0faf 100644 --- a/asdf/_tests/_block/test_store.py +++ b/asdf/_tests/_block/test_store.py @@ -1,3 +1,5 @@ +from unittest.mock import patch + import pytest from asdf._block.key import Key @@ -99,13 +101,15 @@ def test_get_memory_reused(): s.assign_object(f, v) fid = id(f) del f - for _ in range(1000): - f = Foo() - if id(f) == fid: - break - else: - raise AssertionError("Failed to trigger memory reuse") - assert s.lookup_by_object(f) is None + f2 = Foo() + + def mock_id(obj): + if obj is f2: + return fid + return id(obj) + + with patch("asdf._block.store.id", mock_id): + assert s.lookup_by_object(f2) is None def test_set_memory_reused(): @@ -115,15 +119,17 @@ def test_set_memory_reused(): s.assign_object(f, v) fid = id(f) del f - for _ in range(1000): - f = Foo() - if id(f) == fid: - break - else: - raise AssertionError("Failed to trigger memory reuse") - nv = 26 - s.assign_object(f, nv) - assert s.lookup_by_object(f) is nv + f2 = Foo() + + def mock_id(obj): + if obj is f2: + return fid + return id(obj) + + with patch("asdf._block.store.id", mock_id): + nv = 26 + s.assign_object(f2, nv) + assert s.lookup_by_object(f2) is nv def test_cleanup(): From e4fb1811c82a589c9ff8bddcba5898c53274f897 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 10 May 2023 10:50:57 -0400 Subject: [PATCH 063/154] add _Operations for SerializationContext to make the context aware of the state (serialization, deserialization, etc) so that it can adapt to handle blocks differently. --- asdf/_block/key.py | 15 ++- asdf/_block/manager.py | 7 -- asdf/_serialization_context.py | 217 +++++++++++++++++++++++++++++++++ asdf/asdf.py | 2 +- asdf/yamlutil.py | 6 +- 5 files changed, 234 insertions(+), 13 deletions(-) create mode 100644 asdf/_serialization_context.py diff --git a/asdf/_block/key.py b/asdf/_block/key.py index b7463d5bb..6ec7d8038 100644 --- a/asdf/_block/key.py +++ b/asdf/_block/key.py @@ -1,6 +1,10 @@ import weakref +class UndefinedRef: + pass + + class Key: _next = 0 @@ -10,13 +14,20 @@ def _next_key(cls): cls._next += 1 return key - def __init__(self, obj, key=None): + def __init__(self, obj=None, key=None): if key is None: key = Key._next_key() self._key = key + self._ref = UndefinedRef + if obj is not None: + self.assign_object(obj) + + def assign_object(self, obj): self._ref = weakref.ref(obj) def is_valid(self): + if self._ref is UndefinedRef: + return False r = self._ref() if r is None: return False @@ -26,6 +37,8 @@ def __hash__(self): return self._key def matches(self, obj): + if self._ref is UndefinedRef: + return False r = self._ref() if r is None: return False diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 3d57a4eeb..5433e7b27 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -20,7 +20,6 @@ class ReadBlocks(store.LinearStore): def set_blocks(self, blocks): self._items = blocks - # TODO should this invalidate the associations? def append_block(self, block): self._items.append(block) @@ -142,9 +141,6 @@ def _clear_write(self): def _write_external_blocks(self): from asdf import AsdfFile - if not len(self._external_write_blocks): - return - if self._write_fd.uri is None: raise ValueError("Can't write external blocks, since URI of main file is unknown.") @@ -156,9 +152,6 @@ def _write_external_blocks(self): write_blocks(f, [blk]) def make_write_block(self, data, options, obj): - # if we're not actually writing just return a junk index - # if self._write_fd is None: - # return constants.MAX_BLOCKS + 1 if options.storage_type == "external": for index, blk in enumerate(self._external_write_blocks): if blk._data is data: diff --git a/asdf/_serialization_context.py b/asdf/_serialization_context.py new file mode 100644 index 000000000..b8abe0a16 --- /dev/null +++ b/asdf/_serialization_context.py @@ -0,0 +1,217 @@ +import contextlib +import weakref + +from ._block.key import Key as BlockKey +from ._block.options import Options as BlockOptions +from ._helpers import validate_version +from .extension import ExtensionProxy + + +class SerializationContext: + """ + Container for parameters of the current (de)serialization. + """ + + def __init__(self, version, extension_manager, url, blocks): + self._version = validate_version(version) + self._extension_manager = extension_manager + self._url = url + self._blocks = blocks + + self.__extensions_used = set() + + @property + def url(self): + """ + The URL (if any) of the file being read or written. + + Used to compute relative locations of external files referenced by this + ASDF file. The URL will not exist in some cases (e.g. when the file is + written to an `io.BytesIO`). + + Returns + -------- + str or None + """ + return self._url + + @property + def version(self): + """ + Get the ASDF Standard version. + + Returns + ------- + str + """ + return self._version + + @property + def extension_manager(self): + """ + Get the ExtensionManager for enabled extensions. + + Returns + ------- + asdf.extension.ExtensionManager + """ + return self._extension_manager + + def _mark_extension_used(self, extension): + """ + Note that an extension was used when reading or writing the file. + + Parameters + ---------- + extension : asdf.extension.Extension + """ + self.__extensions_used.add(ExtensionProxy.maybe_wrap(extension)) + + @property + def _extensions_used(self): + """ + Get the set of extensions that were used when reading or writing the file. + + Returns + ------- + set of asdf.extension.Extension + """ + return self.__extensions_used + + def get_block_data_callback(self, index, key=None): + """ + Generate a callable that when called will read data + from a block at the provided index + + Parameters + ---------- + index : int + Block index + + key : BlockKey + TODO + + Returns + ------- + callback : callable + A callable that when called (with no arguments) returns + the block data as a one dimensional array of uint8 + """ + raise NotImplementedError("abstract") + + def find_available_block_index(self, data_callback, lookup_key=None): + """ + Find the index of an available block to write data. + + This is typically used inside asdf.extension.Converter.to_yaml_tree + + Parameters + ---------- + data_callback: callable + Callable that when called will return data (ndarray) that will + be written to a block. + At the moment, this is only assigned if a new block + is created to avoid circular references during AsdfFile.update. + + lookup_key : hashable, optional + Unique key used to retrieve the index of a block that was + previously allocated or reserved. For ndarrays this is + typically the id of the base ndarray. + + Returns + ------- + block_index: int + Index of the block where data returned from data_callback + will be written. + """ + raise NotImplementedError("abstract") + + def generate_block_key(self): + raise NotImplementedError("abstract") + + @contextlib.contextmanager + def _serialization(self, obj): + with _Serialization(self, obj) as op: + yield op + + @contextlib.contextmanager + def _deserialization(self): + with _Deserialization(self) as op: + yield op + + +class _Operation(SerializationContext): + def __init__(self, ctx): + self._ctx = weakref.ref(ctx) + super().__init__(ctx.version, ctx.extension_manager, ctx.url, ctx._blocks) + + def _mark_extension_used(self, extension): + ctx = self._ctx() + ctx._mark_extension_used(extension) + + @property + def _extensions_used(self): + ctx = self._ctx() + return ctx._extensions_used + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + pass + + +class _Deserialization(_Operation): + def __init__(self, ctx): + super().__init__(ctx) + self._obj = None + self._blk = None + self._cb = None + self._keys = set() + + def __exit__(self, exc_type, exc_value, traceback): + # TODO check exception here + if self._blk is not None: + self._blocks.blocks.assign_object(self._obj, self._blk) + self._blocks._data_callbacks.assign_object(self._obj, self._cb) + for k in self._keys: + k.assign_object(self._obj) + + def get_block_data_callback(self, index, key=None): + blk = self._blocks.blocks[index] + cb = self._blocks._get_data_callback(index) + + if key is None: + if self._blk is not None: + msg = "Converters accessing >1 block must provide a key for each block" + raise OSError(msg) + self._blk = blk + self._cb = cb + else: + self._blocks.blocks.assign_object(key, blk) + self._blocks._data_callbacks.assign_object(key, cb) + + return cb + + def generate_block_key(self): + key = BlockKey() + self._keys.add(key) + return key + + +class _Serialization(_Operation): + def __init__(self, ctx, obj): + super().__init__(ctx) + self._obj = obj + + def find_available_block_index(self, data_callback, lookup_key=None): + if lookup_key is None: + lookup_key = self._obj + return self._blocks.make_write_block(data_callback, BlockOptions(), lookup_key) + + def generate_block_key(self): + return BlockKey(self._obj) + + +class _IgnoreBlocks(_Operation): + pass diff --git a/asdf/asdf.py b/asdf/asdf.py index dbf758efc..5a2143e6e 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -18,8 +18,8 @@ from ._block import writer as block_writer from ._block.manager import Manager as BlockManager from ._block.manager import ReadBlocks -from ._block.options import Options as BlockOptions from ._helpers import validate_version +from ._serialization_context import SerializationContext from .config import config_context, get_config from .exceptions import ( AsdfConversionWarning, diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index c225c7f93..02644faa2 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -231,8 +231,8 @@ def _convert_obj(obj, subtype=False): # object which will be handled by a different converter while tag is None: converters.add(converter) - with _serialization_context._serialization(obj): - obj = converter.to_yaml_tree(obj, tag, _serialization_context) + with _serialization_context._serialization(obj) as sctx: + obj = converter.to_yaml_tree(obj, tag, sctx) try: converter = extension_manager.get_converter_for_type(type(obj)) except KeyError: @@ -243,8 +243,6 @@ def _convert_obj(obj, subtype=False): msg = "Conversion cycle detected" raise TypeError(msg) tag = converter.select_tag(obj, _serialization_context) - with _serialization_context._serialization(obj): - node = converter.to_yaml_tree(obj, tag, _serialization_context) if isinstance(node, GeneratorType): generator = node From 06550cb7a1ede976740f72440d2cd1cc559565ae Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 10 May 2023 11:15:03 -0400 Subject: [PATCH 064/154] don't assign blocks if conversion raised error --- asdf/_serialization_context.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/asdf/_serialization_context.py b/asdf/_serialization_context.py index b8abe0a16..752777aed 100644 --- a/asdf/_serialization_context.py +++ b/asdf/_serialization_context.py @@ -170,7 +170,8 @@ def __init__(self, ctx): self._keys = set() def __exit__(self, exc_type, exc_value, traceback): - # TODO check exception here + if exc_type is not None: + return if self._blk is not None: self._blocks.blocks.assign_object(self._obj, self._blk) self._blocks._data_callbacks.assign_object(self._obj, self._cb) From 6918225e173d937e7c8dfe2f388eeef81af071b4 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 10 May 2023 11:40:57 -0400 Subject: [PATCH 065/154] add multi-block example update converter docs --- asdf/_tests/test_block_converter.py | 41 ++++++++++ docs/asdf/extending/converters.rst | 123 ++++++++++++++++++---------- 2 files changed, 123 insertions(+), 41 deletions(-) diff --git a/asdf/_tests/test_block_converter.py b/asdf/_tests/test_block_converter.py index 76ef4758d..a7f4ce3cf 100644 --- a/asdf/_tests/test_block_converter.py +++ b/asdf/_tests/test_block_converter.py @@ -183,3 +183,44 @@ def test_block_with_callback_removal(tmp_path): af[remove_key] = None af.update() af[check_key] = b.data + + +class MultiBlockData: + def __init__(self, data): + self.data = data + self.keys = [] + + +class MultiBlockConverter(Converter): + tags = ["asdf://somewhere.org/tags/multi_block_data-1.0.0"] + types = [MultiBlockData] + + def to_yaml_tree(self, obj, tag, ctx): + if not len(obj.keys): + obj.keys = [ctx.generate_block_key() for _ in obj.data] + indices = [ctx.find_available_block_index(d, k) for d, k in zip(obj.data, obj.keys)] + return { + "indices": indices, + } + + def from_yaml_tree(self, node, tag, ctx): + indices = node["indices"] + keys = [ctx.generate_block_key() for _ in indices] + cbs = [ctx.get_block_data_callback(i, k) for i, k in zip(indices, keys)] + obj = MultiBlockData([cb() for cb in cbs]) + obj.keys = keys + return obj + + +class MultiBlockExtension(Extension): + tags = ["asdf://somewhere.org/tags/multi_block_data-1.0.0"] + converters = [MultiBlockConverter()] + extension_uri = "asdf://somewhere.org/extensions/multi_block_data-1.0.0" + + +@with_extension(MultiBlockExtension) +def test_mutli_block(): + a = MultiBlockData([np.arange(3, dtype="uint8") for i in range(3)]) + b = helpers.roundtrip_object(a) + assert len(a.data) == len(b.data) + assert [np.testing.assert_array_equal(aa, ab) for aa, ab in zip(a.data, b.data)] diff --git a/docs/asdf/extending/converters.rst b/docs/asdf/extending/converters.rst index 999b0b559..d0e3a3e9e 100644 --- a/docs/asdf/extending/converters.rst +++ b/docs/asdf/extending/converters.rst @@ -351,7 +351,7 @@ Block storage ============= As described above :ref:`extending_converters` can return complex objects that will -be passed to other Converters. If a Converter returns a ndarray, ASDF will recognize this +be passed to other Converters. If a Converter returns a ndarray, asdf will recognize this array and store it in an ASDF block. This is the easiest and preferred means of storing data in ASDF blocks. @@ -359,19 +359,18 @@ For applications that require more flexibility, Converters can control block storage through use of the `asdf.extension.SerializationContext` provided as an argument to `Converter.to_yaml_tree` `Converter.from_yaml_tree` and `Converter.select_tag`. -It is helpful to first review some details of how ASDF +It is helpful to first review some details of how asdf :ref:`stores block `. Blocks are stored sequentially within a -ASDF file following the YAML tree. During reads and writes, ASDF will need to know +ASDF file following the YAML tree. During reads and writes, asdf will need to know the index of the block a Converter would like to use to read or write the correct block. However, the index used for reading might not be the same index for writing -if the tree was modified or the file is being written to a new location. To allow -ASDF to track the relationship between blocks and objects, Converters will need -to generate unique hashable keys for each block used and associate these keys with -block indices during read and write (more on this below). +if the tree was modified or the file is being written to a new location. During +serialization and deserialization, asdf will associate each object with the +accessed block during `from_yaml_tree` and `to_yaml_tree`. .. note:: - Use of ``id(obj)`` will not generate a unique key as it returns the memory address - which might be reused after the object is garbage collected. + Converters using multiple blocks are slightly more complicated. + See: :ref:`extending_converter_multiple_block_storage` A simple example of a Converter using block storage to store the ``payload`` for ``BlockData`` object instances is as follows: @@ -385,7 +384,6 @@ A simple example of a Converter using block storage to store the ``payload`` for class BlockData: def __init__(self, payload): self.payload = payload - self._asdf_key = asdf.util.BlockKey() class BlockConverter(Converter): @@ -393,22 +391,17 @@ A simple example of a Converter using block storage to store the ``payload`` for types = [BlockData] def to_yaml_tree(self, obj, tag, ctx): - block_index = ctx.find_block_index( - obj._asdf_key, + block_index = ctx.find_available_block_index( lambda: np.ndarray(len(obj.payload), dtype="uint8", buffer=obj.payload), ) return {"block_index": block_index} def from_yaml_tree(self, node, tag, ctx): block_index = node["block_index"] - obj = BlockData(b"") - ctx.assign_block_key(block_index, obj._asdf_key) - obj.payload = ctx.get_block_data_callback(block_index)() + data_callback = ctx.get_block_data_callback(block_index) + obj = BlockData(data_callback()) return obj - def reserve_blocks(self, obj, tag): - return [obj._asdf_key] - class BlockExtension(Extension): tags = ["asdf://somewhere.org/tags/block_data-1.0.0"] converters = [BlockConverter()] @@ -422,29 +415,77 @@ A simple example of a Converter using block storage to store the ``payload`` for .. asdf:: block_converter_example.asdf During read, ``Converter.from_yaml_tree`` will be called. Within this method -the Converter should associate any used blocks with unique hashable keys by calling -`asdf.extension.SerializationContext.assign_block_key` and can generate (and use) a callable -function that will return block data using `asdf.extension.SerializationContext.get_block_data_callback`. -A callback for reading the data is provided to support lazy loading without -keeping a reference to the `asdf.extension.SerializationContext` (which is meant to be -a short lived and lightweight object). - -During write, ``Converter.to_yaml_tree`` will be called. The Converter should -use `asdf.extension.SerializationContext.find_block_index` to find the location of an -available block by providing a hashable key unique to this object (this should -be the same key used during reading to allow ASDF to associate blocks and objects -during in-place updates). The second argument to `asdf.extension.SerializationContext.find_block_index` -must be a callable function (returning a ndarray) that ASDF will call when it -is time to write data to the portion of the file corresponding to this block. -Note that it's possible this callback will be called multiple times during a -write and ASDF will not cache the result. If the data is coming from a non-repeatable -source (such as a non-seekable stream of bytes) the data should be cached prior -to providing it to ASDF to allow ASDF to call the callback multiple times. - -A Converter that uses block storage must also define ``Converter.reserve_blocks``. -``Converter.reserve_blocks`` will be called during memory management to free -resources for unused blocks. ``Converter.reserve_blocks`` must -return a list of keys associated with an object. +the Converter can prepare to access a block by calling +``SerializationContext.get_block_data_callback``. This will return a function +that when called will return the contents of the block (to support lazy +loading without keeping a reference to the ``SerializationContext`` (which is meant +to be a short lived and lightweight object). + +During write, ``Converter.to_yaml_tree`` will be called. The Converter can +use ``SerializationContext.find_available_block_index`` to find the location of an +available block for writing. The data to be written to the block can be provided +as an ``ndarray`` or a callable function that will return a ``ndarray`` (note that +it is possible this callable function will be called multiple times and the +developer should cache results from any non-repeatable sources). + +.. _extending_converter_multiple_block_storage: + +Converters using multiple blocks +-------------------------------- + +As discussed above, while serializing and deserializing objects that use +one block, asdf will watch which block is accessed by ``find_available_block_index`` +and ``get_block_data_callback`` and associate the block with the converted object. +This association allows asdf to map read and write blocks during updates of ASDF +files. An object that uses multiple blocks must provide a unique key for each +block it uses. These keys are generated using ``SerializationContext.generate_block_key`` +and must be stored by the extension code. These keys must be resupplied to the converter +when writing an object that was read from an ASDF file. + +.. runcode:: + import asdf + import numpy as np + from asdf.extension import Converter, Extension + + class MultiBlockData: + def __init__(self, data): + self.data = data + self.keys = [] + + + class MultiBlockConverter(Converter): + tags = ["asdf://somewhere.org/tags/multi_block_data-1.0.0"] + types = [MultiBlockData] + + def to_yaml_tree(self, obj, tag, ctx): + if not len(obj.keys): + obj.keys = [ctx.generate_block_key() for _ in obj.data] + indices = [ctx.find_available_block_index(d, k) for d, k in zip(obj.data, obj.keys)] + return { + "indices": indices, + } + + def from_yaml_tree(self, node, tag, ctx): + indices = node["indices"] + keys = [ctx.generate_block_key() for _ in indices] + cbs = [ctx.get_block_data_callback(i, k) for i, k in zip(indices, keys)] + obj = MultiBlockData([cb() for cb in cbs]) + obj.keys = keys + return obj + + + class MultiBlockExtension(Extension): + tags = ["asdf://somewhere.org/tags/multi_block_data-1.0.0"] + converters = [MultiBlockConverter()] + extension_uri = "asdf://somewhere.org/extensions/multi_block_data-1.0.0" + + with asdf.config_context() as cfg: + cfg.add_extension(MultiBlockExtension()) + obj = MultiBlockData([np.arange(3, dtype="uint8") + i for i in range(3)]) + ff = asdf.AsdfFile({"example": obj}) + ff.write_to("multi_block_converter_example.asdf") + +.. asdf:: block_converter_example.asdf .. _extending_converters_performance: From 70e441c635c5f1862a2eb89a9b571a100b6e2dad Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 10 May 2023 13:43:51 -0400 Subject: [PATCH 066/154] fix multi-block converter example in docs --- docs/asdf/extending/converters.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/asdf/extending/converters.rst b/docs/asdf/extending/converters.rst index d0e3a3e9e..15de139ca 100644 --- a/docs/asdf/extending/converters.rst +++ b/docs/asdf/extending/converters.rst @@ -443,6 +443,7 @@ and must be stored by the extension code. These keys must be resupplied to the c when writing an object that was read from an ASDF file. .. runcode:: + import asdf import numpy as np from asdf.extension import Converter, Extension @@ -485,7 +486,7 @@ when writing an object that was read from an ASDF file. ff = asdf.AsdfFile({"example": obj}) ff.write_to("multi_block_converter_example.asdf") -.. asdf:: block_converter_example.asdf +.. asdf:: multi_block_converter_example.asdf .. _extending_converters_performance: From 42b6f89ee4efe08841840c9737ff392a82bea654 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 10 May 2023 17:17:33 -0400 Subject: [PATCH 067/154] add try/finally to update and copy blocks in block_size increments also clean up a bunch of comments --- asdf/_tests/_helpers.py | 2 - asdf/asdf.py | 253 ++++++++++++++++--------------- asdf/core/_converters/ndarray.py | 9 +- asdf/generic_io.py | 3 + asdf/tags/core/ndarray.py | 9 +- 5 files changed, 142 insertions(+), 134 deletions(-) diff --git a/asdf/_tests/_helpers.py b/asdf/_tests/_helpers.py index f2d92c8ff..a3fcd15a7 100644 --- a/asdf/_tests/_helpers.py +++ b/asdf/_tests/_helpers.py @@ -278,7 +278,6 @@ def _assert_roundtrip_tree( ff = asdf.open(buff, extensions=extensions, copy_arrays=True, lazy_load=False) # Ensure that all the blocks are loaded for block in ff._blocks.blocks: - # assert isinstance(block, Block) assert block._data is not None and not callable(block._data) # The underlying file is closed at this time and everything should still work assert_tree_match(tree, ff.tree, ff, funcname=tree_match_func) @@ -289,7 +288,6 @@ def _assert_roundtrip_tree( AsdfFile(tree, extensions=extensions, **init_options).write_to(fname, **write_options) with asdf.open(fname, mode="rw", extensions=extensions, copy_arrays=False, lazy_load=False) as ff: for block in ff._blocks.blocks: - # assert isinstance(block, Block) assert block._data is not None and not callable(block._data) assert_tree_match(tree, ff.tree, ff, funcname=tree_match_func) if asdf_check_func: diff --git a/asdf/asdf.py b/asdf/asdf.py index 5a2143e6e..db72813b5 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -1093,8 +1093,7 @@ def update( if fd.can_memmap(): fd.flush_memmap() - # TODO shortcuts for - # - no read blocks + # if we have no read blocks, we can just call write_to as no intrnal blocks are reused if len(self._blocks.blocks) == 0 and self._blocks._streamed_block is None: self._fd.seek(0) self.write_to(self._fd) @@ -1102,7 +1101,7 @@ def update( self._fd.close_memmap() self._fd.truncate() return - # - all external + # if we have all external blocks, we can just call write_to as no internal blocks are reused if config.all_array_storage == "external": self._fd.seek(0) self.write_to(self._fd) @@ -1110,131 +1109,137 @@ def update( self._fd.close_memmap() self._fd.truncate() return - # - no write blocks self._pre_write(fd) - self._tree["asdf_library"] = get_asdf_library_info() - - # TODO wrap a sensible try/finally - # prepare block manager for writing - self._blocks._clear_write() - - # write out tree to temporary buffer - tree_fd = generic_io.get_file(io.BytesIO(), mode="rw") - self._write_tree(self._tree, tree_fd, False) - new_tree_size = tree_fd.tell() - end_of_file = new_tree_size - - # TODO short circuit here if no blocks are used - - # find where to start writing blocks (either end of new tree or end of last 'free' block) - last_block = None - for blk in self._blocks.blocks[::-1]: - if not blk.memmap and (blk._cached_data is not None or not callable(blk._data)): - continue - last_block = blk - break - if last_block is None: - new_block_start = new_tree_size - else: - new_block_start = max( - last_block.data_offset + last_block.header["allocated_size"], - new_tree_size, - ) + try: + self._tree["asdf_library"] = get_asdf_library_info() + + # prepare block manager for writing + self._blocks._clear_write() + + # write out tree to temporary buffer + tree_fd = generic_io.get_file(io.BytesIO(), mode="rw") + self._write_tree(self._tree, tree_fd, False) + new_tree_size = tree_fd.tell() + end_of_file = new_tree_size + + # find where to start writing blocks (either end of new tree or end of last 'free' block) + last_block = None + for blk in self._blocks.blocks[::-1]: + if not blk.memmap and (blk._cached_data is not None or not callable(blk._data)): + continue + last_block = blk + break + if last_block is None: + new_block_start = new_tree_size + else: + new_block_start = max( + last_block.data_offset + last_block.header["allocated_size"], + new_tree_size, + ) - if len(self._blocks._external_write_blocks): - self._blocks._write_external_blocks() - - # do we have any blocks to write? - if len(self._blocks._write_blocks) or self._blocks._streamed_block: - self._fd.seek(new_block_start) - offsets, headers = block_writer.write_blocks( - self._fd, - self._blocks._write_blocks, - pad_blocks, - streamed_block=self._blocks._streamed_block, - write_index=False, # don't write an index as we will modify the offsets - ) - new_block_end = self._fd.tell() - end_of_file = new_block_end - - # move blocks to start TODO as 'chunks' - self._fd.seek(new_block_start) - block_data = self._fd.read(new_block_end - new_block_start) - self._fd.seek(new_tree_size) - self._fd.write(block_data) - # update offset to point at correct locations - offsets = [o - (new_block_start - new_tree_size) for o in offsets] - - # write index if no streamed block - if include_block_index and self._blocks._streamed_block is None: - bio.write_block_index(self._fd, offsets) - end_of_file = self._fd.tell() - - # map new blocks to old blocks - new_read_blocks = ReadBlocks() - for i, (offset, header) in enumerate(zip(offsets, headers)): - if i == len(self._blocks._write_blocks): # this is a streamed block - obj = self._blocks._streamed_obj() - wblk = self._blocks._streamed_block - else: - wblk = self._blocks._write_blocks[i] - # find object associated with wblk - obj = None - for oid, by_key in self._blocks._write_blocks._by_id.items(): - for key, index in by_key.items(): - if self._blocks._write_blocks[index] is wblk: - obj = key._ref() - break - if obj is None: - msg = "Update failed to associate blocks" - raise OSError(msg) - - # does the obj have an old read block? - rblk = self._blocks.blocks.lookup_by_object(obj) - if rblk is not None: - memmap = rblk.memmap - data = None - if not rblk.memmap: - if rblk._cached_data is not None: - data = rblk._cached_data - elif not callable(rblk._data): - data = rblk._data - else: - memmap = self._blocks.memmap - data = None - - # we have to be lazy here as the current memmap is invalid - new_read_block = block_reader.ReadBlock( - offset + 4, self._fd, memmap, True, False, header=header, data=data + if len(self._blocks._external_write_blocks): + self._blocks._write_external_blocks() + + # do we have any blocks to write? + if len(self._blocks._write_blocks) or self._blocks._streamed_block: + self._fd.seek(new_block_start) + offsets, headers = block_writer.write_blocks( + self._fd, + self._blocks._write_blocks, + pad_blocks, + streamed_block=self._blocks._streamed_block, + write_index=False, # don't write an index as we will modify the offsets ) - new_read_blocks.append_block(new_read_block) - new_index = len(new_read_blocks) - 1 - new_read_blocks.assign_object(obj, new_read_block) - - # update data callbacks to point to new blocks - cb = self._blocks._data_callbacks.lookup_by_object(obj) - if cb is not None: - cb.reassign(new_index, new_read_blocks) - - # update read blocks to reflect new state - self._blocks.blocks = new_read_blocks - self._blocks.options._read_blocks = new_read_blocks - - # now write the tree - self._fd.seek(0) - tree_fd.seek(0) - self._fd.write(tree_fd.read()) - - # TODO post_write - # close memmap to trigger arrays to reload themselves - if self._fd.can_memmap(): - self._fd.close_memmap() - self._fd.seek(end_of_file) - self._fd.truncate() - self._post_write(fd) - - self._blocks._clear_write() + new_block_end = self._fd.tell() + end_of_file = new_block_end + + # move blocks to start in increments of block_size + n_bytes = new_block_end - new_block_start + src, dst = new_block_start, new_tree_size + block_size = self._fd.block_size + while n_bytes > 0: + self._fd.seek(src) + bs = self._fd.read(min(n_bytes, block_size)) + self._fd.seek(dst) + self._fd.write(bs) + n = len(bs) + n_bytes -= n + src += n + dst += n + + # update offset to point at correct locations + offsets = [o - (new_block_start - new_tree_size) for o in offsets] + + # write index if no streamed block + if include_block_index and self._blocks._streamed_block is None: + bio.write_block_index(self._fd, offsets) + end_of_file = self._fd.tell() + + # map new blocks to old blocks + new_read_blocks = ReadBlocks() + for i, (offset, header) in enumerate(zip(offsets, headers)): + if i == len(self._blocks._write_blocks): # this is a streamed block + obj = self._blocks._streamed_obj() + wblk = self._blocks._streamed_block + else: + wblk = self._blocks._write_blocks[i] + # find object associated with wblk + obj = None + for oid, by_key in self._blocks._write_blocks._by_id.items(): + for key, index in by_key.items(): + if self._blocks._write_blocks[index] is wblk: + obj = key._ref() + break + if obj is None: + msg = "Update failed to associate blocks" + raise OSError(msg) + + # does the obj have an old read block? + rblk = self._blocks.blocks.lookup_by_object(obj) + if rblk is not None: + memmap = rblk.memmap + data = None + if not rblk.memmap: + if rblk._cached_data is not None: + data = rblk._cached_data + elif not callable(rblk._data): + data = rblk._data + else: + memmap = self._blocks.memmap + data = None + + # we have to be lazy here as the current memmap is invalid + new_read_block = block_reader.ReadBlock( + offset + 4, self._fd, memmap, True, False, header=header, data=data + ) + new_read_blocks.append_block(new_read_block) + new_index = len(new_read_blocks) - 1 + new_read_blocks.assign_object(obj, new_read_block) + + # update data callbacks to point to new blocks + cb = self._blocks._data_callbacks.lookup_by_object(obj) + if cb is not None: + cb.reassign(new_index, new_read_blocks) + + # update read blocks to reflect new state + self._blocks.blocks = new_read_blocks + self._blocks.options._read_blocks = new_read_blocks + + # now write the tree + self._fd.seek(0) + tree_fd.seek(0) + self._fd.write(tree_fd.read()) + self._fd.flush() + # close memmap to trigger arrays to reload themselves + self._fd.seek(end_of_file) + if self._fd.can_memmap(): + self._fd.close_memmap() + self._fd.truncate() + + finally: + self._post_write(fd) + self._blocks._clear_write() def write_to( self, diff --git a/asdf/core/_converters/ndarray.py b/asdf/core/_converters/ndarray.py index dc93289ca..eca8dfc42 100644 --- a/asdf/core/_converters/ndarray.py +++ b/asdf/core/_converters/ndarray.py @@ -28,7 +28,7 @@ def to_yaml_tree(self, obj, tag, ctx): data = obj if isinstance(obj, Stream): - # TODO previously, stream never passed on data? + # previously, stream never passed on data, we can do that here ctx._blocks.set_streamed_block(data._array, data) result = {} @@ -56,6 +56,7 @@ def to_yaml_tree(self, obj, tag, ctx): shape = data.shape + # sort out block writing options if isinstance(obj, NDArrayType) and isinstance(obj._source, str): # this is an external block, if we have no other settings, keep it as external options = ctx._blocks.options.lookup_by_object(data) @@ -64,6 +65,7 @@ def to_yaml_tree(self, obj, tag, ctx): else: options = ctx._blocks.options.get_options(data) + # possibly override options based on config settings with config_context() as cfg: if cfg.all_array_storage is not None: options.storage_type = cfg.all_array_storage @@ -104,10 +106,8 @@ def to_yaml_tree(self, obj, tag, ctx): result["shape"] = list(shape) if options.storage_type == "streamed": result["shape"][0] = "*" - - if options.storage_type == "streamed": - ctx._blocks.set_streamed_block(base, data) result["source"] = -1 + ctx._blocks.set_streamed_block(base, data) else: result["source"] = ctx._blocks.make_write_block(base, options, obj) result["datatype"] = dtype @@ -156,6 +156,7 @@ def from_yaml_tree(self, node, tag, ctx): mask = node.get("mask", None) if isinstance(source, int): + # internal block data_callback = ctx.get_block_data_callback(source) instance = NDArrayType(source, shape, dtype, offset, strides, "A", mask, data_callback) elif isinstance(source, str): diff --git a/asdf/generic_io.py b/asdf/generic_io.py index 1c7ede99b..af6bd7253 100644 --- a/asdf/generic_io.py +++ b/asdf/generic_io.py @@ -739,6 +739,9 @@ def fast_forward(self, size): self.seek(size, SEEK_CUR) def truncate(self, size=None): + # windows supports truncating as long as the file not opened + # more than once. So this must be called after closing all + # memmaps if size is None: self._fd.truncate() else: diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index f39d09f8d..d7be1bbff 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -230,9 +230,6 @@ def ascii_to_unicode(x): class NDArrayType: def __init__(self, source, shape, dtype, offset, strides, order, mask, data_callback=None): - # source can be a: - # - list of numbers for an inline block - # - a data callback for an internal or externalblock self._source = source self._data_callback = data_callback self._array = None @@ -241,6 +238,8 @@ def __init__(self, source, shape, dtype, offset, strides, order, mask, data_call if isinstance(source, list): self._array = inline_data_asarray(source, dtype) self._array = self._apply_mask(self._array, self._mask) + # single element structured arrays can have shape == () + # https://github.com/asdf-format/asdf/issues/1540 if shape is not None and ( self._array.shape != tuple(shape) or (len(shape) and shape[0] == "*" and self._array.shape[1:] != tuple(shape[1:])) @@ -281,7 +280,7 @@ def _make_array(self): if hasattr(data, "base") and isinstance(data.base, mmap.mmap) and data.base.closed: raise OSError("Attempt to read data from a closed file") - # streaming blocks have 0 data size + # compute shape (streaming blocks have '0' data size in the block header) shape = self.get_actual_shape( self._shape, self._strides, @@ -360,6 +359,8 @@ def shape(self): return self._make_array().shape data_size = self._data_callback(_attr="header")["data_size"] if not data_size: + # streamed blocks have a '0' data_size in the header so we + # need to make the array to get the shape return self._make_array().shape return tuple( self.get_actual_shape( From 244522e179e77b558e2551ee58a615d7dae599e4 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 10:38:49 -0400 Subject: [PATCH 068/154] skip 1542 test on windows os.pipe returns files that windows says are seekable (but really aren't) https://bugs.python.org/issue42602 --- asdf/_tests/_issues/test_1542.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/asdf/_tests/_issues/test_1542.py b/asdf/_tests/_issues/test_1542.py index bce7a375d..96247a27f 100644 --- a/asdf/_tests/_issues/test_1542.py +++ b/asdf/_tests/_issues/test_1542.py @@ -1,10 +1,15 @@ import os +import sys import numpy as np +import pytest import asdf +@pytest.mark.skipif( + sys.platform.startswith("win"), reason="os.pipe.seek noop on windows: https://bugs.python.org/issue42602" +) def test_1542(): """ ASDF fails to write blocks to non-seekable file From 2eb07c9908c09d735ad790915be4aba6109e4af5 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 11:51:02 -0400 Subject: [PATCH 069/154] add eq and copy to _block.Key --- asdf/_block/key.py | 12 ++++++ asdf/_tests/_block/test_key.py | 70 ++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/asdf/_block/key.py b/asdf/_block/key.py index 6ec7d8038..fdad39ec7 100644 --- a/asdf/_block/key.py +++ b/asdf/_block/key.py @@ -43,3 +43,15 @@ def matches(self, obj): if r is None: return False return r is obj + + def __eq__(self, other): + if not isinstance(other, Key): + return NotImplemented + if self._key != other._key: + return False + if not self.is_valid(): + return False + return other.matches(self._ref()) + + def __copy__(self): + return self.__class__(self._ref(), self._key) diff --git a/asdf/_tests/_block/test_key.py b/asdf/_tests/_block/test_key.py index a284816c6..f38780a27 100644 --- a/asdf/_tests/_block/test_key.py +++ b/asdf/_tests/_block/test_key.py @@ -1,3 +1,5 @@ +import copy + from asdf._block.key import Key @@ -29,6 +31,11 @@ def test_matches_obj(): assert bk.matches(f) +def test_undefined_no_match(): + bk = Key() + assert not bk.matches(Foo()) + + def test_is_valid(): f = Foo() bk = Key(f) @@ -44,3 +51,66 @@ def test_same_class(): f2 = Foo() assert not bk.is_valid() assert not bk.matches(f2) + + +def test_undefined(): + k = Key() + assert not k.is_valid() + + +def test_equal(): + key_value = 42 + f = Foo() + k1 = Key(f, key_value) + k2 = Key(f, key_value) + assert k1 == k2 + + +def test_key_mismatch_not_equal(): + f = Foo() + k1 = Key(f) + k2 = Key(f) + assert k1 != k2 + + +def test_obj_not_equal(): + f = Foo() + k = Key(f) + assert k != f + + +def test_undefined_not_equal(): + key_value = 42 + k1 = Key(key=key_value) + k2 = Key(key=key_value) + assert k1 != k2 + + +def test_deleted_object_not_equal(): + key_value = 42 + f = Foo() + k1 = Key(f, key_value) + k2 = Key(f, key_value) + del f + assert k1 != k2 + + +def test_copy(): + f = Foo() + k1 = Key(f) + k2 = copy.copy(k1) + assert k1 == k2 + + +def test_copy_undefined_not_equal(): + k1 = Key() + k2 = copy.copy(k1) + assert k1 != k2 + + +def test_copy_deleted_object_not_equal(): + f = Foo() + k1 = Key(f) + k2 = copy.copy(k1) + del f + assert k1 != k2 From 21e3c9b86114d2bffa876333564b1d4892cd4d3b Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 12:28:33 -0400 Subject: [PATCH 070/154] move block reading to Manager.read --- asdf/_block/manager.py | 12 ++++++++++-- asdf/asdf.py | 24 +++++++++--------------- asdf/core/_converters/ndarray.py | 2 +- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 5433e7b27..6bd2ca7da 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -5,7 +5,7 @@ from asdf import config, constants, generic_io, util -from . import store +from . import reader, store from .callback import DataCallback from .external import ExternalBlockCache, UseInternal from .options import Options @@ -111,7 +111,7 @@ def relative_uri_to_full(uri, relative): class Manager: - def __init__(self, read_blocks=None, uri=None): + def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, validate_checksums=False): if read_blocks is None: read_blocks = ReadBlocks([]) self.options = BlockOptions(read_blocks) @@ -124,6 +124,14 @@ def __init__(self, read_blocks=None, uri=None): self._write_fd = None self._uri = uri self._external_block_cache = ExternalBlockCache() + self._lazy_load = lazy_load + self._memmap = memmap + self._validate_checksums = validate_checksums + + def read(self, fd, after_magic=False): + self.blocks.set_blocks( + reader.read_blocks(fd, self._memmap, self._lazy_load, self._validate_checksums, after_magic=after_magic) + ) def _load_external(self, uri): value = self._external_block_cache.load(self._uri, uri) diff --git a/asdf/asdf.py b/asdf/asdf.py index db72813b5..211b8dc17 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -157,9 +157,7 @@ def __init__( self._fd = None self._closed = False self._external_asdf_by_uri = {} - self._blocks = BlockManager(uri=uri) - self._blocks.lazy_load = lazy_load - self._blocks.memmap = not copy_arrays + self._blocks = BlockManager(uri=uri, lazy_load=lazy_load, memmap=not copy_arrays) self._uri = uri if tree is None: # Bypassing the tree property here, to avoid validating @@ -803,6 +801,10 @@ def _open_asdf( raise ValueError(msg) with config_context(): + + # validate_checksums (unlike memmap and lazy_load) is provided + # here instead of in __init__ + self._blocks._validate_checksums = validate_checksums self._mode = fd.mode self._fd = fd if self._fd._uri: @@ -832,7 +834,6 @@ def _open_asdf( yaml_token = fd.read(4) tree = None - read_blocks = [] if yaml_token == b"%YAM": reader = fd.reader_until( constants.YAML_END_MARKER_REGEX, @@ -852,21 +853,14 @@ def _open_asdf( # now, but we don't do anything special with it until # after the blocks have been read tree = yamlutil.load_tree(reader) - # has_blocks = fd.seek_until(constants.BLOCK_MAGIC, 4, include=True, exception=False) - read_blocks = block_reader.read_blocks( - fd, self._blocks.memmap, self._blocks.lazy_load, validate_checksums - ) + self._blocks.read(fd) elif yaml_token == constants.BLOCK_MAGIC: - # this file has only blocks - read_blocks = block_reader.read_blocks( - fd, self._blocks.memmap, self._blocks.lazy_load, validate_checksums, after_magic=True - ) + # this file has only blocks and we're already read the first block magic + self._blocks.read(fd, after_magic=True) elif yaml_token != b"": msg = "ASDF file appears to contain garbage after header." raise OSError(msg) - self._blocks.blocks.set_blocks(read_blocks) - if tree is None: # At this point the tree should be tagged, but we want it to be # tagged with the core/asdf version appropriate to this file's @@ -1206,7 +1200,7 @@ def update( elif not callable(rblk._data): data = rblk._data else: - memmap = self._blocks.memmap + memmap = self._blocks._memmap data = None # we have to be lazy here as the current memmap is invalid diff --git a/asdf/core/_converters/ndarray.py b/asdf/core/_converters/ndarray.py index eca8dfc42..6f26beaa6 100644 --- a/asdf/core/_converters/ndarray.py +++ b/asdf/core/_converters/ndarray.py @@ -176,7 +176,7 @@ def data_callback(_attr=None, _ref=weakref.ref(ctx._blocks)): instance = NDArrayType(source, shape, dtype, offset, strides, "A", mask) ctx._blocks._set_array_storage(instance, "inline") - if not ctx._blocks.lazy_load: + if not ctx._blocks._lazy_load: instance._make_array() return instance From 0f47eecc6e2dcbf705869fed16694ce0161d4032 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 12:39:05 -0400 Subject: [PATCH 071/154] store uri in _block.Manager instead of AsdfFile as the uri is needed for handling external blocks having AsdfFile as the uri provider means _block.Manager needs to have a synchronized uri (or a reference back up to the AsdfFile). As every AsdfFile now has one and only _block.Manager, the uri can be stored in the _block.Manager._uri attribute. --- asdf/asdf.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/asdf/asdf.py b/asdf/asdf.py index 211b8dc17..c209d51f2 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -158,7 +158,6 @@ def __init__( self._closed = False self._external_asdf_by_uri = {} self._blocks = BlockManager(uri=uri, lazy_load=lazy_load, memmap=not copy_arrays) - self._uri = uri if tree is None: # Bypassing the tree property here, to avoid validating # an empty tree. @@ -169,8 +168,7 @@ def __init__( # of copying the file? msg = "Can not copy AsdfFile and change active extensions" raise ValueError(msg) - self._uri = tree.uri - self._blocks._uri = self._uri + self._blocks._uri = tree.uri # Set directly to self._tree (bypassing property), since # we can assume the other AsdfFile is already valid. self._tree = tree.tree @@ -471,7 +469,7 @@ def close(self): def copy(self): return self.__class__( copy.deepcopy(self._tree), - self._uri, + self._blocks._uri, self._user_extensions, ) @@ -485,11 +483,7 @@ def uri(self): In many cases, it is automatically determined from the file handle used to read or write the file. """ - if self._uri is not None: - return self._uri - if self._fd is not None: - return self._fd._uri - return None + return self._blocks._uri @property def _tag_to_schema_resolver(self): @@ -808,7 +802,6 @@ def _open_asdf( self._mode = fd.mode self._fd = fd if self._fd._uri: - self._uri = self._fd._uri self._blocks._uri = self._fd._uri # The filename is currently only used for tracing warning information self._fname = self._fd._uri if self._fd._uri else "" From 07f168575b3d19ed9a9891d9eeb9a0e79c1db94b Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 12:58:37 -0400 Subject: [PATCH 072/154] reduce code duplication in AsdfFile.update --- asdf/asdf.py | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/asdf/asdf.py b/asdf/asdf.py index c209d51f2..1b1aafd40 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -972,14 +972,19 @@ def _pre_write(self, fd): self._run_hook("pre_write") def _serial_write(self, fd, pad_blocks, include_block_index): - # prep a tree for a writing - tree = copy.copy(self._tree) - tree["asdf_library"] = get_asdf_library_info() - if "history" in self._tree: - tree["history"] = copy.deepcopy(self._tree["history"]) - - self._write_tree(tree, fd, pad_blocks) - self._blocks.write(fd, pad_blocks, include_block_index) + with self._blocks.write_context(fd): + self._pre_write(fd) + try: + # prep a tree for a writing + tree = copy.copy(self._tree) + tree["asdf_library"] = get_asdf_library_info() + if "history" in self._tree: + tree["history"] = copy.deepcopy(self._tree["history"]) + + self._write_tree(tree, fd, pad_blocks) + self._blocks.write(fd, pad_blocks, include_block_index) + finally: + self._post_write(fd) def _post_write(self, fd): if len(self._tree): @@ -1080,21 +1085,21 @@ def update( if fd.can_memmap(): fd.flush_memmap() - # if we have no read blocks, we can just call write_to as no intrnal blocks are reused - if len(self._blocks.blocks) == 0 and self._blocks._streamed_block is None: + def rewrite(): self._fd.seek(0) - self.write_to(self._fd) + self._serial_write(self._fd, pad_blocks, include_block_index) if self._fd.can_memmap(): self._fd.close_memmap() self._fd.truncate() + + # if we have no read blocks, we can just call write_to as no internal blocks are reused + if len(self._blocks.blocks) == 0 and self._blocks._streamed_block is None: + rewrite() return + # if we have all external blocks, we can just call write_to as no internal blocks are reused if config.all_array_storage == "external": - self._fd.seek(0) - self.write_to(self._fd) - if self._fd.can_memmap(): - self._fd.close_memmap() - self._fd.truncate() + rewrite() return self._pre_write(fd) @@ -1312,13 +1317,7 @@ def write_to( self.version = version with generic_io.get_file(fd, mode="w") as fd: - with self._blocks.write_context(fd): - self._pre_write(fd) - try: - self._serial_write(fd, pad_blocks, include_block_index) - fd.flush() - finally: - self._post_write(fd) + self._serial_write(fd, pad_blocks, include_block_index) def find_references(self): """ From ef30d8335b0850ce638df0caf37df47e87dc8e9c Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 13:23:13 -0400 Subject: [PATCH 073/154] move bulk of AsdfFile.update into _blocks.Manager.update --- asdf/_block/manager.py | 131 +++++++++++++++++++++++++++++++++++++---- asdf/asdf.py | 125 ++++----------------------------------- 2 files changed, 130 insertions(+), 126 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 6bd2ca7da..d59d4c12f 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -5,11 +5,11 @@ from asdf import config, constants, generic_io, util -from . import reader, store +from . import io as bio +from . import reader, store, writer from .callback import DataCallback from .external import ExternalBlockCache, UseInternal from .options import Options -from .writer import WriteBlock, write_blocks class ReadBlocks(store.LinearStore): @@ -157,7 +157,7 @@ def _write_external_blocks(self): af = AsdfFile() with generic_io.get_file(uri, mode="w") as f: af.write_to(f, include_block_index=False) - write_blocks(f, [blk]) + writer.write_blocks(f, [blk]) def make_write_block(self, data, options, obj): if options.storage_type == "external": @@ -167,7 +167,7 @@ def make_write_block(self, data, options, obj): return blk._uri # need to set up new external block index = len(self._external_write_blocks) - blk = WriteBlock(data, options.compression, options.compression_kwargs) + blk = writer.WriteBlock(data, options.compression, options.compression_kwargs) if self._write_fd is not None: base_uri = self._write_fd.uri or self._uri else: @@ -181,7 +181,7 @@ def make_write_block(self, data, options, obj): self._write_blocks.assign_object(obj, blk) return index # if no block is found, make a new block - blk = WriteBlock(data, options.compression, options.compression_kwargs) + blk = writer.WriteBlock(data, options.compression, options.compression_kwargs) self._write_blocks._items.append(blk) self._write_blocks.assign_object(obj, blk) return len(self._write_blocks) - 1 @@ -189,7 +189,7 @@ def make_write_block(self, data, options, obj): def set_streamed_block(self, data, obj): if self._streamed_block is not None and data is not self._streamed_block.data: raise ValueError("Can not add second streaming block") - self._streamed_block = WriteBlock(data) + self._streamed_block = writer.WriteBlock(data) self._streamed_obj = weakref.ref(obj) def _get_data_callback(self, index): @@ -227,6 +227,10 @@ def options_context(self): @contextlib.contextmanager def write_context(self, fd, copy_options=True): self._clear_write() + # this is required for setting up external blocks + # during serialization we will need to know the uri of + # the file being written to (unless a different uri was + # supplied). self._write_fd = fd if copy_options: with self.options_context(): @@ -235,13 +239,13 @@ def write_context(self, fd, copy_options=True): yield self._clear_write() - def write(self, fd, pad_blocks, include_block_index): - if self._write_fd is None or fd is not self._write_fd: - msg = "Write called outside of valid write_context" + def write(self, pad_blocks, include_block_index): + if self._write_fd is None: + msg = "write called outside of valid write_context" raise OSError(msg) if len(self._write_blocks) or self._streamed_block: - write_blocks( - fd, + writer.write_blocks( + self._write_fd, self._write_blocks, pad_blocks, streamed_block=self._streamed_block, @@ -249,3 +253,108 @@ def write(self, fd, pad_blocks, include_block_index): ) if len(self._external_write_blocks): self._write_external_blocks() + + def update(self, new_tree_size, pad_blocks, include_block_index): + if self._write_fd is None: + msg = "update called outside of valid write_context" + raise OSError(msg) + # find where to start writing blocks (either end of new tree or end of last 'free' block) + last_block = None + for blk in self.blocks[::-1]: + if not blk.memmap and (blk._cached_data is not None or not callable(blk._data)): + continue + last_block = blk + break + if last_block is None: + new_block_start = new_tree_size + else: + new_block_start = max( + last_block.data_offset + last_block.header["allocated_size"], + new_tree_size, + ) + + if len(self._external_write_blocks): + self._write_external_blocks() + + # do we have any blocks to write? + if len(self._write_blocks) or self._streamed_block: + self._write_fd.seek(new_block_start) + offsets, headers = writer.write_blocks( + self._write_fd, + self._write_blocks, + pad_blocks, + streamed_block=self._streamed_block, + write_index=False, # don't write an index as we will modify the offsets + ) + new_block_end = self._write_fd.tell() + + # move blocks to start in increments of block_size + n_bytes = new_block_end - new_block_start + src, dst = new_block_start, new_tree_size + block_size = self._write_fd.block_size + while n_bytes > 0: + self._write_fd.seek(src) + bs = self._write_fd.read(min(n_bytes, block_size)) + self._write_fd.seek(dst) + self._write_fd.write(bs) + n = len(bs) + n_bytes -= n + src += n + dst += n + + # update offset to point at correct locations + offsets = [o - (new_block_start - new_tree_size) for o in offsets] + + # write index if no streamed block + if include_block_index and self._streamed_block is None: + bio.write_block_index(self._write_fd, offsets) + + # map new blocks to old blocks + new_read_blocks = ReadBlocks() + for i, (offset, header) in enumerate(zip(offsets, headers)): + if i == len(self._write_blocks): # this is a streamed block + obj = self._streamed_obj() + wblk = self._streamed_block + else: + wblk = self._write_blocks[i] + # find object associated with wblk + obj = None + for oid, by_key in self._write_blocks._by_id.items(): + for key, index in by_key.items(): + if self._write_blocks[index] is wblk: + obj = key._ref() + break + if obj is None: + msg = "Update failed to associate blocks" + raise OSError(msg) + + # does the obj have an old read block? + rblk = self.blocks.lookup_by_object(obj) + if rblk is not None: + memmap = rblk.memmap + data = None + if not rblk.memmap: + if rblk._cached_data is not None: + data = rblk._cached_data + elif not callable(rblk._data): + data = rblk._data + else: + memmap = self._memmap + data = None + + # we have to be lazy here as the current memmap is invalid + new_read_block = reader.ReadBlock( + offset + 4, self._write_fd, memmap, True, False, header=header, data=data + ) + new_read_blocks.append_block(new_read_block) + new_index = len(new_read_blocks) - 1 + new_read_blocks.assign_object(obj, new_read_block) + + # update data callbacks to point to new blocks + cb = self._data_callbacks.lookup_by_object(obj) + if cb is not None: + cb.reassign(new_index, new_read_blocks) + + # update read blocks to reflect new state + self.blocks = new_read_blocks + self.options._read_blocks = new_read_blocks diff --git a/asdf/asdf.py b/asdf/asdf.py index 1b1aafd40..eadcbbd72 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -13,11 +13,7 @@ from . import _version as version from . import compression as mcompression from . import constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil -from ._block import io as bio -from ._block import reader as block_reader -from ._block import writer as block_writer from ._block.manager import Manager as BlockManager -from ._block.manager import ReadBlocks from ._helpers import validate_version from ._serialization_context import SerializationContext from .config import config_context, get_config @@ -982,7 +978,7 @@ def _serial_write(self, fd, pad_blocks, include_block_index): tree["history"] = copy.deepcopy(self._tree["history"]) self._write_tree(tree, fd, pad_blocks) - self._blocks.write(fd, pad_blocks, include_block_index) + self._blocks.write(pad_blocks, include_block_index) finally: self._post_write(fd) @@ -1107,122 +1103,22 @@ def rewrite(): self._tree["asdf_library"] = get_asdf_library_info() # prepare block manager for writing - self._blocks._clear_write() - - # write out tree to temporary buffer - tree_fd = generic_io.get_file(io.BytesIO(), mode="rw") - self._write_tree(self._tree, tree_fd, False) - new_tree_size = tree_fd.tell() - end_of_file = new_tree_size - - # find where to start writing blocks (either end of new tree or end of last 'free' block) - last_block = None - for blk in self._blocks.blocks[::-1]: - if not blk.memmap and (blk._cached_data is not None or not callable(blk._data)): - continue - last_block = blk - break - if last_block is None: - new_block_start = new_tree_size - else: - new_block_start = max( - last_block.data_offset + last_block.header["allocated_size"], - new_tree_size, - ) + with self._blocks.write_context(self._fd, copy_options=False): + # write out tree to temporary buffer + tree_fd = generic_io.get_file(io.BytesIO(), mode="rw") + self._write_tree(self._tree, tree_fd, False) + new_tree_size = tree_fd.tell() - if len(self._blocks._external_write_blocks): - self._blocks._write_external_blocks() - - # do we have any blocks to write? - if len(self._blocks._write_blocks) or self._blocks._streamed_block: - self._fd.seek(new_block_start) - offsets, headers = block_writer.write_blocks( - self._fd, - self._blocks._write_blocks, - pad_blocks, - streamed_block=self._blocks._streamed_block, - write_index=False, # don't write an index as we will modify the offsets - ) - new_block_end = self._fd.tell() - end_of_file = new_block_end - - # move blocks to start in increments of block_size - n_bytes = new_block_end - new_block_start - src, dst = new_block_start, new_tree_size - block_size = self._fd.block_size - while n_bytes > 0: - self._fd.seek(src) - bs = self._fd.read(min(n_bytes, block_size)) - self._fd.seek(dst) - self._fd.write(bs) - n = len(bs) - n_bytes -= n - src += n - dst += n - - # update offset to point at correct locations - offsets = [o - (new_block_start - new_tree_size) for o in offsets] - - # write index if no streamed block - if include_block_index and self._blocks._streamed_block is None: - bio.write_block_index(self._fd, offsets) - end_of_file = self._fd.tell() - - # map new blocks to old blocks - new_read_blocks = ReadBlocks() - for i, (offset, header) in enumerate(zip(offsets, headers)): - if i == len(self._blocks._write_blocks): # this is a streamed block - obj = self._blocks._streamed_obj() - wblk = self._blocks._streamed_block - else: - wblk = self._blocks._write_blocks[i] - # find object associated with wblk - obj = None - for oid, by_key in self._blocks._write_blocks._by_id.items(): - for key, index in by_key.items(): - if self._blocks._write_blocks[index] is wblk: - obj = key._ref() - break - if obj is None: - msg = "Update failed to associate blocks" - raise OSError(msg) - - # does the obj have an old read block? - rblk = self._blocks.blocks.lookup_by_object(obj) - if rblk is not None: - memmap = rblk.memmap - data = None - if not rblk.memmap: - if rblk._cached_data is not None: - data = rblk._cached_data - elif not callable(rblk._data): - data = rblk._data - else: - memmap = self._blocks._memmap - data = None - - # we have to be lazy here as the current memmap is invalid - new_read_block = block_reader.ReadBlock( - offset + 4, self._fd, memmap, True, False, header=header, data=data - ) - new_read_blocks.append_block(new_read_block) - new_index = len(new_read_blocks) - 1 - new_read_blocks.assign_object(obj, new_read_block) - - # update data callbacks to point to new blocks - cb = self._blocks._data_callbacks.lookup_by_object(obj) - if cb is not None: - cb.reassign(new_index, new_read_blocks) - - # update read blocks to reflect new state - self._blocks.blocks = new_read_blocks - self._blocks.options._read_blocks = new_read_blocks + # update blocks + self._blocks.update(new_tree_size, pad_blocks, include_block_index) + end_of_file = self._fd.tell() # now write the tree self._fd.seek(0) tree_fd.seek(0) self._fd.write(tree_fd.read()) self._fd.flush() + # close memmap to trigger arrays to reload themselves self._fd.seek(end_of_file) if self._fd.can_memmap(): @@ -1231,7 +1127,6 @@ def rewrite(): finally: self._post_write(fd) - self._blocks._clear_write() def write_to( self, From 0eaf18119101e043ee00f0137b5216614b780053 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 13:29:35 -0400 Subject: [PATCH 074/154] rename _streamed_block to _streamed_write_block --- asdf/_block/manager.py | 22 +++++++++++----------- asdf/asdf.py | 2 +- asdf/core/_converters/ndarray.py | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index d59d4c12f..1f3d9e1c4 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -119,7 +119,7 @@ def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, va self._data_callbacks = store.Store() self._write_blocks = store.LinearStore() self._external_write_blocks = [] - self._streamed_block = None + self._streamed_write_block = None self._streamed_obj = None self._write_fd = None self._uri = uri @@ -142,7 +142,7 @@ def _load_external(self, uri): def _clear_write(self): self._write_blocks = store.LinearStore() self._external_write_blocks = [] - self._streamed_block = None + self._streamed_write_block = None self._streamed_obj = None self._write_fd = None @@ -186,10 +186,10 @@ def make_write_block(self, data, options, obj): self._write_blocks.assign_object(obj, blk) return len(self._write_blocks) - 1 - def set_streamed_block(self, data, obj): - if self._streamed_block is not None and data is not self._streamed_block.data: + def set_streamed_write_block(self, data, obj): + if self._streamed_write_block is not None and data is not self._streamed_write_block.data: raise ValueError("Can not add second streaming block") - self._streamed_block = writer.WriteBlock(data) + self._streamed_write_block = writer.WriteBlock(data) self._streamed_obj = weakref.ref(obj) def _get_data_callback(self, index): @@ -243,12 +243,12 @@ def write(self, pad_blocks, include_block_index): if self._write_fd is None: msg = "write called outside of valid write_context" raise OSError(msg) - if len(self._write_blocks) or self._streamed_block: + if len(self._write_blocks) or self._streamed_write_block: writer.write_blocks( self._write_fd, self._write_blocks, pad_blocks, - streamed_block=self._streamed_block, + streamed_block=self._streamed_write_block, write_index=include_block_index, ) if len(self._external_write_blocks): @@ -277,13 +277,13 @@ def update(self, new_tree_size, pad_blocks, include_block_index): self._write_external_blocks() # do we have any blocks to write? - if len(self._write_blocks) or self._streamed_block: + if len(self._write_blocks) or self._streamed_write_block: self._write_fd.seek(new_block_start) offsets, headers = writer.write_blocks( self._write_fd, self._write_blocks, pad_blocks, - streamed_block=self._streamed_block, + streamed_block=self._streamed_write_block, write_index=False, # don't write an index as we will modify the offsets ) new_block_end = self._write_fd.tell() @@ -306,7 +306,7 @@ def update(self, new_tree_size, pad_blocks, include_block_index): offsets = [o - (new_block_start - new_tree_size) for o in offsets] # write index if no streamed block - if include_block_index and self._streamed_block is None: + if include_block_index and self._streamed_write_block is None: bio.write_block_index(self._write_fd, offsets) # map new blocks to old blocks @@ -314,7 +314,7 @@ def update(self, new_tree_size, pad_blocks, include_block_index): for i, (offset, header) in enumerate(zip(offsets, headers)): if i == len(self._write_blocks): # this is a streamed block obj = self._streamed_obj() - wblk = self._streamed_block + wblk = self._streamed_write_block else: wblk = self._write_blocks[i] # find object associated with wblk diff --git a/asdf/asdf.py b/asdf/asdf.py index eadcbbd72..426753c01 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -1089,7 +1089,7 @@ def rewrite(): self._fd.truncate() # if we have no read blocks, we can just call write_to as no internal blocks are reused - if len(self._blocks.blocks) == 0 and self._blocks._streamed_block is None: + if len(self._blocks.blocks) == 0: rewrite() return diff --git a/asdf/core/_converters/ndarray.py b/asdf/core/_converters/ndarray.py index 6f26beaa6..808a0eaa3 100644 --- a/asdf/core/_converters/ndarray.py +++ b/asdf/core/_converters/ndarray.py @@ -29,7 +29,7 @@ def to_yaml_tree(self, obj, tag, ctx): if isinstance(obj, Stream): # previously, stream never passed on data, we can do that here - ctx._blocks.set_streamed_block(data._array, data) + ctx._blocks.set_streamed_write_block(data._array, data) result = {} result["source"] = -1 @@ -107,7 +107,7 @@ def to_yaml_tree(self, obj, tag, ctx): if options.storage_type == "streamed": result["shape"][0] = "*" result["source"] = -1 - ctx._blocks.set_streamed_block(base, data) + ctx._blocks.set_streamed_write_block(base, data) else: result["source"] = ctx._blocks.make_write_block(base, options, obj) result["datatype"] = dtype From b64f982c4df5a9a079e123cdfe423413a4eb130b Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 13:37:30 -0400 Subject: [PATCH 075/154] restore AsdfFile.version after write_to --- asdf/_tests/test_api.py | 10 ++++++++++ asdf/asdf.py | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/asdf/_tests/test_api.py b/asdf/_tests/test_api.py index 3e0f4cb57..6f9083eef 100644 --- a/asdf/_tests/test_api.py +++ b/asdf/_tests/test_api.py @@ -508,3 +508,13 @@ def test_write_to_no_tree_modification(tmp_path): tree = copy.deepcopy(af.tree) af.write_to(fn2) assert af.tree == tree + + +def test_write_to_no_version_modification(tmp_path): + fn = tmp_path / "test.asdf" + tree = {"foo": None} + af = asdf.AsdfFile(tree.copy(), version="1.0.0") + af.write_to(fn, version="1.1.0") + assert af.version_string == "1.0.0" + with asdf.open(fn) as af: + assert af.version_string == "1.1.0" diff --git a/asdf/asdf.py b/asdf/asdf.py index 426753c01..51b60d7a1 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -1209,11 +1209,15 @@ def write_to( config.all_array_compression_kwargs = compression_kwargs if version is not None: + previous_version = self.version self.version = version with generic_io.get_file(fd, mode="w") as fd: self._serial_write(fd, pad_blocks, include_block_index) + if version is not None: + self.version = previous_version + def find_references(self): """ Finds all external "JSON References" in the tree and converts From e8712007214ec85f538e0528f20acfeeceb2bedb Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 14:27:18 -0400 Subject: [PATCH 076/154] change _block.key.Key matches to matches_object --- asdf/_block/key.py | 10 +++++----- asdf/_block/store.py | 4 ++-- asdf/_tests/_block/test_key.py | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/asdf/_block/key.py b/asdf/_block/key.py index fdad39ec7..66134fa7d 100644 --- a/asdf/_block/key.py +++ b/asdf/_block/key.py @@ -22,9 +22,6 @@ def __init__(self, obj=None, key=None): if obj is not None: self.assign_object(obj) - def assign_object(self, obj): - self._ref = weakref.ref(obj) - def is_valid(self): if self._ref is UndefinedRef: return False @@ -36,7 +33,10 @@ def is_valid(self): def __hash__(self): return self._key - def matches(self, obj): + def assign_object(self, obj): + self._ref = weakref.ref(obj) + + def matches_object(self, obj): if self._ref is UndefinedRef: return False r = self._ref() @@ -51,7 +51,7 @@ def __eq__(self, other): return False if not self.is_valid(): return False - return other.matches(self._ref()) + return other.matches_object(self._ref()) def __copy__(self): return self.__class__(self._ref(), self._key) diff --git a/asdf/_block/store.py b/asdf/_block/store.py index 5355d833c..86c2a5dd9 100644 --- a/asdf/_block/store.py +++ b/asdf/_block/store.py @@ -29,7 +29,7 @@ def lookup_by_object(self, obj, default=None): # look for a matching key: O(N) for key, value in by_key.items(): - if key.matches(obj): + if key.matches_object(obj): return value # no match, return default @@ -56,7 +56,7 @@ def assign_object(self, obj, value): # look for a matching matching key if obj_key is None: for key in by_key: - if key.matches(obj): + if key.matches_object(obj): by_key[key] = value return # we didn't find a matching key, so make one diff --git a/asdf/_tests/_block/test_key.py b/asdf/_tests/_block/test_key.py index f38780a27..1e807345a 100644 --- a/asdf/_tests/_block/test_key.py +++ b/asdf/_tests/_block/test_key.py @@ -28,12 +28,12 @@ def test_unique_same_object(): def test_matches_obj(): f = Foo() bk = Key(f) - assert bk.matches(f) + assert bk.matches_object(f) def test_undefined_no_match(): bk = Key() - assert not bk.matches(Foo()) + assert not bk.matches_object(Foo()) def test_is_valid(): @@ -50,7 +50,7 @@ def test_same_class(): del f f2 = Foo() assert not bk.is_valid() - assert not bk.matches(f2) + assert not bk.matches_object(f2) def test_undefined(): From 10f3b57b48387a4bd1311826cc16155e685bf234 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 15:03:45 -0400 Subject: [PATCH 077/154] fix handling of io_block_size this setting is resolved in generic_io not in config so only generic_io.SomeFile.block_size is valid the config will return -1 if the io_block size was not defined and this setting will be 'frozen' at the time of file opening --- asdf/_block/io.py | 7 +------ asdf/_tests/_block/test_io.py | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/asdf/_block/io.py b/asdf/_block/io.py index 619a785a3..793484260 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -8,7 +8,6 @@ from asdf import compression as mcompression from asdf import constants, util -from asdf.config import get_config BLOCK_HEADER = util.BinaryStruct( [ @@ -120,9 +119,7 @@ def callback(): return offset, header, data_offset, data -def generate_write_header( - data, stream=False, compression_kwargs=None, padding=False, fs_block_size=None, **header_kwargs -): +def generate_write_header(data, stream=False, compression_kwargs=None, padding=False, fs_block_size=1, **header_kwargs): if data.ndim != 1 or data.dtype != "uint8": msg = "Data must be of ndim==1 and dtype==uint8" raise ValueError(msg) @@ -148,8 +145,6 @@ def generate_write_header( header_kwargs["allocated_size"] = 0 else: header_kwargs["used_size"] = used_size - if fs_block_size is None: - fs_block_size = get_config().io_block_size padding = util.calculate_padding(used_size, padding, fs_block_size) header_kwargs["allocated_size"] = header_kwargs.get("allocated_size", used_size + padding) diff --git a/asdf/_tests/_block/test_io.py b/asdf/_tests/_block/test_io.py index 636da86bc..6d44bffad 100644 --- a/asdf/_tests/_block/test_io.py +++ b/asdf/_tests/_block/test_io.py @@ -340,6 +340,20 @@ def test_read_block_index_no_header(tmp_path): assert bio.read_block_index(fd) == values +def test_read_block_index_invalid_yaml(): + bs = io.BytesIO(constants.INDEX_HEADER + b"][") + with generic_io.get_file(bs, "r") as fd: + with pytest.raises(OSError, match="Failed to parse block index as yaml"): + bio.read_block_index(fd) + + +def test_read_block_index_valid_yaml_invalid_contents(): + bs = io.BytesIO(constants.INDEX_HEADER + b"['a', 'b']") + with generic_io.get_file(bs, "r") as fd: + with pytest.raises(OSError, match="Invalid block index"): + bio.read_block_index(fd) + + def test_write_block_index_with_offset(tmp_path): fn = tmp_path / "test" offset = 50 @@ -349,7 +363,3 @@ def test_write_block_index_with_offset(tmp_path): bio.write_block_index(fd, [1, 2, 3], offset=offset) with generic_io.get_file(fn, "r") as fd: assert bio.find_block_index(fd) == offset - - -# TODO test that file pointer is always at the end of a block after a read -# for all possible block types From bfed1565fea7c104e33325df3dc65b2aa97586c1 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 16:11:03 -0400 Subject: [PATCH 078/154] skip checksum validation for streamed blocks --- asdf/_block/io.py | 2 +- asdf/_block/reader.py | 3 +- asdf/_tests/_block/test_reader.py | 69 ++++++++++++++++++++++++------- asdf/_tests/_block/test_writer.py | 17 ++++---- 4 files changed, 67 insertions(+), 24 deletions(-) diff --git a/asdf/_block/io.py b/asdf/_block/io.py index 793484260..e15cc021f 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -223,7 +223,7 @@ def read_block_index(fd, offset=None): raise OSError(msg) try: block_index = yaml.load(fd.read(-1), yaml.SafeLoader) - except yaml.parser.ParserError: + except yaml.error.YAMLError: raise OSError("Failed to parse block index as yaml") if ( not isinstance(block_index, list) diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index 64e591607..aae12976a 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -13,7 +13,6 @@ def __init__(self, offset, fd, memmap, lazy_load, validate_checksum, header=None self.data_offset = data_offset self._data = data self._cached_data = None - # TODO alternative to passing these down? self.memmap = memmap self.lazy_load = lazy_load self.validate_checksum = validate_checksum @@ -47,7 +46,7 @@ def data(self): data = self._data if self.validate_checksum: checksum = bio.calculate_block_checksum(data) - if checksum != self._header["checksum"]: + if not self._header["flags"] & constants.BLOCK_FLAG_STREAMED and checksum != self._header["checksum"]: msg = f"Block at {self.offset} does not match given checksum" raise ValueError(msg) # only validate data the first time it's read diff --git a/asdf/_tests/_block/test_reader.py b/asdf/_tests/_block/test_reader.py index 62cce9c1c..9698af1c4 100644 --- a/asdf/_tests/_block/test_reader.py +++ b/asdf/_tests/_block/test_reader.py @@ -12,7 +12,9 @@ @contextlib.contextmanager -def gen_blocks(fn=None, n=5, size=10, padding=0, padding_byte=b"\0", with_index=False, block_padding=False): +def gen_blocks( + fn=None, n=5, size=10, padding=0, padding_byte=b"\0", with_index=False, block_padding=False, streamed=False +): offsets = [] if fn is not None: with generic_io.get_file(fn, mode="w") as fd: @@ -30,8 +32,8 @@ def check(blocks): offsets.append(fd.tell()) fd.write(constants.BLOCK_MAGIC) data = np.ones(size, dtype="uint8") * i - bio.write_block(fd, data, padding=block_padding) - if with_index: + bio.write_block(fd, data, stream=streamed and (i == n - 1), padding=block_padding) + if with_index and not streamed: bio.write_block_index(fd, offsets) fd.seek(0) yield fd, check @@ -43,17 +45,21 @@ def check(blocks): @pytest.mark.parametrize("with_index", [True, False]) @pytest.mark.parametrize("validate_checksums", [True, False]) @pytest.mark.parametrize("padding", [0, 3, 4, 5]) -def test_read(tmp_path, lazy_load, memmap, with_index, validate_checksums, padding): +@pytest.mark.parametrize("streamed", [True, False]) +def test_read(tmp_path, lazy_load, memmap, with_index, validate_checksums, padding, streamed): fn = tmp_path / "test.bin" n = 5 size = 10 - with gen_blocks(fn=fn, n=n, size=size, padding=padding, with_index=with_index) as (fd, check): + with gen_blocks(fn=fn, n=n, size=size, padding=padding, with_index=with_index, streamed=streamed) as (fd, check): r = read_blocks(fd, memmap=memmap, lazy_load=lazy_load, validate_checksums=validate_checksums) - if lazy_load and with_index: + if lazy_load and with_index and not streamed: assert r[0].loaded assert r[-1].loaded for blk in r[1:-1]: assert not blk.loaded + # getting the header should load the block + blk.header + assert blk.loaded else: for blk in r: assert blk.loaded @@ -62,6 +68,13 @@ def test_read(tmp_path, lazy_load, memmap, with_index, validate_checksums, paddi base = util.get_array_base(blk.data) assert isinstance(base.base, mmap.mmap) check(r) + if lazy_load: + # if lazy loaded, each call to data should re-read the data + assert r[0].data is not r[0].data + else: + assert r[0].data is r[0].data + # getting cached_data should always return the same array + assert r[0].cached_data is r[0].cached_data def test_read_invalid_padding(): @@ -79,24 +92,31 @@ def test_read_post_padding(): check(read_blocks(fd)) -# TODO non-seekable - - -@pytest.mark.parametrize("invalid_block_index", [0, 1, -1]) +@pytest.mark.parametrize("invalid_block_index", [0, 1, -1, "junk"]) def test_invalid_block_index(tmp_path, invalid_block_index): fn = tmp_path / "test.bin" with gen_blocks(fn=fn, with_index=True) as (fd, check): + # trash the block index offset = bio.find_block_index(fd) assert offset is not None - block_index = bio.read_block_index(fd, offset) - block_index[invalid_block_index] += 4 - fd.seek(offset) - bio.write_block_index(fd, block_index) + if invalid_block_index == "junk": + # trash the whole index + fd.seek(-4, 2) + fd.write(b"junk") + else: # mess up one entry of the index + block_index = bio.read_block_index(fd, offset) + block_index[invalid_block_index] += 4 + fd.seek(offset) + bio.write_block_index(fd, block_index) fd.seek(0) + # when the block index is read, only the first and last blocks # are check, so any other invalid entry should result in failure if invalid_block_index in (0, -1): check(read_blocks(fd, lazy_load=True)) + elif invalid_block_index == "junk": + # read_blocks should fall back to reading serially + check(read_blocks(fd, lazy_load=True)) else: with pytest.raises(ValueError, match="Header size.*"): check(read_blocks(fd, lazy_load=True)) @@ -130,3 +150,24 @@ def test_closed_file(tmp_path): blk = blocks[1] with pytest.raises(OSError, match="Attempt to load block from closed file"): blk.load() + + +@pytest.mark.parametrize("validate_checksums", [True, False]) +def test_bad_checksum(validate_checksums): + buff = io.BytesIO( + constants.BLOCK_MAGIC + + b"\x000" # header size = 2 + + b"\0\0\0\0" # flags = 4 + + b"\0\0\0\0" # compression = 4 + + b"\0\0\0\0\0\0\0\0" # allocated size = 8 + + b"\0\0\0\0\0\0\0\0" # used size = 8 + + b"\0\0\0\0\0\0\0\0" # data size = 8 + + b"\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" # invalid checksum = 16 + ) + + with generic_io.get_file(buff, mode="r") as fd: + if validate_checksums: + with pytest.raises(ValueError, match=".* does not match given checksum"): + read_blocks(fd, lazy_load=False, validate_checksums=validate_checksums)[0].data + else: + read_blocks(fd, lazy_load=False, validate_checksums=validate_checksums)[0].data diff --git a/asdf/_tests/_block/test_writer.py b/asdf/_tests/_block/test_writer.py index b0d1dc82e..3434782ee 100644 --- a/asdf/_tests/_block/test_writer.py +++ b/asdf/_tests/_block/test_writer.py @@ -5,17 +5,15 @@ from asdf import constants, generic_io from asdf._block import reader, writer -# TODO write blocks, with compression_kwargs: how to check this worked? -# TODO invalid inputs - @pytest.mark.parametrize("lazy", [True, False]) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize("padding", [True, False, 0.1, 0.9]) @pytest.mark.parametrize("compression", [None, b"zlib"]) @pytest.mark.parametrize("stream", [True, False]) -def test_write_blocks(tmp_path, lazy, index, padding, compression, stream): - data = [np.ones(10, dtype=np.uint8), np.zeros(5, dtype=np.uint8)] +@pytest.mark.parametrize("seekable", [True, False]) +def test_write_blocks(tmp_path, lazy, index, padding, compression, stream, seekable): + data = [np.ones(10, dtype=np.uint8), np.zeros(5, dtype=np.uint8), None] if lazy: blocks = [writer.WriteBlock(lambda bd=d: bd, compression=compression) for d in data] else: @@ -26,9 +24,11 @@ def test_write_blocks(tmp_path, lazy, index, padding, compression, stream): streamed_block = None fn = tmp_path / "test.bin" with generic_io.get_file(fn, mode="w") as fd: + if not seekable: + fd.seekable = lambda: False writer.write_blocks(fd, blocks, padding=padding, streamed_block=streamed_block, write_index=index) with generic_io.get_file(fn, mode="r") as fd: - if index and not stream: + if index and not stream and seekable: assert bio.find_block_index(fd) is not None else: assert bio.find_block_index(fd) is None @@ -39,7 +39,10 @@ def test_write_blocks(tmp_path, lazy, index, padding, compression, stream): else: assert len(read_blocks) == len(data) for r, d in zip(read_blocks, data): - np.testing.assert_array_equal(r.data, d) + if d is None: + assert r.data.size == 0 + else: + np.testing.assert_array_equal(r.data, d) if compression is not None: assert r.header["compression"] == compression if padding: From 17b86526edff54f504a72b811396c5821cb1748b Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 16:27:59 -0400 Subject: [PATCH 079/154] add unit tests for _block.external --- asdf/_tests/_block/test_external.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 asdf/_tests/_block/test_external.py diff --git a/asdf/_tests/_block/test_external.py b/asdf/_tests/_block/test_external.py new file mode 100644 index 000000000..978a116ba --- /dev/null +++ b/asdf/_tests/_block/test_external.py @@ -0,0 +1,18 @@ +import numpy as np + +import asdf +from asdf._block import external + + +def test_cache(tmp_path): + efn = tmp_path / "test.asdf" + arr = np.arange(3, dtype="uint8") + asdf.AsdfFile({"data": arr}).write_to(efn) + + cache = external.ExternalBlockCache() + base_uri = f"file://{tmp_path}/" + data = cache.load(base_uri, "test.asdf") + np.testing.assert_array_equal(data, arr) + assert cache.load(base_uri, "test.asdf") is data + assert cache.load(base_uri, "#") is external.UseInternal + assert cache.load(base_uri, "") is external.UseInternal From 27ddde422aca1ba2bf15b2cb51d18ef61c55aeea Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 17:12:00 -0400 Subject: [PATCH 080/154] move external block uri resolution to _block.external --- asdf/_block/external.py | 10 +++++++++ asdf/_block/manager.py | 35 +++++------------------------ asdf/_tests/_block/test_external.py | 8 +++++++ 3 files changed, 23 insertions(+), 30 deletions(-) diff --git a/asdf/_block/external.py b/asdf/_block/external.py index 2fb232f26..9f16da276 100644 --- a/asdf/_block/external.py +++ b/asdf/_block/external.py @@ -1,3 +1,5 @@ +import os + from asdf import generic_io, util @@ -21,3 +23,11 @@ def load(self, base_uri, uri): with asdf_open(resolved_uri, lazy_load=False, copy_arrays=True) as af: self._cache[key] = af._blocks.blocks[0].cached_data return self._cache[key] + + +def uri_for_index(uri, index): + parts = list(util.patched_urllib_parse.urlparse(uri)) + path = parts[2] + dirname, filename = os.path.split(path) + filename = os.path.splitext(filename)[0] + f"{index:04d}.asdf" + return filename diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 1f3d9e1c4..20f24e090 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -1,14 +1,12 @@ import contextlib import copy -import os import weakref from asdf import config, constants, generic_io, util +from . import external, reader, store, writer from . import io as bio -from . import reader, store, writer from .callback import DataCallback -from .external import ExternalBlockCache, UseInternal from .options import Options @@ -87,29 +85,6 @@ def get_output_compressions(self): return compressions -def make_external_uri(uri, index): - if uri is None: - uri = "" - parts = list(util.patched_urllib_parse.urlparse(uri)) - path = parts[2] - dirname, filename = os.path.split(path) - filename = os.path.splitext(filename)[0] + f"{index:04d}.asdf" - return filename - - -def relative_uri_to_full(uri, relative): - # file://foo/bar, bam -> file://foo/bam - # TODO replace with generic_io.resolve_uri - if uri is None: - uri = "" - parts = list(util.patched_urllib_parse.urlparse(uri)) - path = parts[2] - dirname, filename = os.path.split(path) - path = os.path.join(dirname, relative) - parts[2] = path - return util.patched_urllib_parse.urlunparse(parts) - - class Manager: def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, validate_checksums=False): if read_blocks is None: @@ -123,7 +98,7 @@ def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, va self._streamed_obj = None self._write_fd = None self._uri = uri - self._external_block_cache = ExternalBlockCache() + self._external_block_cache = external.ExternalBlockCache() self._lazy_load = lazy_load self._memmap = memmap self._validate_checksums = validate_checksums @@ -135,7 +110,7 @@ def read(self, fd, after_magic=False): def _load_external(self, uri): value = self._external_block_cache.load(self._uri, uri) - if value is UseInternal: + if value is external.UseInternal: return self._blocks.blocks[0].data return value @@ -153,7 +128,7 @@ def _write_external_blocks(self): raise ValueError("Can't write external blocks, since URI of main file is unknown.") for blk in self._external_write_blocks: - uri = relative_uri_to_full(self._write_fd.uri, blk._uri) + uri = generic_io.resolve_uri(self._write_fd.uri, blk._uri) af = AsdfFile() with generic_io.get_file(uri, mode="w") as f: af.write_to(f, include_block_index=False) @@ -172,7 +147,7 @@ def make_write_block(self, data, options, obj): base_uri = self._write_fd.uri or self._uri else: base_uri = self._uri - blk._uri = make_external_uri(base_uri, index) + blk._uri = external.uri_for_index(base_uri, index) self._external_write_blocks.append(blk) return blk._uri # first, look for an existing block diff --git a/asdf/_tests/_block/test_external.py b/asdf/_tests/_block/test_external.py index 978a116ba..5392c1e56 100644 --- a/asdf/_tests/_block/test_external.py +++ b/asdf/_tests/_block/test_external.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import asdf from asdf._block import external @@ -16,3 +17,10 @@ def test_cache(tmp_path): assert cache.load(base_uri, "test.asdf") is data assert cache.load(base_uri, "#") is external.UseInternal assert cache.load(base_uri, "") is external.UseInternal + + +@pytest.mark.parametrize("uri", ["test.asdf", "foo/test.asdf"]) +@pytest.mark.parametrize("index", [0, 1, 100]) +def test_uri_for_index(uri, index): + match = f"test{index:04d}.asdf" + assert external.uri_for_index(uri, index) == match From 52d0e5d945c1f66a06b2a654f8b13a050ac92f57 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 17:30:28 -0400 Subject: [PATCH 081/154] fix missing uri check for external blocks --- asdf/_block/manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 20f24e090..58aa2dfe5 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -147,6 +147,8 @@ def make_write_block(self, data, options, obj): base_uri = self._write_fd.uri or self._uri else: base_uri = self._uri + if base_uri is None: + raise ValueError("Can't write external blocks, since URI of main file is unknown.") blk._uri = external.uri_for_index(base_uri, index) self._external_write_blocks.append(blk) return blk._uri From aede9f327642eba903f7ac6a18ea51000b835f89 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 11 May 2023 17:39:33 -0400 Subject: [PATCH 082/154] remove reserve_blocks --- asdf/_tests/tags/core/tests/test_ndarray.py | 2 - asdf/extension/_converter.py | 48 --------------------- 2 files changed, 50 deletions(-) diff --git a/asdf/_tests/tags/core/tests/test_ndarray.py b/asdf/_tests/tags/core/tests/test_ndarray.py index 29a7b651a..b83ec0e4b 100644 --- a/asdf/_tests/tags/core/tests/test_ndarray.py +++ b/asdf/_tests/tags/core/tests/test_ndarray.py @@ -130,8 +130,6 @@ def test_dont_load_data(): buff.seek(0) with asdf.open(buff) as ff: - ff._run_hook("reserve_blocks") - # repr and str shouldn't load data str(ff.tree["science_data"]) repr(ff.tree) diff --git a/asdf/extension/_converter.py b/asdf/extension/_converter.py index 1966df0a6..12546d87c 100644 --- a/asdf/extension/_converter.py +++ b/asdf/extension/_converter.py @@ -152,29 +152,6 @@ def from_yaml_tree(self, node, tag, ctx): or a generator that yields such an instance. """ - def reserve_blocks(self, obj, tag): - """ - Reserve any number of blocks in which data (ndarrays) can be - stored. - - Parameters - ---------- - obj : object - Instance of a custom type to be serialized. Guaranteed to - be an instance of one of the types listed in the `types` - property. - tag : str - The tag identifying the YAML type that ``obj`` should be - converted into. Selected by a call to this converter's - select_tag method. - - Returns - ------- - keys : list of unique hashable keys - These keys will be used to reserve blocks for later use - """ - return [] - class ConverterProxy(Converter): """ @@ -308,31 +285,6 @@ def from_yaml_tree(self, node, tag, ctx): """ return self._delegate.from_yaml_tree(node, tag, ctx) - def reserve_blocks(self, obj, tag): - """ - Reserve blocks to be used during conversion of this object - - Parameters - ---------- - obj : object - Instance of a custom type to be serialized. Guaranteed to - be an instance of one of the types listed in the `types` - property. - tag : str - The tag identifying the YAML type that ``obj`` should be - converted into. Selected by a call to this converter's - select_tag method. - - Returns - ------- - keys : list of unique hashable keys - These keys will be used to reserve blocks for later use - - """ - if hasattr(self._delegate, "reserve_blocks"): - return self._delegate.reserve_blocks(obj, tag) - return [] - @property def delegate(self): """ From c0d75a4f0aaecca6e7d37af9dea0126d32522444 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 12 May 2023 10:08:02 -0400 Subject: [PATCH 083/154] simplify ndarray shape loading and fix error message --- asdf/_block/io.py | 2 +- asdf/_tests/_block/test_io.py | 2 +- asdf/_tests/tags/core/tests/test_ndarray.py | 12 +++++++++ asdf/_tests/test_api.py | 2 +- asdf/_tests/test_array_blocks.py | 23 +++++++++++++++++ asdf/tags/core/ndarray.py | 28 +++++---------------- 6 files changed, 44 insertions(+), 25 deletions(-) diff --git a/asdf/_block/io.py b/asdf/_block/io.py index e15cc021f..89307c4b6 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -105,7 +105,7 @@ def read_block(fd, offset=None, memmap=False, lazy_load=False): def callback(): fd = fd_ref() if fd is None or fd.is_closed(): - msg = "Attempt to read data from closed file" + msg = "ASDF file has already been closed. Can not get the data." raise OSError(msg) position = fd.tell() data = read_block_data(fd, header, offset=data_offset, memmap=memmap) diff --git a/asdf/_tests/_block/test_io.py b/asdf/_tests/_block/test_io.py index 6d44bffad..4a76065c7 100644 --- a/asdf/_tests/_block/test_io.py +++ b/asdf/_tests/_block/test_io.py @@ -222,7 +222,7 @@ def test_read_from_closed(tmp_path): bio.write_block(fd, data, stream=True) with generic_io.get_file(fn, mode="rw") as fd: _, _, _, callback = bio.read_block(fd, offset=0, lazy_load=True) - with pytest.raises(OSError, match="Attempt to read data from closed file"): + with pytest.raises(OSError, match="ASDF file has already been closed. Can not get the data."): callback() diff --git a/asdf/_tests/tags/core/tests/test_ndarray.py b/asdf/_tests/tags/core/tests/test_ndarray.py index b83ec0e4b..2aa6052fb 100644 --- a/asdf/_tests/tags/core/tests/test_ndarray.py +++ b/asdf/_tests/tags/core/tests/test_ndarray.py @@ -966,3 +966,15 @@ def test_problematic_class_attributes(tmp_path): with pytest.raises(AttributeError, match=r".* object has no attribute 'version'"): af["arr"].version + + +def test_shape_does_not_load_array(tmp_path): + file_path = tmp_path / "test.asdf" + with asdf.AsdfFile() as af: + af["arr"] = np.arange(100) + af.write_to(file_path) + + with asdf.open(file_path, lazy_load=True) as af: + assert af["arr"]._array is None + assert af["arr"].shape == (100,) + assert af["arr"]._array is None diff --git a/asdf/_tests/test_api.py b/asdf/_tests/test_api.py index 6f9083eef..5127e30e7 100644 --- a/asdf/_tests/test_api.py +++ b/asdf/_tests/test_api.py @@ -457,7 +457,7 @@ def test_array_access_after_file_close(tmp_path): # the file has been closed: with asdf.open(path) as af: tree = af.tree - with pytest.raises(OSError, match=r"Attempt to read data from closed file"): + with pytest.raises(OSError, match=r"ASDF file has already been closed. Can not get the data."): tree["data"][0] # With memory mapping disabled and copying arrays enabled, diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index bfdf5a869..3d685aba7 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -842,3 +842,26 @@ def test_remove_blocks(tmp_path): for fn in (fn1, fn2): with asdf.open(fn) as af: assert len(af._blocks.blocks) == 1 + + +def test_open_memmap_from_closed_file(tmp_path): + fn = tmp_path / "test.asdf" + arr = np.zeros(100) + arr2 = np.ones(100) + tree = {"base": arr, "view": arr[:50], "base2": arr2} + af = asdf.AsdfFile(tree) + af.write_to(fn) + + with asdf.open(fn, lazy_load=True, copy_arrays=False) as af: + # load the base so we can test if accessing the view after the + # file is closed will trigger an error + af["base"][:] + view = af["view"] + base2 = af["base2"] + + msg = r"ASDF file has already been closed. Can not get the data." + with pytest.raises(OSError, match=msg): + view[:] + + with pytest.raises(OSError, match=msg): + base2[:] diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index d7be1bbff..53ab5535f 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -269,16 +269,14 @@ def _make_array(self): # we need to keep _source as a str to allow stdatamodels to # support AsdfInFits data = self._data_callback() - elif isinstance(self._source, int): + else: # cached data is used here so that multiple NDArrayTypes will all use # the same base array data = self._data_callback(_attr="cached_data") - else: - # inline data - data = self._source if hasattr(data, "base") and isinstance(data.base, mmap.mmap) and data.base.closed: - raise OSError("Attempt to read data from a closed file") + msg = "ASDF file has already been closed. Can not get the data." + raise OSError(msg) # compute shape (streaming blocks have '0' data size in the block header) shape = self.get_actual_shape( @@ -352,24 +350,10 @@ def get_actual_shape(self, shape, strides, dtype, block_size): @property def shape(self): - if self._shape is None or self._array is not None: + if self._shape is None or self._array is not None or "*" in self._shape: + # streamed blocks have a '0' data_size in the header so we + # need to make the array to get the shape return self.__array__().shape - if "*" in self._shape: - if isinstance(self._source, str): - return self._make_array().shape - data_size = self._data_callback(_attr="header")["data_size"] - if not data_size: - # streamed blocks have a '0' data_size in the header so we - # need to make the array to get the shape - return self._make_array().shape - return tuple( - self.get_actual_shape( - self._shape, - self._strides, - self._dtype, - data_size, - ) - ) return tuple(self._shape) @property From 6942107935b953863556fdc7db97c753d756d9b3 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 12 May 2023 10:15:36 -0400 Subject: [PATCH 084/154] move SerializationContext tests --- asdf/_tests/test_asdf.py | 28 ++-------------------- asdf/_tests/test_serialization_context.py | 29 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 26 deletions(-) create mode 100644 asdf/_tests/test_serialization_context.py diff --git a/asdf/_tests/test_asdf.py b/asdf/_tests/test_asdf.py index 65bbf8797..dfa0b86a9 100644 --- a/asdf/_tests/test_asdf.py +++ b/asdf/_tests/test_asdf.py @@ -3,12 +3,12 @@ import fsspec import pytest -from asdf import config_context, get_config +from asdf import config_context from asdf._tests._helpers import assert_no_warnings, assert_tree_match, yaml_to_asdf from asdf.asdf import AsdfFile, open_asdf from asdf.entry_points import get_extensions from asdf.exceptions import AsdfWarning -from asdf.extension import ExtensionManager, ExtensionProxy, SerializationContext +from asdf.extension import ExtensionProxy, SerializationContext from asdf.extension._legacy import AsdfExtensionList from asdf.versioning import AsdfVersion @@ -188,30 +188,6 @@ def test_open_asdf_extensions(tmp_path): pass -def test_serialization_context(): - extension_manager = ExtensionManager([]) - context = SerializationContext("1.4.0", extension_manager, "file://test.asdf", None) - assert context.version == "1.4.0" - assert context.extension_manager is extension_manager - assert context._extensions_used == set() - - extension = get_config().extensions[0] - context._mark_extension_used(extension) - assert context._extensions_used == {extension} - context._mark_extension_used(extension) - assert context._extensions_used == {extension} - context._mark_extension_used(extension.delegate) - assert context._extensions_used == {extension} - - assert context.url == context._url == "file://test.asdf" - - with pytest.raises(TypeError, match=r"Extension must implement the Extension interface"): - context._mark_extension_used(object()) - - with pytest.raises(ValueError, match=r"ASDF Standard version .* is not supported by asdf==.*"): - SerializationContext("0.5.4", extension_manager, None, None) - - def test_reading_extension_metadata(): extension_with_uri = ExtensionProxy( TestExtension(extension_uri="asdf://somewhere.org/extensions/foo-1.0"), diff --git a/asdf/_tests/test_serialization_context.py b/asdf/_tests/test_serialization_context.py new file mode 100644 index 000000000..188f382c1 --- /dev/null +++ b/asdf/_tests/test_serialization_context.py @@ -0,0 +1,29 @@ +import pytest + +from asdf import get_config +from asdf._serialization_context import SerializationContext +from asdf.extension import ExtensionManager + + +def test_serialization_context(): + extension_manager = ExtensionManager([]) + context = SerializationContext("1.4.0", extension_manager, "file://test.asdf", None) + assert context.version == "1.4.0" + assert context.extension_manager is extension_manager + assert context._extensions_used == set() + + extension = get_config().extensions[0] + context._mark_extension_used(extension) + assert context._extensions_used == {extension} + context._mark_extension_used(extension) + assert context._extensions_used == {extension} + context._mark_extension_used(extension.delegate) + assert context._extensions_used == {extension} + + assert context.url == context._url == "file://test.asdf" + + with pytest.raises(TypeError, match=r"Extension must implement the Extension interface"): + context._mark_extension_used(object()) + + with pytest.raises(ValueError, match=r"ASDF Standard version .* is not supported by asdf==.*"): + SerializationContext("0.5.4", extension_manager, None, None) From 1bd9988e55d77fa94cf837fe20e6d3c9ef54c4d8 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 12 May 2023 12:01:10 -0400 Subject: [PATCH 085/154] update SerializationContext tests --- asdf/_serialization_context.py | 20 ++-- asdf/_tests/test_api.py | 17 +++- asdf/_tests/test_extension.py | 9 ++ asdf/_tests/test_serialization_context.py | 115 ++++++++++++++++++++++ 4 files changed, 151 insertions(+), 10 deletions(-) diff --git a/asdf/_serialization_context.py b/asdf/_serialization_context.py index 752777aed..dde874160 100644 --- a/asdf/_serialization_context.py +++ b/asdf/_serialization_context.py @@ -180,18 +180,24 @@ def __exit__(self, exc_type, exc_value, traceback): def get_block_data_callback(self, index, key=None): blk = self._blocks.blocks[index] - cb = self._blocks._get_data_callback(index) - if key is None: + if blk is self._blk: + # return callback for a previously access block + return self._cb if self._blk is not None: + # for attempts to access a second block without a key msg = "Converters accessing >1 block must provide a key for each block" raise OSError(msg) self._blk = blk - self._cb = cb - else: + self._cb = self._blocks._get_data_callback(index) + return self._cb + + # for key accesses try to find a previous use of this key + cb = self._blocks._data_callbacks.lookup_by_object(key) + if cb is None: self._blocks.blocks.assign_object(key, blk) + cb = self._blocks._get_data_callback(index) self._blocks._data_callbacks.assign_object(key, cb) - return cb def generate_block_key(self): @@ -212,7 +218,3 @@ def find_available_block_index(self, data_callback, lookup_key=None): def generate_block_key(self): return BlockKey(self._obj) - - -class _IgnoreBlocks(_Operation): - pass diff --git a/asdf/_tests/test_api.py b/asdf/_tests/test_api.py index 5127e30e7..3a52fe6ab 100644 --- a/asdf/_tests/test_api.py +++ b/asdf/_tests/test_api.py @@ -488,13 +488,28 @@ def test_asdf_standard_version_tag_selection(): assert b"!core/asdf-1.1.0" not in content buff.seek(0) - af.write_to(buff, version="1.2.0") + af.write_to(buff, version="1.2.0") # asdf-standard 1.2 uses asdf-object 1.1 tag buff.seek(0) content = buff.read() assert b"!core/asdf-1.0.0" not in content assert b"!core/asdf-1.1.0" in content +def test_update_asdf_standard_version_tag_selection(): + buff = io.BytesIO() + + af = asdf.AsdfFile() + af.write_to(buff, version="1.0.0") + + buff.seek(0) + with asdf.open(buff, mode="rw") as af: + af.update(version="1.2.0") # asdf-standard 1.2 uses asdf-object 1.1 tag + buff.seek(0) + content = buff.read() + assert b"!core/asdf-1.1.0" in content + assert b"!core/asdf-1.0.0" not in content + + def test_write_to_no_tree_modification(tmp_path): fn = tmp_path / "test.asdf" fn2 = tmp_path / "test2.asdf" diff --git a/asdf/_tests/test_extension.py b/asdf/_tests/test_extension.py index d92d0041b..813a2cfce 100644 --- a/asdf/_tests/test_extension.py +++ b/asdf/_tests/test_extension.py @@ -149,6 +149,10 @@ class FooType: pass +class SubFooType(FooType): + pass + + class BarType: pass @@ -412,6 +416,8 @@ def test_extension_manager(): assert manager.handles_tag("asdf://somewhere.org/extensions/full/tags/baz-1.0") is True assert manager.handles_type(FooType) is True + assert manager.handles_type(SubFooType) is False + assert manager._handles_subtype(SubFooType) is True # This should return True even though BarType was listed # as string class name: assert manager.handles_type(BarType) is True @@ -434,10 +440,13 @@ def test_extension_manager(): manager.get_converter_for_tag("asdf://somewhere.org/extensions/full/tags/bar-1.0") assert manager.get_converter_for_type(FooType).delegate is converter1 + assert manager._get_converter_for_subtype(SubFooType).delegate is converter1 assert manager.get_converter_for_type(BarType).delegate is converter1 assert manager.get_converter_for_type(BazType).delegate is converter2 with pytest.raises(KeyError, match=r"\"No support available for Python type .*\""): manager.get_converter_for_type(object) + with pytest.raises(KeyError, match=r"\"No support available for Python type .*\""): + manager.get_converter_for_type(SubFooType) def test_get_cached_extension_manager(): diff --git a/asdf/_tests/test_serialization_context.py b/asdf/_tests/test_serialization_context.py index 188f382c1..9e76bb026 100644 --- a/asdf/_tests/test_serialization_context.py +++ b/asdf/_tests/test_serialization_context.py @@ -1,5 +1,7 @@ +import numpy as np import pytest +import asdf from asdf import get_config from asdf._serialization_context import SerializationContext from asdf.extension import ExtensionManager @@ -27,3 +29,116 @@ def test_serialization_context(): with pytest.raises(ValueError, match=r"ASDF Standard version .* is not supported by asdf==.*"): SerializationContext("0.5.4", extension_manager, None, None) + + +@pytest.mark.parametrize("operation", ["_deserialization", "_serialization"]) +def test_extension_used_in_operation(operation): + extension_manager = ExtensionManager([]) + context = SerializationContext("1.4.0", extension_manager, "file://test.asdf", None) + + if operation == "_serialization": + args = [object()] + else: + args = [] + extension = get_config().extensions[0] + with getattr(context, operation)(*args) as op_ctx: + op_ctx._mark_extension_used(extension) + assert extension in op_ctx._extensions_used + # check this persists in the parent context + assert extension in context._extensions_used + + +def test_get_block_data_callback(tmp_path): + fn = tmp_path / "test.asdf" + + # make a file with 2 blocks + arr0 = np.arange(3, dtype="uint8") + arr1 = np.arange(10, dtype="uint8") + asdf.AsdfFile({"arr0": arr0, "arr1": arr1}).write_to(fn) + + with asdf.open(fn) as af: + context = af._create_serialization_context() + with pytest.raises(NotImplementedError, match="abstract"): + context.get_block_data_callback(0) + + with context._deserialization() as op_ctx: + cb0 = op_ctx.get_block_data_callback(0) + + # getting the same callback should pass and return the same object + assert op_ctx.get_block_data_callback(0) is cb0 + + # since we accessed block 0 we shouldn't be allowed to access block 1 + with pytest.raises(OSError, match=r"Converters accessing >1.*"): + op_ctx.get_block_data_callback(1) + + # unless we use a key + key = op_ctx.generate_block_key() + cb1 = op_ctx.get_block_data_callback(1, key) + assert op_ctx.get_block_data_callback(1, key) is cb1 + + # we don't know the order of blocks, so find which block + # was used for which array by looking at the size + d0 = cb0() + d1 = cb1() + if d0.size == arr1.size: + arr0, arr1 = arr1, arr0 + np.testing.assert_array_equal(d0, arr0) + np.testing.assert_array_equal(d1, arr1) + + class Foo: + pass + + # assign a deserialized object as we accessed blocks and the context + # will expect this object to be available + op_ctx._obj = Foo() + + with context._serialization(object()) as op_ctx: + with pytest.raises(NotImplementedError, match="abstract"): + op_ctx.get_block_data_callback(0) + + +def test_find_available_block_index(): + af = asdf.AsdfFile() + context = af._create_serialization_context() + + def cb(): + return np.arange(3, dtype="uint8") + + with pytest.raises(NotImplementedError, match="abstract"): + context.find_available_block_index(cb) + + class Foo: + pass + + with context._serialization(Foo()) as op_ctx: + assert op_ctx.find_available_block_index(cb) == 0 + + with context._deserialization() as op_ctx: + with pytest.raises(NotImplementedError, match="abstract"): + op_ctx.find_available_block_index(cb) + + +def test_generate_block_key(): + af = asdf.AsdfFile() + context = af._create_serialization_context() + + with pytest.raises(NotImplementedError, match="abstract"): + context.generate_block_key() + + class Foo: + pass + + obj = Foo() + with context._serialization(obj) as op_ctx: + key = op_ctx.generate_block_key() + assert key.is_valid() + assert key.matches_object(obj) + + obj = Foo() + with context._deserialization() as op_ctx: + key = op_ctx.generate_block_key() + # the key does not yet have an assigned object + assert not key.is_valid() + op_ctx._obj = obj + assert key.is_valid() + assert key.matches_object(obj) From c44409366b07cd60078bcbb3d9b331a3c444d17f Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 12 May 2023 15:33:27 -0400 Subject: [PATCH 086/154] allow >1 stream if the same block increase test coverage for update usage --- asdf/_block/manager.py | 80 +++++++--------- asdf/_tests/test_array_blocks.py | 143 +++++++++++++++++++--------- asdf/_tests/test_block_converter.py | 68 +++++++++++++ 3 files changed, 205 insertions(+), 86 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 58aa2dfe5..f3c423b7f 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -1,12 +1,12 @@ import contextlib import copy -import weakref from asdf import config, constants, generic_io, util from . import external, reader, store, writer from . import io as bio from .callback import DataCallback +from .key import Key as BlockKey from .options import Options @@ -23,7 +23,7 @@ def append_block(self, block): self._items.append(block) -class BlockOptions(store.Store): +class OptionsStore(store.Store): """ {array_base: options} read_blocks (instance of ReadBlocks) @@ -63,7 +63,8 @@ def set_options(self, array, options): if opt.storage_type == "streamed": if opt is options: continue - raise ValueError("Can not add second streaming block") + msg = "Can not add second streaming block" + raise ValueError(msg) base = util.get_array_base(array) self.assign_object(base, options) @@ -89,13 +90,13 @@ class Manager: def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, validate_checksums=False): if read_blocks is None: read_blocks = ReadBlocks([]) - self.options = BlockOptions(read_blocks) + self.options = OptionsStore(read_blocks) self.blocks = read_blocks self._data_callbacks = store.Store() self._write_blocks = store.LinearStore() self._external_write_blocks = [] self._streamed_write_block = None - self._streamed_obj = None + self._streamed_obj_keys = set() self._write_fd = None self._uri = uri self._external_block_cache = external.ExternalBlockCache() @@ -111,21 +112,22 @@ def read(self, fd, after_magic=False): def _load_external(self, uri): value = self._external_block_cache.load(self._uri, uri) if value is external.UseInternal: - return self._blocks.blocks[0].data + return self.blocks[0].data return value def _clear_write(self): self._write_blocks = store.LinearStore() self._external_write_blocks = [] self._streamed_write_block = None - self._streamed_obj = None + self._streamed_obj_keys = set() self._write_fd = None def _write_external_blocks(self): from asdf import AsdfFile - if self._write_fd.uri is None: - raise ValueError("Can't write external blocks, since URI of main file is unknown.") + if self._write_fd is None or self._write_fd.uri is None: + msg = "Can't write external blocks, since URI of main file is unknown." + raise ValueError(msg) for blk in self._external_write_blocks: uri = generic_io.resolve_uri(self._write_fd.uri, blk._uri) @@ -148,7 +150,8 @@ def make_write_block(self, data, options, obj): else: base_uri = self._uri if base_uri is None: - raise ValueError("Can't write external blocks, since URI of main file is unknown.") + msg = "Can't write external blocks, since URI of main file is unknown." + raise ValueError(msg) blk._uri = external.uri_for_index(base_uri, index) self._external_write_blocks.append(blk) return blk._uri @@ -165,9 +168,11 @@ def make_write_block(self, data, options, obj): def set_streamed_write_block(self, data, obj): if self._streamed_write_block is not None and data is not self._streamed_write_block.data: - raise ValueError("Can not add second streaming block") - self._streamed_write_block = writer.WriteBlock(data) - self._streamed_obj = weakref.ref(obj) + msg = "Can not add second streaming block" + raise ValueError(msg) + if self._streamed_write_block is None: + self._streamed_write_block = writer.WriteBlock(data) + self._streamed_obj_keys.add(BlockKey(obj)) def _get_data_callback(self, index): return DataCallback(index, self.blocks) @@ -289,48 +294,37 @@ def update(self, new_tree_size, pad_blocks, include_block_index): # map new blocks to old blocks new_read_blocks = ReadBlocks() for i, (offset, header) in enumerate(zip(offsets, headers)): + # find all objects that assigned themselves to + # the write block (wblk) at index i if i == len(self._write_blocks): # this is a streamed block - obj = self._streamed_obj() + obj_keys = self._streamed_obj_keys wblk = self._streamed_write_block else: wblk = self._write_blocks[i] # find object associated with wblk - obj = None + obj_keys = set() for oid, by_key in self._write_blocks._by_id.items(): for key, index in by_key.items(): if self._write_blocks[index] is wblk: - obj = key._ref() - break - if obj is None: - msg = "Update failed to associate blocks" - raise OSError(msg) - - # does the obj have an old read block? - rblk = self.blocks.lookup_by_object(obj) - if rblk is not None: - memmap = rblk.memmap - data = None - if not rblk.memmap: - if rblk._cached_data is not None: - data = rblk._cached_data - elif not callable(rblk._data): - data = rblk._data - else: - memmap = self._memmap - data = None + obj_keys.add(key) - # we have to be lazy here as the current memmap is invalid - new_read_block = reader.ReadBlock( - offset + 4, self._write_fd, memmap, True, False, header=header, data=data - ) + # we have to be lazy here as any current memmap is invalid + new_read_block = reader.ReadBlock(offset + 4, self._write_fd, self._memmap, True, False, header=header) new_read_blocks.append_block(new_read_block) new_index = len(new_read_blocks) - 1 - new_read_blocks.assign_object(obj, new_read_block) - # update data callbacks to point to new blocks - cb = self._data_callbacks.lookup_by_object(obj) - if cb is not None: - cb.reassign(new_index, new_read_blocks) + # update all callbacks + for obj_key in obj_keys: + obj = obj_key._ref() + if obj is None: + # this object no longer exists so don't both assigning it + continue + new_read_blocks.assign_object(obj, new_read_block) + + # update data callbacks to point to new block + cb = self._data_callbacks.lookup_by_object(obj) + if cb is not None: + cb.reassign(new_index, new_read_blocks) # update read blocks to reflect new state self.blocks = new_read_blocks diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index 3d685aba7..baeba9f2d 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -107,7 +107,9 @@ def test_pad_blocks(tmp_path): assert_array_equal(ff.tree["my_array2"], my_array2) -def test_update_expand_tree(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_expand_tree(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) testpath = os.path.join(tmp_path, "test.asdf") @@ -119,7 +121,7 @@ def test_update_expand_tree(tmp_path): ff = asdf.AsdfFile(tree) ff.set_array_storage(tree["arrays"][2], "inline") ff.write_to(testpath, pad_blocks=True) - with asdf.open(testpath, mode="rw") as ff: + with asdf.open(testpath, lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: assert len(list(ff._blocks.blocks)) == 2 assert_array_equal(ff.tree["arrays"][0], my_array) ff.tree["extra"] = [0] * 6000 @@ -134,7 +136,7 @@ def test_update_expand_tree(tmp_path): ff = asdf.AsdfFile(tree) ff.set_array_storage(tree["arrays"][2], "inline") ff.write_to(os.path.join(tmp_path, "test2.asdf"), pad_blocks=True) - with asdf.open(os.path.join(tmp_path, "test2.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test2.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: ff.tree["extra"] = [0] * 2 ff.update() @@ -144,7 +146,9 @@ def test_update_expand_tree(tmp_path): assert_array_equal(ff.tree["arrays"][1], my_array2) -def test_update_all_external(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_all_external(tmp_path, lazy_load, copy_arrays): fn = tmp_path / "test.asdf" my_array = np.arange(64) * 1 @@ -157,18 +161,40 @@ def test_update_all_external(tmp_path): with asdf.config.config_context() as cfg: cfg.array_inline_threshold = 10 cfg.all_array_storage = "external" - with asdf.open(fn, mode="rw") as af: + with asdf.open(fn, lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as af: af.update() assert "test0000.asdf" in os.listdir(tmp_path) assert "test0001.asdf" in os.listdir(tmp_path) +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_some_external(tmp_path, lazy_load, copy_arrays): + fn = tmp_path / "test.asdf" + + my_array = np.arange(64) * 1 + my_array2 = np.arange(64) * 2 + tree = {"arrays": [my_array, my_array2]} + + af = asdf.AsdfFile(tree) + af.write_to(fn) + + with asdf.open(fn, lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as af: + af.set_array_storage(af["arrays"][0], "external") + af.update() + + assert "test0000.asdf" in os.listdir(tmp_path) + assert "test0001.asdf" not in os.listdir(tmp_path) + + def _get_update_tree(): return {"arrays": [np.arange(64) * 1, np.arange(64) * 2, np.arange(64) * 3]} -def test_update_delete_first_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_delete_first_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -180,7 +206,7 @@ def test_update_delete_first_array(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: del ff.tree["arrays"][0] ff.update() @@ -191,7 +217,9 @@ def test_update_delete_first_array(tmp_path): assert_array_equal(ff.tree["arrays"][1], tree["arrays"][2]) -def test_update_delete_last_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_delete_last_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -203,7 +231,7 @@ def test_update_delete_last_array(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: del ff.tree["arrays"][-1] ff.update() @@ -214,7 +242,9 @@ def test_update_delete_last_array(tmp_path): assert_array_equal(ff.tree["arrays"][1], tree["arrays"][1]) -def test_update_delete_middle_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_delete_middle_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -226,7 +256,7 @@ def test_update_delete_middle_array(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: del ff.tree["arrays"][1] ff.update() assert len(ff._blocks.blocks) == 2 @@ -239,7 +269,9 @@ def test_update_delete_middle_array(tmp_path): assert_array_equal(ff.tree["arrays"][1], tree["arrays"][2]) -def test_update_replace_first_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_replace_first_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -251,7 +283,7 @@ def test_update_replace_first_array(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: ff.tree["arrays"][0] = np.arange(32) ff.update() @@ -263,7 +295,9 @@ def test_update_replace_first_array(tmp_path): assert_array_equal(ff.tree["arrays"][2], tree["arrays"][2]) -def test_update_replace_last_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_replace_last_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -275,7 +309,7 @@ def test_update_replace_last_array(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: ff.tree["arrays"][2] = np.arange(32) ff.update() @@ -287,7 +321,9 @@ def test_update_replace_last_array(tmp_path): assert_array_equal(ff.tree["arrays"][2], np.arange(32)) -def test_update_replace_middle_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_replace_middle_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -299,7 +335,7 @@ def test_update_replace_middle_array(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: ff.tree["arrays"][1] = np.arange(32) ff.update() @@ -311,7 +347,9 @@ def test_update_replace_middle_array(tmp_path): assert_array_equal(ff.tree["arrays"][2], tree["arrays"][2]) -def test_update_add_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_add_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -321,7 +359,7 @@ def test_update_add_array(tmp_path): ff = asdf.AsdfFile(tree) ff.write_to(path, pad_blocks=True) - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: ff.tree["arrays"].append(np.arange(32)) ff.update() @@ -332,7 +370,9 @@ def test_update_add_array(tmp_path): assert_array_equal(ff.tree["arrays"][3], np.arange(32)) -def test_update_add_array_at_end(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_add_array_at_end(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -344,7 +384,7 @@ def test_update_add_array_at_end(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: ff.tree["arrays"].append(np.arange(65536, dtype=" Date: Fri, 12 May 2023 15:58:22 -0400 Subject: [PATCH 087/154] fix external cache test for windows --- asdf/_tests/_block/test_external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asdf/_tests/_block/test_external.py b/asdf/_tests/_block/test_external.py index 5392c1e56..d5600044c 100644 --- a/asdf/_tests/_block/test_external.py +++ b/asdf/_tests/_block/test_external.py @@ -11,7 +11,7 @@ def test_cache(tmp_path): asdf.AsdfFile({"data": arr}).write_to(efn) cache = external.ExternalBlockCache() - base_uri = f"file://{tmp_path}/" + base_uri = asdf.util.filepath_to_url(f"{tmp_path}/") data = cache.load(base_uri, "test.asdf") np.testing.assert_array_equal(data, arr) assert cache.load(base_uri, "test.asdf") is data From 5bb7d7a2bbc90c8fdab3b4af3ac40bbe4ac20c3b Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 12 May 2023 18:38:12 -0400 Subject: [PATCH 088/154] adjust key assignment during deserialization don't assign keys to callbacks and blocks until after they are assigned an object --- asdf/_serialization_context.py | 31 +++++++++++++++-------- asdf/_tests/test_serialization_context.py | 15 ++++++----- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/asdf/_serialization_context.py b/asdf/_serialization_context.py index dde874160..8cbbae00d 100644 --- a/asdf/_serialization_context.py +++ b/asdf/_serialization_context.py @@ -167,7 +167,7 @@ def __init__(self, ctx): self._obj = None self._blk = None self._cb = None - self._keys = set() + self._keys_to_assign = {} def __exit__(self, exc_type, exc_value, traceback): if exc_type is not None: @@ -175,8 +175,19 @@ def __exit__(self, exc_type, exc_value, traceback): if self._blk is not None: self._blocks.blocks.assign_object(self._obj, self._blk) self._blocks._data_callbacks.assign_object(self._obj, self._cb) - for k in self._keys: - k.assign_object(self._obj) + for key, cb in self._keys_to_assign.items(): + if cb is None: + msg = "Converter generated a key that was never used" + raise OSError(msg) + # now that we have an object, make the key valid + key.assign_object(self._obj) + + # assign the key to the callback + self._blocks._data_callbacks.assign_object(key, cb) + + # and the block + blk = self._blocks.blocks[cb._index] + self._blocks.blocks.assign_object(key, blk) def get_block_data_callback(self, index, key=None): blk = self._blocks.blocks[index] @@ -192,17 +203,17 @@ def get_block_data_callback(self, index, key=None): self._cb = self._blocks._get_data_callback(index) return self._cb - # for key accesses try to find a previous use of this key - cb = self._blocks._data_callbacks.lookup_by_object(key) - if cb is None: - self._blocks.blocks.assign_object(key, blk) - cb = self._blocks._get_data_callback(index) - self._blocks._data_callbacks.assign_object(key, cb) + if self._keys_to_assign.get(key, None) is not None: + return self._keys_to_assign[key] + + cb = self._blocks._get_data_callback(index) + # mark this as a key to later assign + self._keys_to_assign[key] = cb return cb def generate_block_key(self): key = BlockKey() - self._keys.add(key) + self._keys_to_assign[key] = None return key diff --git a/asdf/_tests/test_serialization_context.py b/asdf/_tests/test_serialization_context.py index 9e76bb026..9eaeac868 100644 --- a/asdf/_tests/test_serialization_context.py +++ b/asdf/_tests/test_serialization_context.py @@ -135,10 +135,11 @@ class Foo: assert key.matches_object(obj) obj = Foo() - with context._deserialization() as op_ctx: - key = op_ctx.generate_block_key() - # the key does not yet have an assigned object - assert not key.is_valid() - op_ctx._obj = obj - assert key.is_valid() - assert key.matches_object(obj) + # because this test generates but does not assign a key + # it should raise an exception + with pytest.raises(OSError, match=r"Converter generated a key.*"): + with context._deserialization() as op_ctx: + key = op_ctx.generate_block_key() + # the key does not yet have an assigned object + assert not key.is_valid() + op_ctx._obj = obj From d3045757b495d097381e6e68ebd472fca12f2631 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 15 May 2023 09:50:21 -0400 Subject: [PATCH 089/154] add _block.manager unit tests --- asdf/_tests/_block/test_manager.py | 76 ++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 asdf/_tests/_block/test_manager.py diff --git a/asdf/_tests/_block/test_manager.py b/asdf/_tests/_block/test_manager.py new file mode 100644 index 000000000..a917a2c2c --- /dev/null +++ b/asdf/_tests/_block/test_manager.py @@ -0,0 +1,76 @@ +import numpy as np +import pytest + +import asdf +from asdf._block import manager +from asdf._block.options import Options + + +def test_set_streamed_block_via_options(): + options = manager.OptionsStore(manager.ReadBlocks()) + arr1 = np.arange(10, dtype="uint8") + arr2 = np.arange(5, dtype="uint8") + options.set_options(arr1, Options("streamed")) + with pytest.raises(ValueError, match=r"Can not add second streaming block"): + options.set_options(arr2, Options("streamed")) + del arr1 + options.set_options(arr2, Options("streamed")) + + +def test_set_streamed_block_via_manager(): + af = asdf.AsdfFile() + m = af._blocks + + class Foo: + pass + + arr = np.arange(10, dtype="uint8") + obj = Foo() + m.set_streamed_write_block(arr, obj) + + # setting again with the same data is ok + m.set_streamed_write_block(arr, obj) + + # using a different array is not allowed + arr2 = np.arange(3, dtype="uint8") + with pytest.raises(ValueError, match="Can not add second streaming block"): + m.set_streamed_write_block(arr2, obj) + + # a different object is ok as long as the array matches + obj2 = Foo() + m.set_streamed_write_block(arr, obj2) + + +def test_load_external_internal(tmp_path): + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"arr": np.arange(10, dtype="uint8")}).write_to(fn) + with asdf.open(fn) as af: + m = af._blocks + np.testing.assert_array_equal(m._load_external("#"), m.blocks[0].data) + + +def test_write_no_uri(tmp_path): + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"arr": np.arange(10, dtype="uint8")}).write_to(fn) + with asdf.open(fn) as af: + m = af._blocks + with pytest.raises(ValueError, match=r"Can't write external blocks.*"): + m._write_external_blocks() + + +def test_write_outside_context(tmp_path): + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"arr": np.arange(10, dtype="uint8")}).write_to(fn) + with asdf.open(fn) as af: + m = af._blocks + with pytest.raises(OSError, match=r"write called outside of valid write_context"): + m.write(False, False) + + +def test_update_outside_context(tmp_path): + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"arr": np.arange(10, dtype="uint8")}).write_to(fn) + with asdf.open(fn) as af: + m = af._blocks + with pytest.raises(OSError, match=r"update called outside of valid write_context"): + m.update(0, False, False) From 8ba435ec7b9e4c688b4113a6e459d2e6ebeec0f3 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 15 May 2023 11:20:26 -0400 Subject: [PATCH 090/154] make DataCallback.reassign private --- asdf/_block/callback.py | 9 ++- asdf/_block/manager.py | 2 +- asdf/_serialization_context.py | 90 +++++++++++++++++++++++------ asdf/_tests/_block/test_callback.py | 2 +- 4 files changed, 80 insertions(+), 23 deletions(-) diff --git a/asdf/_block/callback.py b/asdf/_block/callback.py index 00475b1d9..01155df64 100644 --- a/asdf/_block/callback.py +++ b/asdf/_block/callback.py @@ -2,8 +2,13 @@ class DataCallback: + """ + A callable object used to read data from an ASDF block + read from an ASDF file. + """ + def __init__(self, index, read_blocks): - self.reassign(index, read_blocks) + self._reassign(index, read_blocks) def __call__(self, _attr=None): read_blocks = self._read_blocks_ref() @@ -17,6 +22,6 @@ def __call__(self, _attr=None): # like reading the header and cached_data return getattr(read_blocks[self._index], _attr) - def reassign(self, index, read_blocks): + def _reassign(self, index, read_blocks): self._index = index self._read_blocks_ref = weakref.ref(read_blocks) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index f3c423b7f..0d6e4282d 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -324,7 +324,7 @@ def update(self, new_tree_size, pad_blocks, include_block_index): # update data callbacks to point to new block cb = self._data_callbacks.lookup_by_object(obj) if cb is not None: - cb.reassign(new_index, new_read_blocks) + cb._reassign(new_index, new_read_blocks) # update read blocks to reflect new state self.blocks = new_read_blocks diff --git a/asdf/_serialization_context.py b/asdf/_serialization_context.py index 8cbbae00d..a94fefc8b 100644 --- a/asdf/_serialization_context.py +++ b/asdf/_serialization_context.py @@ -81,15 +81,16 @@ def _extensions_used(self): def get_block_data_callback(self, index, key=None): """ Generate a callable that when called will read data - from a block at the provided index + from an ASDF block at the provided index. Parameters ---------- index : int - Block index + Index of ASDF block. - key : BlockKey - TODO + key : BlockKey, optional + BlockKey generated using self.generate_block_key. Only + needed for a Converter that uses multiple blocks. Returns ------- @@ -99,34 +100,42 @@ def get_block_data_callback(self, index, key=None): """ raise NotImplementedError("abstract") - def find_available_block_index(self, data_callback, lookup_key=None): + def find_available_block_index(self, data_callback, key=None): """ - Find the index of an available block to write data. + Find the index of an available ASDF block to write data. - This is typically used inside asdf.extension.Converter.to_yaml_tree + This is typically used inside asdf.extension.Converter.to_yaml_tree. Parameters ---------- data_callback: callable Callable that when called will return data (ndarray) that will be written to a block. - At the moment, this is only assigned if a new block - is created to avoid circular references during AsdfFile.update. - lookup_key : hashable, optional - Unique key used to retrieve the index of a block that was - previously allocated or reserved. For ndarrays this is - typically the id of the base ndarray. + key : BlockKey, optional + BlockKey generated using self.generate_block_key. Only + needed for a Converter that uses multiple blocks. Returns ------- block_index: int - Index of the block where data returned from data_callback - will be written. + Index of the ASDF block where data returned from + data_callback will be written. """ raise NotImplementedError("abstract") def generate_block_key(self): + """ + Generate a BlockKey used for Converters that wish to use + multiple blocks + + Returns + ------- + key : BlockKey + A hashable object that will be associated with the + serialized/deserialized object and can be used to + access multiple blocks within a Converter + """ raise NotImplementedError("abstract") @contextlib.contextmanager @@ -141,6 +150,23 @@ def _deserialization(self): class _Operation(SerializationContext): + """ + `SerializationContext` is used for multiple operations + including serialization and deserialization. The `_Operation` class + allows the SerializationContext to have different behavior during these + operations (for example allowing block reading during deserialization) + and allows the context to be used with a python ``with`` statement to + allow setup and teardown operations (such as associating a + deserialized object with the blocks accessed during deserialization). + + `_Operation` subclasses should not be instantiated directly but instead + should be accessible via private methods on a `SerializationContext`. + This allows the `SerializationContext` to provide itself to the `_Operation` + which can chose to implement abstract methods in `SerializationContext` + (such as `SerializationContext.find_available_block_index` during + `_Serialization` created via `SerializationContext._serialization`). + """ + def __init__(self, ctx): self._ctx = weakref.ref(ctx) super().__init__(ctx.version, ctx.extension_manager, ctx.url, ctx._blocks) @@ -162,6 +188,22 @@ def __exit__(self, exc_type, exc_value, traceback): class _Deserialization(_Operation): + """ + Perform deserialization (reading) with a `SerializationContext`. + + To allow for block access, `_Deserialization` implements: + - `SerializationContext.generate_block_key` + - `SerializationContext.get_block_data_callback` + and tracks which blocks (and keys) are accessed, assigning them + to the deserialized object at the end of the + `SerializationContext._deserialization`. + + Code that uses `_Deserialization` and accesses any blocks + or generates keys must assign an object to + `_Deserialization._obj` prior to exiting the `_Deserialization` + context manager. + """ + def __init__(self, ctx): super().__init__(ctx) self._obj = None @@ -218,14 +260,24 @@ def generate_block_key(self): class _Serialization(_Operation): + """ + Perform serialization (writing) with a `SerializationContext`. + + To allow for block access, `_Serialization` implements: + - `SerializationContext.generate_block_key` + - `SerializationContext.find_available_block_index` + and assigns any accessed blocks (and keys) to the object + being serialized. + """ + def __init__(self, ctx, obj): super().__init__(ctx) self._obj = obj - def find_available_block_index(self, data_callback, lookup_key=None): - if lookup_key is None: - lookup_key = self._obj - return self._blocks.make_write_block(data_callback, BlockOptions(), lookup_key) + def find_available_block_index(self, data_callback, key=None): + if key is None: + key = self._obj + return self._blocks.make_write_block(data_callback, BlockOptions(), key) def generate_block_key(self): return BlockKey(self._obj) diff --git a/asdf/_tests/_block/test_callback.py b/asdf/_tests/_block/test_callback.py index 1b4d438bd..f0f59d63d 100644 --- a/asdf/_tests/_block/test_callback.py +++ b/asdf/_tests/_block/test_callback.py @@ -51,6 +51,6 @@ def __init__(self, value): assert cb() == "a" blks2 = LinearStore([Data("c"), Data("d")]) - cb.reassign(1, blks2) + cb._reassign(1, blks2) assert cb() == "d" From a0ddcce7d77b5e93e812ceca5a43cf3258beceed Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 15 May 2023 14:18:16 -0400 Subject: [PATCH 091/154] update docs off _block submodule --- asdf/_block/callback.py | 16 ++ asdf/_block/external.py | 9 ++ asdf/_block/io.py | 272 +++++++++++++++++++++++++++++++++- asdf/_tests/_block/test_io.py | 2 +- 4 files changed, 296 insertions(+), 3 deletions(-) diff --git a/asdf/_block/callback.py b/asdf/_block/callback.py index 01155df64..946de0b03 100644 --- a/asdf/_block/callback.py +++ b/asdf/_block/callback.py @@ -1,3 +1,19 @@ +""" +A `DataCallback` class is implemented here to allow +for reassignment of the index of an ASDF block corresponding +to a callback. + +This is needed so that extension code can generate a callback +during deserialization of an ASDF file that will continue +to be valid even after an `AsdfFile.update` which might +reorder blocks. + +To allow for 'low-level' block access needed for ndarray +`DataCallback` can be called with an optional ``_attr`` +argument to cache data, access the block header and other +operations that we generally do not want to expose to +extension code. +""" import weakref diff --git a/asdf/_block/external.py b/asdf/_block/external.py index 9f16da276..c011f8064 100644 --- a/asdf/_block/external.py +++ b/asdf/_block/external.py @@ -1,3 +1,12 @@ +""" +For external blocks, the previous block management +would cache data opened from external files (to return the +same underlying ndarray if the same external block +was referenced more than once). `ExternalBlockCache` is +used here to allow for the same behavior without requiring +the block manager to have a reference to the `AsdfFile` +(that references the block manager). +""" import os from asdf import generic_io, util diff --git a/asdf/_block/io.py b/asdf/_block/io.py index 89307c4b6..517c67fc3 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -1,3 +1,7 @@ +""" +Low-level functions for reading and writing ASDF blocks +and other block related file contents (like the block index). +""" import hashlib import io import os @@ -32,6 +36,21 @@ def calculate_block_checksum(data): def validate_block_header(header): + """ + Check that they key value pairs in header contain consistent + information about the ASDF block ``compression``, ``flags``, + ``used_size`` and ``data_size`` (otherwise raise an exception). + + Parameters + ---------- + header : dict + ASDF block header information. + + Raises + ------ + ValueError + If the key value pairs in header contain inconsistent information + """ compression = mcompression.validate(header["compression"]) if header["flags"] & constants.BLOCK_FLAG_STREAMED: if compression is not None: @@ -45,6 +64,30 @@ def validate_block_header(header): def read_block_header(fd, offset=None): + """ + Read an ASDF block header + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to read. + + offset : int, optional + Offset within the file where the start of the ASDF block + header is located. If provided, the file will be seeked prior + to reading. + + Returns + ------- + header : dict + Dictionary containing the read ASDF header as parsed by the + `BLOCK_HEADER` `asdf.util.BinaryStruct`. + + Raises + ------ + ValueError + If the read header is inconsistent (see `validate_block_header`). + """ if offset is not None: fd.seek(offset) @@ -60,6 +103,33 @@ def read_block_header(fd, offset=None): def read_block_data(fd, header, offset=None, memmap=False): + """ + Read (or memory map) data for an ASDF block. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to read. + + header : dict + ASDF block header dictionary (as read from `read_block_header`). + + offset : int, optional + Offset within the file where the start of the ASDF block data + is located. If provided, the file will be seeked prior to reading. + + memmap : bool, optional, default False + Memory map the block data using `generic_io.GenericIO.memmap_array`. + A compressed block will never be memmapped and if the file ``fd`` + does not support memmapping the data will not be memmapped (and + no error will be raised). + + Returns + ------- + data : ndarray or memmap + A one-dimensional ndarray of dtype uint8 + """ + if fd.seekable(): if offset is not None: fd.seek(offset) @@ -90,6 +160,45 @@ def read_block_data(fd, header, offset=None, memmap=False): def read_block(fd, offset=None, memmap=False, lazy_load=False): + """ + Read a block (header and data) from an ASDF file. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to read. + + offset : int, optional + Offset within the file where the start of the ASDF block header + is located. If provided, the file will be seeked prior to reading. + Note this is the start of the block header not the start of the + block magic. + + memmap : bool, optional, default False + Memory map the block data see `read_block_data` for more + details. + + lazy_load : bool, optional, default False + Return a callable that when called will read the block data. This + option is ignored for a non-seekable file. + + Returns + ------- + offset : int + The offset within the file where the block was read (equal to offset + argument if it was provided). + + header : dict + ASDF block header as read with `read_block_header`. + + data_offset : int + The offset within the file where the block data begins. + + data : ndarray, memmap or callable + ASDF block data (one-dimensional ndarray of dtype uint8). If lazy_load + (and the file is seekable) data will be a callable that when executed + will seek the file and read the block data. + """ # expects the fd or offset is past the block magic if offset is None and fd.seekable(): offset = fd.tell() @@ -120,6 +229,56 @@ def callback(): def generate_write_header(data, stream=False, compression_kwargs=None, padding=False, fs_block_size=1, **header_kwargs): + """ + Generate a binary representation of a ASDF block header that can be + used for writing a block. + + Note that if a compression key is provided in ``header_kwargs`` this + function will compress ``data`` to determine the used_size (the + compressed data will be returned via the ``buff`` result to avoid + needing to re-compress the data before writing). + + Parameters + ---------- + + data : ndarray + A one-dimensional ndarray of dtype uint8. + + stream : bool, optional, default False + If True, generate a header for a streamed block. + + compression_kwargs : dict, optional + If provided, these will be passed on to `asdf.compression.compress` + if the data is compressed (see header_kwargs). + + padding : bool or float, optional, default False + If the block should contain additional padding bytes. See the + `asdf.util.calculate_padding` argument ``pad_blocks`` for more + details. + + fs_block_size : int, optional, default 1 + The filesystem block size. See the `asdf.util.calculate_padding` + ``block_size`` argument for more details. + + **header_kwargs : dict, optional + Block header settings that will be read, updated, and used + to generate the binary block header representation by packing + with `BLOCK_HEADER`. + + Returns + ------- + + header : bytes + Packed binary representation of the ASDF block header. + + buff : bytes or None + If this block is compressed buff will contained the compressed + representation of data or None if the data is uncompressed. + + padding_bytes: int + The number of padding bytes that must be written after + the block data. + """ if data.ndim != 1 or data.dtype != "uint8": msg = "Data must be of ndim==1 and dtype==uint8" raise ValueError(msg) @@ -160,6 +319,37 @@ def generate_write_header(data, stream=False, compression_kwargs=None, padding=F def write_block(fd, data, offset=None, stream=False, compression_kwargs=None, padding=False, **header_kwargs): + """ + Write an ASDF block. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to write to. + + offset : int, optional + If provided, seek to this offset before writing. + + stream : bool, optional, default False + If True, write this as a streamed block. + + compression_kwargs : dict, optional + If block is compressed, use these additional arguments during + compression. See `generate_write_header`. + + padding : bool, optional, default False + Optionally pad the block data. See `generate_write_header`. + + **header_kwargs : dict + Block header settings. See `generate_write_header`. + + Returns + ------- + + header : dict + The ASDF block header as unpacked from the `BLOCK_HEADER` used + for writing. + """ header, buff, padding_bytes = generate_write_header( data, stream, compression_kwargs, padding, fd.block_size, **header_kwargs ) @@ -180,7 +370,7 @@ def write_block(fd, data, offset=None, stream=False, compression_kwargs=None, pa return BLOCK_HEADER.unpack(header) -def candidate_offsets(min_offset, max_offset, block_size): +def _candidate_offsets(min_offset, max_offset, block_size): offset = (max_offset // block_size) * block_size if offset == max_offset: offset -= block_size @@ -192,6 +382,35 @@ def candidate_offsets(min_offset, max_offset, block_size): def find_block_index(fd, min_offset=None, max_offset=None): + """ + Find the location of an ASDF block index within a seekable file. + + Searching will begin at the end of the file (or max_offset + if it is provided). + + Parameters + ---------- + + fd : file or generic_io.GenericIO + A seekable file that will be searched to try and find + the start of an ASDF block index within the file. + + min_offset : int, optional + The minimum search offset. A block index will not be + found before this point. + + max_offset : int, optional + The maximum search offset. A block index will not be + found after this point. + + Returns + ------- + + offset : int or None + Index of start of ASDF block index. This is the location of the + ASDF block index header. + + """ if min_offset is None: min_offset = fd.tell() if max_offset is None: @@ -201,7 +420,7 @@ def find_block_index(fd, min_offset=None, max_offset=None): block_index_offset = None buff = b"" pattern = constants.INDEX_HEADER - for offset in candidate_offsets(min_offset, max_offset, block_size): + for offset in _candidate_offsets(min_offset, max_offset, block_size): fd.seek(offset) buff = fd.read(block_size) + buff index = buff.find(pattern) @@ -215,6 +434,35 @@ def find_block_index(fd, min_offset=None, max_offset=None): def read_block_index(fd, offset=None): + """ + Read an ASDF block index from a file. + + Parameters + ---------- + + fd : file or generic_io.GenericIO + File to read the block index from. + + offset : int, optional + Offset within the file where the block index starts + (the start of the ASDF block index header). If not provided + reading will start at the current position of the file + pointer. See `find_block_index` to locate the block + index prior to calling this function. + + Returns + ------- + + block_index : list of ints + A list of ASDF block offsets read and parsed from the + block index. + + Raises + ------ + OSError + The data read from the file did not contain a valid + block index. + """ if offset is not None: fd.seek(offset) buff = fd.read(len(constants.INDEX_HEADER)) @@ -235,6 +483,26 @@ def read_block_index(fd, offset=None): def write_block_index(fd, offsets, offset=None, yaml_version=None): + """ + Write a list of ASDF block offsets to a file in the form + of an ASDF block index. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to write to. + + offsets : list of ints + List of byte offsets (from the start of the file) where + ASDF blocks are located. + + offset : int, optional + If provided, seek to this offset before writing. + + yaml_version : tuple, optional, default (1, 1) + YAML version to use when writing the block index. This + will be passed to ``yaml.dump`` as the version argument. + """ if yaml_version is None: yaml_version = (1, 1) if offset is not None: diff --git a/asdf/_tests/_block/test_io.py b/asdf/_tests/_block/test_io.py index 4a76065c7..0ba772df6 100644 --- a/asdf/_tests/_block/test_io.py +++ b/asdf/_tests/_block/test_io.py @@ -246,7 +246,7 @@ def test_invalid_data(data): ) def test_candidate_offsets(options): min_offset, max_offset, size, targets = options - for offset, target in zip(bio.candidate_offsets(min_offset, max_offset, size), targets): + for offset, target in zip(bio._candidate_offsets(min_offset, max_offset, size), targets): assert offset == target From 69cd693374e99aa35db7b1decbd40ef54ace68a8 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 15 May 2023 14:24:02 -0400 Subject: [PATCH 092/154] make all block Key methods private --- asdf/_block/key.py | 12 ++++++------ asdf/_block/manager.py | 4 ++-- asdf/_block/store.py | 6 +++--- asdf/_serialization_context.py | 2 +- asdf/_tests/_block/test_key.py | 14 +++++++------- asdf/_tests/test_serialization_context.py | 6 +++--- 6 files changed, 22 insertions(+), 22 deletions(-) diff --git a/asdf/_block/key.py b/asdf/_block/key.py index 66134fa7d..21a8cfbfc 100644 --- a/asdf/_block/key.py +++ b/asdf/_block/key.py @@ -20,9 +20,9 @@ def __init__(self, obj=None, key=None): self._key = key self._ref = UndefinedRef if obj is not None: - self.assign_object(obj) + self._assign_object(obj) - def is_valid(self): + def _is_valid(self): if self._ref is UndefinedRef: return False r = self._ref() @@ -33,10 +33,10 @@ def is_valid(self): def __hash__(self): return self._key - def assign_object(self, obj): + def _assign_object(self, obj): self._ref = weakref.ref(obj) - def matches_object(self, obj): + def _matches_object(self, obj): if self._ref is UndefinedRef: return False r = self._ref() @@ -49,9 +49,9 @@ def __eq__(self, other): return NotImplemented if self._key != other._key: return False - if not self.is_valid(): + if not self._is_valid(): return False - return other.matches_object(self._ref()) + return other._matches_object(self._ref()) def __copy__(self): return self.__class__(self._ref(), self._key) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 0d6e4282d..4dcd0b0ea 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -58,7 +58,7 @@ def set_options(self, array, options): if options.storage_type == "streamed": for oid, by_key in self._by_id.items(): for key, opt in by_key.items(): - if not key.is_valid(): + if not key._is_valid(): continue if opt.storage_type == "streamed": if opt is options: @@ -79,7 +79,7 @@ def get_output_compressions(self): compressions.add(cfg.all_array_compression) for _, by_key in self._by_id.items(): for key, opts in by_key.items(): - if not key.is_valid(): + if not key._is_valid(): continue if opts.compression: compressions.add(opts.compression) diff --git a/asdf/_block/store.py b/asdf/_block/store.py index 86c2a5dd9..386bd5702 100644 --- a/asdf/_block/store.py +++ b/asdf/_block/store.py @@ -29,7 +29,7 @@ def lookup_by_object(self, obj, default=None): # look for a matching key: O(N) for key, value in by_key.items(): - if key.matches_object(obj): + if key._matches_object(obj): return value # no match, return default @@ -56,7 +56,7 @@ def assign_object(self, obj, value): # look for a matching matching key if obj_key is None: for key in by_key: - if key.matches_object(obj): + if key._matches_object(obj): by_key[key] = value return # we didn't find a matching key, so make one @@ -76,7 +76,7 @@ def _cleanup(self, object_id=None): self._cleanup(oid) return by_key = self._by_id[object_id] - keys_to_remove = [k for k in by_key if not k.is_valid()] + keys_to_remove = [k for k in by_key if not k._is_valid()] for key in keys_to_remove: del by_key[key] if not len(by_key): diff --git a/asdf/_serialization_context.py b/asdf/_serialization_context.py index a94fefc8b..a31d3c59f 100644 --- a/asdf/_serialization_context.py +++ b/asdf/_serialization_context.py @@ -222,7 +222,7 @@ def __exit__(self, exc_type, exc_value, traceback): msg = "Converter generated a key that was never used" raise OSError(msg) # now that we have an object, make the key valid - key.assign_object(self._obj) + key._assign_object(self._obj) # assign the key to the callback self._blocks._data_callbacks.assign_object(key, cb) diff --git a/asdf/_tests/_block/test_key.py b/asdf/_tests/_block/test_key.py index 1e807345a..4c038d03f 100644 --- a/asdf/_tests/_block/test_key.py +++ b/asdf/_tests/_block/test_key.py @@ -28,20 +28,20 @@ def test_unique_same_object(): def test_matches_obj(): f = Foo() bk = Key(f) - assert bk.matches_object(f) + assert bk._matches_object(f) def test_undefined_no_match(): bk = Key() - assert not bk.matches_object(Foo()) + assert not bk._matches_object(Foo()) def test_is_valid(): f = Foo() bk = Key(f) - assert bk.is_valid() + assert bk._is_valid() del f - assert not bk.is_valid() + assert not bk._is_valid() def test_same_class(): @@ -49,13 +49,13 @@ def test_same_class(): bk = Key(f) del f f2 = Foo() - assert not bk.is_valid() - assert not bk.matches_object(f2) + assert not bk._is_valid() + assert not bk._matches_object(f2) def test_undefined(): k = Key() - assert not k.is_valid() + assert not k._is_valid() def test_equal(): diff --git a/asdf/_tests/test_serialization_context.py b/asdf/_tests/test_serialization_context.py index 9eaeac868..941896522 100644 --- a/asdf/_tests/test_serialization_context.py +++ b/asdf/_tests/test_serialization_context.py @@ -131,8 +131,8 @@ class Foo: obj = Foo() with context._serialization(obj) as op_ctx: key = op_ctx.generate_block_key() - assert key.is_valid() - assert key.matches_object(obj) + assert key._is_valid() + assert key._matches_object(obj) obj = Foo() # because this test generates but does not assign a key @@ -141,5 +141,5 @@ class Foo: with context._deserialization() as op_ctx: key = op_ctx.generate_block_key() # the key does not yet have an assigned object - assert not key.is_valid() + assert not key._is_valid() op_ctx._obj = obj From 2ef0dbb36ea82e3e85eb33c42a8cd4c2f476bbf8 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 15 May 2023 16:25:44 -0400 Subject: [PATCH 093/154] support 'input' compression after overwriting compression also update docs --- asdf/_block/key.py | 47 ++++++++++--- asdf/_block/manager.py | 39 +++++++---- asdf/_block/options.py | 6 ++ asdf/_block/reader.py | 107 +++++++++++++++++++++++++++-- asdf/_block/store.py | 20 +++++- asdf/_block/writer.py | 48 +++++++++++++ asdf/_tests/_block/test_key.py | 4 +- asdf/_tests/_block/test_manager.py | 14 ++++ 8 files changed, 253 insertions(+), 32 deletions(-) diff --git a/asdf/_block/key.py b/asdf/_block/key.py index 21a8cfbfc..e0af2f5a6 100644 --- a/asdf/_block/key.py +++ b/asdf/_block/key.py @@ -1,8 +1,32 @@ -import weakref +""" +A hashable Key class that provides a means for tracking +the lifetime of objects to associate objects with +blocks, options and other parts of an asdf file. + +This Key is meant to replace uses of id(obj) which in +previous code was used to store settings (like block +array storage). The use of id was problematic as +an object might be deallocated (if it is removed from +the tree and all other references freed) and a new +object of the same type might occupy the same location +in memory and result in the same id. This could result +in options originally associated with the first object +being incorrectly assigned to the new object. +At it's core, Key, uses a weak reference to the object +which can be checked to see if the object is still +in memory. -class UndefinedRef: - pass +Instances of this class will be provided to extension +code (see ``SerializationContext.generate_block_key``) +as Converters will need to resupply these keys +on rewrites (updates) to allow asdf to reassociate +objects and blocks. To discourage modifications +of these Key instances all methods and attributes +are private. +""" + +import weakref class Key: @@ -14,16 +38,16 @@ def _next_key(cls): cls._next += 1 return key - def __init__(self, obj=None, key=None): - if key is None: - key = Key._next_key() - self._key = key - self._ref = UndefinedRef + def __init__(self, obj=None, _key=None): + if _key is None: + _key = Key._next_key() + self._key = _key + self._ref = None if obj is not None: self._assign_object(obj) def _is_valid(self): - if self._ref is UndefinedRef: + if self._ref is None: return False r = self._ref() if r is None: @@ -37,7 +61,7 @@ def _assign_object(self, obj): self._ref = weakref.ref(obj) def _matches_object(self, obj): - if self._ref is UndefinedRef: + if self._ref is None: return False r = self._ref() if r is None: @@ -54,4 +78,5 @@ def __eq__(self, other): return other._matches_object(self._ref()) def __copy__(self): - return self.__class__(self._ref(), self._key) + obj = self._ref if self._ref is None else self._ref() + return self.__class__(obj, self._key) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 4dcd0b0ea..99c4bf957 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -33,22 +33,31 @@ def __init__(self, read_blocks=None): super().__init__() self._read_blocks = read_blocks + def has_options(self, array): + base = util.get_array_base(array) + return self.lookup_by_object(base) is not None + + def get_options_from_block(self, array): + base = util.get_array_base(array) + # look up by block with matching _data + for block in self._read_blocks: + if block._cached_data is base or block._data is base: + # init options + if block.header["flags"] & constants.BLOCK_FLAG_STREAMED: + storage_type = "streamed" + else: + storage_type = "internal" + options = Options(storage_type, block.header["compression"]) + return options + return None + def get_options(self, array): base = util.get_array_base(array) options = self.lookup_by_object(base) if options is None: - # look up by block with matching _data - for block in self._read_blocks: - if block._cached_data is base or block._data is base: - # init options - if block.header["flags"] & constants.BLOCK_FLAG_STREAMED: - storage_type = "streamed" - else: - storage_type = "internal" - options = Options(storage_type, block.header["compression"]) - # set options - self.set_options(base, options) - break + options = self.get_options_from_block(base) + if options is not None: + self.set_options(base, options) if options is None: options = Options() self.set_options(base, options) @@ -186,6 +195,12 @@ def _get_array_storage(self, data): return self.options.get_options(data).storage_type def _set_array_compression(self, arr, compression, **compression_kwargs): + # if this is input compression but we already have defined options + # we need to re-lookup the options based off the block + if compression == "input" and self.options.has_options(arr): + from_block_options = self.options.get_options_from_block(arr) + if from_block_options is not None: + compression = from_block_options.compression options = self.options.get_options(arr) options.compression = compression options.compression_kwargs = compression_kwargs diff --git a/asdf/_block/options.py b/asdf/_block/options.py index ff7c6a2a8..bc2ebd198 100644 --- a/asdf/_block/options.py +++ b/asdf/_block/options.py @@ -3,6 +3,10 @@ class Options: + """ + Storage and compression options useful when reading or writing ASDF blocks. + """ + def __init__(self, storage_type=None, compression_type=None, compression_kwargs=None): if storage_type is None: with config_context() as cfg: @@ -41,6 +45,8 @@ def compression(self, compression): # "input" compression will validate as the ASDF compression module made # some assumptions about availability of information (that the input block # is known). The Options here do not have the same assumption. + # For a block read from a file, it's options will be initialized with + # the compression value read from the block header raise ValueError(msg) try: compression = mcompression.validate(compression) diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index aae12976a..69def2128 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -6,8 +6,12 @@ class ReadBlock: + """ + Represents an ASDF block read from a file. + """ + def __init__(self, offset, fd, memmap, lazy_load, validate_checksum, header=None, data_offset=None, data=None): - self.offset = offset # after magic + self.offset = offset # after block magic bytes self._fd = weakref.ref(fd) self._header = header self.data_offset = data_offset @@ -24,6 +28,14 @@ def loaded(self): return self._data is not None def load(self): + """ + Load the block data (if it is not already loaded). + + Raises + ------ + OSError + If attempting to load from a closed file. + """ if self.loaded: return fd = self._fd() @@ -38,6 +50,20 @@ def load(self): @property def data(self): + """ + Read, parse and return data for an ASDF block. + + Returns + ------- + data : ndarray + A one-dimensional ndarray of dypte uint8 read from an ASDF block + + Raises + ------ + ValueError + If the header checksum does not match the checksum of the data + and validate_checksums was set to True. + """ if not self.loaded: self.load() if callable(self._data): @@ -55,18 +81,40 @@ def data(self): @property def cached_data(self): + """ + Return cached data for an ASDF block. + + The first time this is called it may read data from the file + (if lazy loaded). Subsequent calls will return the same + ndarray. + """ if self._cached_data is None: self._cached_data = self.data return self._cached_data @property def header(self): + """ + Get the block header. For a lazy loaded block the first time + this is called the header will be read from the file and + cached. + + Returns + ------- + header : dict + Dictionary containing the read ASDF header. + """ if not self.loaded: self.load() return self._header -def read_blocks_serially(fd, memmap=False, lazy_load=False, validate_checksums=False, after_magic=False): +def _read_blocks_serially(fd, memmap=False, lazy_load=False, validate_checksums=False, after_magic=False): + """ + Read blocks serially from a file without looking for a block index. + + For parameter and return value descriptions see `read_blocks`. + """ blocks = [] buff = b"" magic_len = len(constants.BLOCK_MAGIC) @@ -110,9 +158,56 @@ def read_blocks_serially(fd, memmap=False, lazy_load=False, validate_checksums=F def read_blocks(fd, memmap=False, lazy_load=False, validate_checksums=False, after_magic=False): + """ + Read a sequence of ASDF blocks from a file. + + If the file is seekable (and lazy_load is False) an attempt will + made to find, read and parse a block index. If this fails, the + blocks will be read serially. If parsing the block index + succeeds, the first first and last blocks will be read (to + confirm that those portions of the index are correct). All + other blocks will not be read until they are accessed. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to read. Reading will start at the current position. + + memmap : bool, optional, default False + If true, memory map block data. + + lazy_load : bool, optional, default False + If true, block data will be a callable that when executed + will return the block data. See the ``lazy_load`` argument + to ``asdf._block.io.read_block`` for more details. + + validate_checksums : bool, optional, default False + When reading blocks compute the block data checksum and + compare it to the checksum read from the block header. + Note that this comparison will occur when the data is + accessed if ``lazy_load`` was set to True. + + after_magic : bool, optional, default False + If True don't expect block magic bytes for the first block + read from the file. + + Returns + ------- + + read_blocks : list of ReadBlock + A list of ReadBlock instances. + + Raises + ------ + OSError + Invalid bytes encountered while reading blocks. + + ValueError + A read block has an invalid checksum. + """ if not lazy_load or not fd.seekable(): # load all blocks serially - return read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) + return _read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) # try to find block index starting_offset = fd.tell() @@ -120,7 +215,7 @@ def read_blocks(fd, memmap=False, lazy_load=False, validate_checksums=False, aft if index_offset is None: # if failed, load all blocks serially fd.seek(starting_offset) - return read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) + return _read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) # setup empty blocks try: @@ -128,7 +223,7 @@ def read_blocks(fd, memmap=False, lazy_load=False, validate_checksums=False, aft except OSError: # failed to read block index, fall back to serial reading fd.seek(starting_offset) - return read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) + return _read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) # skip magic for each block magic_len = len(constants.BLOCK_MAGIC) blocks = [ReadBlock(offset + magic_len, fd, memmap, lazy_load, validate_checksums) for offset in block_index] @@ -143,5 +238,5 @@ def read_blocks(fd, memmap=False, lazy_load=False, validate_checksums=False, aft blocks[index].load() except (OSError, ValueError): fd.seek(starting_offset) - return read_blocks_serially(fd, memmap, lazy_load, after_magic) + return _read_blocks_serially(fd, memmap, lazy_load, after_magic) return blocks diff --git a/asdf/_block/store.py b/asdf/_block/store.py index 386bd5702..7b9aeac93 100644 --- a/asdf/_block/store.py +++ b/asdf/_block/store.py @@ -4,13 +4,23 @@ class Store: + """ + A key-value store that uses ``asdf._block.key.Key`` + to allow use of keys that: + - are not hashable (so any object can be used) + - when the key is garbage collected, the value + will be unretrievable + """ + def __init__(self): # store contains 2 layers of lookup: id(obj), Key self._by_id = {} def lookup_by_object(self, obj, default=None): if isinstance(obj, Key): + # if obj is a Key, look up the object obj_id = id(obj._ref()) + # and use the Key obj_key = obj else: obj_id = id(obj) @@ -23,10 +33,13 @@ def lookup_by_object(self, obj, default=None): # first, lookup by id: O(1) by_key = self._by_id[obj_id] - # if we have a key, use it + # if we have a key if obj_key: + # use the key to get an existing value + # or default if this Key is unknown return by_key.get(obj_key, default) + # we have seen this id(obj) before # look for a matching key: O(N) for key, value in by_key.items(): if key._matches_object(obj): @@ -84,6 +97,11 @@ def _cleanup(self, object_id=None): class LinearStore(Store, collections.abc.Sequence): + """ + A collections.abc.Sequence that can also be accessed + like a Store (by using any object as a key). + """ + def __init__(self, init=None): super().__init__() if init is None: diff --git a/asdf/_block/writer.py b/asdf/_block/writer.py index ad21dd7e0..265287d44 100644 --- a/asdf/_block/writer.py +++ b/asdf/_block/writer.py @@ -6,6 +6,10 @@ class WriteBlock: + """ + Data and compression options needed to write and ASDF block. + """ + def __init__(self, data, compression=None, compression_kwargs=None): self._data = data self.compression = compression @@ -26,6 +30,50 @@ def data_bytes(self): def write_blocks(fd, blocks, padding=False, streamed_block=None, write_index=True): + """ + Write a list of WriteBlocks to a file + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to write to. Writing will start at the current position. + + blocks : list of WriteBlock + List of WriteBlock instances used to get the data and options + to write to each ASDF block. + + padding : bool or float, optional, default False + If False, add no padding bytes between blocks. If True + add some default amount of padding. If a float, add + a number of padding bytes based off a ratio of the data + size. + See ``asdf._block.io.write_block`` ``padding`` argument for + more details. + + streamed_block : WriteBlock, optional + If provided (not None) include this WriteBlock as + the final block in the file and mark it as a streamed + block. + + write_index : bool, optional, default True + If True, include a block index at the end of the file. + If a streamed_block is provided (or the file is not + seekable) no block index will be written. + + Returns + ------- + offsets : list of int + Byte offsets (from the start of the file) where each + block was written (this is the start of the block magic + bytes for each block). This list includes the offset of + the streamed_block if it was provided. + If the file written to is not seekable these offsets + will all be None. + + headers : list of dict + Headers written for each block (including the streamed_block + if it was provided). + """ offsets = [] headers = [] for blk in blocks: diff --git a/asdf/_tests/_block/test_key.py b/asdf/_tests/_block/test_key.py index 4c038d03f..204a761c4 100644 --- a/asdf/_tests/_block/test_key.py +++ b/asdf/_tests/_block/test_key.py @@ -81,8 +81,8 @@ def test_obj_not_equal(): def test_undefined_not_equal(): key_value = 42 - k1 = Key(key=key_value) - k2 = Key(key=key_value) + k1 = Key(_key=key_value) + k2 = Key(_key=key_value) assert k1 != k2 diff --git a/asdf/_tests/_block/test_manager.py b/asdf/_tests/_block/test_manager.py index a917a2c2c..6c43d086f 100644 --- a/asdf/_tests/_block/test_manager.py +++ b/asdf/_tests/_block/test_manager.py @@ -74,3 +74,17 @@ def test_update_outside_context(tmp_path): m = af._blocks with pytest.raises(OSError, match=r"update called outside of valid write_context"): m.update(0, False, False) + + +def test_input_compression(tmp_path): + fn = tmp_path / "test.asdf" + af = asdf.AsdfFile({"arr": np.arange(10, dtype="uint8")}) + af.set_array_compression(af["arr"], "zlib") + af.write_to(fn) + + with asdf.open(fn) as af: + assert af.get_array_compression(af["arr"]) == "zlib" + af.set_array_compression(af["arr"], "bzp2") + assert af.get_array_compression(af["arr"]) == "bzp2" + af.set_array_compression(af["arr"], "input") + assert af.get_array_compression(af["arr"]) == "zlib" From a32bffb8dd4bfd7d32d0765ae5f63075f9e468c0 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 15 May 2023 16:43:02 -0400 Subject: [PATCH 094/154] update comment about input compression --- asdf/_block/options.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/asdf/_block/options.py b/asdf/_block/options.py index bc2ebd198..641ca6b26 100644 --- a/asdf/_block/options.py +++ b/asdf/_block/options.py @@ -46,7 +46,8 @@ def compression(self, compression): # some assumptions about availability of information (that the input block # is known). The Options here do not have the same assumption. # For a block read from a file, it's options will be initialized with - # the compression value read from the block header + # the compression value read from the block header so we should never + # see 'input' at this point. raise ValueError(msg) try: compression = mcompression.validate(compression) From 70f99cf5b8573005a14d92bd19977f45cc157cf2 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 15 May 2023 17:42:39 -0400 Subject: [PATCH 095/154] simplify ReadBlocks to a subclass of UserList --- asdf/_block/manager.py | 144 ++++++++++++++++++++++++++++----- asdf/_serialization_context.py | 20 ++--- 2 files changed, 130 insertions(+), 34 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 99c4bf957..dca8f94c5 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -1,3 +1,4 @@ +import collections import contextlib import copy @@ -10,23 +11,15 @@ from .options import Options -class ReadBlocks(store.LinearStore): - """ - {obj: block_index} : where obj is NDArrayType or other high level object - [block_0, block_1, ...] - """ - - def set_blocks(self, blocks): - self._items = blocks - - def append_block(self, block): - self._items.append(block) +class ReadBlocks(collections.UserList): + # workaround inability to weakref a list + pass class OptionsStore(store.Store): """ - {array_base: options} - read_blocks (instance of ReadBlocks) + A Store of Options that can be accessed by Key + (see ``asdf._block.store.Store``). """ def __init__(self, read_blocks=None): @@ -34,10 +27,45 @@ def __init__(self, read_blocks=None): self._read_blocks = read_blocks def has_options(self, array): + """ + Check of Options have been defined for this array + without falling back to generating Options from + a ReadBlock. + + Parameters + ---------- + array : ndarray + The base of this array (see `asdf.util.get_array_base`) will + be used to lookup any Options in the Store. + + Returns + ------- + has_options : bool + True if Options were previously defined for this array. + """ base = util.get_array_base(array) return self.lookup_by_object(base) is not None def get_options_from_block(self, array): + """ + Get Options for some array using only settings read from a + corresponding ReadBlock (one that shares the same base array). + Any Options defined using previous calls to set_options will + be ignored (use ``get_options`` if you would like these previously + set options to be considered). + + Parameters + ---------- + array : ndarray + The base of this array (see `asdf.util.get_array_base`) will + be used to lookup a corresponding ReadBlock. + + Returns + ------- + options : Options or None + Options initialized from settings read from a ReadBlock + or None if no corresponding block was found. + """ base = util.get_array_base(array) # look up by block with matching _data for block in self._read_blocks: @@ -52,6 +80,27 @@ def get_options_from_block(self, array): return None def get_options(self, array): + """ + Get Options for some array using either previously defined + options (as set by ``set_options``) or settings read from a + corresponding ReadBlock (one that shares the same base array). + + Note that if no options are found in the Store and options + are found from a ReadBlock the resulting Options will be added + to the Store. + + Parameters + ---------- + array : ndarray + The base of this array (see `asdf.util.get_array_base`) will + be used to lookup any Options in the Store. + + Returns + ------- + options : Options or None + Options read from the Store or ReadBlocks or None if + no options were found. + """ base = util.get_array_base(array) options = self.lookup_by_object(base) if options is None: @@ -64,6 +113,23 @@ def get_options(self, array): return options def set_options(self, array, options): + """ + Set Options for an array. + + Parameters + ---------- + array : ndarray + The base of this array (see `asdf.util.get_array_base`) will + be used to add options to the Store. + + options : Options + The Options to add to the Store for this array. + + Raises + ------ + ValueError + If more than one block is set as a streamed block. + """ if options.storage_type == "streamed": for oid, by_key in self._by_id.items(): for key, opt in by_key.items(): @@ -78,6 +144,15 @@ def set_options(self, array, options): self.assign_object(base, options) def get_output_compressions(self): + """ + Get all output compression types used for this Store of + Options. + + Returns + ------- + compressions : list of bytes + List of 4 byte compression labels used for this OptionsStore. + """ compressions = set() cfg = config.get_config() if cfg.all_array_compression == "input": @@ -96,26 +171,57 @@ def get_output_compressions(self): class Manager: + """ + Manager for reading, writing and storing options for ASDF blocks. + """ + def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, validate_checksums=False): if read_blocks is None: read_blocks = ReadBlocks([]) self.options = OptionsStore(read_blocks) - self.blocks = read_blocks + + self._blocks = read_blocks + self._external_block_cache = external.ExternalBlockCache() self._data_callbacks = store.Store() + self._write_blocks = store.LinearStore() self._external_write_blocks = [] self._streamed_write_block = None self._streamed_obj_keys = set() self._write_fd = None + self._uri = uri - self._external_block_cache = external.ExternalBlockCache() + + # general block settings self._lazy_load = lazy_load self._memmap = memmap self._validate_checksums = validate_checksums + @property + def blocks(self): + """ + Get any ReadBlocks that were read from an ASDF file + + Returns + ------- + read_blocks : list of ReadBlock + List of ReadBlock instances created during a call to read + or update. + """ + return self._blocks + + @blocks.setter + def blocks(self, new_blocks): + if not isinstance(new_blocks, ReadBlocks): + new_blocks = ReadBlocks(new_blocks) + self._blocks = new_blocks + # we propagate these blocks to options so that + # options lookups can fallback to the new read blocks + self.options._read_blocks = new_blocks + def read(self, fd, after_magic=False): - self.blocks.set_blocks( - reader.read_blocks(fd, self._memmap, self._lazy_load, self._validate_checksums, after_magic=after_magic) + self.blocks = reader.read_blocks( + fd, self._memmap, self._lazy_load, self._validate_checksums, after_magic=after_magic ) def _load_external(self, uri): @@ -325,7 +431,7 @@ def update(self, new_tree_size, pad_blocks, include_block_index): # we have to be lazy here as any current memmap is invalid new_read_block = reader.ReadBlock(offset + 4, self._write_fd, self._memmap, True, False, header=header) - new_read_blocks.append_block(new_read_block) + new_read_blocks.append(new_read_block) new_index = len(new_read_blocks) - 1 # update all callbacks @@ -334,7 +440,6 @@ def update(self, new_tree_size, pad_blocks, include_block_index): if obj is None: # this object no longer exists so don't both assigning it continue - new_read_blocks.assign_object(obj, new_read_block) # update data callbacks to point to new block cb = self._data_callbacks.lookup_by_object(obj) @@ -343,4 +448,3 @@ def update(self, new_tree_size, pad_blocks, include_block_index): # update read blocks to reflect new state self.blocks = new_read_blocks - self.options._read_blocks = new_read_blocks diff --git a/asdf/_serialization_context.py b/asdf/_serialization_context.py index a31d3c59f..0925d04be 100644 --- a/asdf/_serialization_context.py +++ b/asdf/_serialization_context.py @@ -207,15 +207,13 @@ class _Deserialization(_Operation): def __init__(self, ctx): super().__init__(ctx) self._obj = None - self._blk = None self._cb = None self._keys_to_assign = {} def __exit__(self, exc_type, exc_value, traceback): if exc_type is not None: return - if self._blk is not None: - self._blocks.blocks.assign_object(self._obj, self._blk) + if self._cb is not None: self._blocks._data_callbacks.assign_object(self._obj, self._cb) for key, cb in self._keys_to_assign.items(): if cb is None: @@ -227,21 +225,15 @@ def __exit__(self, exc_type, exc_value, traceback): # assign the key to the callback self._blocks._data_callbacks.assign_object(key, cb) - # and the block - blk = self._blocks.blocks[cb._index] - self._blocks.blocks.assign_object(key, blk) - def get_block_data_callback(self, index, key=None): - blk = self._blocks.blocks[index] if key is None: - if blk is self._blk: - # return callback for a previously access block - return self._cb - if self._blk is not None: - # for attempts to access a second block without a key + if self._cb is not None: + # this operation has already accessed a block without using + # a key so check if the same index was accessed + if self._cb._index == index: + return self._cb msg = "Converters accessing >1 block must provide a key for each block" raise OSError(msg) - self._blk = blk self._cb = self._blocks._get_data_callback(index) return self._cb From dfb0d49ac4e00c9139ecca057c32c0bf38b8bab0 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 16 May 2023 10:50:32 -0400 Subject: [PATCH 096/154] update _block.manager docstrings --- asdf/_block/manager.py | 132 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 128 insertions(+), 4 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index dca8f94c5..be62e3b57 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -220,6 +220,18 @@ def blocks(self, new_blocks): self.options._read_blocks = new_blocks def read(self, fd, after_magic=False): + """ + Read blocks from an ASDF file and update the manager read_blocks. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to read from. Reading starts at the current file position. + + after_magic : bool, optional, default False + If True, the file pointer is past the block magic bytes of the + first block. + """ self.blocks = reader.read_blocks( fd, self._memmap, self._lazy_load, self._validate_checksums, after_magic=after_magic ) @@ -252,6 +264,37 @@ def _write_external_blocks(self): writer.write_blocks(f, [blk]) def make_write_block(self, data, options, obj): + """ + Make a WriteBlock with data and options and + associate it with an object (obj). + + Parameters + ---------- + data : npdarray or callable + Data to be written to an ASDF block. Can be provided as + a callable function that when evaluated will return the + data. + options : Options + Options instance used to define the ASDF block compression + and storage type. + obj : object + An object in the ASDF tree that will be associated + with the new WriteBlock so that `AsdfFile.update` can + map newly created blocks to blocks read from the original + file. + + Returns + ------- + block_source : int or str + The relative uri (str) if an external block was created + or the index of the block (int) for an internal block. + + Raises + ------ + ValueError + If a external block was created without a URI for the main + file. + """ if options.storage_type == "external": for index, blk in enumerate(self._external_write_blocks): if blk._data is data: @@ -282,6 +325,22 @@ def make_write_block(self, data, options, obj): return len(self._write_blocks) - 1 def set_streamed_write_block(self, data, obj): + """ + Create a WriteBlock that will be written as an ASDF + streamed block. + + Parameters + ---------- + data : ndarray or callable + Data to be written to an ASDF block. Can be provided as + a callable function that when evaluated will return the + data. + obj : object + An object in the ASDF tree that will be associated + with the new WriteBlock so that `AsdfFile.update` can + map newly created blocks to blocks read from the original + file. + """ if self._streamed_write_block is not None and data is not self._streamed_write_block.data: msg = "Can not add second streaming block" raise ValueError(msg) @@ -322,6 +381,10 @@ def get_output_compressions(self): @contextlib.contextmanager def options_context(self): + """ + Context manager that copies block options on + entrance and restores the options when exited. + """ previous_options = copy.deepcopy(self.options) yield self.options = previous_options @@ -329,11 +392,22 @@ def options_context(self): @contextlib.contextmanager def write_context(self, fd, copy_options=True): + """ + Context manager that copies block options on + entrance and restores the options when exited. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to be written to. This is required on + entrance to this context so that any external + blocks can resolve relative uris. + + copy_options : bool, optional, default True + Copy options on entrance and restore them on + exit (See `options_context`). + """ self._clear_write() - # this is required for setting up external blocks - # during serialization we will need to know the uri of - # the file being written to (unless a different uri was - # supplied). self._write_fd = fd if copy_options: with self.options_context(): @@ -343,6 +417,28 @@ def write_context(self, fd, copy_options=True): self._clear_write() def write(self, pad_blocks, include_block_index): + """ + Write blocks that were set up during the current + `write_context`. + + Parameters + ---------- + pad_blocks : bool, None or float + If False, add no padding bytes between blocks. If True + add some default amount of padding. If a float, add + a number of padding bytes based off a ratio of the data + size. + + include_block_index : bool + If True, include a block index at the end of the file. + If a streamed_block is provided (or the file is not + seekable) no block index will be written. + + Raises + ------ + OSError + If called outside a `write_context`. + """ if self._write_fd is None: msg = "write called outside of valid write_context" raise OSError(msg) @@ -358,6 +454,34 @@ def write(self, pad_blocks, include_block_index): self._write_external_blocks() def update(self, new_tree_size, pad_blocks, include_block_index): + """ + Perform an update-in-place of ASDF blocks set up during + a `write_context`. + + Parameters + ---------- + new_tree_size : int + Size (in bytes) of the serialized ASDF tree (and any + header bytes) that will be written at the start of the + file being updated. + + pad_blocks : bool, None or float + If False, add no padding bytes between blocks. If True + add some default amount of padding. If a float, add + a number of padding bytes based off a ratio of the data + size. + + include_block_index : bool + If True, include a block index at the end of the file. + If a streamed_block is provided (or the file is not + seekable) no block index will be written. + + + Raises + ------ + OSError + If called outside a `write_context`. + """ if self._write_fd is None: msg = "update called outside of valid write_context" raise OSError(msg) From 6676e3106cae9c07aa432e5c235b787b60440e17 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 16 May 2023 11:30:11 -0400 Subject: [PATCH 097/154] update docs fix examples and references for Stream move fix converter references in new block storage docs --- docs/asdf/arrays.rst | 8 +++++--- docs/asdf/deprecations.rst | 10 ---------- docs/asdf/extending/converters.rst | 2 +- docs/asdf/user_api.rst | 1 + 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/docs/asdf/arrays.rst b/docs/asdf/arrays.rst index ba9f79f5c..34c4cbf2e 100644 --- a/docs/asdf/arrays.rst +++ b/docs/asdf/arrays.rst @@ -158,14 +158,15 @@ implicitly determined to include all of the remaining contents of the file. By definition, it must be the last block in the file. To use streaming, rather than including a Numpy array object in the -tree, you include a `asdf.Stream` object which sets up the structure +tree, you include a `asdf.tags.core.Stream` object which sets up the structure of the streamed data, but will not write out the actual content. The file handle's ``write`` method is then used to manually write out the binary data. .. runcode:: - from asdf import AsdfFile, Stream + from asdf import AsdfFile + from asdf.tags.core import Stream import numpy as np tree = { @@ -194,7 +195,8 @@ to numpy arrays stored in ASDF: import csv import numpy as np - from asdf import AsdfFile, Stream + from asdf import AsdfFile + from asdf.tags.core import Stream tree = { # We happen to know in advance that each row in the CSV has 100 ints diff --git a/docs/asdf/deprecations.rst b/docs/asdf/deprecations.rst index 8428ef66c..6a82c38b5 100644 --- a/docs/asdf/deprecations.rst +++ b/docs/asdf/deprecations.rst @@ -66,16 +66,6 @@ Without support for ``fits_embed.AsdfInFits`` the ``extract`` and ``remove-hdu`` commands for :ref:`asdftool ` are no longer usable and are deprecated. -.. _asdffile_blocks_deprecation: - -AsdfFile.blocks Deprecation -=========================== - -Direct usage of the ASDF block manager through `asdf.AsdfFile.blocks` is deprecated. -The BlockManager api was not previously included in the documentation and -was unused by the legacy and new style extensions. Planned features for ASDF 3.0 -include adding block storage support to :ref:`converters `. - .. _tests_helpers_deprecation: asdf.tests.helpers Deprecation diff --git a/docs/asdf/extending/converters.rst b/docs/asdf/extending/converters.rst index 15de139ca..4e5d12c9c 100644 --- a/docs/asdf/extending/converters.rst +++ b/docs/asdf/extending/converters.rst @@ -366,7 +366,7 @@ the index of the block a Converter would like to use to read or write the correc block. However, the index used for reading might not be the same index for writing if the tree was modified or the file is being written to a new location. During serialization and deserialization, asdf will associate each object with the -accessed block during `from_yaml_tree` and `to_yaml_tree`. +accessed block during `Converter.from_yaml_tree` and `Converter.to_yaml_tree`. .. note:: Converters using multiple blocks are slightly more complicated. diff --git a/docs/asdf/user_api.rst b/docs/asdf/user_api.rst index 2b2647eb3..214d3de18 100644 --- a/docs/asdf/user_api.rst +++ b/docs/asdf/user_api.rst @@ -9,6 +9,7 @@ User API :inherited-members: :no-inheritance-diagram: :skip: ValidationError + :skip: Stream .. automodapi:: asdf.search From cc862db6dc59383109d6185716ff0701291fca44 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 16 May 2023 13:03:44 -0400 Subject: [PATCH 098/154] add docs for subclass conversion config setting --- asdf/yamlutil.py | 2 +- docs/asdf/config.rst | 58 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index 02644faa2..e7beb0e33 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -277,7 +277,7 @@ def _walker(obj): warnings.warn( f"A ndarray subclass ({type(obj)}) was converted as a ndarray. " "This behavior will be removed from a future version of ASDF. " - "See TODO some link", + "See https://asdf.readthedocs.io/en/latest/asdf/config.html#convert-unknown-ndarray-subclasses", AsdfConversionWarning, ) return _convert_obj(obj, subtype=True) diff --git a/docs/asdf/config.rst b/docs/asdf/config.rst index ab3ed3f96..d3b85092a 100644 --- a/docs/asdf/config.rst +++ b/docs/asdf/config.rst @@ -104,6 +104,64 @@ type is not managed automatically. Defaults to ``None``. +all_array_storage +----------------- + +Use this storage type for all arrays within an ASDF file. Must be one of + +- ``"internal"`` +- ``"external"`` +- ``"inline"`` +- ``None`` + +If ``None`` a different storage type can be used for each array. +See ``AsdfFile.set_array_storage`` for more details. + +Defaults to ``None``. + +all_array_compression +--------------------- + +Use this compression type for all arrays within an ASDF file. +If ``"input"`` a different compression type can be used for each +array. See ``AsdfFile.set_array_compression`` for more details. + +Defaults to ``"input"``. + +all_array_compression_kwargs +---------------------------- + +Use these additional compression keyword arguments for all arrays +within an ASDF file. If ``None`` diffeerent keyword arguments +can be set for each array. See ``AsdfFile.set_array_compression`` for more details. + +Defaults to ``None``. + +.. _convert_unknown_ndarray_subclasses: + +convert_unknown_ndarray_subclasses +---------------------------------- + +Convert otherwise unhandled instances of subclasses of ndarray into +ndarrays prior to serialization. + +Previous extension code allowed AsdfTypes to convert instances of subclasses +of supported types. Internally, the handling of ndarrays has been moved +from an AsdfType to a Converter which does not support converting +instances of subclasses unless they are explicitly listed. This means +that code that previously relied on asdf converting instances of subclasses +of ndarray into an ndarray will need to be updated to define a Converter +for the ndarray subclass or to request that support be added directly +in asdf (for subclasses in existing asdf dependencies). + +With this setting enabled, asdf will continue to convert instances +of subclasses of ndarray but will issue a warning when an instance is +converted. In a future version of asdf this default will change +to ``False``, a deprecation warning will be issued and finally +the conversion of instances of subclasses will be removed. + +Defaults to ``True``. + default_version --------------- From e0ed2cbabd285f3055b3b07be68e59b0538e65a2 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 16 May 2023 13:33:02 -0400 Subject: [PATCH 099/154] update changelog --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index d1c4525b7..469c71cfc 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -20,6 +20,9 @@ The ASDF Standard is at v1.6.0 AsdfFile.write_to and AsdfFile.update kwargs [#1592] - Fix ``AsdfFile.info`` loading all array data [#1572] - Blank out AsdfFile.tree on close [#1575] +- Move ndarray to a converter, add ``convert_unknown_ndarray_subclasses`` + to ``asdf.config.AsdfConfig``, move ``asdf.Stream`` to + ``asdf.tags.core.Stream``, update internal block API [#1537] 2.15.1 (2023-08-07) ------------------- From a3bdd90bac0c3d638d899d5758b309c6032ab9bc Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 16 May 2023 13:46:17 -0400 Subject: [PATCH 100/154] update changes --- CHANGES.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 469c71cfc..3cc9dd86c 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -22,7 +22,8 @@ The ASDF Standard is at v1.6.0 - Blank out AsdfFile.tree on close [#1575] - Move ndarray to a converter, add ``convert_unknown_ndarray_subclasses`` to ``asdf.config.AsdfConfig``, move ``asdf.Stream`` to - ``asdf.tags.core.Stream``, update internal block API [#1537] + ``asdf.tags.core.Stream``, update block storage support for + Converter and update internal block API [#1537] 2.15.1 (2023-08-07) ------------------- From 0fcff140d853d48e5a18a9908e2d92b82eeceb6c Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 17 May 2023 10:36:31 -0400 Subject: [PATCH 101/154] index write blocks by data allowing for fast matching of data when looking for objects that share blocks --- asdf/_block/manager.py | 33 ++++++++++++++++++++++++--------- asdf/_block/store.py | 3 +++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index be62e3b57..dd0f70c02 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -16,6 +16,22 @@ class ReadBlocks(collections.UserList): pass +class WriteBlocks(store.LinearStore): + def __init__(self, init=None): + super().__init__(init) + self._by_data = store.Store() + + def lookup_by_data(self, data): + return self._by_data.lookup_by_object(data) + + def add_block(self, blk, obj): + index = len(self._items) + self._items.append(blk) + self._by_data.assign_object(blk._data, index) + self.assign_object_by_index(obj, index) + return index + + class OptionsStore(store.Store): """ A Store of Options that can be accessed by Key @@ -184,7 +200,7 @@ def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, va self._external_block_cache = external.ExternalBlockCache() self._data_callbacks = store.Store() - self._write_blocks = store.LinearStore() + self._write_blocks = WriteBlocks() self._external_write_blocks = [] self._streamed_write_block = None self._streamed_obj_keys = set() @@ -243,7 +259,7 @@ def _load_external(self, uri): return value def _clear_write(self): - self._write_blocks = store.LinearStore() + self._write_blocks = WriteBlocks() self._external_write_blocks = [] self._streamed_write_block = None self._streamed_obj_keys = set() @@ -314,15 +330,14 @@ def make_write_block(self, data, options, obj): self._external_write_blocks.append(blk) return blk._uri # first, look for an existing block - for index, blk in enumerate(self._write_blocks): - if blk._data is data: - self._write_blocks.assign_object(obj, blk) - return index + index = self._write_blocks.lookup_by_data(data) + if index is not None: + self._write_blocks.assign_object_by_index(obj, index) + return index # if no block is found, make a new block blk = writer.WriteBlock(data, options.compression, options.compression_kwargs) - self._write_blocks._items.append(blk) - self._write_blocks.assign_object(obj, blk) - return len(self._write_blocks) - 1 + index = self._write_blocks.add_block(blk, obj) + return index def set_streamed_write_block(self, data, obj): """ diff --git a/asdf/_block/store.py b/asdf/_block/store.py index 7b9aeac93..658e395bc 100644 --- a/asdf/_block/store.py +++ b/asdf/_block/store.py @@ -118,6 +118,9 @@ def assign_object(self, obj, value): index = self._items.index(value) super().assign_object(obj, index) + def assign_object_by_index(self, obj, index): + super().assign_object(obj, index) + def __getitem__(self, index): return self._items.__getitem__(index) From a991c4eda14514a4d467a0b031e9f04f3ec2ac3a Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 17 May 2023 11:29:52 -0400 Subject: [PATCH 102/154] remove _serialization_context._Operation context manager feature entering and exiting the context manager for each node in a large tree can introduce signficant overhead (the code is roughly 20% faster without this feature). --- asdf/_serialization_context.py | 56 ++++-------- asdf/_tests/test_serialization_context.py | 100 ++++++++++------------ asdf/yamlutil.py | 61 +++++++------ 3 files changed, 99 insertions(+), 118 deletions(-) diff --git a/asdf/_serialization_context.py b/asdf/_serialization_context.py index 0925d04be..b1d68ec2b 100644 --- a/asdf/_serialization_context.py +++ b/asdf/_serialization_context.py @@ -1,4 +1,3 @@ -import contextlib import weakref from ._block.key import Key as BlockKey @@ -138,37 +137,18 @@ def generate_block_key(self): """ raise NotImplementedError("abstract") - @contextlib.contextmanager - def _serialization(self, obj): - with _Serialization(self, obj) as op: - yield op - - @contextlib.contextmanager - def _deserialization(self): - with _Deserialization(self) as op: - yield op - class _Operation(SerializationContext): """ `SerializationContext` is used for multiple operations including serialization and deserialization. The `_Operation` class allows the SerializationContext to have different behavior during these - operations (for example allowing block reading during deserialization) - and allows the context to be used with a python ``with`` statement to - allow setup and teardown operations (such as associating a - deserialized object with the blocks accessed during deserialization). - - `_Operation` subclasses should not be instantiated directly but instead - should be accessible via private methods on a `SerializationContext`. - This allows the `SerializationContext` to provide itself to the `_Operation` - which can chose to implement abstract methods in `SerializationContext` - (such as `SerializationContext.find_available_block_index` during - `_Serialization` created via `SerializationContext._serialization`). + operations (for example allowing block reading during deserialization). """ def __init__(self, ctx): self._ctx = weakref.ref(ctx) + self._obj = None super().__init__(ctx.version, ctx.extension_manager, ctx.url, ctx._blocks) def _mark_extension_used(self, extension): @@ -180,10 +160,10 @@ def _extensions_used(self): ctx = self._ctx() return ctx._extensions_used - def __enter__(self): - return self + def assign_object(self, obj): + self._obj = obj - def __exit__(self, exc_type, exc_value, traceback): + def assign_blocks(self): pass @@ -195,24 +175,21 @@ class _Deserialization(_Operation): - `SerializationContext.generate_block_key` - `SerializationContext.get_block_data_callback` and tracks which blocks (and keys) are accessed, assigning them - to the deserialized object at the end of the - `SerializationContext._deserialization`. - - Code that uses `_Deserialization` and accesses any blocks - or generates keys must assign an object to - `_Deserialization._obj` prior to exiting the `_Deserialization` - context manager. + to the deserialized object after `assign_object` and + `assign_blocks` are called. """ def __init__(self, ctx): super().__init__(ctx) - self._obj = None - self._cb = None - self._keys_to_assign = {} + self.assign_object(None) + + def assign_object(self, obj): + super().assign_object(obj) + if obj is None: + self._cb = None + self._keys_to_assign = {} - def __exit__(self, exc_type, exc_value, traceback): - if exc_type is not None: - return + def assign_blocks(self): if self._cb is not None: self._blocks._data_callbacks.assign_object(self._obj, self._cb) for key, cb in self._keys_to_assign.items(): @@ -262,9 +239,8 @@ class _Serialization(_Operation): being serialized. """ - def __init__(self, ctx, obj): + def __init__(self, ctx): super().__init__(ctx) - self._obj = obj def find_available_block_index(self, data_callback, key=None): if key is None: diff --git a/asdf/_tests/test_serialization_context.py b/asdf/_tests/test_serialization_context.py index 941896522..2c9e8b20b 100644 --- a/asdf/_tests/test_serialization_context.py +++ b/asdf/_tests/test_serialization_context.py @@ -3,7 +3,7 @@ import asdf from asdf import get_config -from asdf._serialization_context import SerializationContext +from asdf._serialization_context import SerializationContext, _Deserialization, _Serialization from asdf.extension import ExtensionManager @@ -31,19 +31,16 @@ def test_serialization_context(): SerializationContext("0.5.4", extension_manager, None, None) -@pytest.mark.parametrize("operation", ["_deserialization", "_serialization"]) +@pytest.mark.parametrize("operation", ["_Deserialization", "_Serialization"]) def test_extension_used_in_operation(operation): extension_manager = ExtensionManager([]) context = SerializationContext("1.4.0", extension_manager, "file://test.asdf", None) - if operation == "_serialization": - args = [object()] - else: - args = [] + op_ctx = getattr(asdf._serialization_context, operation)(context) extension = get_config().extensions[0] - with getattr(context, operation)(*args) as op_ctx: - op_ctx._mark_extension_used(extension) - assert extension in op_ctx._extensions_used + op_ctx._mark_extension_used(extension) + op_ctx._mark_extension_used(extension) + assert extension in op_ctx._extensions_used # check this persists in the parent context assert extension in context._extensions_used @@ -61,40 +58,33 @@ def test_get_block_data_callback(tmp_path): with pytest.raises(NotImplementedError, match="abstract"): context.get_block_data_callback(0) - with context._deserialization() as op_ctx: - cb0 = op_ctx.get_block_data_callback(0) + op_ctx = _Deserialization(context) + cb0 = op_ctx.get_block_data_callback(0) - # getting the same callback should pass and return the same object - assert op_ctx.get_block_data_callback(0) is cb0 + # getting the same callback should pass and return the same object + assert op_ctx.get_block_data_callback(0) is cb0 - # since we accessed block 0 we shouldn't be allowed to access block 1 - with pytest.raises(OSError, match=r"Converters accessing >1.*"): - op_ctx.get_block_data_callback(1) + # since we accessed block 0 we shouldn't be allowed to access block 1 + with pytest.raises(OSError, match=r"Converters accessing >1.*"): + op_ctx.get_block_data_callback(1) - # unless we use a key - key = op_ctx.generate_block_key() - cb1 = op_ctx.get_block_data_callback(1, key) - assert op_ctx.get_block_data_callback(1, key) is cb1 - - # we don't know the order of blocks, so find which block - # was used for which array by looking at the size - d0 = cb0() - d1 = cb1() - if d0.size == arr1.size: - arr0, arr1 = arr1, arr0 - np.testing.assert_array_equal(d0, arr0) - np.testing.assert_array_equal(d1, arr1) - - class Foo: - pass - - # assign a deserialized object as we accessed blocks and the context - # will expect this object to be available - op_ctx._obj = Foo() - - with context._serialization(object()) as op_ctx: - with pytest.raises(NotImplementedError, match="abstract"): - op_ctx.get_block_data_callback(0) + # unless we use a key + key = op_ctx.generate_block_key() + cb1 = op_ctx.get_block_data_callback(1, key) + assert op_ctx.get_block_data_callback(1, key) is cb1 + + # we don't know the order of blocks, so find which block + # was used for which array by looking at the size + d0 = cb0() + d1 = cb1() + if d0.size == arr1.size: + arr0, arr1 = arr1, arr0 + np.testing.assert_array_equal(d0, arr0) + np.testing.assert_array_equal(d1, arr1) + + op_ctx = _Serialization(context) + with pytest.raises(NotImplementedError, match="abstract"): + op_ctx.get_block_data_callback(0) def test_find_available_block_index(): @@ -110,12 +100,13 @@ def cb(): class Foo: pass - with context._serialization(Foo()) as op_ctx: - assert op_ctx.find_available_block_index(cb) == 0 + op_ctx = _Serialization(context) + op_ctx.assign_object(Foo()) + assert op_ctx.find_available_block_index(cb) == 0 - with context._deserialization() as op_ctx: - with pytest.raises(NotImplementedError, match="abstract"): - op_ctx.find_available_block_index(cb) + op_ctx = _Deserialization(context) + with pytest.raises(NotImplementedError, match="abstract"): + op_ctx.find_available_block_index(cb) def test_generate_block_key(): @@ -129,17 +120,18 @@ class Foo: pass obj = Foo() - with context._serialization(obj) as op_ctx: - key = op_ctx.generate_block_key() - assert key._is_valid() - assert key._matches_object(obj) + op_ctx = _Serialization(context) + op_ctx.assign_object(obj) + key = op_ctx.generate_block_key() + assert key._is_valid() + assert key._matches_object(obj) obj = Foo() + op_ctx = _Deserialization(context) # because this test generates but does not assign a key # it should raise an exception with pytest.raises(OSError, match=r"Converter generated a key.*"): - with context._deserialization() as op_ctx: - key = op_ctx.generate_block_key() - # the key does not yet have an assigned object - assert not key._is_valid() - op_ctx._obj = obj + key = op_ctx.generate_block_key() + # the key does not yet have an assigned object + assert not key._is_valid() + op_ctx.assign_blocks() diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index e7beb0e33..42e932e26 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -6,6 +6,7 @@ import yaml from . import config, schema, tagged, treeutil, util +from ._serialization_context import _Deserialization, _Serialization from .constants import STSCI_SCHEMA_TAG_BASE, YAML_TAG_PREFIX from .exceptions import AsdfConversionWarning from .tags.core import AsdfObject @@ -218,21 +219,18 @@ def custom_tree_to_tagged_tree(tree, ctx, _serialization_context=None): if _serialization_context is None: _serialization_context = ctx._create_serialization_context() - extension_manager = _serialization_context.extension_manager + sctx = _Serialization(_serialization_context) - def _convert_obj(obj, subtype=False): - if subtype: - converter = extension_manager._get_converter_for_subtype(type(obj)) - else: - converter = extension_manager.get_converter_for_type(type(obj)) - tag = converter.select_tag(obj, _serialization_context) - converters = set() + extension_manager = sctx.extension_manager + version_string = ctx.version_string + + def _convert_obj(obj, converter): + tag = converter.select_tag(obj, sctx) # if select_tag returns None, converter.to_yaml_tree should return a new # object which will be handled by a different converter while tag is None: converters.add(converter) - with _serialization_context._serialization(obj) as sctx: - obj = converter.to_yaml_tree(obj, tag, sctx) + obj = converter.to_yaml_tree(obj, tag, sctx) try: converter = extension_manager.get_converter_for_type(type(obj)) except KeyError: @@ -242,7 +240,10 @@ def _convert_obj(obj, subtype=False): if converter in converters: msg = "Conversion cycle detected" raise TypeError(msg) - tag = converter.select_tag(obj, _serialization_context) + tag = converter.select_tag(obj, sctx) + sctx.assign_object(obj) + node = converter.to_yaml_tree(obj, tag, sctx) + sctx.assign_blocks() if isinstance(node, GeneratorType): generator = node @@ -261,7 +262,7 @@ def _convert_obj(obj, subtype=False): msg = f"Converter returned illegal node type: {util.get_class_name(node)}" raise TypeError(msg) - _serialization_context._mark_extension_used(converter.extension) + sctx._mark_extension_used(converter.extension) yield tagged_node if generator is not None: @@ -269,28 +270,38 @@ def _convert_obj(obj, subtype=False): cfg = config.get_config() convert_ndarray_subclasses = cfg.convert_unknown_ndarray_subclasses + converters = {} def _walker(obj): - if extension_manager.handles_type(type(obj)): - return _convert_obj(obj) - if convert_ndarray_subclasses and isinstance(obj, np.ndarray) and extension_manager._handles_subtype(type(obj)): + typ = type(obj) + if typ in converters: + return converters[typ](obj) + if extension_manager.handles_type(typ): + converter = extension_manager.get_converter_for_type(typ) + converters[typ] = lambda obj, _converter=converter: _convert_obj(obj, _converter) + return _convert_obj(obj, converter) + if convert_ndarray_subclasses and isinstance(obj, np.ndarray) and extension_manager._handles_subtype(typ): warnings.warn( f"A ndarray subclass ({type(obj)}) was converted as a ndarray. " "This behavior will be removed from a future version of ASDF. " "See https://asdf.readthedocs.io/en/latest/asdf/config.html#convert-unknown-ndarray-subclasses", AsdfConversionWarning, ) - return _convert_obj(obj, subtype=True) + converter = extension_manager._get_converter_for_subtype(typ) + converters[typ] = lambda obj, _converter=converter: _convert_obj(obj, _converter) + return _convert_obj(obj, converter) tag = ctx._type_index.from_custom_type( - type(obj), - ctx.version_string, - _serialization_context=_serialization_context, + typ, + version_string, + _serialization_context=sctx, ) if tag is not None: + converters[typ] = lambda obj, _tag=tag: _tag.to_tree_tagged(obj, ctx) return tag.to_tree_tagged(obj, ctx) + converters[typ] = lambda obj: obj return obj return treeutil.walk_and_modify( @@ -313,6 +324,7 @@ def tagged_tree_to_custom_tree(tree, ctx, force_raw_types=False, _serialization_ _serialization_context = ctx._create_serialization_context() extension_manager = _serialization_context.extension_manager + dctx = _Deserialization(_serialization_context) def _walker(node): if force_raw_types: @@ -324,13 +336,14 @@ def _walker(node): if extension_manager.handles_tag(tag): converter = extension_manager.get_converter_for_tag(tag) - with _serialization_context._deserialization() as sctx: - obj = converter.from_yaml_tree(node.data, tag, sctx) - sctx._obj = obj - _serialization_context._mark_extension_used(converter.extension) + dctx.assign_object(None) + obj = converter.from_yaml_tree(node.data, tag, dctx) + dctx.assign_object(obj) + dctx.assign_blocks() + dctx._mark_extension_used(converter.extension) return obj - tag_type = ctx._type_index.from_yaml_tag(ctx, tag, _serialization_context=_serialization_context) + tag_type = ctx._type_index.from_yaml_tag(ctx, tag, _serialization_context=dctx) # This means the tag did not correspond to any type in our type index. if tag_type is None: if not ctx._ignore_unrecognized_tag: From e07ea0d8e42d3e60299d536d2fbe66a4020b5283 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 30 May 2023 09:10:24 -0400 Subject: [PATCH 103/154] add external block UseInternalType --- asdf/_block/external.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/asdf/_block/external.py b/asdf/_block/external.py index c011f8064..6f76a489a 100644 --- a/asdf/_block/external.py +++ b/asdf/_block/external.py @@ -12,10 +12,13 @@ from asdf import generic_io, util -class UseInternal: +class UseInternalType: pass +UseInternal = UseInternalType() + + class ExternalBlockCache: def __init__(self): self._cache = {} From 4e3a7aa46c313084f832baca403b528e3045ed37 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 30 May 2023 09:23:48 -0400 Subject: [PATCH 104/154] remove unused config_context in block options --- asdf/_block/options.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/asdf/_block/options.py b/asdf/_block/options.py index 641ca6b26..2e8dcbf8e 100644 --- a/asdf/_block/options.py +++ b/asdf/_block/options.py @@ -1,5 +1,5 @@ from asdf import compression as mcompression -from asdf.config import config_context +from asdf.config import get_config class Options: @@ -9,8 +9,7 @@ class Options: def __init__(self, storage_type=None, compression_type=None, compression_kwargs=None): if storage_type is None: - with config_context() as cfg: - storage_type = cfg.all_array_storage or "internal" + storage_type = get_config().all_array_storage or "internal" self._storage_type = None self._compression = None self._compression_kwargs = None From d7032b197334e3381bc52b89fd84445807cfd2b1 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 30 May 2023 09:26:25 -0400 Subject: [PATCH 105/154] change key copy to match stype of options copy --- asdf/_block/key.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asdf/_block/key.py b/asdf/_block/key.py index e0af2f5a6..fce97d14e 100644 --- a/asdf/_block/key.py +++ b/asdf/_block/key.py @@ -79,4 +79,4 @@ def __eq__(self, other): def __copy__(self): obj = self._ref if self._ref is None else self._ref() - return self.__class__(obj, self._key) + return type(self)(obj, self._key) From b227f9f396ef4f10b76913d34f57eefa8c6af699 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 30 May 2023 09:27:36 -0400 Subject: [PATCH 106/154] remove out-of-date block options comments --- asdf/_block/options.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/asdf/_block/options.py b/asdf/_block/options.py index 2e8dcbf8e..1ced09478 100644 --- a/asdf/_block/options.py +++ b/asdf/_block/options.py @@ -15,11 +15,8 @@ def __init__(self, storage_type=None, compression_type=None, compression_kwargs= self._compression_kwargs = None # set via setters - # set kwargs first to avoid overwrite when compression type changes self.compression_kwargs = compression_kwargs self.compression = compression_type - - # set storage type last to possibly overwrite compression/compression_kwargs self.storage_type = storage_type @property From ca0b9f40a27e72cb39151ee1be367dfd8fd39d37 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 30 May 2023 09:31:39 -0400 Subject: [PATCH 107/154] remove leftover name and types from Stream --- asdf/tags/core/stream.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/asdf/tags/core/stream.py b/asdf/tags/core/stream.py index 1133b1c85..de22dbd65 100644 --- a/asdf/tags/core/stream.py +++ b/asdf/tags/core/stream.py @@ -21,9 +21,6 @@ class Stream(NDArrayType): ... np.array([i] * 1024, np.float64).tobytes()) """ - name = None - types = [] - def __init__(self, shape, dtype, strides=None): self._shape = shape self._datatype, self._byteorder = numpy_dtype_to_asdf_datatype(dtype) From 067c6509fcf373cd6dc5f31aee3e9d9b5c14674c Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 30 May 2023 09:47:27 -0400 Subject: [PATCH 108/154] check for valid key on Store.assign_object remove unneeded else clause --- asdf/_block/store.py | 8 +++----- asdf/_tests/_block/test_store.py | 7 +++++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/asdf/_block/store.py b/asdf/_block/store.py index 658e395bc..b84073f8b 100644 --- a/asdf/_block/store.py +++ b/asdf/_block/store.py @@ -50,6 +50,9 @@ def lookup_by_object(self, obj, default=None): def assign_object(self, obj, value): if isinstance(obj, Key): + if not obj._is_valid(): + msg = "Invalid key used for assign_object" + raise ValueError(msg) obj_id = id(obj._ref()) obj_key = obj else: @@ -74,11 +77,6 @@ def assign_object(self, obj, value): return # we didn't find a matching key, so make one obj_key = Key(obj) - else: - # we already have a key, check if it's already in the store - if obj_key in by_key: - by_key[obj_key] = value - return # if no match was found, add using the key self._by_id[obj_id][obj_key] = value diff --git a/asdf/_tests/_block/test_store.py b/asdf/_tests/_block/test_store.py index 05ede0faf..3dbebe13b 100644 --- a/asdf/_tests/_block/test_store.py +++ b/asdf/_tests/_block/test_store.py @@ -84,6 +84,13 @@ def test_set_same_object(): assert s.lookup_by_object(f) == v +def test_invalid_key_assign_object(): + s = Store() + k = Key() + with pytest.raises(ValueError, match="Invalid key used for assign_object"): + s.assign_object(k, 42) + + def test_set_same_key(): f = Foo() s = Store() From 47a17f4e4393df41c60b22a994da5cb8445a7da9 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 30 May 2023 09:54:53 -0400 Subject: [PATCH 109/154] avoid fast_forwarding using header values for streamed blocks --- asdf/_block/io.py | 11 ++++++++--- asdf/_block/writer.py | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/asdf/_block/io.py b/asdf/_block/io.py index 517c67fc3..dd6f6ae67 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -150,12 +150,14 @@ def read_block_data(fd, header, offset=None, memmap=False): else: if memmap and fd.can_memmap(): data = fd.memmap_array(offset, used_size) - fd.fast_forward(header["allocated_size"]) + ff_bytes = header["allocated_size"] else: data = fd.read_into_array(used_size) - fd.fast_forward(header["allocated_size"] - header["used_size"]) + ff_bytes = header["allocated_size"] - header["used_size"] if (header["flags"] & constants.BLOCK_FLAG_STREAMED) and fd.seekable(): fd.seek(0, os.SEEK_END) + else: + fd.fast_forward(ff_bytes) return data @@ -222,7 +224,10 @@ def callback(): return data data = callback - fd.fast_forward(header["allocated_size"]) + if header["flags"] & constants.BLOCK_FLAG_STREAMED: + fd.seek(0, os.SEEK_END) + else: + fd.fast_forward(header["allocated_size"]) else: data = read_block_data(fd, header, offset=None, memmap=memmap) return offset, header, data_offset, data diff --git a/asdf/_block/writer.py b/asdf/_block/writer.py index 265287d44..5af5e0bcd 100644 --- a/asdf/_block/writer.py +++ b/asdf/_block/writer.py @@ -7,7 +7,7 @@ class WriteBlock: """ - Data and compression options needed to write and ASDF block. + Data and compression options needed to write an ASDF block. """ def __init__(self, data, compression=None, compression_kwargs=None): From 31568c077d8eddffa5a04d8d716b767544896ab4 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 30 May 2023 12:19:54 -0400 Subject: [PATCH 110/154] remove NDArrayType subclassing from Stream --- asdf/tags/core/stream.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asdf/tags/core/stream.py b/asdf/tags/core/stream.py index de22dbd65..fed1c84c8 100644 --- a/asdf/tags/core/stream.py +++ b/asdf/tags/core/stream.py @@ -1,7 +1,7 @@ -from .ndarray import NDArrayType, numpy_dtype_to_asdf_datatype +from .ndarray import numpy_dtype_to_asdf_datatype -class Stream(NDArrayType): +class Stream: """ Used to put a streamed array into the tree. From 254c1657fdd991e45dd2d6d2b6ced5674b565810 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 30 May 2023 13:22:52 -0400 Subject: [PATCH 111/154] change generate_write_block_header to return dict instead of packed header --- asdf/_block/io.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/asdf/_block/io.py b/asdf/_block/io.py index dd6f6ae67..06b14ed73 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -235,7 +235,7 @@ def callback(): def generate_write_header(data, stream=False, compression_kwargs=None, padding=False, fs_block_size=1, **header_kwargs): """ - Generate a binary representation of a ASDF block header that can be + Generate a dict representation of a ASDF block header that can be used for writing a block. Note that if a compression key is provided in ``header_kwargs`` this @@ -273,8 +273,8 @@ def generate_write_header(data, stream=False, compression_kwargs=None, padding=F Returns ------- - header : bytes - Packed binary representation of the ASDF block header. + header : dict + Dictionary representation of an ASDF block header. buff : bytes or None If this block is compressed buff will contained the compressed @@ -292,6 +292,7 @@ def generate_write_header(data, stream=False, compression_kwargs=None, padding=F header_kwargs["data_size"] = 0 header_kwargs["checksum"] = b"\0" * 16 else: + header_kwargs["flags"] = 0 header_kwargs["data_size"] = data.nbytes header_kwargs["checksum"] = calculate_block_checksum(data) @@ -318,9 +319,8 @@ def generate_write_header(data, stream=False, compression_kwargs=None, padding=F f"allocated size {header_kwargs['allocated_size']}", ) raise RuntimeError(msg) - header = BLOCK_HEADER.pack(**header_kwargs) padding_bytes = header_kwargs["allocated_size"] - header_kwargs["used_size"] - return header, buff, padding_bytes + return header_kwargs, buff, padding_bytes def write_block(fd, data, offset=None, stream=False, compression_kwargs=None, padding=False, **header_kwargs): @@ -355,9 +355,10 @@ def write_block(fd, data, offset=None, stream=False, compression_kwargs=None, pa The ASDF block header as unpacked from the `BLOCK_HEADER` used for writing. """ - header, buff, padding_bytes = generate_write_header( + header_dict, buff, padding_bytes = generate_write_header( data, stream, compression_kwargs, padding, fd.block_size, **header_kwargs ) + header_bytes = BLOCK_HEADER.pack(**header_dict) if offset is not None: if fd.seekable(): @@ -365,14 +366,14 @@ def write_block(fd, data, offset=None, stream=False, compression_kwargs=None, pa else: msg = "write_block received offset for non-seekable file" raise ValueError(msg) - fd.write(struct.pack(b">H", len(header))) - fd.write(header) + fd.write(struct.pack(b">H", len(header_bytes))) + fd.write(header_bytes) if buff is None: # data is uncompressed fd.write_array(data) else: fd.write(buff.getvalue()) fd.fast_forward(padding_bytes) - return BLOCK_HEADER.unpack(header) + return header_dict def _candidate_offsets(min_offset, max_offset, block_size): From 4caae1ad584a15758cebc327e206fdb998117860 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 30 May 2023 14:01:38 -0400 Subject: [PATCH 112/154] rename and clarify _block.external.relative_uri_to_index --- asdf/_block/external.py | 6 +++--- asdf/_block/manager.py | 2 +- asdf/_tests/_block/test_external.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/asdf/_block/external.py b/asdf/_block/external.py index 6f76a489a..229a361f7 100644 --- a/asdf/_block/external.py +++ b/asdf/_block/external.py @@ -37,9 +37,9 @@ def load(self, base_uri, uri): return self._cache[key] -def uri_for_index(uri, index): - parts = list(util.patched_urllib_parse.urlparse(uri)) - path = parts[2] +def relative_uri_for_index(uri, index): + # get the os-native separated path for this uri + path = util.patched_urllib_parse.urlparse(uri).path dirname, filename = os.path.split(path) filename = os.path.splitext(filename)[0] + f"{index:04d}.asdf" return filename diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index dd0f70c02..16d82c168 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -326,7 +326,7 @@ def make_write_block(self, data, options, obj): if base_uri is None: msg = "Can't write external blocks, since URI of main file is unknown." raise ValueError(msg) - blk._uri = external.uri_for_index(base_uri, index) + blk._uri = external.relative_uri_for_index(base_uri, index) self._external_write_blocks.append(blk) return blk._uri # first, look for an existing block diff --git a/asdf/_tests/_block/test_external.py b/asdf/_tests/_block/test_external.py index d5600044c..a7cb6f9c5 100644 --- a/asdf/_tests/_block/test_external.py +++ b/asdf/_tests/_block/test_external.py @@ -21,6 +21,6 @@ def test_cache(tmp_path): @pytest.mark.parametrize("uri", ["test.asdf", "foo/test.asdf"]) @pytest.mark.parametrize("index", [0, 1, 100]) -def test_uri_for_index(uri, index): +def test_relative_uri_for_index(uri, index): match = f"test{index:04d}.asdf" - assert external.uri_for_index(uri, index) == match + assert external.relative_uri_for_index(uri, index) == match From b98f428302e837521d746446626af17dc2cc71ae Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 30 May 2023 15:46:08 -0400 Subject: [PATCH 113/154] test for and handle files will odd 'tell' results --- asdf/_block/writer.py | 38 ++++++++++++++++------- asdf/_tests/_block/test_writer.py | 51 ++++++++++++++++++++++++++++++- asdf/generic_io.py | 1 - 3 files changed, 77 insertions(+), 13 deletions(-) diff --git a/asdf/_block/writer.py b/asdf/_block/writer.py index 5af5e0bcd..f8c36cf7d 100644 --- a/asdf/_block/writer.py +++ b/asdf/_block/writer.py @@ -74,14 +74,25 @@ def write_blocks(fd, blocks, padding=False, streamed_block=None, write_index=Tru Headers written for each block (including the streamed_block if it was provided). """ + # some non-seekable files return a valid `tell` result + # others can raise an exception, others might always + # return 0. See relevant issues: + # https://github.com/asdf-format/asdf/issues/1545 + # https://github.com/asdf-format/asdf/issues/1552 + # https://github.com/asdf-format/asdf/issues/1542 + # to enable writing a block index for all valid files + # we will wrap tell to return None on an error + + def tell(): + try: + return fd.tell() + except OSError: + return None + offsets = [] headers = [] for blk in blocks: - if fd.seekable(): - offset = fd.tell() - else: - offset = None - offsets.append(offset) + offsets.append(tell()) fd.write(constants.BLOCK_MAGIC) headers.append( bio.write_block( @@ -93,13 +104,18 @@ def write_blocks(fd, blocks, padding=False, streamed_block=None, write_index=Tru ) ) if streamed_block is not None: - if fd.seekable(): - offset = fd.tell() - else: - offset = None - offsets.append(offset) + offsets.append(tell()) fd.write(constants.BLOCK_MAGIC) headers.append(bio.write_block(fd, streamed_block.data_bytes, stream=True)) - elif len(offsets) and write_index and fd.seekable(): + + # os.pipe on windows returns a file-like object + # that reports as seekable but tell always returns 0 + # https://github.com/asdf-format/asdf/issues/1545 + # when all offsets are 0 replace them with all Nones + if all(o == 0 for o in offsets): + offsets = [None for _ in offsets] + + # only write a block index if all conditions are met + if streamed_block is None and write_index and len(offsets) and all(o is not None for o in offsets): bio.write_block_index(fd, offsets) return offsets, headers diff --git a/asdf/_tests/_block/test_writer.py b/asdf/_tests/_block/test_writer.py index 3434782ee..970a28e9d 100644 --- a/asdf/_tests/_block/test_writer.py +++ b/asdf/_tests/_block/test_writer.py @@ -28,7 +28,7 @@ def test_write_blocks(tmp_path, lazy, index, padding, compression, stream, seeka fd.seekable = lambda: False writer.write_blocks(fd, blocks, padding=padding, streamed_block=streamed_block, write_index=index) with generic_io.get_file(fn, mode="r") as fd: - if index and not stream and seekable: + if index and not stream: assert bio.find_block_index(fd) is not None else: assert bio.find_block_index(fd) is None @@ -51,3 +51,52 @@ def test_write_blocks(tmp_path, lazy, index, padding, compression, stream, seeka read_stream_block = read_blocks[-1] np.testing.assert_array_equal(read_stream_block.data, streamed_block.data) assert read_stream_block.header["flags"] & constants.BLOCK_FLAG_STREAMED + + +def _raise_illegal_seek(): + raise OSError("Illegal seek") + + +@pytest.mark.parametrize("stream", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("tell", [0, None, _raise_illegal_seek]) +def test_non_seekable_files_with_odd_tells(tmp_path, stream, index, tell): + """ + Some non-seekable files have odd 'tell' results. See: + https://github.com/asdf-format/asdf/issues/1545 + https://github.com/asdf-format/asdf/issues/1542 + + These can produce invalid block indices which should not be written + to the ASDF file. + """ + data = [np.ones(10, dtype=np.uint8), np.zeros(5, dtype=np.uint8), None] + blocks = [writer.WriteBlock(d) for d in data] + if stream: + streamed_block = writer.WriteBlock(np.ones(15, dtype=np.uint8)) + else: + streamed_block = None + fn = tmp_path / "test.bin" + with generic_io.get_file(fn, mode="w") as fd: + fd.seekable = lambda: False + if callable(tell): + fd.tell = tell + else: + fd.tell = lambda: tell + writer.write_blocks(fd, blocks, streamed_block=streamed_block, write_index=index) + with generic_io.get_file(fn, mode="r") as fd: + assert bio.find_block_index(fd) is None + fd.seek(0) + read_blocks = reader.read_blocks(fd) + if stream: + assert len(read_blocks) == (len(data) + 1) + else: + assert len(read_blocks) == len(data) + for r, d in zip(read_blocks, data): + if d is None: + assert r.data.size == 0 + else: + np.testing.assert_array_equal(r.data, d) + if stream: + read_stream_block = read_blocks[-1] + np.testing.assert_array_equal(read_stream_block.data, streamed_block.data) + assert read_stream_block.header["flags"] & constants.BLOCK_FLAG_STREAMED diff --git a/asdf/generic_io.py b/asdf/generic_io.py index af6bd7253..128a4af73 100644 --- a/asdf/generic_io.py +++ b/asdf/generic_io.py @@ -380,7 +380,6 @@ def seek(self, offset, whence=0): file`s end). """ result = self._fd.seek(offset, whence) - self.tell() return result def tell(self): From 361f69ded19b18d7adef80f1b874446071db1fc4 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 31 May 2023 11:26:01 -0400 Subject: [PATCH 114/154] remove LinearStore this was only used for WriteBlocks so it was integrated with that class --- asdf/_block/manager.py | 71 +++++++++++++++++++++-------- asdf/_block/store.py | 40 +++------------- asdf/_tests/_block/test_callback.py | 12 ++--- asdf/_tests/_block/test_store.py | 24 +--------- 4 files changed, 64 insertions(+), 83 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 16d82c168..6e5d81e76 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -16,19 +16,57 @@ class ReadBlocks(collections.UserList): pass -class WriteBlocks(store.LinearStore): - def __init__(self, init=None): - super().__init__(init) - self._by_data = store.Store() +class WriteBlocks(collections.abc.Sequence): + """ + A collection of WriteBlock objects that can be accessed by: + - numerical index + - the object or objects in the tree that created or + are associated with this object + - the block data + Access by object and data is via a Store which generates + Keys to allow use of non-hashable objects (and to not hold + a reference to the block data). + """ + + def __init__(self, blocks=None): + super().__init__() + if blocks is None: + blocks = [] + self._blocks = blocks + + # both stores contain values that are indices of + # WriteBlock instances in _blocks + self._data_store = store.Store() + self._object_store = store.Store() + + # -- access by index -- + + def __getitem__(self, index): + return self._blocks.__getitem__(index) + + def __len__(self): + return self._blocks.__len__() def lookup_by_data(self, data): - return self._by_data.lookup_by_object(data) + return self._data_store.lookup_by_object(data) + + def assign_data(self, data, index): + self._data_store.assign_object(data, index) + + def lookup_by_object(self, obj): + return self._object_store.lookup_by_object(obj) + + def assign_object(self, obj, index): + self._object_store.assign_object(obj, index) + + def object_keys_for_index(self, index): + yield from self._object_store.keys_for_value(index) def add_block(self, blk, obj): - index = len(self._items) - self._items.append(blk) - self._by_data.assign_object(blk._data, index) - self.assign_object_by_index(obj, index) + index = len(self._blocks) + self._blocks.append(blk) + self._data_store.assign_object(blk._data, index) + self._object_store.assign_object(obj, index) return index @@ -332,7 +370,7 @@ def make_write_block(self, data, options, obj): # first, look for an existing block index = self._write_blocks.lookup_by_data(data) if index is not None: - self._write_blocks.assign_object_by_index(obj, index) + self._write_blocks.assign_object(obj, index) return index # if no block is found, make a new block blk = writer.WriteBlock(data, options.compression, options.compression_kwargs) @@ -554,19 +592,12 @@ def update(self, new_tree_size, pad_blocks, include_block_index): # map new blocks to old blocks new_read_blocks = ReadBlocks() for i, (offset, header) in enumerate(zip(offsets, headers)): - # find all objects that assigned themselves to - # the write block (wblk) at index i + # find all objects that assigned themselves to the write block at index i if i == len(self._write_blocks): # this is a streamed block obj_keys = self._streamed_obj_keys - wblk = self._streamed_write_block else: - wblk = self._write_blocks[i] - # find object associated with wblk - obj_keys = set() - for oid, by_key in self._write_blocks._by_id.items(): - for key, index in by_key.items(): - if self._write_blocks[index] is wblk: - obj_keys.add(key) + # find object associated with this write block + obj_keys = set(self._write_blocks.object_keys_for_index(i)) # we have to be lazy here as any current memmap is invalid new_read_block = reader.ReadBlock(offset + 4, self._write_fd, self._memmap, True, False, header=header) diff --git a/asdf/_block/store.py b/asdf/_block/store.py index b84073f8b..6dffcdfbc 100644 --- a/asdf/_block/store.py +++ b/asdf/_block/store.py @@ -1,5 +1,3 @@ -import collections.abc - from .key import Key @@ -81,6 +79,12 @@ def assign_object(self, obj, value): # if no match was found, add using the key self._by_id[obj_id][obj_key] = value + def keys_for_value(self, value): + for oid, by_key in self._by_id.items(): + for key, stored_value in by_key.items(): + if stored_value == value and key._is_valid(): + yield key + def _cleanup(self, object_id=None): if object_id is None: for oid in set(self._by_id): @@ -92,35 +96,3 @@ def _cleanup(self, object_id=None): del by_key[key] if not len(by_key): del self._by_id[object_id] - - -class LinearStore(Store, collections.abc.Sequence): - """ - A collections.abc.Sequence that can also be accessed - like a Store (by using any object as a key). - """ - - def __init__(self, init=None): - super().__init__() - if init is None: - init = [] - self._items = init - - def lookup_by_object(self, obj): - index = super().lookup_by_object(obj) - if index is None: - return None - return self[index] - - def assign_object(self, obj, value): - index = self._items.index(value) - super().assign_object(obj, index) - - def assign_object_by_index(self, obj, index): - super().assign_object(obj, index) - - def __getitem__(self, index): - return self._items.__getitem__(index) - - def __len__(self): - return self._items.__len__() diff --git a/asdf/_tests/_block/test_callback.py b/asdf/_tests/_block/test_callback.py index f0f59d63d..d173ded44 100644 --- a/asdf/_tests/_block/test_callback.py +++ b/asdf/_tests/_block/test_callback.py @@ -1,7 +1,7 @@ import pytest from asdf._block.callback import DataCallback -from asdf._block.store import LinearStore +from asdf._block.manager import ReadBlocks def test_default_attribute(): @@ -9,7 +9,7 @@ class Data: def __init__(self, value): self.data = value - blks = LinearStore([Data("a"), Data("b")]) + blks = ReadBlocks([Data("a"), Data("b")]) cbs = [DataCallback(0, blks), DataCallback(1, blks)] assert cbs[0]() == "a" @@ -21,7 +21,7 @@ class Foo: def __init__(self, attr, value): setattr(self, attr, value) - blks = LinearStore([Foo("a", "foo"), Foo("a", "bar")]) + blks = ReadBlocks([Foo("a", "foo"), Foo("a", "bar")]) cb = DataCallback(0, blks) assert cb(_attr="a") == "foo" @@ -32,7 +32,7 @@ class Data: def __init__(self, value): self.data = value - blks = LinearStore([Data("a"), Data("b")]) + blks = ReadBlocks([Data("a"), Data("b")]) cb = DataCallback(0, blks) del blks @@ -45,12 +45,12 @@ class Data: def __init__(self, value): self.data = value - blks = LinearStore([Data("a"), Data("b")]) + blks = ReadBlocks([Data("a"), Data("b")]) cb = DataCallback(0, blks) assert cb() == "a" - blks2 = LinearStore([Data("c"), Data("d")]) + blks2 = ReadBlocks([Data("c"), Data("d")]) cb._reassign(1, blks2) assert cb() == "d" diff --git a/asdf/_tests/_block/test_store.py b/asdf/_tests/_block/test_store.py index 3dbebe13b..11fae13b1 100644 --- a/asdf/_tests/_block/test_store.py +++ b/asdf/_tests/_block/test_store.py @@ -3,7 +3,7 @@ import pytest from asdf._block.key import Key -from asdf._block.store import LinearStore, Store +from asdf._block.store import Store # a blank class for testing @@ -148,25 +148,3 @@ def test_cleanup(): del f s._cleanup() assert s.lookup_by_object(k, None) is None - - -def test_linear_store(): - foos = [Foo(), Foo(), Foo()] - values = ["a", "b", "c"] - s = LinearStore(values) - assert len(s) == len(values) - for f, v in zip(foos, values): - s.assign_object(f, v) - for f, v in zip(foos, values): - assert s.lookup_by_object(f) == v - - -def test_linear_store_missing_value(): - s = LinearStore() - with pytest.raises(ValueError, match=".*is not in list.*"): - s.assign_object(Foo(), "missing") - - -def test_linear_store_lookup_unknown_object(): - s = LinearStore() - assert s.lookup_by_object(Foo()) is None From c1e7975fe96aa66eaad3921d95a8adf8e4099e42 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 31 May 2023 11:36:15 -0400 Subject: [PATCH 115/154] add unit test for Store.keys_for_value --- asdf/_tests/_block/test_store.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/asdf/_tests/_block/test_store.py b/asdf/_tests/_block/test_store.py index 11fae13b1..dd48b4013 100644 --- a/asdf/_tests/_block/test_store.py +++ b/asdf/_tests/_block/test_store.py @@ -148,3 +148,27 @@ def test_cleanup(): del f s._cleanup() assert s.lookup_by_object(k, None) is None + + +def test_keys_for_value(): + s = Store() + data = { + Foo(): 42, + Foo(): 26, + Foo(): 42, + Foo(): 11, + } + data_by_value = {} + for o, v in data.items(): + s.assign_object(o, v) + data_by_value[v] = [*data_by_value.get(v, []), o] + + for v, objs in data_by_value.items(): + objs = set(objs) + returned_objects = set() + for k in s.keys_for_value(v): + assert k._is_valid() + obj = k._ref() + returned_objects.add(obj) + assert objs == returned_objects + del returned_objects, objs From c780a059517476157e6e3651826cb6113a1e0117 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 31 May 2023 14:27:47 -0400 Subject: [PATCH 116/154] remove unused WriteBlocks functions add docs --- asdf/_block/__init__.py | 61 ++++++++++++++++++++++++++ asdf/_block/manager.py | 97 ++++++++++++++++++++++++++++++++--------- 2 files changed, 137 insertions(+), 21 deletions(-) diff --git a/asdf/_block/__init__.py b/asdf/_block/__init__.py index e69de29bb..65f66acb9 100644 --- a/asdf/_block/__init__.py +++ b/asdf/_block/__init__.py @@ -0,0 +1,61 @@ +""" +Submodule for reading and writing ASDF blocks. + +The primary interface to this submodule is ``_block.manager.Manager`` +that in some ways mimics the older ``BlockManager``. An instance +of ``Manager`` will be created by each `asdf.AsdfFile` instance. + +Internally, this submodule is broken up into: + - low-level: + - ``io``: functions for reading and writing blocks + - ``key``: ``Key`` used to implement ``Store`` (see below) + - ``store``: ``Store`` special key-value store for indexing blocks + - medium-level: + - ``reader``: ``ReadBlock`` and ``read_blocks`` + - ``writer``: ``WriteBlock`` and ``write_blocks`` + - ``callback``: ``DataCallback`` for reading block data + - ``external``: ``ExternalBlockCache`` for reading external blocks + - ``options``: ``Options`` controlling block storage + - high-level: + - ``manager``: ``Manager`` and associated classes + + +The low-level ``io`` functions are responsible for reading and writing +bytes compatible with the block format defined in the ASDF standard. +These should be compatible with as wide a variety of file formats as possible +including files that are: + - seekable and non-seekable + - memory mappable + - accessed from a remote server + - stored in memory + - etc + +To help organize ASDF block data the ``key`` and ``store`` submodules +provide a special key-value store, ``Store``. ``Store`` uses ``Key`` +instances to tie the lifetime of values to the lifetime of objects +in the ASDF tree (without keeping references to the objects) and +allows non-hashable objects to be used as keys. See the ``key`` +submodule docstring for more details. One usage of ``Store`` is +for managing ASDF block ``Options``. ``Options`` determine where +and how array data will be written and a single ``Options`` instance +might be associated with several arrays within the ASDF tree +(if the arrays share the same base array). By using a ``Key`` generated +with the base array the block ``Options`` can be stored in a ``Store`` +without keeping a reference to the base array and these ``Options`` +will be made unavailable if the base array is garbage collected (so +they are not inapproriately assigned to a new array). + +The medium-level submodules ``reader`` and ``writer`` each define +a helper class and function for reading or writing blocks: + - ``ReadBlock`` and ``WriteBlock`` + - ``read_blocks`` and ``write_blocks`` +These abstract some of the complexity of reading and writing blocks +using the low-level API and are the primary means by which the ``Manager`` +reads and writes ASDF blocks. Reading of external blocks by the ``Manager`` +requires some special handling which is contained in the ``external`` +submodule. + +To allow for lazy-loading of ASDF block data, ``callback`` defines +``DataCallback`` which allows reading block data even after the blocks +have been rearranged following an update-in-place. +""" diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 6e5d81e76..8bf0b0124 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -12,16 +12,23 @@ class ReadBlocks(collections.UserList): - # workaround inability to weakref a list + """ + A list of ReadBlock instances. + + A simple list can't be used as other code will need + to genearate a weakref to instances of this class + (and it is not possible to generate a weakref to a list). + """ + pass class WriteBlocks(collections.abc.Sequence): """ - A collection of WriteBlock objects that can be accessed by: - - numerical index + A collection of ``WriteBlock`` instances that can be accessed by: + - numerical index (see ``collections.abc.Sequence``) - the object or objects in the tree that created or - are associated with this object + are associated with this block - the block data Access by object and data is via a Store which generates Keys to allow use of non-hashable objects (and to not hold @@ -39,45 +46,54 @@ def __init__(self, blocks=None): self._data_store = store.Store() self._object_store = store.Store() - # -- access by index -- - def __getitem__(self, index): return self._blocks.__getitem__(index) def __len__(self): return self._blocks.__len__() - def lookup_by_data(self, data): + def index_for_data(self, data): return self._data_store.lookup_by_object(data) - def assign_data(self, data, index): - self._data_store.assign_object(data, index) - - def lookup_by_object(self, obj): - return self._object_store.lookup_by_object(obj) - - def assign_object(self, obj, index): + def assign_object_to_index(self, obj, index): self._object_store.assign_object(obj, index) def object_keys_for_index(self, index): yield from self._object_store.keys_for_value(index) - def add_block(self, blk, obj): + def append_block(self, blk, obj): + """ + Append a ``WriteBlock`` instance to this collection + assign an object, obj, to the block and return + the index of the block within the collection. + """ index = len(self._blocks) self._blocks.append(blk) + + # assign the block data to this block to allow + # fast lookup of blocks based on data self._data_store.assign_object(blk._data, index) + + # assign the object that created/uses this block self._object_store.assign_object(obj, index) return index class OptionsStore(store.Store): """ - A Store of Options that can be accessed by Key + A ``Store`` of ``Options`` that can be accessed by the base + array that corresponds to a block. A ``Store`` is used + to avoid holding references to the array data (see ``asdf._block.store.Store``). + + When ``Options`` are not found within the ``Store``, the + ``OptionsStore`` will look for any available matching + ``ReadBlock`` to determine default Options. """ - def __init__(self, read_blocks=None): + def __init__(self, read_blocks): super().__init__() + # ReadBlocks are needed to look up default options self._read_blocks = read_blocks def has_options(self, array): @@ -226,7 +242,43 @@ def get_output_compressions(self): class Manager: """ - Manager for reading, writing and storing options for ASDF blocks. + ``Manager`` for reading, writing and storing options for ASDF blocks. + + This class does the heavy lifting of allowing ``asdf.AsdfFile`` instances + to control ASDF blocks. It is responsible for reading and writing blocks + primarily to maintain some consistency with the previous BlockManager. + + Block ``Options`` control the compression and type of storage for an + ASDF block (see `asdf.AsdfFile.set_array_storage`, + `asdf.AsdfFile.set_array_compression` + `asdf.AsdfFile.set_array_compression` for relevant usage and information). + These ``Options`` instances are stored and retrieved using the base + of the array containing the data for an ASDF block. This allows arrays + that share the same base array (ie views of the same array) to use + the same ASDF block. + + Reading blocks occurs through use of ``Manager.read`` which will + create ``ReadBlock`` instances for each read ASDF block. These ``ReadBlock`` + will be used as the source for default ``Options`` for each block + and ASDF block data can be read using ``DataCallback`` instances. + These callbacks are used (instead of just accessing blocks by index) + to allow block reorganization during ``update``.(Note that reading + of external blocks is special as these are not stored within the + block section of the ASDF file. These must be explicitly loaded + using ``Manager._load_external``). + + Writing ASDF blocks occurs through use of ``Manager.write`` which will + take any queued ``WriteBlocks`` (created via ``Manager.make_write_block`` + and ``Manager.set_streamed_write_block``) and write them out to a file. + This writing must occur within a ``Manager.write_context`` to allow the + ``Manager`` to reset any ``Options`` changes that occur during write + and to clean up the write queue. + + Update-in-place occurs through use of ``Manager.update`` which, like + ``Manager.write`` must occur within a ``Manager.write_context``. Following + a ``Manager.update`` the ``ReadBlock`` instances will be replaced with + the newly written ASDF blocks and any ``DataCallbacks`` will be updated + to reference the appropriate new ``ReadBlock``. """ def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, validate_checksums=False): @@ -244,6 +296,9 @@ def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, va self._streamed_obj_keys = set() self._write_fd = None + # store the uri of the ASDF file here so that the Manager can + # resolve and load external blocks without requiring a reference + # to the AsdfFile instance self._uri = uri # general block settings @@ -368,13 +423,13 @@ def make_write_block(self, data, options, obj): self._external_write_blocks.append(blk) return blk._uri # first, look for an existing block - index = self._write_blocks.lookup_by_data(data) + index = self._write_blocks.index_for_data(data) if index is not None: - self._write_blocks.assign_object(obj, index) + self._write_blocks.assign_object_to_index(obj, index) return index # if no block is found, make a new block blk = writer.WriteBlock(data, options.compression, options.compression_kwargs) - index = self._write_blocks.add_block(blk, obj) + index = self._write_blocks.append_block(blk, obj) return index def set_streamed_write_block(self, data, obj): From df8e9614d52fbfa20ca3704e88dafc5e633ec972 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 31 May 2023 16:51:48 -0400 Subject: [PATCH 117/154] add custom exception BlockIndexError for block index reading/parsing --- asdf/_block/exceptions.py | 4 ++++ asdf/_block/io.py | 10 ++++++---- asdf/_block/reader.py | 3 ++- asdf/_tests/_block/test_io.py | 7 ++++--- 4 files changed, 16 insertions(+), 8 deletions(-) create mode 100644 asdf/_block/exceptions.py diff --git a/asdf/_block/exceptions.py b/asdf/_block/exceptions.py new file mode 100644 index 000000000..f36ef5527 --- /dev/null +++ b/asdf/_block/exceptions.py @@ -0,0 +1,4 @@ +class BlockIndexError(Exception): + """ + An error occurred while reading or parsing an ASDF block index + """ diff --git a/asdf/_block/io.py b/asdf/_block/io.py index 06b14ed73..e0afa7d23 100644 --- a/asdf/_block/io.py +++ b/asdf/_block/io.py @@ -13,6 +13,8 @@ from asdf import compression as mcompression from asdf import constants, util +from .exceptions import BlockIndexError + BLOCK_HEADER = util.BinaryStruct( [ ("flags", "I"), @@ -465,7 +467,7 @@ def read_block_index(fd, offset=None): Raises ------ - OSError + BlockIndexError The data read from the file did not contain a valid block index. """ @@ -474,17 +476,17 @@ def read_block_index(fd, offset=None): buff = fd.read(len(constants.INDEX_HEADER)) if buff != constants.INDEX_HEADER: msg = "Failed to read block index header at offset {offset}" - raise OSError(msg) + raise BlockIndexError(msg) try: block_index = yaml.load(fd.read(-1), yaml.SafeLoader) except yaml.error.YAMLError: - raise OSError("Failed to parse block index as yaml") + raise BlockIndexError("Failed to parse block index as yaml") if ( not isinstance(block_index, list) or any(not isinstance(v, int) for v in block_index) or block_index != sorted(block_index) ): - raise OSError("Invalid block index") + raise BlockIndexError("Invalid block index") return block_index diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index 69def2128..91c5be7fb 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -3,6 +3,7 @@ from asdf import constants from . import io as bio +from .exceptions import BlockIndexError class ReadBlock: @@ -220,7 +221,7 @@ def read_blocks(fd, memmap=False, lazy_load=False, validate_checksums=False, aft # setup empty blocks try: block_index = bio.read_block_index(fd, index_offset) - except OSError: + except BlockIndexError: # failed to read block index, fall back to serial reading fd.seek(starting_offset) return _read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) diff --git a/asdf/_tests/_block/test_io.py b/asdf/_tests/_block/test_io.py index 0ba772df6..15c0f22ef 100644 --- a/asdf/_tests/_block/test_io.py +++ b/asdf/_tests/_block/test_io.py @@ -6,6 +6,7 @@ from asdf import constants, generic_io from asdf._block import io as bio +from asdf._block.exceptions import BlockIndexError def test_checksum(tmp_path): @@ -336,21 +337,21 @@ def test_read_block_index_no_header(tmp_path): generate_block_index_file(fn, values=values, offset=0) with generic_io.get_file(fn, "r") as fd: fd.seek(len(constants.INDEX_HEADER)) - with pytest.raises(OSError, match="Failed to read block index.*"): + with pytest.raises(BlockIndexError, match="Failed to read block index.*"): assert bio.read_block_index(fd) == values def test_read_block_index_invalid_yaml(): bs = io.BytesIO(constants.INDEX_HEADER + b"][") with generic_io.get_file(bs, "r") as fd: - with pytest.raises(OSError, match="Failed to parse block index as yaml"): + with pytest.raises(BlockIndexError, match="Failed to parse block index as yaml"): bio.read_block_index(fd) def test_read_block_index_valid_yaml_invalid_contents(): bs = io.BytesIO(constants.INDEX_HEADER + b"['a', 'b']") with generic_io.get_file(bs, "r") as fd: - with pytest.raises(OSError, match="Invalid block index"): + with pytest.raises(BlockIndexError, match="Invalid block index"): bio.read_block_index(fd) From 843fb8c4c302b7c7abf664286f4964783ca171d1 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 5 Jun 2023 13:10:11 -0400 Subject: [PATCH 118/154] get version string from serialization context --- asdf/yamlutil.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index 42e932e26..73b17c4b0 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -222,7 +222,7 @@ def custom_tree_to_tagged_tree(tree, ctx, _serialization_context=None): sctx = _Serialization(_serialization_context) extension_manager = sctx.extension_manager - version_string = ctx.version_string + version_string = str(sctx.version) def _convert_obj(obj, converter): tag = converter.select_tag(obj, sctx) From 89974b1d7ac05f8ac0a3cb0c038dffab6826384b Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 5 Jun 2023 15:49:15 -0400 Subject: [PATCH 119/154] select SerializationContext operation at creation Instead of wrapping SerializationContext instances in _Operations like _Serialization for writing blocks and _Deserialization for reading blocks, choose the BlockAccess mode at creation by creating subclasses of SerializationContext: ReadBlocksContext WriteBlocksContext Add a ``block_access`` argument to ``AsdfFile._create_serialization_context`` to control which of these subclasses (or a generic no block access ``SerializationContext``) is created. --- asdf/_serialization_context.py | 65 +++++++++++------------ asdf/_tests/test_serialization_context.py | 38 +++++-------- asdf/asdf.py | 9 ++-- asdf/yamlutil.py | 41 +++++++------- 4 files changed, 68 insertions(+), 85 deletions(-) diff --git a/asdf/_serialization_context.py b/asdf/_serialization_context.py index b1d68ec2b..18820f402 100644 --- a/asdf/_serialization_context.py +++ b/asdf/_serialization_context.py @@ -1,4 +1,4 @@ -import weakref +import enum from ._block.key import Key as BlockKey from ._block.options import Options as BlockOptions @@ -16,6 +16,7 @@ def __init__(self, version, extension_manager, url, blocks): self._extension_manager = extension_manager self._url = url self._blocks = blocks + self._obj = None self.__extensions_used = set() @@ -137,29 +138,6 @@ def generate_block_key(self): """ raise NotImplementedError("abstract") - -class _Operation(SerializationContext): - """ - `SerializationContext` is used for multiple operations - including serialization and deserialization. The `_Operation` class - allows the SerializationContext to have different behavior during these - operations (for example allowing block reading during deserialization). - """ - - def __init__(self, ctx): - self._ctx = weakref.ref(ctx) - self._obj = None - super().__init__(ctx.version, ctx.extension_manager, ctx.url, ctx._blocks) - - def _mark_extension_used(self, extension): - ctx = self._ctx() - ctx._mark_extension_used(extension) - - @property - def _extensions_used(self): - ctx = self._ctx() - return ctx._extensions_used - def assign_object(self, obj): self._obj = obj @@ -167,11 +145,11 @@ def assign_blocks(self): pass -class _Deserialization(_Operation): +class ReadBlocksContext(SerializationContext): """ Perform deserialization (reading) with a `SerializationContext`. - To allow for block access, `_Deserialization` implements: + To allow for block access, `ReadBlocksContext` implements: - `SerializationContext.generate_block_key` - `SerializationContext.get_block_data_callback` and tracks which blocks (and keys) are accessed, assigning them @@ -179,8 +157,8 @@ class _Deserialization(_Operation): `assign_blocks` are called. """ - def __init__(self, ctx): - super().__init__(ctx) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) self.assign_object(None) def assign_object(self, obj): @@ -190,6 +168,7 @@ def assign_object(self, obj): self._keys_to_assign = {} def assign_blocks(self): + super().assign_blocks() if self._cb is not None: self._blocks._data_callbacks.assign_object(self._obj, self._cb) for key, cb in self._keys_to_assign.items(): @@ -228,20 +207,17 @@ def generate_block_key(self): return key -class _Serialization(_Operation): +class WriteBlocksContext(SerializationContext): """ Perform serialization (writing) with a `SerializationContext`. - To allow for block access, `_Serialization` implements: + To allow for block access, `WriteBlocksContext` implements: - `SerializationContext.generate_block_key` - `SerializationContext.find_available_block_index` and assigns any accessed blocks (and keys) to the object being serialized. """ - def __init__(self, ctx): - super().__init__(ctx) - def find_available_block_index(self, data_callback, key=None): if key is None: key = self._obj @@ -249,3 +225,26 @@ def find_available_block_index(self, data_callback, key=None): def generate_block_key(self): return BlockKey(self._obj) + + +class BlockAccess(enum.Enum): + """ """ + + NONE = SerializationContext + WRITE = WriteBlocksContext + READ = ReadBlocksContext + + +def create(asdf_file, block_access=BlockAccess.NONE): + """ + Create a SerializationContext instance (or subclass) using + an AsdfFile instance, asdf_file. + + Parameters + ---------- + asdf_file : asdf.AsdfFile + + block_access : BlockAccess, optional + Defaults to BlockAccess.NONE + """ + return block_access.value(asdf_file.version_string, asdf_file.extension_manager, asdf_file.uri, asdf_file._blocks) diff --git a/asdf/_tests/test_serialization_context.py b/asdf/_tests/test_serialization_context.py index 2c9e8b20b..ef9116555 100644 --- a/asdf/_tests/test_serialization_context.py +++ b/asdf/_tests/test_serialization_context.py @@ -3,7 +3,7 @@ import asdf from asdf import get_config -from asdf._serialization_context import SerializationContext, _Deserialization, _Serialization +from asdf._serialization_context import BlockAccess, SerializationContext from asdf.extension import ExtensionManager @@ -31,20 +31,6 @@ def test_serialization_context(): SerializationContext("0.5.4", extension_manager, None, None) -@pytest.mark.parametrize("operation", ["_Deserialization", "_Serialization"]) -def test_extension_used_in_operation(operation): - extension_manager = ExtensionManager([]) - context = SerializationContext("1.4.0", extension_manager, "file://test.asdf", None) - - op_ctx = getattr(asdf._serialization_context, operation)(context) - extension = get_config().extensions[0] - op_ctx._mark_extension_used(extension) - op_ctx._mark_extension_used(extension) - assert extension in op_ctx._extensions_used - # check this persists in the parent context - assert extension in context._extensions_used - - def test_get_block_data_callback(tmp_path): fn = tmp_path / "test.asdf" @@ -58,7 +44,7 @@ def test_get_block_data_callback(tmp_path): with pytest.raises(NotImplementedError, match="abstract"): context.get_block_data_callback(0) - op_ctx = _Deserialization(context) + op_ctx = af._create_serialization_context(BlockAccess.READ) cb0 = op_ctx.get_block_data_callback(0) # getting the same callback should pass and return the same object @@ -82,9 +68,10 @@ def test_get_block_data_callback(tmp_path): np.testing.assert_array_equal(d0, arr0) np.testing.assert_array_equal(d1, arr1) - op_ctx = _Serialization(context) - with pytest.raises(NotImplementedError, match="abstract"): - op_ctx.get_block_data_callback(0) + for access in (BlockAccess.NONE, BlockAccess.WRITE): + op_ctx = af._create_serialization_context(access) + with pytest.raises(NotImplementedError, match="abstract"): + op_ctx.get_block_data_callback(0) def test_find_available_block_index(): @@ -100,13 +87,14 @@ def cb(): class Foo: pass - op_ctx = _Serialization(context) + op_ctx = af._create_serialization_context(BlockAccess.WRITE) op_ctx.assign_object(Foo()) assert op_ctx.find_available_block_index(cb) == 0 - op_ctx = _Deserialization(context) - with pytest.raises(NotImplementedError, match="abstract"): - op_ctx.find_available_block_index(cb) + for access in (BlockAccess.NONE, BlockAccess.READ): + op_ctx = af._create_serialization_context(access) + with pytest.raises(NotImplementedError, match="abstract"): + op_ctx.find_available_block_index(cb) def test_generate_block_key(): @@ -120,14 +108,14 @@ class Foo: pass obj = Foo() - op_ctx = _Serialization(context) + op_ctx = af._create_serialization_context(BlockAccess.WRITE) op_ctx.assign_object(obj) key = op_ctx.generate_block_key() assert key._is_valid() assert key._matches_object(obj) obj = Foo() - op_ctx = _Deserialization(context) + op_ctx = af._create_serialization_context(BlockAccess.READ) # because this test generates but does not assign a key # it should raise an exception with pytest.raises(OSError, match=r"Converter generated a key.*"): diff --git a/asdf/asdf.py b/asdf/asdf.py index 51b60d7a1..9355aed7a 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -10,12 +10,11 @@ from . import _display as display from . import _node_info as node_info +from . import _serialization_context, constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil from . import _version as version from . import compression as mcompression -from . import constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil from ._block.manager import Manager as BlockManager from ._helpers import validate_version -from ._serialization_context import SerializationContext from .config import config_context, get_config from .exceptions import ( AsdfConversionWarning, @@ -925,7 +924,7 @@ def _write_tree(self, tree, fd, pad_blocks): fd.write(b"\n") if len(tree): - serialization_context = self._create_serialization_context() + serialization_context = self._create_serialization_context(_serialization_context.BlockAccess.WRITE) for compression in self._blocks.get_output_compressions(): # lookup extension @@ -1519,8 +1518,8 @@ def _warn_tag_mismatch(self, tag, best_tag): # This function is called from within yamlutil methods to create # a context when one isn't explicitly passed in. - def _create_serialization_context(self): - return SerializationContext(self.version_string, self.extension_manager, self.uri, self._blocks) + def _create_serialization_context(self, operation=_serialization_context.BlockAccess.NONE): + return _serialization_context.create(self, operation) def open_asdf( diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index 73b17c4b0..eb88d5520 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -6,7 +6,7 @@ import yaml from . import config, schema, tagged, treeutil, util -from ._serialization_context import _Deserialization, _Serialization +from ._serialization_context import BlockAccess from .constants import STSCI_SCHEMA_TAG_BASE, YAML_TAG_PREFIX from .exceptions import AsdfConversionWarning from .tags.core import AsdfObject @@ -217,20 +217,18 @@ def custom_tree_to_tagged_tree(tree, ctx, _serialization_context=None): annotated with tags. """ if _serialization_context is None: - _serialization_context = ctx._create_serialization_context() + _serialization_context = ctx._create_serialization_context(BlockAccess.WRITE) - sctx = _Serialization(_serialization_context) - - extension_manager = sctx.extension_manager - version_string = str(sctx.version) + extension_manager = _serialization_context.extension_manager + version_string = str(_serialization_context.version) def _convert_obj(obj, converter): - tag = converter.select_tag(obj, sctx) + tag = converter.select_tag(obj, _serialization_context) # if select_tag returns None, converter.to_yaml_tree should return a new # object which will be handled by a different converter while tag is None: converters.add(converter) - obj = converter.to_yaml_tree(obj, tag, sctx) + obj = converter.to_yaml_tree(obj, tag, _serialization_context) try: converter = extension_manager.get_converter_for_type(type(obj)) except KeyError: @@ -240,10 +238,10 @@ def _convert_obj(obj, converter): if converter in converters: msg = "Conversion cycle detected" raise TypeError(msg) - tag = converter.select_tag(obj, sctx) - sctx.assign_object(obj) - node = converter.to_yaml_tree(obj, tag, sctx) - sctx.assign_blocks() + tag = converter.select_tag(obj, _serialization_context) + _serialization_context.assign_object(obj) + node = converter.to_yaml_tree(obj, tag, _serialization_context) + _serialization_context.assign_blocks() if isinstance(node, GeneratorType): generator = node @@ -262,7 +260,7 @@ def _convert_obj(obj, converter): msg = f"Converter returned illegal node type: {util.get_class_name(node)}" raise TypeError(msg) - sctx._mark_extension_used(converter.extension) + _serialization_context._mark_extension_used(converter.extension) yield tagged_node if generator is not None: @@ -294,7 +292,7 @@ def _walker(obj): tag = ctx._type_index.from_custom_type( typ, version_string, - _serialization_context=sctx, + _serialization_context=_serialization_context, ) if tag is not None: @@ -321,10 +319,9 @@ def tagged_tree_to_custom_tree(tree, ctx, force_raw_types=False, _serialization_ tags, to a tree containing custom data types. """ if _serialization_context is None: - _serialization_context = ctx._create_serialization_context() + _serialization_context = ctx._create_serialization_context(BlockAccess.READ) extension_manager = _serialization_context.extension_manager - dctx = _Deserialization(_serialization_context) def _walker(node): if force_raw_types: @@ -336,14 +333,14 @@ def _walker(node): if extension_manager.handles_tag(tag): converter = extension_manager.get_converter_for_tag(tag) - dctx.assign_object(None) - obj = converter.from_yaml_tree(node.data, tag, dctx) - dctx.assign_object(obj) - dctx.assign_blocks() - dctx._mark_extension_used(converter.extension) + _serialization_context.assign_object(None) + obj = converter.from_yaml_tree(node.data, tag, _serialization_context) + _serialization_context.assign_object(obj) + _serialization_context.assign_blocks() + _serialization_context._mark_extension_used(converter.extension) return obj - tag_type = ctx._type_index.from_yaml_tag(ctx, tag, _serialization_context=dctx) + tag_type = ctx._type_index.from_yaml_tag(ctx, tag, _serialization_context=_serialization_context) # This means the tag did not correspond to any type in our type index. if tag_type is None: if not ctx._ignore_unrecognized_tag: From 930123a2d8ac1d6dee3b9eb770d498812410703f Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 5 Jun 2023 16:02:19 -0400 Subject: [PATCH 120/154] remove unneeded code --- asdf/_block/manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 8bf0b0124..25706644b 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -36,7 +36,6 @@ class WriteBlocks(collections.abc.Sequence): """ def __init__(self, blocks=None): - super().__init__() if blocks is None: blocks = [] self._blocks = blocks From 77368372eff13f34d416a9d8c72b8e1ff5e33432 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 6 Jun 2023 09:36:27 -0400 Subject: [PATCH 121/154] add warning when <4 non-null bytes after blocks this mimics the old block management behavior of allowing reading of files with these extra bytes but adds a new warning message as the file is possibly corrupt --- asdf/_block/reader.py | 17 +++++++++++++++-- asdf/_tests/_block/test_reader.py | 15 +++++++++++++-- asdf/_tests/test_array_blocks.py | 2 +- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index 91c5be7fb..520136cd2 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -1,6 +1,8 @@ +import warnings import weakref from asdf import constants +from asdf.exceptions import AsdfWarning from . import io as bio from .exceptions import BlockIndexError @@ -124,9 +126,20 @@ def _read_blocks_serially(fd, memmap=False, lazy_load=False, validate_checksums= # read 4 bytes if not after_magic: buff += fd.read(magic_len - len(buff)) - if len(buff) < magic_len: + if len(buff) == 0: # we are done, there are no more blocks and no index - # TODO error? we shouldn't have extra bytes, the old code allows this + break + elif len(buff) < magic_len: + # we have less than magic_len bytes, this is likely an error + # in the input file/bytes + if all([b == 0 for b in buff]): + # if these are all 0, assume this was a 'truncated' file + # so don't issue a warning + break + # if these are non-0 bytes issue a warning that the file + # is likely corrupt + msg = f"Read invalid bytes {buff!r} after blocks, your file might be corrupt" + warnings.warn(msg, AsdfWarning) break if buff == constants.INDEX_HEADER[:magic_len]: diff --git a/asdf/_tests/_block/test_reader.py b/asdf/_tests/_block/test_reader.py index 9698af1c4..c82896dc4 100644 --- a/asdf/_tests/_block/test_reader.py +++ b/asdf/_tests/_block/test_reader.py @@ -9,6 +9,7 @@ from asdf import constants, generic_io, util from asdf._block import io as bio from asdf._block.reader import read_blocks +from asdf.exceptions import AsdfWarning @contextlib.contextmanager @@ -83,15 +84,25 @@ def test_read_invalid_padding(): check(read_blocks(fd)) -def test_read_post_padding(): +def test_read_post_padding_null_bytes(): with gen_blocks(padding=1) as (fd, check): fd.seek(0, os.SEEK_END) # acceptable to have <4 bytes after the last block - fd.write(b"\0" * 3) + fd.write(b"\x00" * 3) fd.seek(0) check(read_blocks(fd)) +def test_read_post_padding_non_null_bytes(): + with gen_blocks(padding=1) as (fd, check): + fd.seek(0, os.SEEK_END) + # acceptable to have <4 bytes after the last block + fd.write(b"\x01" * 3) + fd.seek(0) + with pytest.warns(AsdfWarning, match=r"Read invalid bytes.*"): + check(read_blocks(fd)) + + @pytest.mark.parametrize("invalid_block_index", [0, 1, -1, "junk"]) def test_invalid_block_index(tmp_path, invalid_block_index): fn = tmp_path / "test.bin" diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index baeba9f2d..f601c16ee 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -481,7 +481,7 @@ def test_seek_until_on_block_boundary(): foo : bar ... """ - content += b"\0" * (io.DEFAULT_BUFFER_SIZE - 2) + constants.BLOCK_MAGIC + b"\0\x30" + b"\0" * 50 + content += b"\0" * (io.DEFAULT_BUFFER_SIZE - 2) + constants.BLOCK_MAGIC + b"\0\x30" + b"\0" * 48 buff = io.BytesIO(content) ff = asdf.open(buff) From 550ae6dcb3f691a850e30b995f3b283bb07438fd Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 6 Jun 2023 10:52:45 -0400 Subject: [PATCH 122/154] preserve block index during asdftool edit attempt to update the block index during an asdftool edit where the tree expands into the previous ASDF block bytes. This is done without parsing and rewriting block data (as was done in the older code) to avoid: - removing block padding - memory mapping blocks - decompression and compression (which might require custom compressors and keyword arguments) --- asdf/_tests/commands/tests/test_edit.py | 19 +++++++++++ asdf/commands/edit.py | 43 ++++++++++++++++++++----- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/asdf/_tests/commands/tests/test_edit.py b/asdf/_tests/commands/tests/test_edit.py index f1181547c..18060b985 100644 --- a/asdf/_tests/commands/tests/test_edit.py +++ b/asdf/_tests/commands/tests/test_edit.py @@ -7,6 +7,8 @@ from numpy.testing import assert_array_equal import asdf +from asdf import constants +from asdf._block import io as bio from asdf.commands import main RNG = np.random.default_rng(42) @@ -154,6 +156,17 @@ def test_no_blocks_decrease_size(tmp_path, create_editor, version): assert af["foo"] == "bar" +def confirm_valid_block_index(file_path): + # make sure the block index is valid + with asdf.generic_io.get_file(file_path, "r") as f: + block_index_offset = bio.find_block_index(f) + assert block_index_offset is not None + block_index = bio.read_block_index(f, block_index_offset) + for block_offset in block_index: + f.seek(block_offset) + assert f.read(len(constants.BLOCK_MAGIC)) == constants.BLOCK_MAGIC + + def test_with_blocks(tmp_path, create_editor, version): file_path = str(tmp_path / "test.asdf") @@ -174,6 +187,8 @@ def test_with_blocks(tmp_path, create_editor, version): assert_array_equal(af["array1"], array1) assert_array_equal(af["array2"], array2) + confirm_valid_block_index(file_path) + def test_with_blocks_increase_size(tmp_path, create_editor, version, mock_input): file_path = str(tmp_path / "test.asdf") @@ -202,6 +217,8 @@ def test_with_blocks_increase_size(tmp_path, create_editor, version, mock_input) assert_array_equal(af["array1"], array1) assert_array_equal(af["array2"], array2) + confirm_valid_block_index(file_path) + def test_with_blocks_decrease_size(tmp_path, create_editor, version): file_path = str(tmp_path / "test.asdf") @@ -225,6 +242,8 @@ def test_with_blocks_decrease_size(tmp_path, create_editor, version): assert_array_equal(af["array1"], array1) assert_array_equal(af["array2"], array2) + confirm_valid_block_index(file_path) + def test_no_changes(tmp_path, create_editor, version): file_path = str(tmp_path / "test.asdf") diff --git a/asdf/commands/edit.py b/asdf/commands/edit.py index 08b7a254d..9cf22e640 100644 --- a/asdf/commands/edit.py +++ b/asdf/commands/edit.py @@ -16,6 +16,8 @@ import yaml from asdf import constants, generic_io, schema, util +from asdf._block import io as bio +from asdf._block.exceptions import BlockIndexError from asdf.asdf import AsdfFile, open_asdf from .main import Command @@ -129,17 +131,42 @@ def write_edited_yaml_larger(path, new_content, version): pad_length = util.calculate_padding(len(new_content), True, fd.block_size) fd.fast_forward(pad_length) - # copy blocks from original_fd to fd - fd.tell() + # now copy over ASDF block contents + with generic_io.get_file(path) as original_fd: original_fd.seek_until(constants.BLOCK_MAGIC, len(constants.BLOCK_MAGIC)) - fd.write(constants.BLOCK_MAGIC) + old_first_block_offset = original_fd.tell() - len(constants.BLOCK_MAGIC) + new_first_block_offset = fd.tell() + + # check if the original file has a block index which we will need to update + # as we're moving the blocks + block_index_offset = bio.find_block_index(original_fd) + if block_index_offset is None: + block_index = None + original_fd.seek(0, generic_io.SEEK_END) + blocks_end = original_fd.tell() + else: + blocks_end = block_index_offset + try: + block_index = bio.read_block_index(original_fd, block_index_offset) + except BlockIndexError: + # the original index was invalid + block_index = None + + # copy over blocks byte-for-byte from old_first_block_offset to block_index_offset + original_fd.seek(old_first_block_offset) + block_index_offset block_size = min(fd.block_size, original_fd.block_size) - while bs := original_fd.read(block_size): - fd.write(bs) - - # the file needs to be closed here to release all memmaps - original_fd.close() + n_bytes = blocks_end - old_first_block_offset + for offset in range(0, n_bytes, block_size): + this_size = min(block_size, n_bytes - offset) + fd.write(original_fd.read(this_size)) + + # update index + if block_index is not None: + offset = new_first_block_offset - old_first_block_offset + updated_block_index = [i + offset for i in block_index] + bio.write_block_index(fd, updated_block_index) # Swap in the new version of the file atomically: shutil.copy(temp_file.name, path) From 67044aacd838e50ed18de33fbc1db1c9c2e16716 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 6 Jun 2023 11:12:23 -0400 Subject: [PATCH 123/154] re-add SerializationContext to asdf.asdf for weldx --- asdf/asdf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asdf/asdf.py b/asdf/asdf.py index 9355aed7a..822394d4d 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -15,6 +15,7 @@ from . import compression as mcompression from ._block.manager import Manager as BlockManager from ._helpers import validate_version +from ._serialization_context import SerializationContext # noqa: F401 from .config import config_context, get_config from .exceptions import ( AsdfConversionWarning, From c114ad223ab4e31c45ca5cc1d9aea6d6cbc08114 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 6 Jun 2023 13:57:08 -0400 Subject: [PATCH 124/154] allow external block memory mapping when requested (and supported) memory map external blocks by returning a numpy.memmap instance of the bytes for the block in the external file fixes #1525 --- asdf/_block/external.py | 21 +++++++++++++++++---- asdf/_block/manager.py | 5 ++++- asdf/_tests/_issues/test_1525.py | 29 +++++++++++++++++++++++++++++ asdf/asdf.py | 1 + 4 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 asdf/_tests/_issues/test_1525.py diff --git a/asdf/_block/external.py b/asdf/_block/external.py index 229a361f7..d93ef6902 100644 --- a/asdf/_block/external.py +++ b/asdf/_block/external.py @@ -9,6 +9,8 @@ """ import os +import numpy as np + from asdf import generic_io, util @@ -21,9 +23,9 @@ class UseInternalType: class ExternalBlockCache: def __init__(self): - self._cache = {} + self.clear() - def load(self, base_uri, uri): + def load(self, base_uri, uri, memmap=False, validate_checksums=False): key = util.get_base_uri(uri) if key not in self._cache: resolved_uri = generic_io.resolve_uri(base_uri, uri) @@ -32,10 +34,21 @@ def load(self, base_uri, uri): from asdf import open as asdf_open - with asdf_open(resolved_uri, lazy_load=False, copy_arrays=True) as af: - self._cache[key] = af._blocks.blocks[0].cached_data + with asdf_open( + resolved_uri, "r", lazy_load=False, copy_arrays=True, validate_checksums=validate_checksums + ) as af: + blk = af._blocks.blocks[0] + if memmap and blk.header["compression"] == b"\0\0\0\0": + file_path = util.patched_urllib_parse.urlparse(resolved_uri).path + arr = np.memmap(file_path, np.uint8, "r", blk.data_offset, blk.cached_data.nbytes) + else: + arr = blk.cached_data + self._cache[key] = arr return self._cache[key] + def clear(self): + self._cache = {} + def relative_uri_for_index(uri, index): # get the os-native separated path for this uri diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 25706644b..f7ee0b113 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -305,6 +305,9 @@ def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, va self._memmap = memmap self._validate_checksums = validate_checksums + def clear_external_cache(self): + self._external_block_cache.clear() + @property def blocks(self): """ @@ -345,7 +348,7 @@ def read(self, fd, after_magic=False): ) def _load_external(self, uri): - value = self._external_block_cache.load(self._uri, uri) + value = self._external_block_cache.load(self._uri, uri, self._memmap, self._validate_checksums) if value is external.UseInternal: return self.blocks[0].data return value diff --git a/asdf/_tests/_issues/test_1525.py b/asdf/_tests/_issues/test_1525.py new file mode 100644 index 000000000..5a5880bde --- /dev/null +++ b/asdf/_tests/_issues/test_1525.py @@ -0,0 +1,29 @@ +import numpy as np + +import asdf + + +def test_1525(tmp_path): + """ + External blocks are always lazy loaded and memmapped + + https://github.com/asdf-format/asdf/issues/1525 + """ + + fn = tmp_path / "test.asdf" + arr = np.arange(10) + af = asdf.AsdfFile({"arr": arr}) + af.set_array_storage(arr, "external") + af.write_to(fn) + + for copy_arrays in (True, False): + with asdf.open(fn, copy_arrays=copy_arrays) as af: + # check that block is external + source = af["arr"]._source + assert isinstance(source, str) + + # check if block is memmapped + if copy_arrays: + assert not isinstance(af["arr"].base, np.memmap) + else: + assert isinstance(af["arr"].base, np.memmap) diff --git a/asdf/asdf.py b/asdf/asdf.py index 822394d4d..b0c9e111b 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -461,6 +461,7 @@ def close(self): for external in self._external_asdf_by_uri.values(): external.close() self._external_asdf_by_uri.clear() + self._blocks.clear_external_cache() def copy(self): return self.__class__( From 2c739db2cc75f058fe250042b251dead15ffbe6c Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 6 Jun 2023 14:34:21 -0400 Subject: [PATCH 125/154] re-add block manager close clean up cached block data for internal (as well as external) blocks and clear any potentially queued write blocks --- asdf/_block/manager.py | 6 +++++- asdf/_block/reader.py | 3 +++ asdf/asdf.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index f7ee0b113..80034d4e7 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -305,8 +305,12 @@ def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, va self._memmap = memmap self._validate_checksums = validate_checksums - def clear_external_cache(self): + def close(self): self._external_block_cache.clear() + self._clear_write() + for blk in self.blocks: + blk.close() + self.options = OptionsStore(self.blocks) @property def blocks(self): diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index 520136cd2..659c8cb21 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -26,6 +26,9 @@ def __init__(self, offset, fd, memmap, lazy_load, validate_checksum, header=None if not lazy_load: self.load() + def close(self): + self._cached_data = None + @property def loaded(self): return self._data is not None diff --git a/asdf/asdf.py b/asdf/asdf.py index b0c9e111b..1ccc85cf3 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -461,7 +461,7 @@ def close(self): for external in self._external_asdf_by_uri.values(): external.close() self._external_asdf_by_uri.clear() - self._blocks.clear_external_cache() + self._blocks.close() def copy(self): return self.__class__( From 92a0ddbe5986f335d418316285c99993f3dd92b9 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 6 Jun 2023 15:18:36 -0400 Subject: [PATCH 126/154] clear SerializationContext object after block assignment this breaks a potentially circular reference where objects that lazily load blocks might hold only a reference to the serialization context (which without this commit might hold onto the object if it's the last one converted). --- asdf/_serialization_context.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/asdf/_serialization_context.py b/asdf/_serialization_context.py index 18820f402..46d94b1af 100644 --- a/asdf/_serialization_context.py +++ b/asdf/_serialization_context.py @@ -180,6 +180,9 @@ def assign_blocks(self): # assign the key to the callback self._blocks._data_callbacks.assign_object(key, cb) + # now that we've assigned blocks, remove the reference to the + # assigned object + self.assign_object(None) def get_block_data_callback(self, index, key=None): if key is None: From 7722296337ca0c15d217c17df03d3a530438499a Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 7 Jun 2023 10:29:26 -0400 Subject: [PATCH 127/154] fix external block loading over http --- asdf/_block/external.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/asdf/_block/external.py b/asdf/_block/external.py index d93ef6902..fb31ced73 100644 --- a/asdf/_block/external.py +++ b/asdf/_block/external.py @@ -39,8 +39,11 @@ def load(self, base_uri, uri, memmap=False, validate_checksums=False): ) as af: blk = af._blocks.blocks[0] if memmap and blk.header["compression"] == b"\0\0\0\0": - file_path = util.patched_urllib_parse.urlparse(resolved_uri).path - arr = np.memmap(file_path, np.uint8, "r", blk.data_offset, blk.cached_data.nbytes) + parsed_url = util.patched_urllib_parse.urlparse(resolved_uri) + if parsed_url.scheme == "file": + arr = np.memmap(parsed_url.path, np.uint8, "r", blk.data_offset, blk.cached_data.nbytes) + else: + arr = blk.cached_data else: arr = blk.cached_data self._cache[key] = arr From b9c7ac81823682ded46e224ecb358e0b2736c30a Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 7 Jun 2023 10:58:15 -0400 Subject: [PATCH 128/154] fix file url parsing for external blocks on windows --- asdf/_block/external.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/asdf/_block/external.py b/asdf/_block/external.py index fb31ced73..6e5f41620 100644 --- a/asdf/_block/external.py +++ b/asdf/_block/external.py @@ -8,6 +8,7 @@ (that references the block manager). """ import os +import urllib import numpy as np @@ -41,7 +42,9 @@ def load(self, base_uri, uri, memmap=False, validate_checksums=False): if memmap and blk.header["compression"] == b"\0\0\0\0": parsed_url = util.patched_urllib_parse.urlparse(resolved_uri) if parsed_url.scheme == "file": - arr = np.memmap(parsed_url.path, np.uint8, "r", blk.data_offset, blk.cached_data.nbytes) + # deal with leading slash for windows file:// + filename = urllib.request.url2pathname(parsed_url.path) + arr = np.memmap(filename, np.uint8, "r", blk.data_offset, blk.cached_data.nbytes) else: arr = blk.cached_data else: From 34f74faccf0c9f5fb2200adc32e0e4fe7075b89f Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 8 Jun 2023 15:48:04 -0400 Subject: [PATCH 129/154] remove use of assert_roundtrip_tree in ndarray tests It appears that asdf._tests._helpers.assert_roundtrip_tree works best with old style AsdfType objects. As ndarray is now handled by a Converter the tests were updated to no longer use assert_roundtrip_tree. --- asdf/_tests/tags/core/tests/data/__init__.py | 0 .../tags/core/tests/data/datatype-1.0.0.yaml | 27 - .../tags/core/tests/data/ndim-1.0.0.yaml | 11 - asdf/_tests/tags/core/tests/test_ndarray.py | 554 ++++++++++-------- 4 files changed, 301 insertions(+), 291 deletions(-) delete mode 100644 asdf/_tests/tags/core/tests/data/__init__.py delete mode 100644 asdf/_tests/tags/core/tests/data/datatype-1.0.0.yaml delete mode 100644 asdf/_tests/tags/core/tests/data/ndim-1.0.0.yaml diff --git a/asdf/_tests/tags/core/tests/data/__init__.py b/asdf/_tests/tags/core/tests/data/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/asdf/_tests/tags/core/tests/data/datatype-1.0.0.yaml b/asdf/_tests/tags/core/tests/data/datatype-1.0.0.yaml deleted file mode 100644 index a7cdc03d1..000000000 --- a/asdf/_tests/tags/core/tests/data/datatype-1.0.0.yaml +++ /dev/null @@ -1,27 +0,0 @@ -%YAML 1.1 ---- -$schema: "http://stsci.edu/schemas/asdf/asdf-schema-1.0.0" -id: "http://nowhere.org/schemas/custom/datatype-1.0.0" -type: object -properties: - a: - datatype: float32 - - b: - datatype: float32 - exact_datatype: true - - c: - datatype: - - name: a - datatype: int16 - - name: b - datatype: ['ascii', 16] - - d: - datatype: - - name: a - datatype: int16 - - name: b - datatype: ['ascii', 16] - exact_datatype: true diff --git a/asdf/_tests/tags/core/tests/data/ndim-1.0.0.yaml b/asdf/_tests/tags/core/tests/data/ndim-1.0.0.yaml deleted file mode 100644 index ace803e2a..000000000 --- a/asdf/_tests/tags/core/tests/data/ndim-1.0.0.yaml +++ /dev/null @@ -1,11 +0,0 @@ -%YAML 1.1 ---- -$schema: "http://stsci.edu/schemas/asdf/asdf-schema-1.0.0" -id: "http://nowhere.org/schemas/custom/ndim-1.0.0" -type: object -properties: - a: - ndim: 2 - - b: - max_ndim: 2 diff --git a/asdf/_tests/tags/core/tests/test_ndarray.py b/asdf/_tests/tags/core/tests/test_ndarray.py index 2aa6052fb..d75ddabfd 100644 --- a/asdf/_tests/tags/core/tests/test_ndarray.py +++ b/asdf/_tests/tags/core/tests/test_ndarray.py @@ -1,3 +1,4 @@ +import contextlib import io import os import re @@ -10,72 +11,145 @@ from numpy.testing import assert_array_equal import asdf -from asdf import util -from asdf._tests import _helpers as helpers -from asdf._tests.objects import CustomTestType from asdf.exceptions import ValidationError +from asdf.extension import Converter, Extension, TagDefinition from asdf.tags.core import ndarray - -from . import data as test_data - -TEST_DATA_PATH = helpers.get_test_data_path("", module=test_data) +from asdf.testing import helpers # These custom types and the custom extension are here purely for the purpose # of testing NDArray objects and making sure that they can be validated as part # of a nested hierarchy, and not just top-level objects. -class CustomNdim(CustomTestType): - name = "ndim" - organization = "nowhere.org" - standard = "custom" - version = "1.0.0" - - -class CustomDatatype(CustomTestType): - name = "datatype" - organization = "nowhere.org" - standard = "custom" - version = "1.0.0" - - -class CustomExtension: - @property - def types(self): - return [CustomNdim, CustomDatatype] - - @property - def tag_mapping(self): - return [("tag:nowhere.org:custom", "http://nowhere.org/schemas/custom{tag_suffix}")] - - @property - def url_mapping(self): - return [("http://nowhere.org/schemas/custom/", util.filepath_to_url(TEST_DATA_PATH) + "/{url_suffix}.yaml")] - - -def test_sharing(tmpdir): +class CustomData: + def __init__(self, value): + self.value = value + + +class CustomNDim: + def __init__(self, value): + self.value = value + + +class CustomNDimConverter(Converter): + tags = ["tag:nowhere.org:custom/ndim-1.0.0"] + types = [CustomNDim] + + def to_yaml_tree(self, obj, tag, ctx): + return obj.value + + def from_yaml_tree(self, node, tag, ctx): + return CustomNDim(node) + + +class CustomDataConverter(Converter): + tags = ["tag:nowhere.org:custom/datatype-1.0.0"] + types = [CustomData] + + def to_yaml_tree(self, obj, tag, ctx): + return obj.value + + def from_yaml_tree(self, node, tag, ctx): + return CustomData(node) + + +class CustomExtension(Extension): + tags = [ + TagDefinition( + tag_uri="tag:nowhere.org:custom/datatype-1.0.0", + schema_uris=["http://nowhere.org/schemas/custom/datatype-1.0.0"], + ), + TagDefinition( + tag_uri="tag:nowhere.org:custom/ndim-1.0.0", + schema_uris=["http://nowhere.org/schemas/custom/ndim-1.0.0"], + ), + ] + extension_uri = "asdf://nowhere.org/extensions/custom-1.0.0" + converters = [CustomDataConverter(), CustomNDimConverter()] + + +@contextlib.contextmanager +def with_custom_extension(): + with asdf.config_context() as cfg: + cfg.add_extension(CustomExtension()) + cfg.add_resource_mapping( + { + "http://nowhere.org/schemas/custom/datatype-1.0.0": """%YAML 1.1 +--- +$schema: "http://stsci.edu/schemas/asdf/asdf-schema-1.0.0" +id: "http://nowhere.org/schemas/custom/datatype-1.0.0" +type: object +properties: + a: + datatype: float32 + + b: + datatype: float32 + exact_datatype: true + + c: + datatype: + - name: a + datatype: int16 + - name: b + datatype: ['ascii', 16] + + d: + datatype: + - name: a + datatype: int16 + - name: b + datatype: ['ascii', 16] + exact_datatype: true""", + "http://nowhere.org/schemas/custom/ndim-1.0.0": """%YAML 1.1 +--- +$schema: "http://stsci.edu/schemas/asdf/asdf-schema-1.0.0" +id: "http://nowhere.org/schemas/custom/ndim-1.0.0" +type: object +properties: + a: + ndim: 2 + + b: + max_ndim: 2""", + } + ) + yield + + +@contextlib.contextmanager +def roundtrip(af, raw=False): + if not isinstance(af, asdf.AsdfFile): + af = asdf.AsdfFile(af) + b = io.BytesIO() + af.write_to(b) + b.seek(0) + if raw: + bs = b.read() + if asdf.constants.BLOCK_MAGIC in bs: + bs, *_ = bs.split(asdf.constants.BLOCK_MAGIC) + yield bs + else: + with asdf.open(b) as af: + yield af + + +def test_sharing(): x = np.arange(0, 10, dtype=float) tree = {"science_data": x, "subset": x[3:-3], "skipping": x[::2]} - def check_asdf(asdf): - tree = asdf.tree - + with roundtrip(tree) as af: + tree = af.tree assert_array_equal(tree["science_data"], x) assert_array_equal(tree["subset"], x[3:-3]) assert_array_equal(tree["skipping"], x[::2]) assert tree["science_data"].ctypes.data == tree["skipping"].ctypes.data - assert len(asdf._blocks.blocks) == 1 - assert asdf._blocks.blocks[0].header["data_size"] == 80 + assert len(af._blocks.blocks) == 1 + assert af._blocks.blocks[0].header["data_size"] == 80 - if "w" in asdf._mode: - tree["science_data"][0] = 42 - assert tree["skipping"][0] == 42 - - def check_raw_yaml(content): - assert b"!core/ndarray" in content - - helpers.assert_roundtrip_tree(tree, tmpdir, asdf_check_func=check_asdf, raw_yaml_check_func=check_raw_yaml) + tree["science_data"][0] = 42 + assert tree["skipping"][0] == 42 def test_byteorder(tmpdir): @@ -84,8 +158,8 @@ def test_byteorder(tmpdir): "little": np.arange(0, 10, dtype=" 5) tree = {"masked_array": m, "unmasked_array": x} - def check_asdf(asdf): - tree = asdf.tree + with roundtrip(tree) as af: + tree = af.tree m = tree["masked_array"] assert np.all(m.mask[6:]) - assert len(asdf._blocks.blocks) == 2 - - helpers.assert_roundtrip_tree(tree, tmpdir, asdf_check_func=check_asdf) + assert len(af._blocks.blocks) == 2 def test_len_roundtrip(tmpdir): sequence = np.arange(0, 10, dtype=int) tree = {"sequence": sequence} - def check_len(asdf): - s = asdf.tree["sequence"] + with roundtrip(tree) as af: + s = af.tree["sequence"] assert len(s) == 10 - helpers.assert_roundtrip_tree(tree, tmpdir, asdf_check_func=check_len) - def test_mask_arbitrary(): content = """ - arr: !core/ndarray-1.0.0 - data: [[1, 2, 3, 1234], [5, 6, 7, 8]] - mask: 1234 +arr: !core/ndarray-1.0.0 + data: [[1, 2, 3, 1234], [5, 6, 7, 8]] + mask: 1234 """ buff = helpers.yaml_to_asdf(content) @@ -320,9 +393,9 @@ def test_mask_arbitrary(): def test_mask_nan(): content = """ - arr: !core/ndarray-1.0.0 - data: [[1, 2, 3, .NaN], [5, 6, 7, 8]] - mask: .NaN +arr: !core/ndarray-1.0.0 + data: [[1, 2, 3, .NaN], [5, 6, 7, 8]] + mask: .NaN """ buff = helpers.yaml_to_asdf(content) @@ -336,13 +409,17 @@ def test_string(tmpdir): "unicode": np.array(["სამეცნიერო", "данные", "வடிவம்"]), } - helpers.assert_roundtrip_tree(tree, tmpdir) + with roundtrip(tree) as af: + for k in tree: + assert_array_equal(tree[k], af[k]) def test_string_table(tmpdir): tree = {"table": np.array([(b"foo", "სამეცნიერო", "42", "53.0")])} - helpers.assert_roundtrip_tree(tree, tmpdir) + with roundtrip(tree) as af: + for k in tree: + assert_array_equal(tree[k], af[k]) def test_inline_string(): @@ -355,12 +432,12 @@ def test_inline_string(): def test_inline_structured(): content = """ - arr: !core/ndarray-1.0.0 - datatype: [['ascii', 4], uint16, uint16, ['ascii', 4]] - data: [[M110, 110, 205, And], - [ M31, 31, 224, And], - [ M32, 32, 221, And], - [M103, 103, 581, Cas]]""" +arr: !core/ndarray-1.0.0 + datatype: [['ascii', 4], uint16, uint16, ['ascii', 4]] + data: [[M110, 110, 205, And], + [ M31, 31, 224, And], + [ M32, 32, 221, And], + [M103, 103, 581, Cas]]""" buff = helpers.yaml_to_asdf(content) @@ -512,11 +589,11 @@ def test_operations_on_ndarray_proxies(tmpdir): def test_mask_datatype(tmpdir): content = """ - arr: !core/ndarray-1.0.0 - data: [1, 2, 3] - dtype: int32 - mask: !core/ndarray-1.0.0 - data: [true, true, false] +arr: !core/ndarray-1.0.0 + data: [1, 2, 3] + dtype: int32 + mask: !core/ndarray-1.0.0 + data: [true, true, false] """ buff = helpers.yaml_to_asdf(content) @@ -526,11 +603,11 @@ def test_mask_datatype(tmpdir): def test_invalid_mask_datatype(tmpdir): content = """ - arr: !core/ndarray-1.0.0 - data: [1, 2, 3] - dtype: int32 - mask: !core/ndarray-1.0.0 - data: ['a', 'b', 'c'] +arr: !core/ndarray-1.0.0 + data: [1, 2, 3] + dtype: int32 + mask: !core/ndarray-1.0.0 + data: ['a', 'b', 'c'] """ buff = helpers.yaml_to_asdf(content) @@ -540,243 +617,237 @@ def test_invalid_mask_datatype(tmpdir): pass +@with_custom_extension() def test_ndim_validation(tmpdir): content = """ - obj: ! - a: !core/ndarray-1.0.0 - data: [1, 2, 3] +obj: ! + a: !core/ndarray-1.0.0 + data: [1, 2, 3] """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Wrong number of dimensions:.*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - a: !core/ndarray-1.0.0 - data: [[1, 2, 3]] +obj: ! + a: !core/ndarray-1.0.0 + data: [[1, 2, 3]] """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - a: !core/ndarray-1.0.0 - shape: [1, 3] - data: [[1, 2, 3]] +obj: ! + a: !core/ndarray-1.0.0 + shape: [1, 3] + data: [[1, 2, 3]] """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - b: !core/ndarray-1.0.0 - data: [1, 2, 3] +obj: ! + b: !core/ndarray-1.0.0 + data: [1, 2, 3] """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - b: !core/ndarray-1.0.0 - data: [[1, 2, 3]] +obj: ! + b: !core/ndarray-1.0.0 + data: [[1, 2, 3]] """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - b: !core/ndarray-1.0.0 - data: [[[1, 2, 3]]] +obj: ! + b: !core/ndarray-1.0.0 + data: [[[1, 2, 3]]] """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Wrong number of dimensions:.*"), asdf.open( buff, - extensions=CustomExtension(), ): pass +@with_custom_extension() def test_datatype_validation(tmpdir): content = """ - obj: ! - a: !core/ndarray-1.0.0 - data: [1, 2, 3] - datatype: float32 +obj: ! + a: !core/ndarray-1.0.0 + data: [1, 2, 3] + datatype: float32 """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - a: !core/ndarray-1.0.0 - data: [1, 2, 3] - datatype: float64 +obj: ! + a: !core/ndarray-1.0.0 + data: [1, 2, 3] + datatype: float64 """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Can not safely cast from .* to .*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - a: !core/ndarray-1.0.0 - data: [1, 2, 3] - datatype: int16 +obj: ! + a: !core/ndarray-1.0.0 + data: [1, 2, 3] + datatype: int16 """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - b: !core/ndarray-1.0.0 - data: [1, 2, 3] - datatype: int16 +obj: ! + b: !core/ndarray-1.0.0 + data: [1, 2, 3] + datatype: int16 """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Expected datatype .*, got .*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - a: !core/ndarray-1.0.0 - data: [[1, 'a'], [2, 'b'], [3, 'c']] - datatype: - - name: a - datatype: int8 - - name: b - datatype: ['ascii', 8] +obj: ! + a: !core/ndarray-1.0.0 + data: [[1, 'a'], [2, 'b'], [3, 'c']] + datatype: + - name: a + datatype: int8 + - name: b + datatype: ['ascii', 8] """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Expected scalar datatype .*, got .*"), asdf.open( buff, - extensions=CustomExtension(), ): pass +@with_custom_extension() def test_structured_datatype_validation(tmpdir): content = """ - obj: ! - c: !core/ndarray-1.0.0 - data: [[1, 'a'], [2, 'b'], [3, 'c']] - datatype: - - name: a - datatype: int8 - - name: b - datatype: ['ascii', 8] +obj: ! + c: !core/ndarray-1.0.0 + data: [[1, 'a'], [2, 'b'], [3, 'c']] + datatype: + - name: a + datatype: int8 + - name: b + datatype: ['ascii', 8] """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - c: !core/ndarray-1.0.0 - data: [[1, 'a'], [2, 'b'], [3, 'c']] - datatype: - - name: a - datatype: int64 - - name: b - datatype: ['ascii', 8] +obj: ! + c: !core/ndarray-1.0.0 + data: [[1, 'a'], [2, 'b'], [3, 'c']] + datatype: + - name: a + datatype: int64 + - name: b + datatype: ['ascii', 8] """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Can not safely cast to expected datatype.*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - c: !core/ndarray-1.0.0 - data: [[1, 'a', 0], [2, 'b', 1], [3, 'c', 2]] - datatype: - - name: a - datatype: int8 - - name: b - datatype: ['ascii', 8] - - name: c - datatype: float64 +obj: ! + c: !core/ndarray-1.0.0 + data: [[1, 'a', 0], [2, 'b', 1], [3, 'c', 2]] + datatype: + - name: a + datatype: int8 + - name: b + datatype: ['ascii', 8] + - name: c + datatype: float64 """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Mismatch in number of columns:.*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - c: !core/ndarray-1.0.0 - data: [1, 2, 3] +obj: ! + c: !core/ndarray-1.0.0 + data: [1, 2, 3] """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Expected structured datatype.*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - d: !core/ndarray-1.0.0 - data: [[1, 'a'], [2, 'b'], [3, 'c']] - datatype: - - name: a - datatype: int8 - - name: b - datatype: ['ascii', 8] +obj: ! + d: !core/ndarray-1.0.0 + data: [[1, 'a'], [2, 'b'], [3, 'c']] + datatype: + - name: a + datatype: int8 + - name: b + datatype: ['ascii', 8] """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Expected datatype .*, got .*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - d: !core/ndarray-1.0.0 - data: [[1, 'a'], [2, 'b'], [3, 'c']] - datatype: - - name: a - datatype: int16 - - name: b - datatype: ['ascii', 16] +obj: ! + d: !core/ndarray-1.0.0 + data: [[1, 'a'], [2, 'b'], [3, 'c']] + datatype: + - name: a + datatype: int16 + - name: b + datatype: ['ascii', 16] """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass @@ -790,9 +861,9 @@ def test_string_inline(): def test_inline_shape_mismatch(): content = """ - arr: !core/ndarray-1.0.0 - data: [1, 2, 3] - shape: [2] +arr: !core/ndarray-1.0.0 + data: [1, 2, 3] + shape: [2] """ buff = helpers.yaml_to_asdf(content) @@ -800,34 +871,11 @@ def test_inline_shape_mismatch(): pass -@pytest.mark.xfail(reason="NDArrays with dtype=object are not currently supported") -def test_simple_object_array(tmpdir): - # See https://github.com/asdf-format/asdf/issues/383 for feature - # request - dictdata = np.empty((3, 3), dtype=object) - for i, _ in enumerate(dictdata.flat): - dictdata.flat[i] = {"foo": i * 42, "bar": i**2} - - helpers.assert_roundtrip_tree({"bizbaz": dictdata}, tmpdir) - - -@pytest.mark.xfail(reason="NDArrays with dtype=object are not currently supported") -def test_tagged_object_array(tmpdir): - # See https://github.com/asdf-format/asdf/issues/383 for feature - # request - quantity = pytest.importorskip("astropy.units.quantity") - - objdata = np.empty((3, 3), dtype=object) - for i, _ in enumerate(objdata.flat): - objdata.flat[i] = quantity.Quantity(i, "angstrom") - - helpers.assert_roundtrip_tree({"bizbaz": objdata}, tmpdir) - - def test_broadcasted_array(tmpdir): attrs = np.broadcast_arrays(np.array([10, 20]), np.array(10), np.array(10)) tree = {"one": attrs[1]} # , 'two': attrs[1], 'three': attrs[2]} - helpers.assert_roundtrip_tree(tree, tmpdir) + with roundtrip(tree) as af: + assert_array_equal(tree["one"], af["one"]) def test_broadcasted_offset_array(tmpdir): @@ -835,30 +883,30 @@ def test_broadcasted_offset_array(tmpdir): offset = base[5:] broadcasted = np.broadcast_to(offset, (4, 5)) tree = {"broadcasted": broadcasted} - helpers.assert_roundtrip_tree(tree, tmpdir) + with roundtrip(tree) as af: + assert_array_equal(tree["broadcasted"], af["broadcasted"]) def test_non_contiguous_base_array(tmpdir): base = np.arange(60).reshape(5, 4, 3).transpose(2, 0, 1) * 1 contiguous = base.transpose(1, 2, 0) tree = {"contiguous": contiguous} - helpers.assert_roundtrip_tree(tree, tmpdir) + with roundtrip(tree) as af: + assert_array_equal(tree["contiguous"], af["contiguous"]) def test_fortran_order(tmpdir): array = np.array([[11, 12, 13], [21, 22, 23]], order="F", dtype=np.int64) tree = {"data": array} - def check_f_order(t): - assert t["data"].flags.fortran - assert np.all(np.isclose(array, t["data"])) + with roundtrip(tree) as af: + assert af["data"].flags.fortran + assert np.all(np.isclose(array, af["data"])) - def check_raw_yaml(content): + with roundtrip(tree, raw=True) as content: tree = yaml.safe_load(re.sub(rb"!core/\S+", b"", content)) assert tree["data"]["strides"] == [8, 16] - helpers.assert_roundtrip_tree(tree, tmpdir, asdf_check_func=check_f_order, raw_yaml_check_func=check_raw_yaml) - def test_memmap_write(tmpdir): tmpfile = str(tmpdir.join("data.asdf")) From eb944b946bbe35733b17ae6959f92458ca45be73 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 8 Jun 2023 16:18:50 -0400 Subject: [PATCH 130/154] attempt to fix FutureWarning for array comparsion with older numpy --- asdf/_tests/tags/core/tests/test_ndarray.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/asdf/_tests/tags/core/tests/test_ndarray.py b/asdf/_tests/tags/core/tests/test_ndarray.py index d75ddabfd..45373326f 100644 --- a/asdf/_tests/tags/core/tests/test_ndarray.py +++ b/asdf/_tests/tags/core/tests/test_ndarray.py @@ -222,7 +222,9 @@ def test_table_inline(tmpdir): config.array_inline_threshold = 100 with roundtrip(tree) as af: - assert_array_equal(table, af["table_data"]) + assert table.dtype.names == af["table_data"].dtype.names + for n in table.dtype.names: + assert_array_equal(table[n], af["table_data"][n]) with roundtrip(tree, raw=True) as content: tree = yaml.safe_load(re.sub(rb"!core/\S+", b"", content)) From cc2fd1f490a8c2c76c5083f2d5cbb9a631f226f2 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 10:24:34 -0400 Subject: [PATCH 131/154] rename some local variables to avoid confusion --- asdf/yamlutil.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index eb88d5520..074fa6960 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -226,8 +226,9 @@ def _convert_obj(obj, converter): tag = converter.select_tag(obj, _serialization_context) # if select_tag returns None, converter.to_yaml_tree should return a new # object which will be handled by a different converter + converters_used = set() while tag is None: - converters.add(converter) + converters_used.add(converter) obj = converter.to_yaml_tree(obj, tag, _serialization_context) try: converter = extension_manager.get_converter_for_type(type(obj)) @@ -235,7 +236,7 @@ def _convert_obj(obj, converter): # no converter supports this type, return it as-is yield obj return - if converter in converters: + if converter in converters_used: msg = "Conversion cycle detected" raise TypeError(msg) tag = converter.select_tag(obj, _serialization_context) @@ -268,15 +269,15 @@ def _convert_obj(obj, converter): cfg = config.get_config() convert_ndarray_subclasses = cfg.convert_unknown_ndarray_subclasses - converters = {} + converters_cache = {} def _walker(obj): typ = type(obj) - if typ in converters: - return converters[typ](obj) + if typ in converters_cache: + return converters_cache[typ](obj) if extension_manager.handles_type(typ): converter = extension_manager.get_converter_for_type(typ) - converters[typ] = lambda obj, _converter=converter: _convert_obj(obj, _converter) + converters_cache[typ] = lambda obj, _converter=converter: _convert_obj(obj, _converter) return _convert_obj(obj, converter) if convert_ndarray_subclasses and isinstance(obj, np.ndarray) and extension_manager._handles_subtype(typ): warnings.warn( @@ -286,7 +287,7 @@ def _walker(obj): AsdfConversionWarning, ) converter = extension_manager._get_converter_for_subtype(typ) - converters[typ] = lambda obj, _converter=converter: _convert_obj(obj, _converter) + converters_cache[typ] = lambda obj, _converter=converter: _convert_obj(obj, _converter) return _convert_obj(obj, converter) tag = ctx._type_index.from_custom_type( @@ -296,10 +297,10 @@ def _walker(obj): ) if tag is not None: - converters[typ] = lambda obj, _tag=tag: _tag.to_tree_tagged(obj, ctx) + converters_cache[typ] = lambda obj, _tag=tag: _tag.to_tree_tagged(obj, ctx) return tag.to_tree_tagged(obj, ctx) - converters[typ] = lambda obj: obj + converters_cache[typ] = lambda obj: obj return obj return treeutil.walk_and_modify( From c7fb962c86de8883ac090e7880abc93a9acc3d94 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 10:36:57 -0400 Subject: [PATCH 132/154] fix stream deprecation test --- asdf/_tests/test_deprecated.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/asdf/_tests/test_deprecated.py b/asdf/_tests/test_deprecated.py index 45df227f9..9402c3b15 100644 --- a/asdf/_tests/test_deprecated.py +++ b/asdf/_tests/test_deprecated.py @@ -1,3 +1,5 @@ +import sys + import pytest import asdf From 335b6780da313851a636a0801f9f842263f7c9f3 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 11:30:22 -0400 Subject: [PATCH 133/154] fix reference uri resolution --- asdf/core/_converters/reference.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/asdf/core/_converters/reference.py b/asdf/core/_converters/reference.py index 316b24ffd..eeedaec25 100644 --- a/asdf/core/_converters/reference.py +++ b/asdf/core/_converters/reference.py @@ -8,7 +8,12 @@ class ReferenceConverter(Converter): def to_yaml_tree(self, obj, tag, ctx): from asdf.generic_io import relative_uri - uri = relative_uri(ctx.url, obj._uri) if ctx.url is not None else obj._uri + base_uri = None + if ctx._blocks._write_fd is not None and ctx._blocks._write_fd.uri is not None: + base_uri = ctx._blocks._write_fd.uri + elif ctx.url is not None: + base_uri = ctx.url + uri = relative_uri(base_uri, obj._uri) if base_uri is not None else obj._uri return {"$ref": uri} def from_yaml_tree(self, node, tag, ctx): From b6adf6118d083cae29d2f4a64cca7117a4cdc160 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 11:40:58 -0400 Subject: [PATCH 134/154] run pre-commit locally --- asdf/_tests/test_asdf.py | 2 +- asdf/asdf.py | 5 +---- asdf/core/_extensions.py | 2 +- asdf/tags/core/ndarray.py | 2 -- 4 files changed, 3 insertions(+), 8 deletions(-) diff --git a/asdf/_tests/test_asdf.py b/asdf/_tests/test_asdf.py index dfa0b86a9..88366642b 100644 --- a/asdf/_tests/test_asdf.py +++ b/asdf/_tests/test_asdf.py @@ -8,7 +8,7 @@ from asdf.asdf import AsdfFile, open_asdf from asdf.entry_points import get_extensions from asdf.exceptions import AsdfWarning -from asdf.extension import ExtensionProxy, SerializationContext +from asdf.extension import ExtensionProxy from asdf.extension._legacy import AsdfExtensionList from asdf.versioning import AsdfVersion diff --git a/asdf/asdf.py b/asdf/asdf.py index 1ccc85cf3..a48a6dd85 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -15,16 +15,14 @@ from . import compression as mcompression from ._block.manager import Manager as BlockManager from ._helpers import validate_version -from ._serialization_context import SerializationContext # noqa: F401 from .config import config_context, get_config from .exceptions import ( AsdfConversionWarning, - AsdfDeprecationWarning, AsdfWarning, DelimiterNotFoundError, ValidationError, ) -from .extension import Extension, ExtensionProxy, SerializationContext, _legacy, get_cached_extension_manager +from .extension import Extension, ExtensionProxy, _legacy, get_cached_extension_manager from .search import AsdfSearchResult from .tags.core import AsdfObject, ExtensionMetadata, HistoryEntry, Software from .util import NotSet @@ -792,7 +790,6 @@ def _open_asdf( raise ValueError(msg) with config_context(): - # validate_checksums (unlike memmap and lazy_load) is provided # here instead of in __init__ self._blocks._validate_checksums = validate_checksums diff --git a/asdf/core/_extensions.py b/asdf/core/_extensions.py index f7493af7c..fb2a9d982 100644 --- a/asdf/core/_extensions.py +++ b/asdf/core/_extensions.py @@ -3,8 +3,8 @@ from ._converters.complex import ComplexConverter from ._converters.constant import ConstantConverter from ._converters.external_reference import ExternalArrayReferenceConverter -from ._converters.reference import ReferenceConverter from ._converters.ndarray import NDArrayConverter +from ._converters.reference import ReferenceConverter from ._converters.tree import ( AsdfObjectConverter, ExtensionMetadataConverter, diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index 53ab5535f..5567b909f 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -6,8 +6,6 @@ from asdf import util from asdf._jsonschema import ValidationError -from asdf._block.options import Options -from asdf.config import config_context _datatype_names = { "int8": "i1", From d36b929d1f920ceeb1180f09eef75a1e6e847517 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 11:41:20 -0400 Subject: [PATCH 135/154] ignore hypothesis files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index c47a23cdd..b59a5cded 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,6 @@ asdf/_version.py # airspeed velocity files .asv + +# hypothesis files +.hypothesis From bb9a98c38f286d586a92bcbf86115c0e8aa78a3c Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 13:13:10 -0400 Subject: [PATCH 136/154] move SerializationContext out of asdf.extension --- asdf/_serialization_context.py | 8 +- asdf/extension/__init__.py | 2 - asdf/extension/_serialization_context.py | 159 ----------------------- 3 files changed, 6 insertions(+), 163 deletions(-) delete mode 100644 asdf/extension/_serialization_context.py diff --git a/asdf/_serialization_context.py b/asdf/_serialization_context.py index 46d94b1af..dfd1d253b 100644 --- a/asdf/_serialization_context.py +++ b/asdf/_serialization_context.py @@ -3,12 +3,16 @@ from ._block.key import Key as BlockKey from ._block.options import Options as BlockOptions from ._helpers import validate_version -from .extension import ExtensionProxy +from .extension._extension import ExtensionProxy class SerializationContext: """ Container for parameters of the current (de)serialization. + + This class should not be instantiated directly and instead + will be created by the AsdfFile object and provided to extension + classes (like Converters) via method arguments. """ def __init__(self, version, extension_manager, url, blocks): @@ -30,7 +34,7 @@ def url(self): written to an `io.BytesIO`). Returns - -------- + ------- str or None """ return self._url diff --git a/asdf/extension/__init__.py b/asdf/extension/__init__.py index 909094f8d..7e4f96e2a 100644 --- a/asdf/extension/__init__.py +++ b/asdf/extension/__init__.py @@ -9,7 +9,6 @@ from ._extension import Extension, ExtensionProxy from ._manager import ExtensionManager, get_cached_extension_manager from ._manifest import ManifestExtension -from ._serialization_context import SerializationContext from ._tag import TagDefinition from ._validator import Validator @@ -25,5 +24,4 @@ "ConverterProxy", "Compressor", "Validator", - "SerializationContext", ] diff --git a/asdf/extension/_serialization_context.py b/asdf/extension/_serialization_context.py deleted file mode 100644 index 0697925e2..000000000 --- a/asdf/extension/_serialization_context.py +++ /dev/null @@ -1,159 +0,0 @@ -import contextlib - -from asdf._helpers import validate_version -from asdf.extension import ExtensionProxy - - -class SerializationContext: - """ - Container for parameters of the current (de)serialization. - - This class should not be instantiated directly and instead - will be created by the AsdfFile object and provided to extension - classes (like Converters) via method arguments. - """ - - def __init__(self, version, extension_manager, url, blocks): - self._version = validate_version(version) - self._extension_manager = extension_manager - self._url = url - self._blocks = blocks - - self.__extensions_used = set() - - @property - def url(self): - """ - The URL (if any) of the file being read or written. - - Used to compute relative locations of external files referenced by this - ASDF file. The URL will not exist in some cases (e.g. when the file is - written to an `io.BytesIO`). - - Returns - ------- - str or None - """ - return self._url - - @property - def version(self): - """ - Get the ASDF Standard version. - - Returns - ------- - str - """ - return self._version - - @property - def extension_manager(self): - """ - Get the ExtensionManager for enabled extensions. - - Returns - ------- - asdf.extension.ExtensionManager - """ - return self._extension_manager - - def _mark_extension_used(self, extension): - """ - Note that an extension was used when reading or writing the file. - - Parameters - ---------- - extension : asdf.extension.Extension - """ - self.__extensions_used.add(ExtensionProxy.maybe_wrap(extension)) - - @property - def _extensions_used(self): - """ - Get the set of extensions that were used when reading or writing the file. - - Returns - ------- - set of asdf.extension.Extension - """ - return self.__extensions_used - - @contextlib.contextmanager - def _deserialization(self): - self._obj = None - self._blk = None - self._cb = None - yield self - if self._blk is not None: - self._blocks.blocks.assign_object(self._obj, self._blk) - self._blocks._data_callbacks.assign_object(self._obj, self._cb) - - @contextlib.contextmanager - def _serialization(self, obj): - self._obj = obj - yield self - - def get_block_data_callback(self, index, key=None): - """ - Generate a callable that when called will read data - from a block at the provided index - - Parameters - ---------- - index : int - Block index - - key : BlockKey - TODO - - Returns - ------- - callback : callable - A callable that when called (with no arguments) returns - the block data as a one dimensional array of uint8 - """ - blk = self._blocks.blocks[index] - cb = self._blocks._get_data_callback(index) - - if key is None: - if self._blk is not None: - msg = "Converters accessing >1 block must provide a key for each block" - raise OSError(msg) - self._blk = blk - self._cb = cb - else: - self._blocks.blocks.assign_object(key, blk) - self._blocks._data_callbacks.assign_object(key, cb) - - return cb - - def find_available_block_index(self, data_callback, lookup_key=None): - """ - Find the index of an available block to write data. - - This is typically used inside asdf.extension.Converter.to_yaml_tree - - Parameters - ---------- - data_callback: callable - Callable that when called will return data (ndarray) that will - be written to a block. - At the moment, this is only assigned if a new block - is created to avoid circular references during AsdfFile.update. - - lookup_key : hashable, optional - Unique key used to retrieve the index of a block that was - previously allocated or reserved. For ndarrays this is - typically the id of the base ndarray. - - Returns - ------- - block_index: int - Index of the block where data returned from data_callback - will be written. - """ - - if lookup_key is None: - lookup_key = self._obj - return self._blocks.make_write_block(data_callback, BlockOptions(), lookup_key) From 7cc553c422946453889cfdbc8409195c45c82331 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 13:15:05 -0400 Subject: [PATCH 137/154] keep SerializationContext exposed at asdf.asdf --- asdf/asdf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asdf/asdf.py b/asdf/asdf.py index a48a6dd85..9bae46623 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -15,6 +15,7 @@ from . import compression as mcompression from ._block.manager import Manager as BlockManager from ._helpers import validate_version +from ._serialization_context import SerializationContext # noqa: F401 from .config import config_context, get_config from .exceptions import ( AsdfConversionWarning, From c224590e7def812ed587b61a9aadcc41216fa9e6 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 13:30:07 -0400 Subject: [PATCH 138/154] temporarily use dev sphinx-asdf --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4c59c81f8..b528dedbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ all = [ "lz4>=0.10", ] docs = [ - "sphinx-asdf>=0.1.4", + "sphinx-asdf @ git+https://github.com/braingram/sphinx_asdf.git@immutable_block_manager", "graphviz", "sphinx-inline-tabs", 'tomli; python_version < "3.11"', From 4148331fe4194c7d6da04277cf61b0cf35d28dc0 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 13:45:33 -0400 Subject: [PATCH 139/154] move SerializationContext back into asdf.extension --- asdf/_block/manager.py | 7 +++++-- asdf/_tests/test_serialization_context.py | 2 +- asdf/asdf.py | 6 +++--- asdf/extension/__init__.py | 12 +++++++----- asdf/{ => extension}/_serialization_context.py | 9 ++++----- asdf/yamlutil.py | 2 +- 6 files changed, 21 insertions(+), 17 deletions(-) rename asdf/{ => extension}/_serialization_context.py (96%) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 80034d4e7..300e5904f 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -389,9 +389,10 @@ def make_write_block(self, data, options, obj): Data to be written to an ASDF block. Can be provided as a callable function that when evaluated will return the data. - options : Options + options : Options or None Options instance used to define the ASDF block compression - and storage type. + and storage type. If None, a new Options instance will + be created. obj : object An object in the ASDF tree that will be associated with the new WriteBlock so that `AsdfFile.update` can @@ -410,6 +411,8 @@ def make_write_block(self, data, options, obj): If a external block was created without a URI for the main file. """ + if options is None: + options = Options() if options.storage_type == "external": for index, blk in enumerate(self._external_write_blocks): if blk._data is data: diff --git a/asdf/_tests/test_serialization_context.py b/asdf/_tests/test_serialization_context.py index ef9116555..85d455afb 100644 --- a/asdf/_tests/test_serialization_context.py +++ b/asdf/_tests/test_serialization_context.py @@ -3,8 +3,8 @@ import asdf from asdf import get_config -from asdf._serialization_context import BlockAccess, SerializationContext from asdf.extension import ExtensionManager +from asdf.extension._serialization_context import BlockAccess, SerializationContext def test_serialization_context(): diff --git a/asdf/asdf.py b/asdf/asdf.py index 9bae46623..823b6019f 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -10,12 +10,11 @@ from . import _display as display from . import _node_info as node_info -from . import _serialization_context, constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil from . import _version as version from . import compression as mcompression +from . import constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil from ._block.manager import Manager as BlockManager from ._helpers import validate_version -from ._serialization_context import SerializationContext # noqa: F401 from .config import config_context, get_config from .exceptions import ( AsdfConversionWarning, @@ -23,7 +22,8 @@ DelimiterNotFoundError, ValidationError, ) -from .extension import Extension, ExtensionProxy, _legacy, get_cached_extension_manager +from .extension import Extension, ExtensionProxy, _legacy, _serialization_context, get_cached_extension_manager +from .extension._serialization_context import SerializationContext # noqa: F401 from .search import AsdfSearchResult from .tags.core import AsdfObject, ExtensionMetadata, HistoryEntry, Software from .util import NotSet diff --git a/asdf/extension/__init__.py b/asdf/extension/__init__.py index 7e4f96e2a..b51911caf 100644 --- a/asdf/extension/__init__.py +++ b/asdf/extension/__init__.py @@ -9,19 +9,21 @@ from ._extension import Extension, ExtensionProxy from ._manager import ExtensionManager, get_cached_extension_manager from ._manifest import ManifestExtension +from ._serialization_context import SerializationContext from ._tag import TagDefinition from ._validator import Validator __all__ = [ # New API + "Compressor", + "Converter", + "ConverterProxy", "Extension", + "ExtensionManager", "ExtensionProxy", "ManifestExtension", - "ExtensionManager", - "get_cached_extension_manager", + "SerializationContext", "TagDefinition", - "Converter", - "ConverterProxy", - "Compressor", "Validator", + "get_cached_extension_manager", ] diff --git a/asdf/_serialization_context.py b/asdf/extension/_serialization_context.py similarity index 96% rename from asdf/_serialization_context.py rename to asdf/extension/_serialization_context.py index dfd1d253b..66714676d 100644 --- a/asdf/_serialization_context.py +++ b/asdf/extension/_serialization_context.py @@ -1,9 +1,8 @@ import enum -from ._block.key import Key as BlockKey -from ._block.options import Options as BlockOptions -from ._helpers import validate_version -from .extension._extension import ExtensionProxy +from asdf._block.key import Key as BlockKey +from asdf._helpers import validate_version +from asdf.extension._extension import ExtensionProxy class SerializationContext: @@ -228,7 +227,7 @@ class WriteBlocksContext(SerializationContext): def find_available_block_index(self, data_callback, key=None): if key is None: key = self._obj - return self._blocks.make_write_block(data_callback, BlockOptions(), key) + return self._blocks.make_write_block(data_callback, None, key) def generate_block_key(self): return BlockKey(self._obj) diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index 074fa6960..00b0deb63 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -6,9 +6,9 @@ import yaml from . import config, schema, tagged, treeutil, util -from ._serialization_context import BlockAccess from .constants import STSCI_SCHEMA_TAG_BASE, YAML_TAG_PREFIX from .exceptions import AsdfConversionWarning +from .extension._serialization_context import BlockAccess from .tags.core import AsdfObject from .versioning import _yaml_base_loader, split_tag_version From ab67c4893d424b63ce301305f8e34c1237424fad Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 13:54:53 -0400 Subject: [PATCH 140/154] deprecate import of asdf.asdf.SerializationContext --- asdf/_tests/test_deprecated.py | 5 +++++ asdf/asdf.py | 17 ++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/asdf/_tests/test_deprecated.py b/asdf/_tests/test_deprecated.py index 9402c3b15..2e0623894 100644 --- a/asdf/_tests/test_deprecated.py +++ b/asdf/_tests/test_deprecated.py @@ -36,3 +36,8 @@ def test_asdf_stream_deprecation(): if "asdf.stream" in sys.modules: del sys.modules["asdf.stream"] import asdf.stream # noqa: F401 + + +def test_asdf_asdf_SerializationContext_import_deprecation(): + with pytest.warns(AsdfDeprecationWarning, match="importing SerializationContext from asdf.asdf"): + from asdf.asdf import SerializationContext # noqa: F401 diff --git a/asdf/asdf.py b/asdf/asdf.py index 823b6019f..1bc558044 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -18,17 +18,32 @@ from .config import config_context, get_config from .exceptions import ( AsdfConversionWarning, + AsdfDeprecationWarning, AsdfWarning, DelimiterNotFoundError, ValidationError, ) from .extension import Extension, ExtensionProxy, _legacy, _serialization_context, get_cached_extension_manager -from .extension._serialization_context import SerializationContext # noqa: F401 from .search import AsdfSearchResult from .tags.core import AsdfObject, ExtensionMetadata, HistoryEntry, Software from .util import NotSet +def __getattr__(name): + if name == "SerializationContext": + warnings.warn( + "importing SerializationContext from asdf.asdf is deprecated. " + "Please import SerializationContext from asdf.extension", + AsdfDeprecationWarning, + ) + from .extension._serialization_context import SerializationContext + + return SerializationContext + + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) + + def get_asdf_library_info(): """ Get information about asdf to include in the asdf_library entry From 180259737322263b39fc0cd505cbc655e87fca9b Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 14:15:25 -0400 Subject: [PATCH 141/154] move _issues tests to _regtests and rename tests --- asdf/_tests/_issues/__init__.py | 0 asdf/_tests/{_issues => _regtests}/test_1013.py | 8 +++++++- asdf/_tests/{_issues => _regtests}/test_1505.py | 2 +- asdf/_tests/{_issues => _regtests}/test_1520.py | 2 +- asdf/_tests/{_issues => _regtests}/test_1523.py | 2 +- asdf/_tests/{_issues => _regtests}/test_1525.py | 2 +- asdf/_tests/{_issues => _regtests}/test_1526.py | 2 +- asdf/_tests/{_issues => _regtests}/test_1530.py | 2 +- asdf/_tests/{_issues => _regtests}/test_1538.py | 2 +- asdf/_tests/{_issues => _regtests}/test_1539.py | 2 +- asdf/_tests/{_issues => _regtests}/test_1540.py | 2 +- asdf/_tests/{_issues => _regtests}/test_1541.py | 2 +- asdf/_tests/{_issues => _regtests}/test_1542.py | 2 +- 13 files changed, 18 insertions(+), 12 deletions(-) delete mode 100644 asdf/_tests/_issues/__init__.py rename asdf/_tests/{_issues => _regtests}/test_1013.py (87%) rename asdf/_tests/{_issues => _regtests}/test_1505.py (89%) rename asdf/_tests/{_issues => _regtests}/test_1520.py (94%) rename asdf/_tests/{_issues => _regtests}/test_1523.py (93%) rename asdf/_tests/{_issues => _regtests}/test_1525.py (91%) rename asdf/_tests/{_issues => _regtests}/test_1526.py (91%) rename asdf/_tests/{_issues => _regtests}/test_1530.py (93%) rename asdf/_tests/{_issues => _regtests}/test_1538.py (87%) rename asdf/_tests/{_issues => _regtests}/test_1539.py (89%) rename asdf/_tests/{_issues => _regtests}/test_1540.py (86%) rename asdf/_tests/{_issues => _regtests}/test_1541.py (90%) rename asdf/_tests/{_issues => _regtests}/test_1542.py (92%) diff --git a/asdf/_tests/_issues/__init__.py b/asdf/_tests/_issues/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/asdf/_tests/_issues/test_1013.py b/asdf/_tests/_regtests/test_1013.py similarity index 87% rename from asdf/_tests/_issues/test_1013.py rename to asdf/_tests/_regtests/test_1013.py index b10e43eaf..b95f02d4f 100644 --- a/asdf/_tests/_issues/test_1013.py +++ b/asdf/_tests/_regtests/test_1013.py @@ -3,7 +3,13 @@ import asdf -def test_1013(tmp_path): +def test_control_array_storage_in_to_yaml_tree_methods(tmp_path): + """ + controlling array storage in to_yaml_tree methods + + https://github.com/asdf-format/asdf/issues/1013 + """ + class FooType: def __init__(self, data): self.data = data diff --git a/asdf/_tests/_issues/test_1505.py b/asdf/_tests/_regtests/test_1505.py similarity index 89% rename from asdf/_tests/_issues/test_1505.py rename to asdf/_tests/_regtests/test_1505.py index 7a0c8796d..92cd3da63 100644 --- a/asdf/_tests/_issues/test_1505.py +++ b/asdf/_tests/_regtests/test_1505.py @@ -3,7 +3,7 @@ import asdf -def test_1505(tmp_path): +def test_update_fails_after_write_to(tmp_path): """ Calling update after write_to fails diff --git a/asdf/_tests/_issues/test_1520.py b/asdf/_tests/_regtests/test_1520.py similarity index 94% rename from asdf/_tests/_issues/test_1520.py rename to asdf/_tests/_regtests/test_1520.py index 8b99b0c26..587ebf6d7 100644 --- a/asdf/_tests/_issues/test_1520.py +++ b/asdf/_tests/_regtests/test_1520.py @@ -3,7 +3,7 @@ import asdf -def test_1520(tmp_path): +def test_failed_update_corrupts_file(tmp_path): """ A failed update can corrupt the file diff --git a/asdf/_tests/_issues/test_1523.py b/asdf/_tests/_regtests/test_1523.py similarity index 93% rename from asdf/_tests/_issues/test_1523.py rename to asdf/_tests/_regtests/test_1523.py index 65451df51..1777f769d 100644 --- a/asdf/_tests/_issues/test_1523.py +++ b/asdf/_tests/_regtests/test_1523.py @@ -3,7 +3,7 @@ import asdf -def test_1523(tmp_path): +def test_update_corrupts_stream_data(tmp_path): """ update corrupts stream data https://github.com/asdf-format/asdf/issues/1523 diff --git a/asdf/_tests/_issues/test_1525.py b/asdf/_tests/_regtests/test_1525.py similarity index 91% rename from asdf/_tests/_issues/test_1525.py rename to asdf/_tests/_regtests/test_1525.py index 5a5880bde..82d56bea8 100644 --- a/asdf/_tests/_issues/test_1525.py +++ b/asdf/_tests/_regtests/test_1525.py @@ -3,7 +3,7 @@ import asdf -def test_1525(tmp_path): +def test_external_blocks_always_lazy_loaded_and_memmapped(tmp_path): """ External blocks are always lazy loaded and memmapped diff --git a/asdf/_tests/_issues/test_1526.py b/asdf/_tests/_regtests/test_1526.py similarity index 91% rename from asdf/_tests/_issues/test_1526.py rename to asdf/_tests/_regtests/test_1526.py index 6ff9b22cc..2552e3e6d 100644 --- a/asdf/_tests/_issues/test_1526.py +++ b/asdf/_tests/_regtests/test_1526.py @@ -5,7 +5,7 @@ import asdf -def test_1526(tmp_path): +def test_rewrite_file_with_unaccessed_external_blocks_fails(tmp_path): """ Rewriting a file with external blocks fails if arrays are not first accessed diff --git a/asdf/_tests/_issues/test_1530.py b/asdf/_tests/_regtests/test_1530.py similarity index 93% rename from asdf/_tests/_issues/test_1530.py rename to asdf/_tests/_regtests/test_1530.py index 0d7b65778..353bca11d 100644 --- a/asdf/_tests/_issues/test_1530.py +++ b/asdf/_tests/_regtests/test_1530.py @@ -5,7 +5,7 @@ @pytest.mark.xfail(reason="fixing this may require subclassing ndarray") -def test_1530(tmp_path): +def test_update_with_memmapped_data_can_make_view_data_invalid(tmp_path): """ Calling update with memmapped data can create invalid data in memmap views diff --git a/asdf/_tests/_issues/test_1538.py b/asdf/_tests/_regtests/test_1538.py similarity index 87% rename from asdf/_tests/_issues/test_1538.py rename to asdf/_tests/_regtests/test_1538.py index b79418722..d71b1d8df 100644 --- a/asdf/_tests/_issues/test_1538.py +++ b/asdf/_tests/_regtests/test_1538.py @@ -3,7 +3,7 @@ import asdf -def test_1538(tmp_path): +def test_unable_to_read_empty_inline_array(tmp_path): """ ASDF unable to read empty inline array diff --git a/asdf/_tests/_issues/test_1539.py b/asdf/_tests/_regtests/test_1539.py similarity index 89% rename from asdf/_tests/_issues/test_1539.py rename to asdf/_tests/_regtests/test_1539.py index bcea762c1..e1c042a04 100644 --- a/asdf/_tests/_issues/test_1539.py +++ b/asdf/_tests/_regtests/test_1539.py @@ -6,7 +6,7 @@ @pytest.mark.xfail(reason="Fix will require more major changes to generic_io") -def test_1539(): +def test_invalid_seek_and_read_from_closed_memoryio(): """ Seek and read from closed MemoryIO diff --git a/asdf/_tests/_issues/test_1540.py b/asdf/_tests/_regtests/test_1540.py similarity index 86% rename from asdf/_tests/_issues/test_1540.py rename to asdf/_tests/_regtests/test_1540.py index f739db7f4..62c83852d 100644 --- a/asdf/_tests/_issues/test_1540.py +++ b/asdf/_tests/_regtests/test_1540.py @@ -3,7 +3,7 @@ import asdf -def test_1540(tmp_path): +def test_writes_but_fails_to_read_inline_structured_array(tmp_path): """ ASDF writes but fails to read inline structured array diff --git a/asdf/_tests/_issues/test_1541.py b/asdf/_tests/_regtests/test_1541.py similarity index 90% rename from asdf/_tests/_issues/test_1541.py rename to asdf/_tests/_regtests/test_1541.py index fc16c78aa..ac3f6bed1 100644 --- a/asdf/_tests/_issues/test_1541.py +++ b/asdf/_tests/_regtests/test_1541.py @@ -7,7 +7,7 @@ @pytest.mark.parametrize("lazy_load", [True, False]) @pytest.mark.parametrize("include_block_index", [True, False]) @pytest.mark.parametrize("index", [0, 1, 2]) -def test_1541(tmp_path, index, include_block_index, lazy_load): +def test_block_checksums_only_checked_for_first_block_if_index_exists(tmp_path, index, include_block_index, lazy_load): """ Block checksums are only checked for first block if a block index is present diff --git a/asdf/_tests/_issues/test_1542.py b/asdf/_tests/_regtests/test_1542.py similarity index 92% rename from asdf/_tests/_issues/test_1542.py rename to asdf/_tests/_regtests/test_1542.py index 96247a27f..e7daec6c0 100644 --- a/asdf/_tests/_issues/test_1542.py +++ b/asdf/_tests/_regtests/test_1542.py @@ -10,7 +10,7 @@ @pytest.mark.skipif( sys.platform.startswith("win"), reason="os.pipe.seek noop on windows: https://bugs.python.org/issue42602" ) -def test_1542(): +def test_failure_to_write_blocks_to_non_seekable_file(): """ ASDF fails to write blocks to non-seekable file From e6e3c4406c47f4bb3031606ef9ddf7ada9d1c5dd Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 14:21:18 -0400 Subject: [PATCH 142/154] typo prevented dev sphinx_asdf install --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b528dedbf..2ce006669 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ all = [ "lz4>=0.10", ] docs = [ - "sphinx-asdf @ git+https://github.com/braingram/sphinx_asdf.git@immutable_block_manager", + "sphinx-asdf @ git+https://github.com/braingram/sphinx-asdf.git@immutable_block_manager", "graphviz", "sphinx-inline-tabs", 'tomli; python_version < "3.11"', From 428dc9e91ef74f2581380b9cc5afdd68e75081dc Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 14:52:01 -0400 Subject: [PATCH 143/154] move write_to version reset into finally and add test to make sure a failed write doesn't modify the tree or the version --- asdf/_tests/test_api.py | 32 ++++++++++++++++++++++++++------ asdf/asdf.py | 11 ++++++----- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/asdf/_tests/test_api.py b/asdf/_tests/test_api.py index 3a52fe6ab..9cf562870 100644 --- a/asdf/_tests/test_api.py +++ b/asdf/_tests/test_api.py @@ -510,13 +510,23 @@ def test_update_asdf_standard_version_tag_selection(): assert b"!core/asdf-1.0.0" not in content -def test_write_to_no_tree_modification(tmp_path): - fn = tmp_path / "test.asdf" +@pytest.mark.parametrize("valid_filename", [True, False], ids=["valid_filename", "invalid_filename"]) +def test_write_to_no_tree_modification(tmp_path, valid_filename): + if valid_filename: + fn = tmp_path / "test.asdf" + else: + fn = "invalid/missing.asdf" fn2 = tmp_path / "test2.asdf" tree = {"foo": None} af = asdf.AsdfFile(tree.copy()) - af.write_to(fn) + try: + af.write_to(fn) + except Exception: + if valid_filename: + raise assert tree == af.tree + if not valid_filename: + return with asdf.open(fn) as af: af["history"]["extensions"][0]["software"]["version"] = "0.0.0.dev+abcdefg" af["asdf_library"]["author"] = "foo" @@ -525,11 +535,21 @@ def test_write_to_no_tree_modification(tmp_path): assert af.tree == tree -def test_write_to_no_version_modification(tmp_path): - fn = tmp_path / "test.asdf" +@pytest.mark.parametrize("valid_filename", [True, False], ids=["valid_filename", "invalid_filename"]) +def test_write_to_no_version_modification(tmp_path, valid_filename): + if valid_filename: + fn = tmp_path / "test.asdf" + else: + fn = "invalid/missing.asdf" tree = {"foo": None} af = asdf.AsdfFile(tree.copy(), version="1.0.0") - af.write_to(fn, version="1.1.0") + try: + af.write_to(fn, version="1.1.0") + except Exception: + if valid_filename: + raise assert af.version_string == "1.0.0" + if not valid_filename: + return with asdf.open(fn) as af: assert af.version_string == "1.1.0" diff --git a/asdf/asdf.py b/asdf/asdf.py index 1bc558044..0ea3b45ef 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -1226,11 +1226,12 @@ def write_to( previous_version = self.version self.version = version - with generic_io.get_file(fd, mode="w") as fd: - self._serial_write(fd, pad_blocks, include_block_index) - - if version is not None: - self.version = previous_version + try: + with generic_io.get_file(fd, mode="w") as fd: + self._serial_write(fd, pad_blocks, include_block_index) + finally: + if version is not None: + self.version = previous_version def find_references(self): """ From 75e005d38e973c9afccb162bba30b5ad2c447954 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 15:47:50 -0400 Subject: [PATCH 144/154] add missing BlockAccess docstring --- asdf/extension/_serialization_context.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/asdf/extension/_serialization_context.py b/asdf/extension/_serialization_context.py index 66714676d..80ed8cdc8 100644 --- a/asdf/extension/_serialization_context.py +++ b/asdf/extension/_serialization_context.py @@ -234,7 +234,10 @@ def generate_block_key(self): class BlockAccess(enum.Enum): - """ """ + """ + Block access enumerated values that define + how a SerializationContext can access ASDF blocks. + """ NONE = SerializationContext WRITE = WriteBlocksContext From f74029246b7d57e68e2345b4791b3c76e9c10745 Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 9 Aug 2023 15:54:46 -0400 Subject: [PATCH 145/154] add asdf.asdf.SerializationContext import deprecation to docs --- docs/asdf/deprecations.rst | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/docs/asdf/deprecations.rst b/docs/asdf/deprecations.rst index 6a82c38b5..3b952870f 100644 --- a/docs/asdf/deprecations.rst +++ b/docs/asdf/deprecations.rst @@ -6,6 +6,16 @@ Deprecations ************ +Version 3.0 +=========== + +SerializationContext was previously importable from ``asdf.asdf.SerializationContext``. +Although not part of the public API, this import path has been deprecated and users +should instead import ``SerializationContext`` from `asdf.extension`. + +Version 2.15 +============ + ASDF 2.15 introduced many new `asdf.exceptions.AsdfDeprecationWarning` messages. These warnings are subclasses of the built-in python `DeprecationWarning` and will by default be ignored except in `__main__` and with testing tools such as @@ -18,7 +28,7 @@ versioning, compatibility and support policy). .. _legacy_extension_api_deprecation: Legacy Extension API Deprecation -================================ +-------------------------------- A large number of `asdf.exceptions.AsdfDeprecationWarning` messages appear related to use of the ``legacy extension api``. Some examples include: @@ -55,7 +65,7 @@ package that uses these new-style extension api. .. _asdf_in_fits_deprecation: ASDF-in-FITS Deprecation -======================== +------------------------ Support for ``AsdfInFits`` (including the ``asdf.fits_embed`` module) is deprecated. Code using this format can migrate to using `stdatamodels` which @@ -69,7 +79,7 @@ deprecated. .. _tests_helpers_deprecation: asdf.tests.helpers Deprecation -============================== +------------------------------ Use of ``asdf.tests.helpers`` is deprecated. Please see `asdf.testing.helpers` for alternative functions to aid in testing. From 5a143c0f8e0e5cf7c99a90ad94bf7c05ea3af56d Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 14 Aug 2023 10:23:53 -0400 Subject: [PATCH 146/154] add parametrization to 1525 regression test --- asdf/_tests/_regtests/test_1525.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/asdf/_tests/_regtests/test_1525.py b/asdf/_tests/_regtests/test_1525.py index 82d56bea8..a682fc66e 100644 --- a/asdf/_tests/_regtests/test_1525.py +++ b/asdf/_tests/_regtests/test_1525.py @@ -1,9 +1,11 @@ import numpy as np +import pytest import asdf -def test_external_blocks_always_lazy_loaded_and_memmapped(tmp_path): +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_external_blocks_always_lazy_loaded_and_memmapped(tmp_path, copy_arrays): """ External blocks are always lazy loaded and memmapped @@ -16,14 +18,13 @@ def test_external_blocks_always_lazy_loaded_and_memmapped(tmp_path): af.set_array_storage(arr, "external") af.write_to(fn) - for copy_arrays in (True, False): - with asdf.open(fn, copy_arrays=copy_arrays) as af: - # check that block is external - source = af["arr"]._source - assert isinstance(source, str) + with asdf.open(fn, copy_arrays=copy_arrays) as af: + # check that block is external + source = af["arr"]._source + assert isinstance(source, str) - # check if block is memmapped - if copy_arrays: - assert not isinstance(af["arr"].base, np.memmap) - else: - assert isinstance(af["arr"].base, np.memmap) + # check if block is memmapped + if copy_arrays: + assert not isinstance(af["arr"].base, np.memmap) + else: + assert isinstance(af["arr"].base, np.memmap) From adf9a99522c23e50dd393f356244e9eb74fd719e Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 14 Aug 2023 11:40:17 -0400 Subject: [PATCH 147/154] remove unneeded line --- asdf/commands/edit.py | 1 - 1 file changed, 1 deletion(-) diff --git a/asdf/commands/edit.py b/asdf/commands/edit.py index 9cf22e640..9e09d93c9 100644 --- a/asdf/commands/edit.py +++ b/asdf/commands/edit.py @@ -155,7 +155,6 @@ def write_edited_yaml_larger(path, new_content, version): # copy over blocks byte-for-byte from old_first_block_offset to block_index_offset original_fd.seek(old_first_block_offset) - block_index_offset block_size = min(fd.block_size, original_fd.block_size) n_bytes = blocks_end - old_first_block_offset for offset in range(0, n_bytes, block_size): From 66516725d0398206e4b0545d29624f052fd5e669 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 14 Aug 2023 11:54:32 -0400 Subject: [PATCH 148/154] simplify ndarray subclass handling --- asdf/_tests/test_extension.py | 2 -- asdf/extension/_manager.py | 20 -------------------- asdf/yamlutil.py | 4 ++-- 3 files changed, 2 insertions(+), 24 deletions(-) diff --git a/asdf/_tests/test_extension.py b/asdf/_tests/test_extension.py index 813a2cfce..53cb4711d 100644 --- a/asdf/_tests/test_extension.py +++ b/asdf/_tests/test_extension.py @@ -417,7 +417,6 @@ def test_extension_manager(): assert manager.handles_type(FooType) is True assert manager.handles_type(SubFooType) is False - assert manager._handles_subtype(SubFooType) is True # This should return True even though BarType was listed # as string class name: assert manager.handles_type(BarType) is True @@ -440,7 +439,6 @@ def test_extension_manager(): manager.get_converter_for_tag("asdf://somewhere.org/extensions/full/tags/bar-1.0") assert manager.get_converter_for_type(FooType).delegate is converter1 - assert manager._get_converter_for_subtype(SubFooType).delegate is converter1 assert manager.get_converter_for_type(BarType).delegate is converter1 assert manager.get_converter_for_type(BazType).delegate is converter2 with pytest.raises(KeyError, match=r"\"No support available for Python type .*\""): diff --git a/asdf/extension/_manager.py b/asdf/extension/_manager.py index a8f0691ce..8094ef701 100644 --- a/asdf/extension/_manager.py +++ b/asdf/extension/_manager.py @@ -92,14 +92,6 @@ def handles_type(self, typ): """ return typ in self._converters_by_type or get_class_name(typ, instance=False) in self._converters_by_type - def _handles_subtype(self, typ): - for ctyp in self._converters_by_type: - if isinstance(ctyp, str): - continue - if issubclass(typ, ctyp): - return True - return False - def handles_tag_definition(self, tag): """ Return `True` if the specified tag has a definition. @@ -193,18 +185,6 @@ def get_converter_for_type(self, typ): ) raise KeyError(msg) from None - def _get_converter_for_subtype(self, typ): - for ctyp in self._converters_by_type: - if isinstance(ctyp, str): - continue - if issubclass(typ, ctyp): - return self._converters_by_type[ctyp] - msg = ( - f"No support available for Python type '{get_class_name(typ, instance=False)}'. " - "You may need to install or enable an extension." - ) - raise KeyError(msg) from None - @property def validator_manager(self): return self._validator_manager diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index 00b0deb63..56a854400 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -279,14 +279,14 @@ def _walker(obj): converter = extension_manager.get_converter_for_type(typ) converters_cache[typ] = lambda obj, _converter=converter: _convert_obj(obj, _converter) return _convert_obj(obj, converter) - if convert_ndarray_subclasses and isinstance(obj, np.ndarray) and extension_manager._handles_subtype(typ): + if convert_ndarray_subclasses and isinstance(obj, np.ndarray): warnings.warn( f"A ndarray subclass ({type(obj)}) was converted as a ndarray. " "This behavior will be removed from a future version of ASDF. " "See https://asdf.readthedocs.io/en/latest/asdf/config.html#convert-unknown-ndarray-subclasses", AsdfConversionWarning, ) - converter = extension_manager._get_converter_for_subtype(typ) + converter = extension_manager.get_converter_for_type(np.ndarray) converters_cache[typ] = lambda obj, _converter=converter: _convert_obj(obj, _converter) return _convert_obj(obj, converter) From 3a59907b10c8fb56fc3b67859e8fc7f70bbc7518 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 14 Aug 2023 12:01:27 -0400 Subject: [PATCH 149/154] remove unnecessary assign_object(None) --- asdf/yamlutil.py | 1 - 1 file changed, 1 deletion(-) diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index 56a854400..0fd20b986 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -334,7 +334,6 @@ def _walker(node): if extension_manager.handles_tag(tag): converter = extension_manager.get_converter_for_tag(tag) - _serialization_context.assign_object(None) obj = converter.from_yaml_tree(node.data, tag, _serialization_context) _serialization_context.assign_object(obj) _serialization_context.assign_blocks() From 2caf2c9bd167a0247daf8057e91926ef6135cfba Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 14 Aug 2023 12:26:25 -0400 Subject: [PATCH 150/154] add warnings to failed block index reading --- asdf/_block/reader.py | 8 +++-- asdf/_tests/_block/test_reader.py | 9 +++-- asdf/_tests/commands/tests/data/block0.asdf | Bin 80469 -> 80427 bytes asdf/_tests/commands/tests/data/block1.asdf | Bin 80469 -> 80427 bytes asdf/_tests/test_array_blocks.py | 38 +++++++++++--------- 5 files changed, 34 insertions(+), 21 deletions(-) diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index 659c8cb21..87c451836 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -237,8 +237,10 @@ def read_blocks(fd, memmap=False, lazy_load=False, validate_checksums=False, aft # setup empty blocks try: block_index = bio.read_block_index(fd, index_offset) - except BlockIndexError: + except BlockIndexError as e: # failed to read block index, fall back to serial reading + msg = f"Failed to read block index, falling back to serial reading: {e!s}" + warnings.warn(msg, AsdfWarning) fd.seek(starting_offset) return _read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) # skip magic for each block @@ -253,7 +255,9 @@ def read_blocks(fd, memmap=False, lazy_load=False, validate_checksums=False, aft msg = "Invalid block magic" raise OSError(msg) blocks[index].load() - except (OSError, ValueError): + except (OSError, ValueError) as e: + msg = f"Invalid block index contents for block {index}, falling back to serial reading: {e!s}" + warnings.warn(msg, AsdfWarning) fd.seek(starting_offset) return _read_blocks_serially(fd, memmap, lazy_load, after_magic) return blocks diff --git a/asdf/_tests/_block/test_reader.py b/asdf/_tests/_block/test_reader.py index c82896dc4..15a005997 100644 --- a/asdf/_tests/_block/test_reader.py +++ b/asdf/_tests/_block/test_reader.py @@ -124,10 +124,12 @@ def test_invalid_block_index(tmp_path, invalid_block_index): # when the block index is read, only the first and last blocks # are check, so any other invalid entry should result in failure if invalid_block_index in (0, -1): - check(read_blocks(fd, lazy_load=True)) + with pytest.warns(AsdfWarning, match="Invalid block index contents"): + check(read_blocks(fd, lazy_load=True)) elif invalid_block_index == "junk": # read_blocks should fall back to reading serially - check(read_blocks(fd, lazy_load=True)) + with pytest.warns(AsdfWarning, match="Failed to read block index"): + check(read_blocks(fd, lazy_load=True)) else: with pytest.raises(ValueError, match="Header size.*"): check(read_blocks(fd, lazy_load=True)) @@ -151,7 +153,8 @@ def test_invalid_block_in_index_with_valid_magic(tmp_path): bio.write_block_index(fd, block_index) fd.seek(0) - check(read_blocks(fd, lazy_load=True)) + with pytest.warns(AsdfWarning, match="Invalid block index contents"): + check(read_blocks(fd, lazy_load=True)) def test_closed_file(tmp_path): diff --git a/asdf/_tests/commands/tests/data/block0.asdf b/asdf/_tests/commands/tests/data/block0.asdf index 46f11afb5740fc5922a050d99eb579f7df54f122..1d5e7d4940b16d52dda6beed1e9f9348611f7a2c 100644 GIT binary patch delta 13 Ucmccmg=O^@mWC~iT2_o)051Cl6#xJL delta 55 zcmZ4eh2`oOmWC~iT2|Wj%8tP diff --git a/asdf/_tests/commands/tests/data/block1.asdf b/asdf/_tests/commands/tests/data/block1.asdf index 97fe038ccf5a2dc3776ad91178d911d2f1018b53..9bf44e186698bcd5476fa7198b4daf4b6b3ffcc8 100644 GIT binary patch delta 13 Ucmccmg=O^@mWC~iT2_o)051Cl6#xJL delta 55 zcmZ4eh2`oOmWC~iT2|Wj%8tP diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index f601c16ee..de956777e 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -10,6 +10,7 @@ import asdf from asdf import constants, generic_io from asdf._block import io as bio +from asdf.exceptions import AsdfWarning RNG = np.random.default_rng(6) @@ -625,8 +626,9 @@ def test_junk_after_index(): # This has junk after the block index, so it # should fall back to reading serially - with asdf.open(buff) as ff: - assert ff._blocks.blocks[1].loaded + with pytest.warns(AsdfWarning, match="Failed to read block index"): + with asdf.open(buff) as ff: + assert ff._blocks.blocks[1].loaded def test_short_file_find_block_index(): @@ -646,9 +648,10 @@ def test_short_file_find_block_index(): buff.write(b"0" * (io.DEFAULT_BUFFER_SIZE * 4)) buff.seek(0) - with asdf.open(buff) as ff: - assert len(ff._blocks.blocks) == 2 - assert ff._blocks.blocks[1].loaded + with pytest.warns(AsdfWarning, match="Failed to read block index"): + with asdf.open(buff) as ff: + assert len(ff._blocks.blocks) == 2 + assert ff._blocks.blocks[1].loaded def test_invalid_block_index_values(): @@ -675,9 +678,10 @@ def test_invalid_block_index_values(): bio.write_block_index(buff, block_index) buff.seek(0) - with asdf.open(buff) as ff: - assert len(ff._blocks.blocks) == 10 - assert ff._blocks.blocks[1].loaded + with pytest.warns(AsdfWarning, match="Invalid block index contents"): + with asdf.open(buff) as ff: + assert len(ff._blocks.blocks) == 10 + assert ff._blocks.blocks[1].loaded @pytest.mark.parametrize("block_index_index", [0, -1]) @@ -719,11 +723,12 @@ def test_invalid_block_index_offset(block_index_index): ) buff.seek(0) - with asdf.open(buff) as ff: - assert len(ff._blocks.blocks) == 10 - for i, a in enumerate(arrays): - assert ff._blocks.blocks[i].loaded - assert_array_equal(ff["arrays"][i], a) + with pytest.warns(AsdfWarning, match="Invalid block index contents"): + with asdf.open(buff) as ff: + assert len(ff._blocks.blocks) == 10 + for i, a in enumerate(arrays): + assert ff._blocks.blocks[i].loaded + assert_array_equal(ff["arrays"][i], a) def test_unordered_block_index(): @@ -750,9 +755,10 @@ def test_unordered_block_index(): buff.seek(0) buff.seek(0) - with asdf.open(buff) as ff: - assert len(ff._blocks.blocks) == 10 - assert ff._blocks.blocks[1].loaded + with pytest.warns(AsdfWarning, match="Failed to read block index"): + with asdf.open(buff) as ff: + assert len(ff._blocks.blocks) == 10 + assert ff._blocks.blocks[1].loaded def test_open_no_memmap(tmp_path): From 9bf92fe3f63f72a08dcf10dac7b6620baf6d6b3f Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 14 Aug 2023 12:31:11 -0400 Subject: [PATCH 151/154] remove unnecessary config context --- asdf/core/_converters/ndarray.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/asdf/core/_converters/ndarray.py b/asdf/core/_converters/ndarray.py index 808a0eaa3..b448a5e32 100644 --- a/asdf/core/_converters/ndarray.py +++ b/asdf/core/_converters/ndarray.py @@ -19,9 +19,8 @@ def to_yaml_tree(self, obj, tag, ctx): import numpy as np from numpy import ma - from asdf import util + from asdf import config, util from asdf._block.options import Options - from asdf.config import config_context from asdf.tags.core.ndarray import NDArrayType, numpy_array_to_list, numpy_dtype_to_asdf_datatype from asdf.tags.core.stream import Stream @@ -66,13 +65,13 @@ def to_yaml_tree(self, obj, tag, ctx): options = ctx._blocks.options.get_options(data) # possibly override options based on config settings - with config_context() as cfg: - if cfg.all_array_storage is not None: - options.storage_type = cfg.all_array_storage - if cfg.all_array_compression != "input": - options.compression = cfg.all_array_compression - options.compression_kwargs = cfg.all_array_compression_kwargs - inline_threshold = cfg.array_inline_threshold + cfg = config.get_config() + if cfg.all_array_storage is not None: + options.storage_type = cfg.all_array_storage + if cfg.all_array_compression != "input": + options.compression = cfg.all_array_compression + options.compression_kwargs = cfg.all_array_compression_kwargs + inline_threshold = cfg.array_inline_threshold if inline_threshold is not None and options.storage_type in ("inline", "internal"): if data.size < inline_threshold: From 688617ada622f7687013c95bec7bdc3d560ea4c8 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 14 Aug 2023 13:15:55 -0400 Subject: [PATCH 152/154] add test_update_compressed_blocks tests that updated compressed block data does not cause a failed update --- asdf/_tests/_regtests/test_1520.py | 34 -------------------------- asdf/_tests/test_array_blocks.py | 39 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 34 deletions(-) delete mode 100644 asdf/_tests/_regtests/test_1520.py diff --git a/asdf/_tests/_regtests/test_1520.py b/asdf/_tests/_regtests/test_1520.py deleted file mode 100644 index 587ebf6d7..000000000 --- a/asdf/_tests/_regtests/test_1520.py +++ /dev/null @@ -1,34 +0,0 @@ -import numpy as np - -import asdf - - -def test_failed_update_corrupts_file(tmp_path): - """ - A failed update can corrupt the file - - https://github.com/asdf-format/asdf/issues/1520 - """ - fn = tmp_path / "test.asdf" - n_arrays = 10 - array_size = 10000 - - # make a tree with many arrays that will compress well - af = asdf.AsdfFile() - for i in range(n_arrays): - af[i] = np.zeros(array_size, dtype="uint8") + i - af.set_array_compression(af[i], "zlib") - af.write_to(fn) - - with asdf.open(fn, mode="rw") as af: - # now make the data difficult to compress - for i in range(n_arrays): - assert np.all(af[i] == i) - af[i][:] = np.random.randint(255, size=array_size) - af[i][0] = i + 1 - # this no longer causes update to fail - af.update() - - with asdf.open(fn, mode="r") as af: - for i in range(n_arrays): - assert af[i][0] == i + 1 diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index de956777e..807dfb294 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -447,6 +447,45 @@ def test_update_array_in_place(tmp_path, lazy_load, copy_arrays): assert_array_equal(ff.tree["my_array"], np.ones((64, 64)) * 2) +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_compressed_blocks(tmp_path, lazy_load, copy_arrays): + """ + This test was originally constructed to test an issue where + a failed update left a corrupt file. The issue that resulted in + the failed update (a compressed block growing in size) was fixed + so this is no longer a good test for a failed update. + + See: https://github.com/asdf-format/asdf/issues/1520 + + However, the test does serve to make sure that updating the + contents of compressed blocks in a way that causes them to grow + in size on disk does not result in a failed update. + """ + fn = tmp_path / "test.asdf" + n_arrays = 10 + array_size = 10000 + + # make a tree with many arrays that will compress well + af = asdf.AsdfFile() + for i in range(n_arrays): + af[i] = np.zeros(array_size, dtype="uint8") + i + af.set_array_compression(af[i], "zlib") + af.write_to(fn) + + with asdf.open(fn, lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as af: + # now make the data are difficult to compress + for i in range(n_arrays): + assert np.all(af[i] == i) + af[i][:] = np.random.randint(255, size=array_size) + af[i][0] = i + 1 + af.update() + + with asdf.open(fn, mode="r") as af: + for i in range(n_arrays): + assert af[i][0] == i + 1 + + def test_init_from_asdffile(tmp_path): tmp_path = str(tmp_path) From 1534bd955b3d6783da049b3cbb5e18fed27ddc60 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 14 Aug 2023 14:34:43 -0400 Subject: [PATCH 153/154] add AsdfBlockIndexWarning --- asdf/_block/reader.py | 6 +++--- asdf/_tests/_block/test_reader.py | 8 ++++---- asdf/_tests/test_array_blocks.py | 12 ++++++------ asdf/exceptions.py | 6 ++++++ 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py index 87c451836..60abd9d34 100644 --- a/asdf/_block/reader.py +++ b/asdf/_block/reader.py @@ -2,7 +2,7 @@ import weakref from asdf import constants -from asdf.exceptions import AsdfWarning +from asdf.exceptions import AsdfBlockIndexWarning, AsdfWarning from . import io as bio from .exceptions import BlockIndexError @@ -240,7 +240,7 @@ def read_blocks(fd, memmap=False, lazy_load=False, validate_checksums=False, aft except BlockIndexError as e: # failed to read block index, fall back to serial reading msg = f"Failed to read block index, falling back to serial reading: {e!s}" - warnings.warn(msg, AsdfWarning) + warnings.warn(msg, AsdfBlockIndexWarning) fd.seek(starting_offset) return _read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) # skip magic for each block @@ -257,7 +257,7 @@ def read_blocks(fd, memmap=False, lazy_load=False, validate_checksums=False, aft blocks[index].load() except (OSError, ValueError) as e: msg = f"Invalid block index contents for block {index}, falling back to serial reading: {e!s}" - warnings.warn(msg, AsdfWarning) + warnings.warn(msg, AsdfBlockIndexWarning) fd.seek(starting_offset) return _read_blocks_serially(fd, memmap, lazy_load, after_magic) return blocks diff --git a/asdf/_tests/_block/test_reader.py b/asdf/_tests/_block/test_reader.py index 15a005997..800cc9d98 100644 --- a/asdf/_tests/_block/test_reader.py +++ b/asdf/_tests/_block/test_reader.py @@ -9,7 +9,7 @@ from asdf import constants, generic_io, util from asdf._block import io as bio from asdf._block.reader import read_blocks -from asdf.exceptions import AsdfWarning +from asdf.exceptions import AsdfBlockIndexWarning, AsdfWarning @contextlib.contextmanager @@ -124,11 +124,11 @@ def test_invalid_block_index(tmp_path, invalid_block_index): # when the block index is read, only the first and last blocks # are check, so any other invalid entry should result in failure if invalid_block_index in (0, -1): - with pytest.warns(AsdfWarning, match="Invalid block index contents"): + with pytest.warns(AsdfBlockIndexWarning, match="Invalid block index contents"): check(read_blocks(fd, lazy_load=True)) elif invalid_block_index == "junk": # read_blocks should fall back to reading serially - with pytest.warns(AsdfWarning, match="Failed to read block index"): + with pytest.warns(AsdfBlockIndexWarning, match="Failed to read block index"): check(read_blocks(fd, lazy_load=True)) else: with pytest.raises(ValueError, match="Header size.*"): @@ -153,7 +153,7 @@ def test_invalid_block_in_index_with_valid_magic(tmp_path): bio.write_block_index(fd, block_index) fd.seek(0) - with pytest.warns(AsdfWarning, match="Invalid block index contents"): + with pytest.warns(AsdfBlockIndexWarning, match="Invalid block index contents"): check(read_blocks(fd, lazy_load=True)) diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index 807dfb294..f2c2b626b 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -10,7 +10,7 @@ import asdf from asdf import constants, generic_io from asdf._block import io as bio -from asdf.exceptions import AsdfWarning +from asdf.exceptions import AsdfBlockIndexWarning RNG = np.random.default_rng(6) @@ -665,7 +665,7 @@ def test_junk_after_index(): # This has junk after the block index, so it # should fall back to reading serially - with pytest.warns(AsdfWarning, match="Failed to read block index"): + with pytest.warns(AsdfBlockIndexWarning, match="Failed to read block index"): with asdf.open(buff) as ff: assert ff._blocks.blocks[1].loaded @@ -687,7 +687,7 @@ def test_short_file_find_block_index(): buff.write(b"0" * (io.DEFAULT_BUFFER_SIZE * 4)) buff.seek(0) - with pytest.warns(AsdfWarning, match="Failed to read block index"): + with pytest.warns(AsdfBlockIndexWarning, match="Failed to read block index"): with asdf.open(buff) as ff: assert len(ff._blocks.blocks) == 2 assert ff._blocks.blocks[1].loaded @@ -717,7 +717,7 @@ def test_invalid_block_index_values(): bio.write_block_index(buff, block_index) buff.seek(0) - with pytest.warns(AsdfWarning, match="Invalid block index contents"): + with pytest.warns(AsdfBlockIndexWarning, match="Invalid block index contents"): with asdf.open(buff) as ff: assert len(ff._blocks.blocks) == 10 assert ff._blocks.blocks[1].loaded @@ -762,7 +762,7 @@ def test_invalid_block_index_offset(block_index_index): ) buff.seek(0) - with pytest.warns(AsdfWarning, match="Invalid block index contents"): + with pytest.warns(AsdfBlockIndexWarning, match="Invalid block index contents"): with asdf.open(buff) as ff: assert len(ff._blocks.blocks) == 10 for i, a in enumerate(arrays): @@ -794,7 +794,7 @@ def test_unordered_block_index(): buff.seek(0) buff.seek(0) - with pytest.warns(AsdfWarning, match="Failed to read block index"): + with pytest.warns(AsdfBlockIndexWarning, match="Failed to read block index"): with asdf.open(buff) as ff: assert len(ff._blocks.blocks) == 10 assert ff._blocks.blocks[1].loaded diff --git a/asdf/exceptions.py b/asdf/exceptions.py index cdccc9b1d..752a99b15 100644 --- a/asdf/exceptions.py +++ b/asdf/exceptions.py @@ -28,6 +28,12 @@ class AsdfConversionWarning(AsdfWarning): """ +class AsdfBlockIndexWarning(AsdfWarning): + """ + Warning class to indicate that a file was read with an invalid block index + """ + + class DelimiterNotFoundError(ValueError): """ Indicates that a delimiter was not found when reading or From 8672815906b7185e642b08806b85505f8ce559ea Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 5 Sep 2023 13:52:06 -0400 Subject: [PATCH 154/154] remove sphinx-asdf dev version requirement --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2ce006669..8c06773f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ all = [ "lz4>=0.10", ] docs = [ - "sphinx-asdf @ git+https://github.com/braingram/sphinx-asdf.git@immutable_block_manager", + "sphinx-asdf>=0.2.2", "graphviz", "sphinx-inline-tabs", 'tomli; python_version < "3.11"',