diff --git a/.gitignore b/.gitignore index c47a23cdd..b59a5cded 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,6 @@ asdf/_version.py # airspeed velocity files .asv + +# hypothesis files +.hypothesis diff --git a/CHANGES.rst b/CHANGES.rst index d1c4525b7..3cc9dd86c 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -20,6 +20,10 @@ The ASDF Standard is at v1.6.0 AsdfFile.write_to and AsdfFile.update kwargs [#1592] - Fix ``AsdfFile.info`` loading all array data [#1572] - Blank out AsdfFile.tree on close [#1575] +- Move ndarray to a converter, add ``convert_unknown_ndarray_subclasses`` + to ``asdf.config.AsdfConfig``, move ``asdf.Stream`` to + ``asdf.tags.core.Stream``, update block storage support for + Converter and update internal block API [#1537] 2.15.1 (2023-08-07) ------------------- diff --git a/asdf/__init__.py b/asdf/__init__.py index 92bbcf0fc..6950a4bdf 100644 --- a/asdf/__init__.py +++ b/asdf/__init__.py @@ -23,6 +23,5 @@ from .asdf import open_asdf as open from .config import config_context, get_config from .exceptions import ValidationError -from .stream import Stream -from .tags.core import IntegerType +from .tags.core import IntegerType, Stream from .tags.core.external_reference import ExternalArrayReference diff --git a/asdf/_block/__init__.py b/asdf/_block/__init__.py new file mode 100644 index 000000000..65f66acb9 --- /dev/null +++ b/asdf/_block/__init__.py @@ -0,0 +1,61 @@ +""" +Submodule for reading and writing ASDF blocks. + +The primary interface to this submodule is ``_block.manager.Manager`` +that in some ways mimics the older ``BlockManager``. An instance +of ``Manager`` will be created by each `asdf.AsdfFile` instance. + +Internally, this submodule is broken up into: + - low-level: + - ``io``: functions for reading and writing blocks + - ``key``: ``Key`` used to implement ``Store`` (see below) + - ``store``: ``Store`` special key-value store for indexing blocks + - medium-level: + - ``reader``: ``ReadBlock`` and ``read_blocks`` + - ``writer``: ``WriteBlock`` and ``write_blocks`` + - ``callback``: ``DataCallback`` for reading block data + - ``external``: ``ExternalBlockCache`` for reading external blocks + - ``options``: ``Options`` controlling block storage + - high-level: + - ``manager``: ``Manager`` and associated classes + + +The low-level ``io`` functions are responsible for reading and writing +bytes compatible with the block format defined in the ASDF standard. +These should be compatible with as wide a variety of file formats as possible +including files that are: + - seekable and non-seekable + - memory mappable + - accessed from a remote server + - stored in memory + - etc + +To help organize ASDF block data the ``key`` and ``store`` submodules +provide a special key-value store, ``Store``. ``Store`` uses ``Key`` +instances to tie the lifetime of values to the lifetime of objects +in the ASDF tree (without keeping references to the objects) and +allows non-hashable objects to be used as keys. See the ``key`` +submodule docstring for more details. One usage of ``Store`` is +for managing ASDF block ``Options``. ``Options`` determine where +and how array data will be written and a single ``Options`` instance +might be associated with several arrays within the ASDF tree +(if the arrays share the same base array). By using a ``Key`` generated +with the base array the block ``Options`` can be stored in a ``Store`` +without keeping a reference to the base array and these ``Options`` +will be made unavailable if the base array is garbage collected (so +they are not inapproriately assigned to a new array). + +The medium-level submodules ``reader`` and ``writer`` each define +a helper class and function for reading or writing blocks: + - ``ReadBlock`` and ``WriteBlock`` + - ``read_blocks`` and ``write_blocks`` +These abstract some of the complexity of reading and writing blocks +using the low-level API and are the primary means by which the ``Manager`` +reads and writes ASDF blocks. Reading of external blocks by the ``Manager`` +requires some special handling which is contained in the ``external`` +submodule. + +To allow for lazy-loading of ASDF block data, ``callback`` defines +``DataCallback`` which allows reading block data even after the blocks +have been rearranged following an update-in-place. +""" diff --git a/asdf/_block/callback.py b/asdf/_block/callback.py new file mode 100644 index 000000000..946de0b03 --- /dev/null +++ b/asdf/_block/callback.py @@ -0,0 +1,43 @@ +""" +A `DataCallback` class is implemented here to allow +for reassignment of the index of an ASDF block corresponding +to a callback. + +This is needed so that extension code can generate a callback +during deserialization of an ASDF file that will continue +to be valid even after an `AsdfFile.update` which might +reorder blocks. + +To allow for 'low-level' block access needed for ndarray +`DataCallback` can be called with an optional ``_attr`` +argument to cache data, access the block header and other +operations that we generally do not want to expose to +extension code. +""" +import weakref + + +class DataCallback: + """ + A callable object used to read data from an ASDF block + read from an ASDF file. + """ + + def __init__(self, index, read_blocks): + self._reassign(index, read_blocks) + + def __call__(self, _attr=None): + read_blocks = self._read_blocks_ref() + if read_blocks is None: + msg = "Attempt to read block data from missing block" + raise OSError(msg) + if _attr is None: + return read_blocks[self._index].data + else: + # _attr allows NDArrayType to have low level block access for things + # like reading the header and cached_data + return getattr(read_blocks[self._index], _attr) + + def _reassign(self, index, read_blocks): + self._index = index + self._read_blocks_ref = weakref.ref(read_blocks) diff --git a/asdf/_block/exceptions.py b/asdf/_block/exceptions.py new file mode 100644 index 000000000..f36ef5527 --- /dev/null +++ b/asdf/_block/exceptions.py @@ -0,0 +1,4 @@ +class BlockIndexError(Exception): + """ + An error occurred while reading or parsing an ASDF block index + """ diff --git a/asdf/_block/external.py b/asdf/_block/external.py new file mode 100644 index 000000000..6e5f41620 --- /dev/null +++ b/asdf/_block/external.py @@ -0,0 +1,64 @@ +""" +For external blocks, the previous block management +would cache data opened from external files (to return the +same underlying ndarray if the same external block +was referenced more than once). `ExternalBlockCache` is +used here to allow for the same behavior without requiring +the block manager to have a reference to the `AsdfFile` +(that references the block manager). +""" +import os +import urllib + +import numpy as np + +from asdf import generic_io, util + + +class UseInternalType: + pass + + +UseInternal = UseInternalType() + + +class ExternalBlockCache: + def __init__(self): + self.clear() + + def load(self, base_uri, uri, memmap=False, validate_checksums=False): + key = util.get_base_uri(uri) + if key not in self._cache: + resolved_uri = generic_io.resolve_uri(base_uri, uri) + if resolved_uri == "" or resolved_uri == base_uri: + return UseInternal + + from asdf import open as asdf_open + + with asdf_open( + resolved_uri, "r", lazy_load=False, copy_arrays=True, validate_checksums=validate_checksums + ) as af: + blk = af._blocks.blocks[0] + if memmap and blk.header["compression"] == b"\0\0\0\0": + parsed_url = util.patched_urllib_parse.urlparse(resolved_uri) + if parsed_url.scheme == "file": + # deal with leading slash for windows file:// + filename = urllib.request.url2pathname(parsed_url.path) + arr = np.memmap(filename, np.uint8, "r", blk.data_offset, blk.cached_data.nbytes) + else: + arr = blk.cached_data + else: + arr = blk.cached_data + self._cache[key] = arr + return self._cache[key] + + def clear(self): + self._cache = {} + + +def relative_uri_for_index(uri, index): + # get the os-native separated path for this uri + path = util.patched_urllib_parse.urlparse(uri).path + dirname, filename = os.path.split(path) + filename = os.path.splitext(filename)[0] + f"{index:04d}.asdf" + return filename diff --git a/asdf/_block/io.py b/asdf/_block/io.py new file mode 100644 index 000000000..e0afa7d23 --- /dev/null +++ b/asdf/_block/io.py @@ -0,0 +1,529 @@ +""" +Low-level functions for reading and writing ASDF blocks +and other block related file contents (like the block index). +""" +import hashlib +import io +import os +import struct +import weakref + +import yaml + +from asdf import compression as mcompression +from asdf import constants, util + +from .exceptions import BlockIndexError + +BLOCK_HEADER = util.BinaryStruct( + [ + ("flags", "I"), + ("compression", "4s"), + ("allocated_size", "Q"), + ("used_size", "Q"), + ("data_size", "Q"), + ("checksum", "16s"), + ], +) + + +def calculate_block_checksum(data): + if data.ndim > 1: + data = data.ravel(order="K") + # The following line is safe because we're only using + # the MD5 as a checksum. + m = hashlib.new("md5") # noqa: S324 + m.update(data) + return m.digest() + + +def validate_block_header(header): + """ + Check that they key value pairs in header contain consistent + information about the ASDF block ``compression``, ``flags``, + ``used_size`` and ``data_size`` (otherwise raise an exception). + + Parameters + ---------- + header : dict + ASDF block header information. + + Raises + ------ + ValueError + If the key value pairs in header contain inconsistent information + """ + compression = mcompression.validate(header["compression"]) + if header["flags"] & constants.BLOCK_FLAG_STREAMED: + if compression is not None: + msg = "Compression set on a streamed block." + raise ValueError(msg) + else: + if compression is None and header["used_size"] != header["data_size"]: + msg = "used_size and data_size must be equal when no compression is used." + raise ValueError(msg) + return header + + +def read_block_header(fd, offset=None): + """ + Read an ASDF block header + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to read. + + offset : int, optional + Offset within the file where the start of the ASDF block + header is located. If provided, the file will be seeked prior + to reading. + + Returns + ------- + header : dict + Dictionary containing the read ASDF header as parsed by the + `BLOCK_HEADER` `asdf.util.BinaryStruct`. + + Raises + ------ + ValueError + If the read header is inconsistent (see `validate_block_header`). + """ + if offset is not None: + fd.seek(offset) + + # read the header size + buff = fd.read(2) + header_size = struct.unpack(b">H", buff)[0] + if header_size < BLOCK_HEADER.size: + msg = f"Header size must be >= {BLOCK_HEADER.size}" + raise ValueError(msg) + + header = BLOCK_HEADER.unpack(fd.read(header_size)) + return validate_block_header(header) + + +def read_block_data(fd, header, offset=None, memmap=False): + """ + Read (or memory map) data for an ASDF block. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to read. + + header : dict + ASDF block header dictionary (as read from `read_block_header`). + + offset : int, optional + Offset within the file where the start of the ASDF block data + is located. If provided, the file will be seeked prior to reading. + + memmap : bool, optional, default False + Memory map the block data using `generic_io.GenericIO.memmap_array`. + A compressed block will never be memmapped and if the file ``fd`` + does not support memmapping the data will not be memmapped (and + no error will be raised). + + Returns + ------- + data : ndarray or memmap + A one-dimensional ndarray of dtype uint8 + """ + + if fd.seekable(): + if offset is not None: + fd.seek(offset) + else: + offset = fd.tell() + + if header["flags"] & constants.BLOCK_FLAG_STREAMED: + used_size = -1 + else: + used_size = header["used_size"] + + # if no compression, just read data + compression = mcompression.validate(header["compression"]) + if compression: + # compressed data will not be memmapped + data = mcompression.decompress(fd, used_size, header["data_size"], compression) + fd.fast_forward(header["allocated_size"] - header["used_size"]) + else: + if memmap and fd.can_memmap(): + data = fd.memmap_array(offset, used_size) + ff_bytes = header["allocated_size"] + else: + data = fd.read_into_array(used_size) + ff_bytes = header["allocated_size"] - header["used_size"] + if (header["flags"] & constants.BLOCK_FLAG_STREAMED) and fd.seekable(): + fd.seek(0, os.SEEK_END) + else: + fd.fast_forward(ff_bytes) + return data + + +def read_block(fd, offset=None, memmap=False, lazy_load=False): + """ + Read a block (header and data) from an ASDF file. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to read. + + offset : int, optional + Offset within the file where the start of the ASDF block header + is located. If provided, the file will be seeked prior to reading. + Note this is the start of the block header not the start of the + block magic. + + memmap : bool, optional, default False + Memory map the block data see `read_block_data` for more + details. + + lazy_load : bool, optional, default False + Return a callable that when called will read the block data. This + option is ignored for a non-seekable file. + + Returns + ------- + offset : int + The offset within the file where the block was read (equal to offset + argument if it was provided). + + header : dict + ASDF block header as read with `read_block_header`. + + data_offset : int + The offset within the file where the block data begins. + + data : ndarray, memmap or callable + ASDF block data (one-dimensional ndarray of dtype uint8). If lazy_load + (and the file is seekable) data will be a callable that when executed + will seek the file and read the block data. + """ + # expects the fd or offset is past the block magic + if offset is None and fd.seekable(): + offset = fd.tell() + header = read_block_header(fd, offset) + if fd.seekable(): + data_offset = fd.tell() + else: + data_offset = None + if lazy_load and fd.seekable(): + # setup a callback to later load the data + fd_ref = weakref.ref(fd) + + def callback(): + fd = fd_ref() + if fd is None or fd.is_closed(): + msg = "ASDF file has already been closed. Can not get the data." + raise OSError(msg) + position = fd.tell() + data = read_block_data(fd, header, offset=data_offset, memmap=memmap) + fd.seek(position) + return data + + data = callback + if header["flags"] & constants.BLOCK_FLAG_STREAMED: + fd.seek(0, os.SEEK_END) + else: + fd.fast_forward(header["allocated_size"]) + else: + data = read_block_data(fd, header, offset=None, memmap=memmap) + return offset, header, data_offset, data + + +def generate_write_header(data, stream=False, compression_kwargs=None, padding=False, fs_block_size=1, **header_kwargs): + """ + Generate a dict representation of a ASDF block header that can be + used for writing a block. + + Note that if a compression key is provided in ``header_kwargs`` this + function will compress ``data`` to determine the used_size (the + compressed data will be returned via the ``buff`` result to avoid + needing to re-compress the data before writing). + + Parameters + ---------- + + data : ndarray + A one-dimensional ndarray of dtype uint8. + + stream : bool, optional, default False + If True, generate a header for a streamed block. + + compression_kwargs : dict, optional + If provided, these will be passed on to `asdf.compression.compress` + if the data is compressed (see header_kwargs). + + padding : bool or float, optional, default False + If the block should contain additional padding bytes. See the + `asdf.util.calculate_padding` argument ``pad_blocks`` for more + details. + + fs_block_size : int, optional, default 1 + The filesystem block size. See the `asdf.util.calculate_padding` + ``block_size`` argument for more details. + + **header_kwargs : dict, optional + Block header settings that will be read, updated, and used + to generate the binary block header representation by packing + with `BLOCK_HEADER`. + + Returns + ------- + + header : dict + Dictionary representation of an ASDF block header. + + buff : bytes or None + If this block is compressed buff will contained the compressed + representation of data or None if the data is uncompressed. + + padding_bytes: int + The number of padding bytes that must be written after + the block data. + """ + if data.ndim != 1 or data.dtype != "uint8": + msg = "Data must be of ndim==1 and dtype==uint8" + raise ValueError(msg) + if stream: + header_kwargs["flags"] = header_kwargs.get("flags", 0) | constants.BLOCK_FLAG_STREAMED + header_kwargs["data_size"] = 0 + header_kwargs["checksum"] = b"\0" * 16 + else: + header_kwargs["flags"] = 0 + header_kwargs["data_size"] = data.nbytes + header_kwargs["checksum"] = calculate_block_checksum(data) + + header_kwargs["compression"] = mcompression.to_compression_header(header_kwargs.get("compression", None)) + + if header_kwargs["compression"] == b"\0\0\0\0": + used_size = header_kwargs["data_size"] + buff = None + else: + buff = io.BytesIO() + mcompression.compress(buff, data, header_kwargs["compression"], config=compression_kwargs) + used_size = buff.tell() + if stream: + header_kwargs["used_size"] = 0 + header_kwargs["allocated_size"] = 0 + else: + header_kwargs["used_size"] = used_size + padding = util.calculate_padding(used_size, padding, fs_block_size) + header_kwargs["allocated_size"] = header_kwargs.get("allocated_size", used_size + padding) + + if header_kwargs["allocated_size"] < header_kwargs["used_size"]: + msg = ( + f"Block used size {header_kwargs['used_size']} larger than " + f"allocated size {header_kwargs['allocated_size']}", + ) + raise RuntimeError(msg) + padding_bytes = header_kwargs["allocated_size"] - header_kwargs["used_size"] + return header_kwargs, buff, padding_bytes + + +def write_block(fd, data, offset=None, stream=False, compression_kwargs=None, padding=False, **header_kwargs): + """ + Write an ASDF block. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to write to. + + offset : int, optional + If provided, seek to this offset before writing. + + stream : bool, optional, default False + If True, write this as a streamed block. + + compression_kwargs : dict, optional + If block is compressed, use these additional arguments during + compression. See `generate_write_header`. + + padding : bool, optional, default False + Optionally pad the block data. See `generate_write_header`. + + **header_kwargs : dict + Block header settings. See `generate_write_header`. + + Returns + ------- + + header : dict + The ASDF block header as unpacked from the `BLOCK_HEADER` used + for writing. + """ + header_dict, buff, padding_bytes = generate_write_header( + data, stream, compression_kwargs, padding, fd.block_size, **header_kwargs + ) + header_bytes = BLOCK_HEADER.pack(**header_dict) + + if offset is not None: + if fd.seekable(): + fd.seek(offset) + else: + msg = "write_block received offset for non-seekable file" + raise ValueError(msg) + fd.write(struct.pack(b">H", len(header_bytes))) + fd.write(header_bytes) + if buff is None: # data is uncompressed + fd.write_array(data) + else: + fd.write(buff.getvalue()) + fd.fast_forward(padding_bytes) + return header_dict + + +def _candidate_offsets(min_offset, max_offset, block_size): + offset = (max_offset // block_size) * block_size + if offset == max_offset: + offset -= block_size + while offset > min_offset: + yield offset + offset -= block_size + if offset <= min_offset: + yield min_offset + + +def find_block_index(fd, min_offset=None, max_offset=None): + """ + Find the location of an ASDF block index within a seekable file. + + Searching will begin at the end of the file (or max_offset + if it is provided). + + Parameters + ---------- + + fd : file or generic_io.GenericIO + A seekable file that will be searched to try and find + the start of an ASDF block index within the file. + + min_offset : int, optional + The minimum search offset. A block index will not be + found before this point. + + max_offset : int, optional + The maximum search offset. A block index will not be + found after this point. + + Returns + ------- + + offset : int or None + Index of start of ASDF block index. This is the location of the + ASDF block index header. + + """ + if min_offset is None: + min_offset = fd.tell() + if max_offset is None: + fd.seek(0, os.SEEK_END) + max_offset = fd.tell() + block_size = fd.block_size + block_index_offset = None + buff = b"" + pattern = constants.INDEX_HEADER + for offset in _candidate_offsets(min_offset, max_offset, block_size): + fd.seek(offset) + buff = fd.read(block_size) + buff + index = buff.find(pattern) + if index != -1: + block_index_offset = offset + index + if block_index_offset >= max_offset: + return None + break + buff = buff[: len(pattern)] + return block_index_offset + + +def read_block_index(fd, offset=None): + """ + Read an ASDF block index from a file. + + Parameters + ---------- + + fd : file or generic_io.GenericIO + File to read the block index from. + + offset : int, optional + Offset within the file where the block index starts + (the start of the ASDF block index header). If not provided + reading will start at the current position of the file + pointer. See `find_block_index` to locate the block + index prior to calling this function. + + Returns + ------- + + block_index : list of ints + A list of ASDF block offsets read and parsed from the + block index. + + Raises + ------ + BlockIndexError + The data read from the file did not contain a valid + block index. + """ + if offset is not None: + fd.seek(offset) + buff = fd.read(len(constants.INDEX_HEADER)) + if buff != constants.INDEX_HEADER: + msg = "Failed to read block index header at offset {offset}" + raise BlockIndexError(msg) + try: + block_index = yaml.load(fd.read(-1), yaml.SafeLoader) + except yaml.error.YAMLError: + raise BlockIndexError("Failed to parse block index as yaml") + if ( + not isinstance(block_index, list) + or any(not isinstance(v, int) for v in block_index) + or block_index != sorted(block_index) + ): + raise BlockIndexError("Invalid block index") + return block_index + + +def write_block_index(fd, offsets, offset=None, yaml_version=None): + """ + Write a list of ASDF block offsets to a file in the form + of an ASDF block index. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to write to. + + offsets : list of ints + List of byte offsets (from the start of the file) where + ASDF blocks are located. + + offset : int, optional + If provided, seek to this offset before writing. + + yaml_version : tuple, optional, default (1, 1) + YAML version to use when writing the block index. This + will be passed to ``yaml.dump`` as the version argument. + """ + if yaml_version is None: + yaml_version = (1, 1) + if offset is not None: + fd.seek(offset) + fd.write(constants.INDEX_HEADER) + fd.write(b"\n") + yaml.dump( + offsets, + stream=fd, + Dumper=yaml.SafeDumper, + explicit_start=True, + explicit_end=True, + allow_unicode=True, + encoding="utf-8", + version=yaml_version, + ) diff --git a/asdf/_block/key.py b/asdf/_block/key.py new file mode 100644 index 000000000..fce97d14e --- /dev/null +++ b/asdf/_block/key.py @@ -0,0 +1,82 @@ +""" +A hashable Key class that provides a means for tracking +the lifetime of objects to associate objects with +blocks, options and other parts of an asdf file. + +This Key is meant to replace uses of id(obj) which in +previous code was used to store settings (like block +array storage). The use of id was problematic as +an object might be deallocated (if it is removed from +the tree and all other references freed) and a new +object of the same type might occupy the same location +in memory and result in the same id. This could result +in options originally associated with the first object +being incorrectly assigned to the new object. + +At it's core, Key, uses a weak reference to the object +which can be checked to see if the object is still +in memory. + +Instances of this class will be provided to extension +code (see ``SerializationContext.generate_block_key``) +as Converters will need to resupply these keys +on rewrites (updates) to allow asdf to reassociate +objects and blocks. To discourage modifications +of these Key instances all methods and attributes +are private. +""" + +import weakref + + +class Key: + _next = 0 + + @classmethod + def _next_key(cls): + key = cls._next + cls._next += 1 + return key + + def __init__(self, obj=None, _key=None): + if _key is None: + _key = Key._next_key() + self._key = _key + self._ref = None + if obj is not None: + self._assign_object(obj) + + def _is_valid(self): + if self._ref is None: + return False + r = self._ref() + if r is None: + return False + return True + + def __hash__(self): + return self._key + + def _assign_object(self, obj): + self._ref = weakref.ref(obj) + + def _matches_object(self, obj): + if self._ref is None: + return False + r = self._ref() + if r is None: + return False + return r is obj + + def __eq__(self, other): + if not isinstance(other, Key): + return NotImplemented + if self._key != other._key: + return False + if not self._is_valid(): + return False + return other._matches_object(self._ref()) + + def __copy__(self): + obj = self._ref if self._ref is None else self._ref() + return type(self)(obj, self._key) diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py new file mode 100644 index 000000000..300e5904f --- /dev/null +++ b/asdf/_block/manager.py @@ -0,0 +1,684 @@ +import collections +import contextlib +import copy + +from asdf import config, constants, generic_io, util + +from . import external, reader, store, writer +from . import io as bio +from .callback import DataCallback +from .key import Key as BlockKey +from .options import Options + + +class ReadBlocks(collections.UserList): + """ + A list of ReadBlock instances. + + A simple list can't be used as other code will need + to genearate a weakref to instances of this class + (and it is not possible to generate a weakref to a list). + """ + + pass + + +class WriteBlocks(collections.abc.Sequence): + """ + A collection of ``WriteBlock`` instances that can be accessed by: + - numerical index (see ``collections.abc.Sequence``) + - the object or objects in the tree that created or + are associated with this block + - the block data + Access by object and data is via a Store which generates + Keys to allow use of non-hashable objects (and to not hold + a reference to the block data). + """ + + def __init__(self, blocks=None): + if blocks is None: + blocks = [] + self._blocks = blocks + + # both stores contain values that are indices of + # WriteBlock instances in _blocks + self._data_store = store.Store() + self._object_store = store.Store() + + def __getitem__(self, index): + return self._blocks.__getitem__(index) + + def __len__(self): + return self._blocks.__len__() + + def index_for_data(self, data): + return self._data_store.lookup_by_object(data) + + def assign_object_to_index(self, obj, index): + self._object_store.assign_object(obj, index) + + def object_keys_for_index(self, index): + yield from self._object_store.keys_for_value(index) + + def append_block(self, blk, obj): + """ + Append a ``WriteBlock`` instance to this collection + assign an object, obj, to the block and return + the index of the block within the collection. + """ + index = len(self._blocks) + self._blocks.append(blk) + + # assign the block data to this block to allow + # fast lookup of blocks based on data + self._data_store.assign_object(blk._data, index) + + # assign the object that created/uses this block + self._object_store.assign_object(obj, index) + return index + + +class OptionsStore(store.Store): + """ + A ``Store`` of ``Options`` that can be accessed by the base + array that corresponds to a block. A ``Store`` is used + to avoid holding references to the array data + (see ``asdf._block.store.Store``). + + When ``Options`` are not found within the ``Store``, the + ``OptionsStore`` will look for any available matching + ``ReadBlock`` to determine default Options. + """ + + def __init__(self, read_blocks): + super().__init__() + # ReadBlocks are needed to look up default options + self._read_blocks = read_blocks + + def has_options(self, array): + """ + Check of Options have been defined for this array + without falling back to generating Options from + a ReadBlock. + + Parameters + ---------- + array : ndarray + The base of this array (see `asdf.util.get_array_base`) will + be used to lookup any Options in the Store. + + Returns + ------- + has_options : bool + True if Options were previously defined for this array. + """ + base = util.get_array_base(array) + return self.lookup_by_object(base) is not None + + def get_options_from_block(self, array): + """ + Get Options for some array using only settings read from a + corresponding ReadBlock (one that shares the same base array). + Any Options defined using previous calls to set_options will + be ignored (use ``get_options`` if you would like these previously + set options to be considered). + + Parameters + ---------- + array : ndarray + The base of this array (see `asdf.util.get_array_base`) will + be used to lookup a corresponding ReadBlock. + + Returns + ------- + options : Options or None + Options initialized from settings read from a ReadBlock + or None if no corresponding block was found. + """ + base = util.get_array_base(array) + # look up by block with matching _data + for block in self._read_blocks: + if block._cached_data is base or block._data is base: + # init options + if block.header["flags"] & constants.BLOCK_FLAG_STREAMED: + storage_type = "streamed" + else: + storage_type = "internal" + options = Options(storage_type, block.header["compression"]) + return options + return None + + def get_options(self, array): + """ + Get Options for some array using either previously defined + options (as set by ``set_options``) or settings read from a + corresponding ReadBlock (one that shares the same base array). + + Note that if no options are found in the Store and options + are found from a ReadBlock the resulting Options will be added + to the Store. + + Parameters + ---------- + array : ndarray + The base of this array (see `asdf.util.get_array_base`) will + be used to lookup any Options in the Store. + + Returns + ------- + options : Options or None + Options read from the Store or ReadBlocks or None if + no options were found. + """ + base = util.get_array_base(array) + options = self.lookup_by_object(base) + if options is None: + options = self.get_options_from_block(base) + if options is not None: + self.set_options(base, options) + if options is None: + options = Options() + self.set_options(base, options) + return options + + def set_options(self, array, options): + """ + Set Options for an array. + + Parameters + ---------- + array : ndarray + The base of this array (see `asdf.util.get_array_base`) will + be used to add options to the Store. + + options : Options + The Options to add to the Store for this array. + + Raises + ------ + ValueError + If more than one block is set as a streamed block. + """ + if options.storage_type == "streamed": + for oid, by_key in self._by_id.items(): + for key, opt in by_key.items(): + if not key._is_valid(): + continue + if opt.storage_type == "streamed": + if opt is options: + continue + msg = "Can not add second streaming block" + raise ValueError(msg) + base = util.get_array_base(array) + self.assign_object(base, options) + + def get_output_compressions(self): + """ + Get all output compression types used for this Store of + Options. + + Returns + ------- + compressions : list of bytes + List of 4 byte compression labels used for this OptionsStore. + """ + compressions = set() + cfg = config.get_config() + if cfg.all_array_compression == "input": + for blk in self._read_blocks: + if blk.header["compression"]: + compressions.add(blk.header["compression"]) + else: + compressions.add(cfg.all_array_compression) + for _, by_key in self._by_id.items(): + for key, opts in by_key.items(): + if not key._is_valid(): + continue + if opts.compression: + compressions.add(opts.compression) + return compressions + + +class Manager: + """ + ``Manager`` for reading, writing and storing options for ASDF blocks. + + This class does the heavy lifting of allowing ``asdf.AsdfFile`` instances + to control ASDF blocks. It is responsible for reading and writing blocks + primarily to maintain some consistency with the previous BlockManager. + + Block ``Options`` control the compression and type of storage for an + ASDF block (see `asdf.AsdfFile.set_array_storage`, + `asdf.AsdfFile.set_array_compression` + `asdf.AsdfFile.set_array_compression` for relevant usage and information). + These ``Options`` instances are stored and retrieved using the base + of the array containing the data for an ASDF block. This allows arrays + that share the same base array (ie views of the same array) to use + the same ASDF block. + + Reading blocks occurs through use of ``Manager.read`` which will + create ``ReadBlock`` instances for each read ASDF block. These ``ReadBlock`` + will be used as the source for default ``Options`` for each block + and ASDF block data can be read using ``DataCallback`` instances. + These callbacks are used (instead of just accessing blocks by index) + to allow block reorganization during ``update``.(Note that reading + of external blocks is special as these are not stored within the + block section of the ASDF file. These must be explicitly loaded + using ``Manager._load_external``). + + Writing ASDF blocks occurs through use of ``Manager.write`` which will + take any queued ``WriteBlocks`` (created via ``Manager.make_write_block`` + and ``Manager.set_streamed_write_block``) and write them out to a file. + This writing must occur within a ``Manager.write_context`` to allow the + ``Manager`` to reset any ``Options`` changes that occur during write + and to clean up the write queue. + + Update-in-place occurs through use of ``Manager.update`` which, like + ``Manager.write`` must occur within a ``Manager.write_context``. Following + a ``Manager.update`` the ``ReadBlock`` instances will be replaced with + the newly written ASDF blocks and any ``DataCallbacks`` will be updated + to reference the appropriate new ``ReadBlock``. + """ + + def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, validate_checksums=False): + if read_blocks is None: + read_blocks = ReadBlocks([]) + self.options = OptionsStore(read_blocks) + + self._blocks = read_blocks + self._external_block_cache = external.ExternalBlockCache() + self._data_callbacks = store.Store() + + self._write_blocks = WriteBlocks() + self._external_write_blocks = [] + self._streamed_write_block = None + self._streamed_obj_keys = set() + self._write_fd = None + + # store the uri of the ASDF file here so that the Manager can + # resolve and load external blocks without requiring a reference + # to the AsdfFile instance + self._uri = uri + + # general block settings + self._lazy_load = lazy_load + self._memmap = memmap + self._validate_checksums = validate_checksums + + def close(self): + self._external_block_cache.clear() + self._clear_write() + for blk in self.blocks: + blk.close() + self.options = OptionsStore(self.blocks) + + @property + def blocks(self): + """ + Get any ReadBlocks that were read from an ASDF file + + Returns + ------- + read_blocks : list of ReadBlock + List of ReadBlock instances created during a call to read + or update. + """ + return self._blocks + + @blocks.setter + def blocks(self, new_blocks): + if not isinstance(new_blocks, ReadBlocks): + new_blocks = ReadBlocks(new_blocks) + self._blocks = new_blocks + # we propagate these blocks to options so that + # options lookups can fallback to the new read blocks + self.options._read_blocks = new_blocks + + def read(self, fd, after_magic=False): + """ + Read blocks from an ASDF file and update the manager read_blocks. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to read from. Reading starts at the current file position. + + after_magic : bool, optional, default False + If True, the file pointer is past the block magic bytes of the + first block. + """ + self.blocks = reader.read_blocks( + fd, self._memmap, self._lazy_load, self._validate_checksums, after_magic=after_magic + ) + + def _load_external(self, uri): + value = self._external_block_cache.load(self._uri, uri, self._memmap, self._validate_checksums) + if value is external.UseInternal: + return self.blocks[0].data + return value + + def _clear_write(self): + self._write_blocks = WriteBlocks() + self._external_write_blocks = [] + self._streamed_write_block = None + self._streamed_obj_keys = set() + self._write_fd = None + + def _write_external_blocks(self): + from asdf import AsdfFile + + if self._write_fd is None or self._write_fd.uri is None: + msg = "Can't write external blocks, since URI of main file is unknown." + raise ValueError(msg) + + for blk in self._external_write_blocks: + uri = generic_io.resolve_uri(self._write_fd.uri, blk._uri) + af = AsdfFile() + with generic_io.get_file(uri, mode="w") as f: + af.write_to(f, include_block_index=False) + writer.write_blocks(f, [blk]) + + def make_write_block(self, data, options, obj): + """ + Make a WriteBlock with data and options and + associate it with an object (obj). + + Parameters + ---------- + data : npdarray or callable + Data to be written to an ASDF block. Can be provided as + a callable function that when evaluated will return the + data. + options : Options or None + Options instance used to define the ASDF block compression + and storage type. If None, a new Options instance will + be created. + obj : object + An object in the ASDF tree that will be associated + with the new WriteBlock so that `AsdfFile.update` can + map newly created blocks to blocks read from the original + file. + + Returns + ------- + block_source : int or str + The relative uri (str) if an external block was created + or the index of the block (int) for an internal block. + + Raises + ------ + ValueError + If a external block was created without a URI for the main + file. + """ + if options is None: + options = Options() + if options.storage_type == "external": + for index, blk in enumerate(self._external_write_blocks): + if blk._data is data: + # this external uri is already ready to go + return blk._uri + # need to set up new external block + index = len(self._external_write_blocks) + blk = writer.WriteBlock(data, options.compression, options.compression_kwargs) + if self._write_fd is not None: + base_uri = self._write_fd.uri or self._uri + else: + base_uri = self._uri + if base_uri is None: + msg = "Can't write external blocks, since URI of main file is unknown." + raise ValueError(msg) + blk._uri = external.relative_uri_for_index(base_uri, index) + self._external_write_blocks.append(blk) + return blk._uri + # first, look for an existing block + index = self._write_blocks.index_for_data(data) + if index is not None: + self._write_blocks.assign_object_to_index(obj, index) + return index + # if no block is found, make a new block + blk = writer.WriteBlock(data, options.compression, options.compression_kwargs) + index = self._write_blocks.append_block(blk, obj) + return index + + def set_streamed_write_block(self, data, obj): + """ + Create a WriteBlock that will be written as an ASDF + streamed block. + + Parameters + ---------- + data : ndarray or callable + Data to be written to an ASDF block. Can be provided as + a callable function that when evaluated will return the + data. + obj : object + An object in the ASDF tree that will be associated + with the new WriteBlock so that `AsdfFile.update` can + map newly created blocks to blocks read from the original + file. + """ + if self._streamed_write_block is not None and data is not self._streamed_write_block.data: + msg = "Can not add second streaming block" + raise ValueError(msg) + if self._streamed_write_block is None: + self._streamed_write_block = writer.WriteBlock(data) + self._streamed_obj_keys.add(BlockKey(obj)) + + def _get_data_callback(self, index): + return DataCallback(index, self.blocks) + + def _set_array_storage(self, data, storage): + options = self.options.get_options(data) + options.storage_type = storage + self.options.set_options(data, options) + + def _get_array_storage(self, data): + return self.options.get_options(data).storage_type + + def _set_array_compression(self, arr, compression, **compression_kwargs): + # if this is input compression but we already have defined options + # we need to re-lookup the options based off the block + if compression == "input" and self.options.has_options(arr): + from_block_options = self.options.get_options_from_block(arr) + if from_block_options is not None: + compression = from_block_options.compression + options = self.options.get_options(arr) + options.compression = compression + options.compression_kwargs = compression_kwargs + + def _get_array_compression(self, arr): + return self.options.get_options(arr).compression + + def _get_array_compression_kwargs(self, arr): + return self.options.get_options(arr).compression_kwargs + + def get_output_compressions(self): + return self.options.get_output_compressions() + + @contextlib.contextmanager + def options_context(self): + """ + Context manager that copies block options on + entrance and restores the options when exited. + """ + previous_options = copy.deepcopy(self.options) + yield + self.options = previous_options + self.options._read_blocks = self.blocks + + @contextlib.contextmanager + def write_context(self, fd, copy_options=True): + """ + Context manager that copies block options on + entrance and restores the options when exited. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to be written to. This is required on + entrance to this context so that any external + blocks can resolve relative uris. + + copy_options : bool, optional, default True + Copy options on entrance and restore them on + exit (See `options_context`). + """ + self._clear_write() + self._write_fd = fd + if copy_options: + with self.options_context(): + yield + else: + yield + self._clear_write() + + def write(self, pad_blocks, include_block_index): + """ + Write blocks that were set up during the current + `write_context`. + + Parameters + ---------- + pad_blocks : bool, None or float + If False, add no padding bytes between blocks. If True + add some default amount of padding. If a float, add + a number of padding bytes based off a ratio of the data + size. + + include_block_index : bool + If True, include a block index at the end of the file. + If a streamed_block is provided (or the file is not + seekable) no block index will be written. + + Raises + ------ + OSError + If called outside a `write_context`. + """ + if self._write_fd is None: + msg = "write called outside of valid write_context" + raise OSError(msg) + if len(self._write_blocks) or self._streamed_write_block: + writer.write_blocks( + self._write_fd, + self._write_blocks, + pad_blocks, + streamed_block=self._streamed_write_block, + write_index=include_block_index, + ) + if len(self._external_write_blocks): + self._write_external_blocks() + + def update(self, new_tree_size, pad_blocks, include_block_index): + """ + Perform an update-in-place of ASDF blocks set up during + a `write_context`. + + Parameters + ---------- + new_tree_size : int + Size (in bytes) of the serialized ASDF tree (and any + header bytes) that will be written at the start of the + file being updated. + + pad_blocks : bool, None or float + If False, add no padding bytes between blocks. If True + add some default amount of padding. If a float, add + a number of padding bytes based off a ratio of the data + size. + + include_block_index : bool + If True, include a block index at the end of the file. + If a streamed_block is provided (or the file is not + seekable) no block index will be written. + + + Raises + ------ + OSError + If called outside a `write_context`. + """ + if self._write_fd is None: + msg = "update called outside of valid write_context" + raise OSError(msg) + # find where to start writing blocks (either end of new tree or end of last 'free' block) + last_block = None + for blk in self.blocks[::-1]: + if not blk.memmap and (blk._cached_data is not None or not callable(blk._data)): + continue + last_block = blk + break + if last_block is None: + new_block_start = new_tree_size + else: + new_block_start = max( + last_block.data_offset + last_block.header["allocated_size"], + new_tree_size, + ) + + if len(self._external_write_blocks): + self._write_external_blocks() + + # do we have any blocks to write? + if len(self._write_blocks) or self._streamed_write_block: + self._write_fd.seek(new_block_start) + offsets, headers = writer.write_blocks( + self._write_fd, + self._write_blocks, + pad_blocks, + streamed_block=self._streamed_write_block, + write_index=False, # don't write an index as we will modify the offsets + ) + new_block_end = self._write_fd.tell() + + # move blocks to start in increments of block_size + n_bytes = new_block_end - new_block_start + src, dst = new_block_start, new_tree_size + block_size = self._write_fd.block_size + while n_bytes > 0: + self._write_fd.seek(src) + bs = self._write_fd.read(min(n_bytes, block_size)) + self._write_fd.seek(dst) + self._write_fd.write(bs) + n = len(bs) + n_bytes -= n + src += n + dst += n + + # update offset to point at correct locations + offsets = [o - (new_block_start - new_tree_size) for o in offsets] + + # write index if no streamed block + if include_block_index and self._streamed_write_block is None: + bio.write_block_index(self._write_fd, offsets) + + # map new blocks to old blocks + new_read_blocks = ReadBlocks() + for i, (offset, header) in enumerate(zip(offsets, headers)): + # find all objects that assigned themselves to the write block at index i + if i == len(self._write_blocks): # this is a streamed block + obj_keys = self._streamed_obj_keys + else: + # find object associated with this write block + obj_keys = set(self._write_blocks.object_keys_for_index(i)) + + # we have to be lazy here as any current memmap is invalid + new_read_block = reader.ReadBlock(offset + 4, self._write_fd, self._memmap, True, False, header=header) + new_read_blocks.append(new_read_block) + new_index = len(new_read_blocks) - 1 + + # update all callbacks + for obj_key in obj_keys: + obj = obj_key._ref() + if obj is None: + # this object no longer exists so don't both assigning it + continue + + # update data callbacks to point to new block + cb = self._data_callbacks.lookup_by_object(obj) + if cb is not None: + cb._reassign(new_index, new_read_blocks) + + # update read blocks to reflect new state + self.blocks = new_read_blocks diff --git a/asdf/_block/options.py b/asdf/_block/options.py new file mode 100644 index 000000000..1ced09478 --- /dev/null +++ b/asdf/_block/options.py @@ -0,0 +1,65 @@ +from asdf import compression as mcompression +from asdf.config import get_config + + +class Options: + """ + Storage and compression options useful when reading or writing ASDF blocks. + """ + + def __init__(self, storage_type=None, compression_type=None, compression_kwargs=None): + if storage_type is None: + storage_type = get_config().all_array_storage or "internal" + self._storage_type = None + self._compression = None + self._compression_kwargs = None + + # set via setters + self.compression_kwargs = compression_kwargs + self.compression = compression_type + self.storage_type = storage_type + + @property + def storage_type(self): + return self._storage_type + + @storage_type.setter + def storage_type(self, storage_type): + if storage_type not in ["internal", "external", "streamed", "inline"]: + msg = "array_storage must be one of 'internal', 'external', 'streamed' or 'inline'" + raise ValueError(msg) + self._storage_type = storage_type + + @property + def compression(self): + return self._compression + + @compression.setter + def compression(self, compression): + msg = f"Invalid compression type: {compression}" + if compression == "input": + # "input" compression will validate as the ASDF compression module made + # some assumptions about availability of information (that the input block + # is known). The Options here do not have the same assumption. + # For a block read from a file, it's options will be initialized with + # the compression value read from the block header so we should never + # see 'input' at this point. + raise ValueError(msg) + try: + compression = mcompression.validate(compression) + except ValueError: + raise ValueError(msg) + self._compression = compression + + @property + def compression_kwargs(self): + return self._compression_kwargs + + @compression_kwargs.setter + def compression_kwargs(self, kwargs): + if not kwargs: + kwargs = {} + self._compression_kwargs = kwargs + + def __copy__(self): + return type(self)(self._storage_type, self._compression, self._compression_kwargs) diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py new file mode 100644 index 000000000..60abd9d34 --- /dev/null +++ b/asdf/_block/reader.py @@ -0,0 +1,263 @@ +import warnings +import weakref + +from asdf import constants +from asdf.exceptions import AsdfBlockIndexWarning, AsdfWarning + +from . import io as bio +from .exceptions import BlockIndexError + + +class ReadBlock: + """ + Represents an ASDF block read from a file. + """ + + def __init__(self, offset, fd, memmap, lazy_load, validate_checksum, header=None, data_offset=None, data=None): + self.offset = offset # after block magic bytes + self._fd = weakref.ref(fd) + self._header = header + self.data_offset = data_offset + self._data = data + self._cached_data = None + self.memmap = memmap + self.lazy_load = lazy_load + self.validate_checksum = validate_checksum + if not lazy_load: + self.load() + + def close(self): + self._cached_data = None + + @property + def loaded(self): + return self._data is not None + + def load(self): + """ + Load the block data (if it is not already loaded). + + Raises + ------ + OSError + If attempting to load from a closed file. + """ + if self.loaded: + return + fd = self._fd() + if fd is None or fd.is_closed(): + msg = "Attempt to load block from closed file" + raise OSError(msg) + position = fd.tell() + _, self._header, self.data_offset, self._data = bio.read_block( + fd, offset=self.offset, memmap=self.memmap, lazy_load=self.lazy_load + ) + fd.seek(position) + + @property + def data(self): + """ + Read, parse and return data for an ASDF block. + + Returns + ------- + data : ndarray + A one-dimensional ndarray of dypte uint8 read from an ASDF block + + Raises + ------ + ValueError + If the header checksum does not match the checksum of the data + and validate_checksums was set to True. + """ + if not self.loaded: + self.load() + if callable(self._data): + data = self._data() + else: + data = self._data + if self.validate_checksum: + checksum = bio.calculate_block_checksum(data) + if not self._header["flags"] & constants.BLOCK_FLAG_STREAMED and checksum != self._header["checksum"]: + msg = f"Block at {self.offset} does not match given checksum" + raise ValueError(msg) + # only validate data the first time it's read + self.validate_checksum = False + return data + + @property + def cached_data(self): + """ + Return cached data for an ASDF block. + + The first time this is called it may read data from the file + (if lazy loaded). Subsequent calls will return the same + ndarray. + """ + if self._cached_data is None: + self._cached_data = self.data + return self._cached_data + + @property + def header(self): + """ + Get the block header. For a lazy loaded block the first time + this is called the header will be read from the file and + cached. + + Returns + ------- + header : dict + Dictionary containing the read ASDF header. + """ + if not self.loaded: + self.load() + return self._header + + +def _read_blocks_serially(fd, memmap=False, lazy_load=False, validate_checksums=False, after_magic=False): + """ + Read blocks serially from a file without looking for a block index. + + For parameter and return value descriptions see `read_blocks`. + """ + blocks = [] + buff = b"" + magic_len = len(constants.BLOCK_MAGIC) + while True: + # the expectation is that this will begin PRIOR to the block magic + # read 4 bytes + if not after_magic: + buff += fd.read(magic_len - len(buff)) + if len(buff) == 0: + # we are done, there are no more blocks and no index + break + elif len(buff) < magic_len: + # we have less than magic_len bytes, this is likely an error + # in the input file/bytes + if all([b == 0 for b in buff]): + # if these are all 0, assume this was a 'truncated' file + # so don't issue a warning + break + # if these are non-0 bytes issue a warning that the file + # is likely corrupt + msg = f"Read invalid bytes {buff!r} after blocks, your file might be corrupt" + warnings.warn(msg, AsdfWarning) + break + + if buff == constants.INDEX_HEADER[:magic_len]: + # we hit the block index, which is not useful here + break + + if after_magic or buff == constants.BLOCK_MAGIC: + # this is another block + offset, header, data_offset, data = bio.read_block(fd, memmap=memmap, lazy_load=lazy_load) + blocks.append( + ReadBlock( + offset, fd, memmap, lazy_load, validate_checksums, header=header, data_offset=data_offset, data=data + ) + ) + if blocks[-1].header["flags"] & constants.BLOCK_FLAG_STREAMED: + # a file can only have 1 streamed block and it must be at the end so we + # can stop looking for more blocks + break + buff = b"" + after_magic = False + else: + if len(blocks) or buff[0] != 0: + # if this is not the first block or we haven't found any + # blocks and the first byte is non-zero + msg = f"Invalid bytes while reading blocks {buff}" + raise OSError(msg) + # this is the first block, allow empty bytes before block + buff = buff.strip(b"\0") + return blocks + + +def read_blocks(fd, memmap=False, lazy_load=False, validate_checksums=False, after_magic=False): + """ + Read a sequence of ASDF blocks from a file. + + If the file is seekable (and lazy_load is False) an attempt will + made to find, read and parse a block index. If this fails, the + blocks will be read serially. If parsing the block index + succeeds, the first first and last blocks will be read (to + confirm that those portions of the index are correct). All + other blocks will not be read until they are accessed. + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to read. Reading will start at the current position. + + memmap : bool, optional, default False + If true, memory map block data. + + lazy_load : bool, optional, default False + If true, block data will be a callable that when executed + will return the block data. See the ``lazy_load`` argument + to ``asdf._block.io.read_block`` for more details. + + validate_checksums : bool, optional, default False + When reading blocks compute the block data checksum and + compare it to the checksum read from the block header. + Note that this comparison will occur when the data is + accessed if ``lazy_load`` was set to True. + + after_magic : bool, optional, default False + If True don't expect block magic bytes for the first block + read from the file. + + Returns + ------- + + read_blocks : list of ReadBlock + A list of ReadBlock instances. + + Raises + ------ + OSError + Invalid bytes encountered while reading blocks. + + ValueError + A read block has an invalid checksum. + """ + if not lazy_load or not fd.seekable(): + # load all blocks serially + return _read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) + + # try to find block index + starting_offset = fd.tell() + index_offset = bio.find_block_index(fd, starting_offset) + if index_offset is None: + # if failed, load all blocks serially + fd.seek(starting_offset) + return _read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) + + # setup empty blocks + try: + block_index = bio.read_block_index(fd, index_offset) + except BlockIndexError as e: + # failed to read block index, fall back to serial reading + msg = f"Failed to read block index, falling back to serial reading: {e!s}" + warnings.warn(msg, AsdfBlockIndexWarning) + fd.seek(starting_offset) + return _read_blocks_serially(fd, memmap, lazy_load, validate_checksums, after_magic) + # skip magic for each block + magic_len = len(constants.BLOCK_MAGIC) + blocks = [ReadBlock(offset + magic_len, fd, memmap, lazy_load, validate_checksums) for offset in block_index] + try: + # load first and last blocks to check if the index looks correct + for index in (0, -1): + fd.seek(block_index[index]) + buff = fd.read(magic_len) + if buff != constants.BLOCK_MAGIC: + msg = "Invalid block magic" + raise OSError(msg) + blocks[index].load() + except (OSError, ValueError) as e: + msg = f"Invalid block index contents for block {index}, falling back to serial reading: {e!s}" + warnings.warn(msg, AsdfBlockIndexWarning) + fd.seek(starting_offset) + return _read_blocks_serially(fd, memmap, lazy_load, after_magic) + return blocks diff --git a/asdf/_block/store.py b/asdf/_block/store.py new file mode 100644 index 000000000..6dffcdfbc --- /dev/null +++ b/asdf/_block/store.py @@ -0,0 +1,98 @@ +from .key import Key + + +class Store: + """ + A key-value store that uses ``asdf._block.key.Key`` + to allow use of keys that: + - are not hashable (so any object can be used) + - when the key is garbage collected, the value + will be unretrievable + """ + + def __init__(self): + # store contains 2 layers of lookup: id(obj), Key + self._by_id = {} + + def lookup_by_object(self, obj, default=None): + if isinstance(obj, Key): + # if obj is a Key, look up the object + obj_id = id(obj._ref()) + # and use the Key + obj_key = obj + else: + obj_id = id(obj) + obj_key = None + + # if id is unknown, return default + if obj_id not in self._by_id: + return default + + # first, lookup by id: O(1) + by_key = self._by_id[obj_id] + + # if we have a key + if obj_key: + # use the key to get an existing value + # or default if this Key is unknown + return by_key.get(obj_key, default) + + # we have seen this id(obj) before + # look for a matching key: O(N) + for key, value in by_key.items(): + if key._matches_object(obj): + return value + + # no match, return default + return default + + def assign_object(self, obj, value): + if isinstance(obj, Key): + if not obj._is_valid(): + msg = "Invalid key used for assign_object" + raise ValueError(msg) + obj_id = id(obj._ref()) + obj_key = obj + else: + obj_id = id(obj) + obj_key = None + + # if the id is unknown, just set it + if obj_id not in self._by_id: + if obj_key is None: + obj_key = Key(obj) + self._by_id[obj_id] = {obj_key: value} + return + + # if id is known + by_key = self._by_id[obj_id] + + # look for a matching matching key + if obj_key is None: + for key in by_key: + if key._matches_object(obj): + by_key[key] = value + return + # we didn't find a matching key, so make one + obj_key = Key(obj) + + # if no match was found, add using the key + self._by_id[obj_id][obj_key] = value + + def keys_for_value(self, value): + for oid, by_key in self._by_id.items(): + for key, stored_value in by_key.items(): + if stored_value == value and key._is_valid(): + yield key + + def _cleanup(self, object_id=None): + if object_id is None: + for oid in set(self._by_id): + self._cleanup(oid) + return + by_key = self._by_id[object_id] + keys_to_remove = [k for k in by_key if not k._is_valid()] + for key in keys_to_remove: + del by_key[key] + if not len(by_key): + del self._by_id[object_id] diff --git a/asdf/_block/writer.py b/asdf/_block/writer.py new file mode 100644 index 000000000..f8c36cf7d --- /dev/null +++ b/asdf/_block/writer.py @@ -0,0 +1,121 @@ +import numpy as np + +from asdf import constants + +from . import io as bio + + +class WriteBlock: + """ + Data and compression options needed to write an ASDF block. + """ + + def __init__(self, data, compression=None, compression_kwargs=None): + self._data = data + self.compression = compression + self.compression_kwargs = compression_kwargs + + @property + def data(self): + if callable(self._data): + return self._data() + return self._data + + @property + def data_bytes(self): + data = self.data + if data is not None: + return np.ndarray(-1, np.uint8, data.ravel(order="K").data) + return np.ndarray(0, np.uint8) + + +def write_blocks(fd, blocks, padding=False, streamed_block=None, write_index=True): + """ + Write a list of WriteBlocks to a file + + Parameters + ---------- + fd : file or generic_io.GenericIO + File to write to. Writing will start at the current position. + + blocks : list of WriteBlock + List of WriteBlock instances used to get the data and options + to write to each ASDF block. + + padding : bool or float, optional, default False + If False, add no padding bytes between blocks. If True + add some default amount of padding. If a float, add + a number of padding bytes based off a ratio of the data + size. + See ``asdf._block.io.write_block`` ``padding`` argument for + more details. + + streamed_block : WriteBlock, optional + If provided (not None) include this WriteBlock as + the final block in the file and mark it as a streamed + block. + + write_index : bool, optional, default True + If True, include a block index at the end of the file. + If a streamed_block is provided (or the file is not + seekable) no block index will be written. + + Returns + ------- + offsets : list of int + Byte offsets (from the start of the file) where each + block was written (this is the start of the block magic + bytes for each block). This list includes the offset of + the streamed_block if it was provided. + If the file written to is not seekable these offsets + will all be None. + + headers : list of dict + Headers written for each block (including the streamed_block + if it was provided). + """ + # some non-seekable files return a valid `tell` result + # others can raise an exception, others might always + # return 0. See relevant issues: + # https://github.com/asdf-format/asdf/issues/1545 + # https://github.com/asdf-format/asdf/issues/1552 + # https://github.com/asdf-format/asdf/issues/1542 + # to enable writing a block index for all valid files + # we will wrap tell to return None on an error + + def tell(): + try: + return fd.tell() + except OSError: + return None + + offsets = [] + headers = [] + for blk in blocks: + offsets.append(tell()) + fd.write(constants.BLOCK_MAGIC) + headers.append( + bio.write_block( + fd, + blk.data_bytes, + compression_kwargs=blk.compression_kwargs, + padding=padding, + compression=blk.compression, + ) + ) + if streamed_block is not None: + offsets.append(tell()) + fd.write(constants.BLOCK_MAGIC) + headers.append(bio.write_block(fd, streamed_block.data_bytes, stream=True)) + + # os.pipe on windows returns a file-like object + # that reports as seekable but tell always returns 0 + # https://github.com/asdf-format/asdf/issues/1545 + # when all offsets are 0 replace them with all Nones + if all(o == 0 for o in offsets): + offsets = [None for _ in offsets] + + # only write a block index if all conditions are met + if streamed_block is None and write_index and len(offsets) and all(o is not None for o in offsets): + bio.write_block_index(fd, offsets) + return offsets, headers diff --git a/asdf/_tests/tags/core/tests/data/__init__.py b/asdf/_tests/_block/__init__.py similarity index 100% rename from asdf/_tests/tags/core/tests/data/__init__.py rename to asdf/_tests/_block/__init__.py diff --git a/asdf/_tests/_block/test_callback.py b/asdf/_tests/_block/test_callback.py new file mode 100644 index 000000000..d173ded44 --- /dev/null +++ b/asdf/_tests/_block/test_callback.py @@ -0,0 +1,56 @@ +import pytest + +from asdf._block.callback import DataCallback +from asdf._block.manager import ReadBlocks + + +def test_default_attribute(): + class Data: + def __init__(self, value): + self.data = value + + blks = ReadBlocks([Data("a"), Data("b")]) + cbs = [DataCallback(0, blks), DataCallback(1, blks)] + + assert cbs[0]() == "a" + assert cbs[1]() == "b" + + +def test_attribute_access(): + class Foo: + def __init__(self, attr, value): + setattr(self, attr, value) + + blks = ReadBlocks([Foo("a", "foo"), Foo("a", "bar")]) + cb = DataCallback(0, blks) + + assert cb(_attr="a") == "foo" + + +def test_weakref(): + class Data: + def __init__(self, value): + self.data = value + + blks = ReadBlocks([Data("a"), Data("b")]) + cb = DataCallback(0, blks) + del blks + + with pytest.raises(OSError, match="Attempt to read block data from missing block"): + cb() + + +def test_reassign(): + class Data: + def __init__(self, value): + self.data = value + + blks = ReadBlocks([Data("a"), Data("b")]) + cb = DataCallback(0, blks) + + assert cb() == "a" + + blks2 = ReadBlocks([Data("c"), Data("d")]) + cb._reassign(1, blks2) + + assert cb() == "d" diff --git a/asdf/_tests/_block/test_external.py b/asdf/_tests/_block/test_external.py new file mode 100644 index 000000000..a7cb6f9c5 --- /dev/null +++ b/asdf/_tests/_block/test_external.py @@ -0,0 +1,26 @@ +import numpy as np +import pytest + +import asdf +from asdf._block import external + + +def test_cache(tmp_path): + efn = tmp_path / "test.asdf" + arr = np.arange(3, dtype="uint8") + asdf.AsdfFile({"data": arr}).write_to(efn) + + cache = external.ExternalBlockCache() + base_uri = asdf.util.filepath_to_url(f"{tmp_path}/") + data = cache.load(base_uri, "test.asdf") + np.testing.assert_array_equal(data, arr) + assert cache.load(base_uri, "test.asdf") is data + assert cache.load(base_uri, "#") is external.UseInternal + assert cache.load(base_uri, "") is external.UseInternal + + +@pytest.mark.parametrize("uri", ["test.asdf", "foo/test.asdf"]) +@pytest.mark.parametrize("index", [0, 1, 100]) +def test_relative_uri_for_index(uri, index): + match = f"test{index:04d}.asdf" + assert external.relative_uri_for_index(uri, index) == match diff --git a/asdf/_tests/_block/test_io.py b/asdf/_tests/_block/test_io.py new file mode 100644 index 000000000..15c0f22ef --- /dev/null +++ b/asdf/_tests/_block/test_io.py @@ -0,0 +1,366 @@ +import io +import mmap + +import numpy as np +import pytest + +from asdf import constants, generic_io +from asdf._block import io as bio +from asdf._block.exceptions import BlockIndexError + + +def test_checksum(tmp_path): + my_array = np.arange(0, 64, dtype=" allocated_size + data = np.ones(30, dtype="uint8") + raw_fd = io.BytesIO() + fd = generic_io.get_file(raw_fd, mode="rw") + with pytest.raises(RuntimeError, match="Block used size.*"): + bio.write_block(fd, data, allocated_size=0) + assert fd.tell() == 0 + + +def test_fd_not_seekable(): + data = np.ones(30, dtype="uint8") + raw_fd = io.BytesIO() + fd = generic_io.get_file(raw_fd, mode="rw") + bio.write_block(fd, data) + + raw_fd.seek(0) + fd = generic_io.get_file(raw_fd, mode="rw") + + seekable = lambda: False # noqa: E731 + fd.seekable = seekable + + _, _, _, d = bio.read_block(fd) + + np.testing.assert_array_equal(d, data) + + with pytest.raises(ValueError, match="write_block received offset.*"): + bio.write_block(fd, data, offset=0) + + +def test_compressed_block(): + data = np.ones(30, dtype="uint8") + fd = generic_io.get_file(io.BytesIO(), mode="rw") + write_header = bio.write_block(fd, data, compression="zlib") + assert write_header["compression"] == b"zlib" + _, _, _, rdata = bio.read_block(fd, offset=0) + np.testing.assert_array_equal(rdata, data) + + +def test_stream_block(): + data = np.ones(10, dtype="uint8") + fd = generic_io.get_file(io.BytesIO(), mode="rw") + write_header = bio.write_block(fd, data, stream=True) + assert write_header["flags"] & constants.BLOCK_FLAG_STREAMED + # now write extra data to file + extra_data = np.ones(10, dtype="uint8") + fd.write_array(extra_data) + _, _, _, rdata = bio.read_block(fd, offset=0) + assert rdata.size == 20 + assert np.all(rdata == 1) + + +def test_read_from_closed(tmp_path): + fn = tmp_path / "test.blk" + data = np.ones(10, dtype="uint8") + with generic_io.get_file(fn, mode="w") as fd: + bio.write_block(fd, data, stream=True) + with generic_io.get_file(fn, mode="rw") as fd: + _, _, _, callback = bio.read_block(fd, offset=0, lazy_load=True) + with pytest.raises(OSError, match="ASDF file has already been closed. Can not get the data."): + callback() + + +@pytest.mark.parametrize("data", [np.ones(10, dtype="f4"), np.ones((3, 3), dtype="uint8")]) +def test_invalid_data(data): + fd = generic_io.get_file(io.BytesIO(), mode="rw") + with pytest.raises(ValueError, match="Data must be of.*"): + bio.write_block(fd, data, stream=True) + + +@pytest.mark.parametrize( + "options", + [ + (0, 10, 5, [5, 0]), + (0, 10, 3, [9, 6, 3, 0]), + (0, 10, 10, [0]), + (0, 10, 6, [6, 0]), + (0, 10, 11, [0]), + (0, 10, 4096, [0]), + ], +) +def test_candidate_offsets(options): + min_offset, max_offset, size, targets = options + for offset, target in zip(bio._candidate_offsets(min_offset, max_offset, size), targets): + assert offset == target + + +def generate_block_index_file(fn, values=None, offset=0): + if values is None: + values = [1, 2, 3] + with generic_io.get_file(fn, "w") as f: + f.write(b"\0" * offset) + bio.write_block_index(f, values) + + +def test_find_block_index(tmp_path): + offset = 42 + fn = tmp_path / "test" + generate_block_index_file(fn, offset=offset) + with generic_io.get_file(fn, "r") as fd: + assert bio.find_block_index(fd) == offset + + +def test_find_block_index_on_boundry(tmp_path): + fn = tmp_path / "test" + with generic_io.get_file(fn, "w") as fd: + block_size = fd.block_size + # put pattern across a block boundary + offset = block_size - (len(constants.INDEX_HEADER) // 2) + generate_block_index_file(fn, offset=offset) + with generic_io.get_file(fn, "r") as fd: + assert bio.find_block_index(fd) == offset + + +def test_missing_block_index(tmp_path): + fn = tmp_path / "test" + with open(fn, "w") as f: + f.write("\0" * 4096) + with generic_io.get_file(fn, "r") as fd: + assert bio.find_block_index(fd) is None + + +def test_less_than_min_offset_block_index(tmp_path): + fn = tmp_path / "test" + offset = 26 + min_offset = 42 + generate_block_index_file(fn, offset=offset) + with generic_io.get_file(fn, "r") as fd: + assert bio.find_block_index(fd, min_offset) is None + + +def test_greater_than_max_offset_block_index(tmp_path): + fn = tmp_path / "test" + offset = 72 + max_offset = 42 + generate_block_index_file(fn, offset=offset) + with generic_io.get_file(fn, "r") as fd: + assert bio.find_block_index(fd, 0, max_offset) is None + + +def test_read_block_index(tmp_path): + fn = tmp_path / "test" + values = [1, 2, 3] + generate_block_index_file(fn, values=values, offset=0) + with generic_io.get_file(fn, "r") as fd: + assert bio.read_block_index(fd) == values + + +def test_read_block_index_with_offset(tmp_path): + fn = tmp_path / "test" + values = [1, 2, 3] + offset = 42 + generate_block_index_file(fn, values=values, offset=offset) + with generic_io.get_file(fn, "r") as fd: + assert bio.read_block_index(fd, offset) == values + + +def test_read_block_index_pre_seek(tmp_path): + fn = tmp_path / "test" + values = [1, 2, 3] + offset = 42 + generate_block_index_file(fn, values=values, offset=offset) + with generic_io.get_file(fn, "r") as fd: + fd.seek(offset) + assert bio.read_block_index(fd) == values + + +def test_read_block_index_no_header(tmp_path): + fn = tmp_path / "test" + values = [1, 2, 3] + generate_block_index_file(fn, values=values, offset=0) + with generic_io.get_file(fn, "r") as fd: + fd.seek(len(constants.INDEX_HEADER)) + with pytest.raises(BlockIndexError, match="Failed to read block index.*"): + assert bio.read_block_index(fd) == values + + +def test_read_block_index_invalid_yaml(): + bs = io.BytesIO(constants.INDEX_HEADER + b"][") + with generic_io.get_file(bs, "r") as fd: + with pytest.raises(BlockIndexError, match="Failed to parse block index as yaml"): + bio.read_block_index(fd) + + +def test_read_block_index_valid_yaml_invalid_contents(): + bs = io.BytesIO(constants.INDEX_HEADER + b"['a', 'b']") + with generic_io.get_file(bs, "r") as fd: + with pytest.raises(BlockIndexError, match="Invalid block index"): + bio.read_block_index(fd) + + +def test_write_block_index_with_offset(tmp_path): + fn = tmp_path / "test" + offset = 50 + with generic_io.get_file(fn, "w") as fd: + fd.write(b"\0" * 100) + fd.seek(0) + bio.write_block_index(fd, [1, 2, 3], offset=offset) + with generic_io.get_file(fn, "r") as fd: + assert bio.find_block_index(fd) == offset diff --git a/asdf/_tests/_block/test_key.py b/asdf/_tests/_block/test_key.py new file mode 100644 index 000000000..204a761c4 --- /dev/null +++ b/asdf/_tests/_block/test_key.py @@ -0,0 +1,116 @@ +import copy + +from asdf._block.key import Key + + +# a blank class for testing +class Foo: + pass + + +def test_unique_per_object(): + seen = set() + for _i in range(10): + bk = Key(Foo()) + assert bk not in seen + seen.add(bk) + + +def test_unique_same_object(): + seen = set() + f = Foo() + for _i in range(10): + bk = Key(f) + assert bk not in seen + seen.add(bk) + + +def test_matches_obj(): + f = Foo() + bk = Key(f) + assert bk._matches_object(f) + + +def test_undefined_no_match(): + bk = Key() + assert not bk._matches_object(Foo()) + + +def test_is_valid(): + f = Foo() + bk = Key(f) + assert bk._is_valid() + del f + assert not bk._is_valid() + + +def test_same_class(): + f = Foo() + bk = Key(f) + del f + f2 = Foo() + assert not bk._is_valid() + assert not bk._matches_object(f2) + + +def test_undefined(): + k = Key() + assert not k._is_valid() + + +def test_equal(): + key_value = 42 + f = Foo() + k1 = Key(f, key_value) + k2 = Key(f, key_value) + assert k1 == k2 + + +def test_key_mismatch_not_equal(): + f = Foo() + k1 = Key(f) + k2 = Key(f) + assert k1 != k2 + + +def test_obj_not_equal(): + f = Foo() + k = Key(f) + assert k != f + + +def test_undefined_not_equal(): + key_value = 42 + k1 = Key(_key=key_value) + k2 = Key(_key=key_value) + assert k1 != k2 + + +def test_deleted_object_not_equal(): + key_value = 42 + f = Foo() + k1 = Key(f, key_value) + k2 = Key(f, key_value) + del f + assert k1 != k2 + + +def test_copy(): + f = Foo() + k1 = Key(f) + k2 = copy.copy(k1) + assert k1 == k2 + + +def test_copy_undefined_not_equal(): + k1 = Key() + k2 = copy.copy(k1) + assert k1 != k2 + + +def test_copy_deleted_object_not_equal(): + f = Foo() + k1 = Key(f) + k2 = copy.copy(k1) + del f + assert k1 != k2 diff --git a/asdf/_tests/_block/test_manager.py b/asdf/_tests/_block/test_manager.py new file mode 100644 index 000000000..6c43d086f --- /dev/null +++ b/asdf/_tests/_block/test_manager.py @@ -0,0 +1,90 @@ +import numpy as np +import pytest + +import asdf +from asdf._block import manager +from asdf._block.options import Options + + +def test_set_streamed_block_via_options(): + options = manager.OptionsStore(manager.ReadBlocks()) + arr1 = np.arange(10, dtype="uint8") + arr2 = np.arange(5, dtype="uint8") + options.set_options(arr1, Options("streamed")) + with pytest.raises(ValueError, match=r"Can not add second streaming block"): + options.set_options(arr2, Options("streamed")) + del arr1 + options.set_options(arr2, Options("streamed")) + + +def test_set_streamed_block_via_manager(): + af = asdf.AsdfFile() + m = af._blocks + + class Foo: + pass + + arr = np.arange(10, dtype="uint8") + obj = Foo() + m.set_streamed_write_block(arr, obj) + + # setting again with the same data is ok + m.set_streamed_write_block(arr, obj) + + # using a different array is not allowed + arr2 = np.arange(3, dtype="uint8") + with pytest.raises(ValueError, match="Can not add second streaming block"): + m.set_streamed_write_block(arr2, obj) + + # a different object is ok as long as the array matches + obj2 = Foo() + m.set_streamed_write_block(arr, obj2) + + +def test_load_external_internal(tmp_path): + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"arr": np.arange(10, dtype="uint8")}).write_to(fn) + with asdf.open(fn) as af: + m = af._blocks + np.testing.assert_array_equal(m._load_external("#"), m.blocks[0].data) + + +def test_write_no_uri(tmp_path): + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"arr": np.arange(10, dtype="uint8")}).write_to(fn) + with asdf.open(fn) as af: + m = af._blocks + with pytest.raises(ValueError, match=r"Can't write external blocks.*"): + m._write_external_blocks() + + +def test_write_outside_context(tmp_path): + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"arr": np.arange(10, dtype="uint8")}).write_to(fn) + with asdf.open(fn) as af: + m = af._blocks + with pytest.raises(OSError, match=r"write called outside of valid write_context"): + m.write(False, False) + + +def test_update_outside_context(tmp_path): + fn = tmp_path / "test.asdf" + asdf.AsdfFile({"arr": np.arange(10, dtype="uint8")}).write_to(fn) + with asdf.open(fn) as af: + m = af._blocks + with pytest.raises(OSError, match=r"update called outside of valid write_context"): + m.update(0, False, False) + + +def test_input_compression(tmp_path): + fn = tmp_path / "test.asdf" + af = asdf.AsdfFile({"arr": np.arange(10, dtype="uint8")}) + af.set_array_compression(af["arr"], "zlib") + af.write_to(fn) + + with asdf.open(fn) as af: + assert af.get_array_compression(af["arr"]) == "zlib" + af.set_array_compression(af["arr"], "bzp2") + assert af.get_array_compression(af["arr"]) == "bzp2" + af.set_array_compression(af["arr"], "input") + assert af.get_array_compression(af["arr"]) == "zlib" diff --git a/asdf/_tests/_block/test_options.py b/asdf/_tests/_block/test_options.py new file mode 100644 index 000000000..22bade26c --- /dev/null +++ b/asdf/_tests/_block/test_options.py @@ -0,0 +1,106 @@ +import copy + +import pytest + +from asdf._block.options import Options +from asdf.config import config_context + +valid_storage_types = ["internal", "external", "streamed", "inline"] +valid_default_storage_types = [st for st in valid_storage_types if st != "streamed"] +valid_compression_types = [None, "zlib", "bzp2", "lz4", ""] + +invalid_storage_types = ["foo", "bar"] +invalid_compression_types = ["input", "foo"] + + +@pytest.mark.parametrize("storage", valid_storage_types) +def test_set_storage_init(storage): + o = Options(storage) + assert o.storage_type == storage + + +@pytest.mark.parametrize("storage", valid_default_storage_types) +def test_default_storage_init(storage): + with config_context() as cfg: + cfg.all_array_storage = storage + o = Options() + assert o.storage_type == storage + + +@pytest.mark.parametrize("storage", valid_storage_types) +def test_set_storage_attr(storage): + # start with a different storage type + o = Options("internal" if storage == "external" else "external") + o.storage_type = storage + assert o.storage_type == storage + + +@pytest.mark.parametrize("compression", valid_compression_types) +def test_set_compression_attr(compression): + o = Options("internal") + o.compression = compression + # allow "" to become None, both are falsey + assert o.compression == compression if compression else not o.compression + + +@pytest.mark.parametrize("compression", valid_compression_types) +def test_set_compression_init(compression): + o = Options("internal", compression) + # allow "" to become None, both are falsey + assert o.compression == compression if compression else not o.compression + + +def test_set_compression_kwargs_attr(): + o = Options("internal") + o.compression_kwargs = {"foo": 1} + assert o.compression_kwargs == {"foo": 1} + + +def test_set_compression_kwargs_init(): + o = Options("internal", compression_kwargs={"foo": 1}) + assert o.compression_kwargs == {"foo": 1} + + +def test_default_compression(): + o = Options("internal") + assert o.compression is None + + +@pytest.mark.parametrize("invalid_storage", invalid_storage_types) +def test_invalid_storage_type_init(invalid_storage): + with pytest.raises(ValueError, match="array_storage must be one of.*"): + Options(invalid_storage) + + +@pytest.mark.parametrize("invalid_storage", invalid_storage_types) +def test_invalid_storage_attr(invalid_storage): + o = Options("internal") + with pytest.raises(ValueError, match="array_storage must be one of.*"): + o.storage_type = invalid_storage + + +@pytest.mark.parametrize("invalid_compression", invalid_compression_types) +def test_invalid_compression_attr(invalid_compression): + o = Options("internal") + with pytest.raises(ValueError, match="Invalid compression.*"): + o.compression = invalid_compression + + +@pytest.mark.parametrize("invalid_compression", invalid_compression_types) +def test_invalid_compression_init(invalid_compression): + with pytest.raises(ValueError, match="Invalid compression.*"): + Options("internal", invalid_compression) + + +@pytest.mark.parametrize("storage", valid_storage_types) +@pytest.mark.parametrize("compression", valid_compression_types) +@pytest.mark.parametrize("compression_kwargs", [None, {"foo": 1}]) +def test_copy(storage, compression, compression_kwargs): + o = Options(storage, compression, compression_kwargs) + o2 = copy.copy(o) + assert o2 is not o + assert o2.storage_type == storage + # allow "" to become None, both are falsey + assert o2.compression == compression if compression else not o2.compression + # allow None to become {}, both are falsey + assert o2.compression_kwargs == compression_kwargs if compression_kwargs else not o2.compression_kwargs diff --git a/asdf/_tests/_block/test_reader.py b/asdf/_tests/_block/test_reader.py new file mode 100644 index 000000000..800cc9d98 --- /dev/null +++ b/asdf/_tests/_block/test_reader.py @@ -0,0 +1,187 @@ +import contextlib +import io +import mmap +import os + +import numpy as np +import pytest + +from asdf import constants, generic_io, util +from asdf._block import io as bio +from asdf._block.reader import read_blocks +from asdf.exceptions import AsdfBlockIndexWarning, AsdfWarning + + +@contextlib.contextmanager +def gen_blocks( + fn=None, n=5, size=10, padding=0, padding_byte=b"\0", with_index=False, block_padding=False, streamed=False +): + offsets = [] + if fn is not None: + with generic_io.get_file(fn, mode="w") as fd: + pass + + def check(blocks): + assert len(blocks) == n + for i, blk in enumerate(blocks): + assert blk.data.size == size + assert np.all(blk.data == i) + + with generic_io.get_file(fn or io.BytesIO(), mode="rw") as fd: + fd.write(padding_byte * padding) + for i in range(n): + offsets.append(fd.tell()) + fd.write(constants.BLOCK_MAGIC) + data = np.ones(size, dtype="uint8") * i + bio.write_block(fd, data, stream=streamed and (i == n - 1), padding=block_padding) + if with_index and not streamed: + bio.write_block_index(fd, offsets) + fd.seek(0) + yield fd, check + + +# test a few paddings to test read_blocks checking 4 bytes while searching for the first block +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("memmap", [True, False]) +@pytest.mark.parametrize("with_index", [True, False]) +@pytest.mark.parametrize("validate_checksums", [True, False]) +@pytest.mark.parametrize("padding", [0, 3, 4, 5]) +@pytest.mark.parametrize("streamed", [True, False]) +def test_read(tmp_path, lazy_load, memmap, with_index, validate_checksums, padding, streamed): + fn = tmp_path / "test.bin" + n = 5 + size = 10 + with gen_blocks(fn=fn, n=n, size=size, padding=padding, with_index=with_index, streamed=streamed) as (fd, check): + r = read_blocks(fd, memmap=memmap, lazy_load=lazy_load, validate_checksums=validate_checksums) + if lazy_load and with_index and not streamed: + assert r[0].loaded + assert r[-1].loaded + for blk in r[1:-1]: + assert not blk.loaded + # getting the header should load the block + blk.header + assert blk.loaded + else: + for blk in r: + assert blk.loaded + if memmap: + for blk in r: + base = util.get_array_base(blk.data) + assert isinstance(base.base, mmap.mmap) + check(r) + if lazy_load: + # if lazy loaded, each call to data should re-read the data + assert r[0].data is not r[0].data + else: + assert r[0].data is r[0].data + # getting cached_data should always return the same array + assert r[0].cached_data is r[0].cached_data + + +def test_read_invalid_padding(): + with gen_blocks(padding=1, padding_byte=b"\1") as (fd, check): + with pytest.raises(OSError, match="Invalid bytes.*"): + check(read_blocks(fd)) + + +def test_read_post_padding_null_bytes(): + with gen_blocks(padding=1) as (fd, check): + fd.seek(0, os.SEEK_END) + # acceptable to have <4 bytes after the last block + fd.write(b"\x00" * 3) + fd.seek(0) + check(read_blocks(fd)) + + +def test_read_post_padding_non_null_bytes(): + with gen_blocks(padding=1) as (fd, check): + fd.seek(0, os.SEEK_END) + # acceptable to have <4 bytes after the last block + fd.write(b"\x01" * 3) + fd.seek(0) + with pytest.warns(AsdfWarning, match=r"Read invalid bytes.*"): + check(read_blocks(fd)) + + +@pytest.mark.parametrize("invalid_block_index", [0, 1, -1, "junk"]) +def test_invalid_block_index(tmp_path, invalid_block_index): + fn = tmp_path / "test.bin" + with gen_blocks(fn=fn, with_index=True) as (fd, check): + # trash the block index + offset = bio.find_block_index(fd) + assert offset is not None + if invalid_block_index == "junk": + # trash the whole index + fd.seek(-4, 2) + fd.write(b"junk") + else: # mess up one entry of the index + block_index = bio.read_block_index(fd, offset) + block_index[invalid_block_index] += 4 + fd.seek(offset) + bio.write_block_index(fd, block_index) + fd.seek(0) + + # when the block index is read, only the first and last blocks + # are check, so any other invalid entry should result in failure + if invalid_block_index in (0, -1): + with pytest.warns(AsdfBlockIndexWarning, match="Invalid block index contents"): + check(read_blocks(fd, lazy_load=True)) + elif invalid_block_index == "junk": + # read_blocks should fall back to reading serially + with pytest.warns(AsdfBlockIndexWarning, match="Failed to read block index"): + check(read_blocks(fd, lazy_load=True)) + else: + with pytest.raises(ValueError, match="Header size.*"): + check(read_blocks(fd, lazy_load=True)) + + +def test_invalid_block_in_index_with_valid_magic(tmp_path): + fn = tmp_path / "test.bin" + with gen_blocks(fn=fn, with_index=True, block_padding=1.0) as (fd, check): + offset = bio.find_block_index(fd) + assert offset is not None + block_index = bio.read_block_index(fd, offset) + # move the first block offset to the padding before + # the second block with enough space to write + # valid magic (but invalid header) + block_index[0] = block_index[1] - 6 + fd.seek(block_index[0]) + fd.write(constants.BLOCK_MAGIC) + fd.write(b"\0\0") + + fd.seek(offset) + bio.write_block_index(fd, block_index) + + fd.seek(0) + with pytest.warns(AsdfBlockIndexWarning, match="Invalid block index contents"): + check(read_blocks(fd, lazy_load=True)) + + +def test_closed_file(tmp_path): + fn = tmp_path / "test.bin" + with gen_blocks(fn=fn, with_index=True) as (fd, check): + blocks = read_blocks(fd, lazy_load=True) + blk = blocks[1] + with pytest.raises(OSError, match="Attempt to load block from closed file"): + blk.load() + + +@pytest.mark.parametrize("validate_checksums", [True, False]) +def test_bad_checksum(validate_checksums): + buff = io.BytesIO( + constants.BLOCK_MAGIC + + b"\x000" # header size = 2 + + b"\0\0\0\0" # flags = 4 + + b"\0\0\0\0" # compression = 4 + + b"\0\0\0\0\0\0\0\0" # allocated size = 8 + + b"\0\0\0\0\0\0\0\0" # used size = 8 + + b"\0\0\0\0\0\0\0\0" # data size = 8 + + b"\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" # invalid checksum = 16 + ) + + with generic_io.get_file(buff, mode="r") as fd: + if validate_checksums: + with pytest.raises(ValueError, match=".* does not match given checksum"): + read_blocks(fd, lazy_load=False, validate_checksums=validate_checksums)[0].data + else: + read_blocks(fd, lazy_load=False, validate_checksums=validate_checksums)[0].data diff --git a/asdf/_tests/_block/test_store.py b/asdf/_tests/_block/test_store.py new file mode 100644 index 000000000..dd48b4013 --- /dev/null +++ b/asdf/_tests/_block/test_store.py @@ -0,0 +1,174 @@ +from unittest.mock import patch + +import pytest + +from asdf._block.key import Key +from asdf._block.store import Store + + +# a blank class for testing +class Foo: + pass + + +def test_store_by_obj(): + f = Foo() + v = 42 + s = Store() + s.assign_object(f, v) + assert s.lookup_by_object(f) == v + + +def test_get_missing_by_obj(): + f = Foo() + s = Store() + assert s.lookup_by_object(f) is None + + +def test_store_by_key(): + f = Foo() + v = 42 + s = Store() + k = Key(f) + s.assign_object(k, v) + assert s.lookup_by_object(k) == v + + +def test_get_by_key(): + f = Foo() + v = 42 + s = Store() + k = Key(f) + s.assign_object(k, v) + assert s.lookup_by_object(f) == v + + +def test_get_missing_key(): + f = Foo() + s = Store() + k = Key(f) + assert s.lookup_by_object(k) is None + + +def test_get_missing_key_same_obj(): + f = Foo() + v = 42 + s = Store() + k = Key(f) + s.assign_object(k, v) + k2 = Key(f) + assert s.lookup_by_object(k2) is None + + +def test_get_existing_default(): + f = Foo() + v = 42 + s = Store() + s.assign_object(f, v) + assert s.lookup_by_object(f, 26) == v + + +def test_get_missing_default(): + f = Foo() + v = 42 + s = Store() + assert s.lookup_by_object(f, v) == v + + +def test_set_same_object(): + f = Foo() + v = 42 + s = Store() + s.assign_object(f, 26) + s.assign_object(f, v) + assert s.lookup_by_object(f) == v + + +def test_invalid_key_assign_object(): + s = Store() + k = Key() + with pytest.raises(ValueError, match="Invalid key used for assign_object"): + s.assign_object(k, 42) + + +def test_set_same_key(): + f = Foo() + s = Store() + k = Key(f) + v = 42 + s.assign_object(k, 26) + s.assign_object(k, v) + assert s.lookup_by_object(k) == v + + +def test_get_memory_reused(): + f = Foo() + s = Store() + v = 42 + s.assign_object(f, v) + fid = id(f) + del f + f2 = Foo() + + def mock_id(obj): + if obj is f2: + return fid + return id(obj) + + with patch("asdf._block.store.id", mock_id): + assert s.lookup_by_object(f2) is None + + +def test_set_memory_reused(): + f = Foo() + s = Store() + v = 42 + s.assign_object(f, v) + fid = id(f) + del f + f2 = Foo() + + def mock_id(obj): + if obj is f2: + return fid + return id(obj) + + with patch("asdf._block.store.id", mock_id): + nv = 26 + s.assign_object(f2, nv) + assert s.lookup_by_object(f2) is nv + + +def test_cleanup(): + f = Foo() + s = Store() + k = Key(f) + s.assign_object(s, 42) + s.assign_object(k, 26) + del f + s._cleanup() + assert s.lookup_by_object(k, None) is None + + +def test_keys_for_value(): + s = Store() + data = { + Foo(): 42, + Foo(): 26, + Foo(): 42, + Foo(): 11, + } + data_by_value = {} + for o, v in data.items(): + s.assign_object(o, v) + data_by_value[v] = [*data_by_value.get(v, []), o] + + for v, objs in data_by_value.items(): + objs = set(objs) + returned_objects = set() + for k in s.keys_for_value(v): + assert k._is_valid() + obj = k._ref() + returned_objects.add(obj) + assert objs == returned_objects + del returned_objects, objs diff --git a/asdf/_tests/_block/test_writer.py b/asdf/_tests/_block/test_writer.py new file mode 100644 index 000000000..970a28e9d --- /dev/null +++ b/asdf/_tests/_block/test_writer.py @@ -0,0 +1,102 @@ +import numpy as np +import pytest + +import asdf._block.io as bio +from asdf import constants, generic_io +from asdf._block import reader, writer + + +@pytest.mark.parametrize("lazy", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("padding", [True, False, 0.1, 0.9]) +@pytest.mark.parametrize("compression", [None, b"zlib"]) +@pytest.mark.parametrize("stream", [True, False]) +@pytest.mark.parametrize("seekable", [True, False]) +def test_write_blocks(tmp_path, lazy, index, padding, compression, stream, seekable): + data = [np.ones(10, dtype=np.uint8), np.zeros(5, dtype=np.uint8), None] + if lazy: + blocks = [writer.WriteBlock(lambda bd=d: bd, compression=compression) for d in data] + else: + blocks = [writer.WriteBlock(d, compression=compression) for d in data] + if stream: + streamed_block = writer.WriteBlock(np.ones(15, dtype=np.uint8)) + else: + streamed_block = None + fn = tmp_path / "test.bin" + with generic_io.get_file(fn, mode="w") as fd: + if not seekable: + fd.seekable = lambda: False + writer.write_blocks(fd, blocks, padding=padding, streamed_block=streamed_block, write_index=index) + with generic_io.get_file(fn, mode="r") as fd: + if index and not stream: + assert bio.find_block_index(fd) is not None + else: + assert bio.find_block_index(fd) is None + fd.seek(0) + read_blocks = reader.read_blocks(fd) + if stream: + assert len(read_blocks) == (len(data) + 1) + else: + assert len(read_blocks) == len(data) + for r, d in zip(read_blocks, data): + if d is None: + assert r.data.size == 0 + else: + np.testing.assert_array_equal(r.data, d) + if compression is not None: + assert r.header["compression"] == compression + if padding: + assert r.header["allocated_size"] > r.header["used_size"] + if stream: + read_stream_block = read_blocks[-1] + np.testing.assert_array_equal(read_stream_block.data, streamed_block.data) + assert read_stream_block.header["flags"] & constants.BLOCK_FLAG_STREAMED + + +def _raise_illegal_seek(): + raise OSError("Illegal seek") + + +@pytest.mark.parametrize("stream", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("tell", [0, None, _raise_illegal_seek]) +def test_non_seekable_files_with_odd_tells(tmp_path, stream, index, tell): + """ + Some non-seekable files have odd 'tell' results. See: + https://github.com/asdf-format/asdf/issues/1545 + https://github.com/asdf-format/asdf/issues/1542 + + These can produce invalid block indices which should not be written + to the ASDF file. + """ + data = [np.ones(10, dtype=np.uint8), np.zeros(5, dtype=np.uint8), None] + blocks = [writer.WriteBlock(d) for d in data] + if stream: + streamed_block = writer.WriteBlock(np.ones(15, dtype=np.uint8)) + else: + streamed_block = None + fn = tmp_path / "test.bin" + with generic_io.get_file(fn, mode="w") as fd: + fd.seekable = lambda: False + if callable(tell): + fd.tell = tell + else: + fd.tell = lambda: tell + writer.write_blocks(fd, blocks, streamed_block=streamed_block, write_index=index) + with generic_io.get_file(fn, mode="r") as fd: + assert bio.find_block_index(fd) is None + fd.seek(0) + read_blocks = reader.read_blocks(fd) + if stream: + assert len(read_blocks) == (len(data) + 1) + else: + assert len(read_blocks) == len(data) + for r, d in zip(read_blocks, data): + if d is None: + assert r.data.size == 0 + else: + np.testing.assert_array_equal(r.data, d) + if stream: + read_stream_block = read_blocks[-1] + np.testing.assert_array_equal(read_stream_block.data, streamed_block.data) + assert read_stream_block.header["flags"] & constants.BLOCK_FLAG_STREAMED diff --git a/asdf/_tests/_helpers.py b/asdf/_tests/_helpers.py index cdf147155..a3fcd15a7 100644 --- a/asdf/_tests/_helpers.py +++ b/asdf/_tests/_helpers.py @@ -19,13 +19,13 @@ except ImportError: CartesianDifferential = None +import numpy as np import yaml import asdf from asdf import generic_io, versioning from asdf._resolver import Resolver, ResolverChain from asdf.asdf import AsdfFile, get_asdf_library_info -from asdf.block import Block from asdf.constants import YAML_TAG_PREFIX from asdf.exceptions import AsdfConversionWarning, AsdfDeprecationWarning from asdf.extension import _legacy @@ -149,6 +149,20 @@ def recurse(old, new): elif ICRS is not None and isinstance(old, ICRS): assert old.ra == new.ra assert old.dec == new.dec + elif all([isinstance(obj, (np.ndarray, asdf.tags.core.NDArrayType)) for obj in (old, new)]): + with warnings.catch_warnings(): + # The oldest deps job tests against versions of numpy where this + # testing function raised a FutureWarning but still functioned + # as expected + warnings.filterwarnings("ignore", category=FutureWarning) + if old.dtype.fields: + if not new.dtype.fields: + msg = "arrays not equal" + raise AssertionError(msg) + for f in old.dtype.fields: + np.testing.assert_array_equal(old[f], new[f]) + else: + np.testing.assert_array_equal(old.__array__(), new.__array__()) else: assert old == new @@ -263,9 +277,8 @@ def _assert_roundtrip_tree( buff.seek(0) ff = asdf.open(buff, extensions=extensions, copy_arrays=True, lazy_load=False) # Ensure that all the blocks are loaded - for block in ff._blocks._internal_blocks: - assert isinstance(block, Block) - assert block._data is not None + for block in ff._blocks.blocks: + assert block._data is not None and not callable(block._data) # The underlying file is closed at this time and everything should still work assert_tree_match(tree, ff.tree, ff, funcname=tree_match_func) if asdf_check_func: @@ -274,9 +287,8 @@ def _assert_roundtrip_tree( # Now repeat with copy_arrays=False and a real file to test mmap() AsdfFile(tree, extensions=extensions, **init_options).write_to(fname, **write_options) with asdf.open(fname, mode="rw", extensions=extensions, copy_arrays=False, lazy_load=False) as ff: - for block in ff._blocks._internal_blocks: - assert isinstance(block, Block) - assert block._data is not None + for block in ff._blocks.blocks: + assert block._data is not None and not callable(block._data) assert_tree_match(tree, ff.tree, ff, funcname=tree_match_func) if asdf_check_func: asdf_check_func(ff) @@ -441,7 +453,7 @@ def _assert_extension_type_correctness(extension, extension_type, resolver): if extension_type.yaml_tag is not None and extension_type.yaml_tag.startswith(YAML_TAG_PREFIX): return - if extension_type == asdf.stream.Stream: + if extension_type == asdf.Stream: # Stream is a special case. It was implemented as a subclass of NDArrayType, # but shares a tag with that class, so it isn't really a distinct type. return diff --git a/asdf/_tests/_regtests/test_1013.py b/asdf/_tests/_regtests/test_1013.py new file mode 100644 index 000000000..b95f02d4f --- /dev/null +++ b/asdf/_tests/_regtests/test_1013.py @@ -0,0 +1,51 @@ +import numpy as np + +import asdf + + +def test_control_array_storage_in_to_yaml_tree_methods(tmp_path): + """ + controlling array storage in to_yaml_tree methods + + https://github.com/asdf-format/asdf/issues/1013 + """ + + class FooType: + def __init__(self, data): + self.data = data + + class FooConverter: + tags = ["asdf://somewhere.org/tag/foo-1.0.0"] + types = [FooType] + + def to_yaml_tree(self, obj, tag, ctx): + if obj.data.ndim < 2: + ctx._blocks._set_array_storage(obj.data, "inline") + return {"data": obj.data} + + def from_yaml_tree(self, obj, tag, ctx): + return FooType(obj["data"]) + + class FooExtension: + converters = [FooConverter()] + tags = ["asdf://somewhere.org/tag/foo-1.0.0"] + extension_uri = "asdf://somewhere.org/extensions/foo-1.0.0" + + with asdf.config_context() as cfg: + cfg.add_extension(FooExtension()) + + fn = tmp_path / "test.asdf" + + for shape in [3, (3, 3)]: + arr = np.zeros(shape) + n_blocks = 0 if arr.ndim == 1 else 1 + af = asdf.AsdfFile({"foo": FooType(arr)}) + assert af.get_array_storage(arr) == "internal" + af.write_to(fn) + # make sure write_to doesn't change the settings outside of the + # writing context + assert af.get_array_storage(arr) == "internal" + + with asdf.open(fn) as af: + np.testing.assert_array_equal(af["foo"].data, arr) + assert len(af._blocks.blocks) == n_blocks diff --git a/asdf/_tests/_regtests/test_1505.py b/asdf/_tests/_regtests/test_1505.py new file mode 100644 index 000000000..92cd3da63 --- /dev/null +++ b/asdf/_tests/_regtests/test_1505.py @@ -0,0 +1,23 @@ +import numpy as np + +import asdf + + +def test_update_fails_after_write_to(tmp_path): + """ + Calling update after write_to fails + + https://github.com/asdf-format/asdf/issues/1505 + """ + fn1 = tmp_path / "test1.asdf" + fn2 = tmp_path / "test2.asdf" + + tree = {"a": np.zeros(3), "b": np.ones(3)} + af = asdf.AsdfFile(tree) + + af.write_to(fn1) + + with asdf.open(fn1, mode="rw") as af: + af["a"] = None + af.write_to(fn2) + af.update() diff --git a/asdf/_tests/_regtests/test_1523.py b/asdf/_tests/_regtests/test_1523.py new file mode 100644 index 000000000..1777f769d --- /dev/null +++ b/asdf/_tests/_regtests/test_1523.py @@ -0,0 +1,30 @@ +import numpy as np + +import asdf + + +def test_update_corrupts_stream_data(tmp_path): + """ + update corrupts stream data + https://github.com/asdf-format/asdf/issues/1523 + """ + fn = tmp_path / "stream.asdf" + + s = asdf.Stream([3], np.uint8) + asdf.AsdfFile({"s": s}).write_to(fn) + + with open(fn, "rb+") as f: + f.seek(0, 2) + f.write(b"\x01\x02\x03") + + with asdf.open(fn) as af: + np.testing.assert_array_equal(af["s"], [[1, 2, 3]]) + + with asdf.open(fn, mode="rw") as af: + af["a"] = np.arange(1000) + af.update() + # print(af['s']) # segmentation fault + + with asdf.open(fn) as af: + # fails as af['s'] == [[116, 101, 111]] + np.testing.assert_array_equal(af["s"], [[1, 2, 3]]) diff --git a/asdf/_tests/_regtests/test_1525.py b/asdf/_tests/_regtests/test_1525.py new file mode 100644 index 000000000..a682fc66e --- /dev/null +++ b/asdf/_tests/_regtests/test_1525.py @@ -0,0 +1,30 @@ +import numpy as np +import pytest + +import asdf + + +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_external_blocks_always_lazy_loaded_and_memmapped(tmp_path, copy_arrays): + """ + External blocks are always lazy loaded and memmapped + + https://github.com/asdf-format/asdf/issues/1525 + """ + + fn = tmp_path / "test.asdf" + arr = np.arange(10) + af = asdf.AsdfFile({"arr": arr}) + af.set_array_storage(arr, "external") + af.write_to(fn) + + with asdf.open(fn, copy_arrays=copy_arrays) as af: + # check that block is external + source = af["arr"]._source + assert isinstance(source, str) + + # check if block is memmapped + if copy_arrays: + assert not isinstance(af["arr"].base, np.memmap) + else: + assert isinstance(af["arr"].base, np.memmap) diff --git a/asdf/_tests/_regtests/test_1526.py b/asdf/_tests/_regtests/test_1526.py new file mode 100644 index 000000000..2552e3e6d --- /dev/null +++ b/asdf/_tests/_regtests/test_1526.py @@ -0,0 +1,35 @@ +import os + +import numpy as np + +import asdf + + +def test_rewrite_file_with_unaccessed_external_blocks_fails(tmp_path): + """ + Rewriting a file with external blocks fails if arrays are not first accessed + + https://github.com/asdf-format/asdf/issues/1526 + """ + arrs = [np.arange(3) + i for i in range(3)] + af = asdf.AsdfFile({"arrs": arrs}) + [af.set_array_storage(a, "external") for a in arrs] + + dns = [] + for i in range(2): + dn = tmp_path / f"d{i}" + if not os.path.exists(dn): + os.makedirs(dn) + dns.append(dn) + fns = [dn / "test.asdf" for dn in dns] + + # write to d0 + af.write_to(fns[0]) + + with asdf.open(fns[0]) as af2: + af2["arrs"][0] = 42 + # write to d1 + af2.write_to(fns[1]) + + assert len(os.listdir(dns[0])) == 4 + assert len(os.listdir(dns[1])) == 3 diff --git a/asdf/_tests/_regtests/test_1530.py b/asdf/_tests/_regtests/test_1530.py new file mode 100644 index 000000000..353bca11d --- /dev/null +++ b/asdf/_tests/_regtests/test_1530.py @@ -0,0 +1,35 @@ +import numpy as np +import pytest + +import asdf + + +@pytest.mark.xfail(reason="fixing this may require subclassing ndarray") +def test_update_with_memmapped_data_can_make_view_data_invalid(tmp_path): + """ + Calling update with memmapped data can create invalid data in memmap views + + https://github.com/asdf-format/asdf/issues/1530 + + A view of a memmapped array can return invalid data or segfault + after an update + """ + fn = tmp_path / "test.asdf" + a = np.zeros(10, dtype="uint8") + b = np.ones(10, dtype="uint8") + ov = a[:3] + + af = asdf.AsdfFile({"a": a, "b": b}) + af.write_to(fn) + + with asdf.open(fn, mode="rw", copy_arrays=False) as af: + va = af["a"][:3] + np.testing.assert_array_equal(a, af["a"]) + np.testing.assert_array_equal(b, af["b"]) + np.testing.assert_array_equal(va, ov) + af["c"] = "a" * 10000 + af.update() + np.testing.assert_array_equal(a, af["a"]) + np.testing.assert_array_equal(b, af["b"]) + assert False + # np.testing.assert_array_equal(va, ov) # segfault diff --git a/asdf/_tests/_regtests/test_1538.py b/asdf/_tests/_regtests/test_1538.py new file mode 100644 index 000000000..d71b1d8df --- /dev/null +++ b/asdf/_tests/_regtests/test_1538.py @@ -0,0 +1,18 @@ +import numpy as np + +import asdf + + +def test_unable_to_read_empty_inline_array(tmp_path): + """ + ASDF unable to read empty inline array + + https://github.com/asdf-format/asdf/issues/1538 + """ + fn = tmp_path / "test.asdf" + a = np.array([]) + af = asdf.AsdfFile({"a": a}) + af.set_array_storage(a, "inline") + af.write_to(fn) + with asdf.open(fn) as af: + np.testing.assert_array_equal(af["a"], a) diff --git a/asdf/_tests/_regtests/test_1539.py b/asdf/_tests/_regtests/test_1539.py new file mode 100644 index 000000000..e1c042a04 --- /dev/null +++ b/asdf/_tests/_regtests/test_1539.py @@ -0,0 +1,22 @@ +import io + +import pytest + +import asdf + + +@pytest.mark.xfail(reason="Fix will require more major changes to generic_io") +def test_invalid_seek_and_read_from_closed_memoryio(): + """ + Seek and read from closed MemoryIO + + https://github.com/asdf-format/asdf/issues/1539 + """ + b = io.BytesIO() + b.write(b"\0" * 10) + b.seek(0) + f = asdf.generic_io.get_file(b) + f.close() + with pytest.raises(IOError, match="I/O operation on closed file."): + f.read_into_array(10) + assert b.tell() == 0 diff --git a/asdf/_tests/_regtests/test_1540.py b/asdf/_tests/_regtests/test_1540.py new file mode 100644 index 000000000..62c83852d --- /dev/null +++ b/asdf/_tests/_regtests/test_1540.py @@ -0,0 +1,18 @@ +import numpy as np + +import asdf + + +def test_writes_but_fails_to_read_inline_structured_array(tmp_path): + """ + ASDF writes but fails to read inline structured array + + https://github.com/asdf-format/asdf/issues/1540 + """ + x = np.array((0, 1.0, [2, 3]), dtype=[("MINE", "i1"), ("f1", " 5) tree = {"masked_array": m, "unmasked_array": x} - def check_asdf(asdf): - tree = asdf.tree + with roundtrip(tree) as af: + tree = af.tree m = tree["masked_array"] assert np.all(m.mask[6:]) - assert len(asdf._blocks) == 2 - - helpers.assert_roundtrip_tree(tree, tmpdir, asdf_check_func=check_asdf) + assert len(af._blocks.blocks) == 2 def test_len_roundtrip(tmpdir): sequence = np.arange(0, 10, dtype=int) tree = {"sequence": sequence} - def check_len(asdf): - s = asdf.tree["sequence"] + with roundtrip(tree) as af: + s = af.tree["sequence"] assert len(s) == 10 - helpers.assert_roundtrip_tree(tree, tmpdir, asdf_check_func=check_len) - def test_mask_arbitrary(): content = """ - arr: !core/ndarray-1.0.0 - data: [[1, 2, 3, 1234], [5, 6, 7, 8]] - mask: 1234 +arr: !core/ndarray-1.0.0 + data: [[1, 2, 3, 1234], [5, 6, 7, 8]] + mask: 1234 """ buff = helpers.yaml_to_asdf(content) @@ -322,9 +395,9 @@ def test_mask_arbitrary(): def test_mask_nan(): content = """ - arr: !core/ndarray-1.0.0 - data: [[1, 2, 3, .NaN], [5, 6, 7, 8]] - mask: .NaN +arr: !core/ndarray-1.0.0 + data: [[1, 2, 3, .NaN], [5, 6, 7, 8]] + mask: .NaN """ buff = helpers.yaml_to_asdf(content) @@ -338,13 +411,17 @@ def test_string(tmpdir): "unicode": np.array(["სამეცნიერო", "данные", "வடிவம்"]), } - helpers.assert_roundtrip_tree(tree, tmpdir) + with roundtrip(tree) as af: + for k in tree: + assert_array_equal(tree[k], af[k]) def test_string_table(tmpdir): tree = {"table": np.array([(b"foo", "სამეცნიერო", "42", "53.0")])} - helpers.assert_roundtrip_tree(tree, tmpdir) + with roundtrip(tree) as af: + for k in tree: + assert_array_equal(tree[k], af[k]) def test_inline_string(): @@ -357,12 +434,12 @@ def test_inline_string(): def test_inline_structured(): content = """ - arr: !core/ndarray-1.0.0 - datatype: [['ascii', 4], uint16, uint16, ['ascii', 4]] - data: [[M110, 110, 205, And], - [ M31, 31, 224, And], - [ M32, 32, 221, And], - [M103, 103, 581, Cas]]""" +arr: !core/ndarray-1.0.0 + datatype: [['ascii', 4], uint16, uint16, ['ascii', 4]] + data: [[M110, 110, 205, And], + [ M31, 31, 224, And], + [ M32, 32, 221, And], + [M103, 103, 581, Cas]]""" buff = helpers.yaml_to_asdf(content) @@ -422,7 +499,7 @@ def test_inline_masked_array(tmpdir): f.write_to(testfile) with asdf.open(testfile) as f2: - assert len(list(f2._blocks.internal_blocks)) == 0 + assert len(list(f2._blocks.blocks)) == 0 assert_array_equal(f.tree["test"], f2.tree["test"]) with open(testfile, "rb") as fd: @@ -514,11 +591,11 @@ def test_operations_on_ndarray_proxies(tmpdir): def test_mask_datatype(tmpdir): content = """ - arr: !core/ndarray-1.0.0 - data: [1, 2, 3] - dtype: int32 - mask: !core/ndarray-1.0.0 - data: [true, true, false] +arr: !core/ndarray-1.0.0 + data: [1, 2, 3] + dtype: int32 + mask: !core/ndarray-1.0.0 + data: [true, true, false] """ buff = helpers.yaml_to_asdf(content) @@ -528,11 +605,11 @@ def test_mask_datatype(tmpdir): def test_invalid_mask_datatype(tmpdir): content = """ - arr: !core/ndarray-1.0.0 - data: [1, 2, 3] - dtype: int32 - mask: !core/ndarray-1.0.0 - data: ['a', 'b', 'c'] +arr: !core/ndarray-1.0.0 + data: [1, 2, 3] + dtype: int32 + mask: !core/ndarray-1.0.0 + data: ['a', 'b', 'c'] """ buff = helpers.yaml_to_asdf(content) @@ -542,243 +619,237 @@ def test_invalid_mask_datatype(tmpdir): pass +@with_custom_extension() def test_ndim_validation(tmpdir): content = """ - obj: ! - a: !core/ndarray-1.0.0 - data: [1, 2, 3] +obj: ! + a: !core/ndarray-1.0.0 + data: [1, 2, 3] """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Wrong number of dimensions:.*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - a: !core/ndarray-1.0.0 - data: [[1, 2, 3]] +obj: ! + a: !core/ndarray-1.0.0 + data: [[1, 2, 3]] """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - a: !core/ndarray-1.0.0 - shape: [1, 3] - data: [[1, 2, 3]] +obj: ! + a: !core/ndarray-1.0.0 + shape: [1, 3] + data: [[1, 2, 3]] """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - b: !core/ndarray-1.0.0 - data: [1, 2, 3] +obj: ! + b: !core/ndarray-1.0.0 + data: [1, 2, 3] """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - b: !core/ndarray-1.0.0 - data: [[1, 2, 3]] +obj: ! + b: !core/ndarray-1.0.0 + data: [[1, 2, 3]] """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - b: !core/ndarray-1.0.0 - data: [[[1, 2, 3]]] +obj: ! + b: !core/ndarray-1.0.0 + data: [[[1, 2, 3]]] """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Wrong number of dimensions:.*"), asdf.open( buff, - extensions=CustomExtension(), ): pass +@with_custom_extension() def test_datatype_validation(tmpdir): content = """ - obj: ! - a: !core/ndarray-1.0.0 - data: [1, 2, 3] - datatype: float32 +obj: ! + a: !core/ndarray-1.0.0 + data: [1, 2, 3] + datatype: float32 """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - a: !core/ndarray-1.0.0 - data: [1, 2, 3] - datatype: float64 +obj: ! + a: !core/ndarray-1.0.0 + data: [1, 2, 3] + datatype: float64 """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Can not safely cast from .* to .*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - a: !core/ndarray-1.0.0 - data: [1, 2, 3] - datatype: int16 +obj: ! + a: !core/ndarray-1.0.0 + data: [1, 2, 3] + datatype: int16 """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - b: !core/ndarray-1.0.0 - data: [1, 2, 3] - datatype: int16 +obj: ! + b: !core/ndarray-1.0.0 + data: [1, 2, 3] + datatype: int16 """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Expected datatype .*, got .*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - a: !core/ndarray-1.0.0 - data: [[1, 'a'], [2, 'b'], [3, 'c']] - datatype: - - name: a - datatype: int8 - - name: b - datatype: ['ascii', 8] +obj: ! + a: !core/ndarray-1.0.0 + data: [[1, 'a'], [2, 'b'], [3, 'c']] + datatype: + - name: a + datatype: int8 + - name: b + datatype: ['ascii', 8] """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Expected scalar datatype .*, got .*"), asdf.open( buff, - extensions=CustomExtension(), ): pass +@with_custom_extension() def test_structured_datatype_validation(tmpdir): content = """ - obj: ! - c: !core/ndarray-1.0.0 - data: [[1, 'a'], [2, 'b'], [3, 'c']] - datatype: - - name: a - datatype: int8 - - name: b - datatype: ['ascii', 8] +obj: ! + c: !core/ndarray-1.0.0 + data: [[1, 'a'], [2, 'b'], [3, 'c']] + datatype: + - name: a + datatype: int8 + - name: b + datatype: ['ascii', 8] """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass content = """ - obj: ! - c: !core/ndarray-1.0.0 - data: [[1, 'a'], [2, 'b'], [3, 'c']] - datatype: - - name: a - datatype: int64 - - name: b - datatype: ['ascii', 8] +obj: ! + c: !core/ndarray-1.0.0 + data: [[1, 'a'], [2, 'b'], [3, 'c']] + datatype: + - name: a + datatype: int64 + - name: b + datatype: ['ascii', 8] """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Can not safely cast to expected datatype.*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - c: !core/ndarray-1.0.0 - data: [[1, 'a', 0], [2, 'b', 1], [3, 'c', 2]] - datatype: - - name: a - datatype: int8 - - name: b - datatype: ['ascii', 8] - - name: c - datatype: float64 +obj: ! + c: !core/ndarray-1.0.0 + data: [[1, 'a', 0], [2, 'b', 1], [3, 'c', 2]] + datatype: + - name: a + datatype: int8 + - name: b + datatype: ['ascii', 8] + - name: c + datatype: float64 """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Mismatch in number of columns:.*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - c: !core/ndarray-1.0.0 - data: [1, 2, 3] +obj: ! + c: !core/ndarray-1.0.0 + data: [1, 2, 3] """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Expected structured datatype.*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - d: !core/ndarray-1.0.0 - data: [[1, 'a'], [2, 'b'], [3, 'c']] - datatype: - - name: a - datatype: int8 - - name: b - datatype: ['ascii', 8] +obj: ! + d: !core/ndarray-1.0.0 + data: [[1, 'a'], [2, 'b'], [3, 'c']] + datatype: + - name: a + datatype: int8 + - name: b + datatype: ['ascii', 8] """ buff = helpers.yaml_to_asdf(content) with pytest.raises(ValidationError, match=r"Expected datatype .*, got .*"), asdf.open( buff, - extensions=CustomExtension(), ): pass content = """ - obj: ! - d: !core/ndarray-1.0.0 - data: [[1, 'a'], [2, 'b'], [3, 'c']] - datatype: - - name: a - datatype: int16 - - name: b - datatype: ['ascii', 16] +obj: ! + d: !core/ndarray-1.0.0 + data: [[1, 'a'], [2, 'b'], [3, 'c']] + datatype: + - name: a + datatype: int16 + - name: b + datatype: ['ascii', 16] """ buff = helpers.yaml_to_asdf(content) - with asdf.open(buff, extensions=CustomExtension()): + with asdf.open(buff): pass @@ -792,9 +863,9 @@ def test_string_inline(): def test_inline_shape_mismatch(): content = """ - arr: !core/ndarray-1.0.0 - data: [1, 2, 3] - shape: [2] +arr: !core/ndarray-1.0.0 + data: [1, 2, 3] + shape: [2] """ buff = helpers.yaml_to_asdf(content) @@ -802,34 +873,11 @@ def test_inline_shape_mismatch(): pass -@pytest.mark.xfail(reason="NDArrays with dtype=object are not currently supported") -def test_simple_object_array(tmpdir): - # See https://github.com/asdf-format/asdf/issues/383 for feature - # request - dictdata = np.empty((3, 3), dtype=object) - for i, _ in enumerate(dictdata.flat): - dictdata.flat[i] = {"foo": i * 42, "bar": i**2} - - helpers.assert_roundtrip_tree({"bizbaz": dictdata}, tmpdir) - - -@pytest.mark.xfail(reason="NDArrays with dtype=object are not currently supported") -def test_tagged_object_array(tmpdir): - # See https://github.com/asdf-format/asdf/issues/383 for feature - # request - quantity = pytest.importorskip("astropy.units.quantity") - - objdata = np.empty((3, 3), dtype=object) - for i, _ in enumerate(objdata.flat): - objdata.flat[i] = quantity.Quantity(i, "angstrom") - - helpers.assert_roundtrip_tree({"bizbaz": objdata}, tmpdir) - - def test_broadcasted_array(tmpdir): attrs = np.broadcast_arrays(np.array([10, 20]), np.array(10), np.array(10)) tree = {"one": attrs[1]} # , 'two': attrs[1], 'three': attrs[2]} - helpers.assert_roundtrip_tree(tree, tmpdir) + with roundtrip(tree) as af: + assert_array_equal(tree["one"], af["one"]) def test_broadcasted_offset_array(tmpdir): @@ -837,30 +885,30 @@ def test_broadcasted_offset_array(tmpdir): offset = base[5:] broadcasted = np.broadcast_to(offset, (4, 5)) tree = {"broadcasted": broadcasted} - helpers.assert_roundtrip_tree(tree, tmpdir) + with roundtrip(tree) as af: + assert_array_equal(tree["broadcasted"], af["broadcasted"]) def test_non_contiguous_base_array(tmpdir): base = np.arange(60).reshape(5, 4, 3).transpose(2, 0, 1) * 1 contiguous = base.transpose(1, 2, 0) tree = {"contiguous": contiguous} - helpers.assert_roundtrip_tree(tree, tmpdir) + with roundtrip(tree) as af: + assert_array_equal(tree["contiguous"], af["contiguous"]) def test_fortran_order(tmpdir): array = np.array([[11, 12, 13], [21, 22, 23]], order="F", dtype=np.int64) tree = {"data": array} - def check_f_order(t): - assert t["data"].flags.fortran - assert np.all(np.isclose(array, t["data"])) + with roundtrip(tree) as af: + assert af["data"].flags.fortran + assert np.all(np.isclose(array, af["data"])) - def check_raw_yaml(content): + with roundtrip(tree, raw=True) as content: tree = yaml.safe_load(re.sub(rb"!core/\S+", b"", content)) assert tree["data"]["strides"] == [8, 16] - helpers.assert_roundtrip_tree(tree, tmpdir, asdf_check_func=check_f_order, raw_yaml_check_func=check_raw_yaml) - def test_memmap_write(tmpdir): tmpfile = str(tmpdir.join("data.asdf")) @@ -968,3 +1016,15 @@ def test_problematic_class_attributes(tmp_path): with pytest.raises(AttributeError, match=r".* object has no attribute 'version'"): af["arr"].version + + +def test_shape_does_not_load_array(tmp_path): + file_path = tmp_path / "test.asdf" + with asdf.AsdfFile() as af: + af["arr"] = np.arange(100) + af.write_to(file_path) + + with asdf.open(file_path, lazy_load=True) as af: + assert af["arr"]._array is None + assert af["arr"].shape == (100,) + assert af["arr"]._array is None diff --git a/asdf/_tests/test_api.py b/asdf/_tests/test_api.py index c8a49f8f9..9cf562870 100644 --- a/asdf/_tests/test_api.py +++ b/asdf/_tests/test_api.py @@ -364,8 +364,8 @@ def test_array_inline_threshold(array_inline_threshold, inline_blocks, internal_ with asdf.AsdfFile(tree) as af: af.write_to(file_path) - assert len(list(af._blocks.inline_blocks)) == inline_blocks - assert len(list(af._blocks.internal_blocks)) == internal_blocks + with asdf.open(file_path) as af: + assert len(af._blocks.blocks) == internal_blocks @pytest.mark.parametrize( @@ -388,8 +388,7 @@ def test_array_inline_threshold_masked_array(array_inline_threshold, inline_bloc with asdf.AsdfFile(tree) as af: af.write_to(file_path) with asdf.open(file_path) as af: - assert len(list(af._blocks.inline_blocks)) == inline_blocks - assert len(list(af._blocks.internal_blocks)) == internal_blocks + assert len(af._blocks.blocks) == internal_blocks @pytest.mark.parametrize( @@ -410,8 +409,8 @@ def test_array_inline_threshold_string_array(array_inline_threshold, inline_bloc with asdf.AsdfFile(tree) as af: af.write_to(file_path) - assert len(list(af._blocks.inline_blocks)) == inline_blocks - assert len(list(af._blocks.internal_blocks)) == internal_blocks + with asdf.open(file_path) as af: + assert len(af._blocks.blocks) == internal_blocks def test_resolver_deprecations(): @@ -458,7 +457,7 @@ def test_array_access_after_file_close(tmp_path): # the file has been closed: with asdf.open(path) as af: tree = af.tree - with pytest.raises(OSError, match=r"ASDF file has already been closed"): + with pytest.raises(OSError, match=r"ASDF file has already been closed. Can not get the data."): tree["data"][0] # With memory mapping disabled and copying arrays enabled, @@ -489,23 +488,68 @@ def test_asdf_standard_version_tag_selection(): assert b"!core/asdf-1.1.0" not in content buff.seek(0) - af.write_to(buff, version="1.2.0") + af.write_to(buff, version="1.2.0") # asdf-standard 1.2 uses asdf-object 1.1 tag buff.seek(0) content = buff.read() assert b"!core/asdf-1.0.0" not in content assert b"!core/asdf-1.1.0" in content -def test_write_to_no_tree_modification(tmp_path): - fn = tmp_path / "test.asdf" +def test_update_asdf_standard_version_tag_selection(): + buff = io.BytesIO() + + af = asdf.AsdfFile() + af.write_to(buff, version="1.0.0") + + buff.seek(0) + with asdf.open(buff, mode="rw") as af: + af.update(version="1.2.0") # asdf-standard 1.2 uses asdf-object 1.1 tag + buff.seek(0) + content = buff.read() + assert b"!core/asdf-1.1.0" in content + assert b"!core/asdf-1.0.0" not in content + + +@pytest.mark.parametrize("valid_filename", [True, False], ids=["valid_filename", "invalid_filename"]) +def test_write_to_no_tree_modification(tmp_path, valid_filename): + if valid_filename: + fn = tmp_path / "test.asdf" + else: + fn = "invalid/missing.asdf" fn2 = tmp_path / "test2.asdf" tree = {"foo": None} af = asdf.AsdfFile(tree.copy()) - af.write_to(fn) + try: + af.write_to(fn) + except Exception: + if valid_filename: + raise assert tree == af.tree + if not valid_filename: + return with asdf.open(fn) as af: af["history"]["extensions"][0]["software"]["version"] = "0.0.0.dev+abcdefg" af["asdf_library"]["author"] = "foo" tree = copy.deepcopy(af.tree) af.write_to(fn2) assert af.tree == tree + + +@pytest.mark.parametrize("valid_filename", [True, False], ids=["valid_filename", "invalid_filename"]) +def test_write_to_no_version_modification(tmp_path, valid_filename): + if valid_filename: + fn = tmp_path / "test.asdf" + else: + fn = "invalid/missing.asdf" + tree = {"foo": None} + af = asdf.AsdfFile(tree.copy(), version="1.0.0") + try: + af.write_to(fn, version="1.1.0") + except Exception: + if valid_filename: + raise + assert af.version_string == "1.0.0" + if not valid_filename: + return + with asdf.open(fn) as af: + assert af.version_string == "1.1.0" diff --git a/asdf/_tests/test_array_blocks.py b/asdf/_tests/test_array_blocks.py index b95be394b..f2c2b626b 100644 --- a/asdf/_tests/test_array_blocks.py +++ b/asdf/_tests/test_array_blocks.py @@ -8,7 +8,9 @@ from numpy.testing import assert_array_equal import asdf -from asdf import block, constants, generic_io +from asdf import constants, generic_io +from asdf._block import io as bio +from asdf.exceptions import AsdfBlockIndexWarning RNG = np.random.default_rng(6) @@ -55,15 +57,6 @@ def test_invalid_array_storage(): with pytest.raises(ValueError, match=r"array_storage must be one of.*"): ff.set_array_storage(my_array, "foo") - b = block.Block() - b._array_storage = "foo" - - with pytest.raises(ValueError, match=r"Unknown array storage type foo"): - ff._blocks.add(b) - - with pytest.raises(ValueError, match=r"Unknown array storage type foo"): - ff._blocks.remove(b) - def test_transfer_array_sources(tmp_path): tmp_path = str(tmp_path) @@ -115,7 +108,9 @@ def test_pad_blocks(tmp_path): assert_array_equal(ff.tree["my_array2"], my_array2) -def test_update_expand_tree(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_expand_tree(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) testpath = os.path.join(tmp_path, "test.asdf") @@ -126,17 +121,15 @@ def test_update_expand_tree(tmp_path): ff = asdf.AsdfFile(tree) ff.set_array_storage(tree["arrays"][2], "inline") - assert len(list(ff._blocks.inline_blocks)) == 1 ff.write_to(testpath, pad_blocks=True) - with asdf.open(testpath, mode="rw") as ff: + with asdf.open(testpath, lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: + assert len(list(ff._blocks.blocks)) == 2 assert_array_equal(ff.tree["arrays"][0], my_array) - orig_offset = ff._blocks[ff.tree["arrays"][0]].offset ff.tree["extra"] = [0] * 6000 ff.update() with asdf.open(testpath) as ff: - assert orig_offset <= ff._blocks[ff.tree["arrays"][0]].offset - assert ff._blocks[ff.tree["arrays"][2]].array_storage == "inline" + assert ff.get_array_storage(ff.tree["arrays"][2]) == "inline" assert_array_equal(ff.tree["arrays"][0], my_array) assert_array_equal(ff.tree["arrays"][1], my_array2) @@ -144,19 +137,19 @@ def test_update_expand_tree(tmp_path): ff = asdf.AsdfFile(tree) ff.set_array_storage(tree["arrays"][2], "inline") ff.write_to(os.path.join(tmp_path, "test2.asdf"), pad_blocks=True) - with asdf.open(os.path.join(tmp_path, "test2.asdf"), mode="rw") as ff: - orig_offset = ff._blocks[ff.tree["arrays"][0]].offset + with asdf.open(os.path.join(tmp_path, "test2.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: ff.tree["extra"] = [0] * 2 ff.update() with asdf.open(os.path.join(tmp_path, "test2.asdf")) as ff: - assert orig_offset == ff._blocks[ff.tree["arrays"][0]].offset - assert ff._blocks[ff.tree["arrays"][2]].array_storage == "inline" + assert ff.get_array_storage(ff.tree["arrays"][2]) == "inline" assert_array_equal(ff.tree["arrays"][0], my_array) assert_array_equal(ff.tree["arrays"][1], my_array2) -def test_update_all_external(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_all_external(tmp_path, lazy_load, copy_arrays): fn = tmp_path / "test.asdf" my_array = np.arange(64) * 1 @@ -169,18 +162,40 @@ def test_update_all_external(tmp_path): with asdf.config.config_context() as cfg: cfg.array_inline_threshold = 10 cfg.all_array_storage = "external" - with asdf.open(fn, mode="rw") as af: + with asdf.open(fn, lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as af: af.update() assert "test0000.asdf" in os.listdir(tmp_path) assert "test0001.asdf" in os.listdir(tmp_path) +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_some_external(tmp_path, lazy_load, copy_arrays): + fn = tmp_path / "test.asdf" + + my_array = np.arange(64) * 1 + my_array2 = np.arange(64) * 2 + tree = {"arrays": [my_array, my_array2]} + + af = asdf.AsdfFile(tree) + af.write_to(fn) + + with asdf.open(fn, lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as af: + af.set_array_storage(af["arrays"][0], "external") + af.update() + + assert "test0000.asdf" in os.listdir(tmp_path) + assert "test0001.asdf" not in os.listdir(tmp_path) + + def _get_update_tree(): return {"arrays": [np.arange(64) * 1, np.arange(64) * 2, np.arange(64) * 3]} -def test_update_delete_first_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_delete_first_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -192,7 +207,7 @@ def test_update_delete_first_array(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: del ff.tree["arrays"][0] ff.update() @@ -203,7 +218,9 @@ def test_update_delete_first_array(tmp_path): assert_array_equal(ff.tree["arrays"][1], tree["arrays"][2]) -def test_update_delete_last_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_delete_last_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -215,7 +232,7 @@ def test_update_delete_last_array(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: del ff.tree["arrays"][-1] ff.update() @@ -226,7 +243,9 @@ def test_update_delete_last_array(tmp_path): assert_array_equal(ff.tree["arrays"][1], tree["arrays"][1]) -def test_update_delete_middle_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_delete_middle_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -238,22 +257,22 @@ def test_update_delete_middle_array(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: del ff.tree["arrays"][1] ff.update() - assert len(ff._blocks._internal_blocks) == 2 + assert len(ff._blocks.blocks) == 2 assert os.stat(path).st_size <= original_size with asdf.open(os.path.join(tmp_path, "test.asdf")) as ff: assert len(ff.tree["arrays"]) == 2 - assert ff.tree["arrays"][0]._source == 0 - assert ff.tree["arrays"][1]._source == 1 assert_array_equal(ff.tree["arrays"][0], tree["arrays"][0]) assert_array_equal(ff.tree["arrays"][1], tree["arrays"][2]) -def test_update_replace_first_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_replace_first_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -265,7 +284,7 @@ def test_update_replace_first_array(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: ff.tree["arrays"][0] = np.arange(32) ff.update() @@ -277,7 +296,9 @@ def test_update_replace_first_array(tmp_path): assert_array_equal(ff.tree["arrays"][2], tree["arrays"][2]) -def test_update_replace_last_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_replace_last_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -289,7 +310,7 @@ def test_update_replace_last_array(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: ff.tree["arrays"][2] = np.arange(32) ff.update() @@ -301,7 +322,9 @@ def test_update_replace_last_array(tmp_path): assert_array_equal(ff.tree["arrays"][2], np.arange(32)) -def test_update_replace_middle_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_replace_middle_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -313,7 +336,7 @@ def test_update_replace_middle_array(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: ff.tree["arrays"][1] = np.arange(32) ff.update() @@ -325,7 +348,9 @@ def test_update_replace_middle_array(tmp_path): assert_array_equal(ff.tree["arrays"][2], tree["arrays"][2]) -def test_update_add_array(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_add_array(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -335,7 +360,7 @@ def test_update_add_array(tmp_path): ff = asdf.AsdfFile(tree) ff.write_to(path, pad_blocks=True) - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: ff.tree["arrays"].append(np.arange(32)) ff.update() @@ -346,7 +371,9 @@ def test_update_add_array(tmp_path): assert_array_equal(ff.tree["arrays"][3], np.arange(32)) -def test_update_add_array_at_end(tmp_path): +@pytest.mark.parametrize("lazy_load", [True, False]) +@pytest.mark.parametrize("copy_arrays", [True, False]) +def test_update_add_array_at_end(tmp_path, lazy_load, copy_arrays): tmp_path = str(tmp_path) path = os.path.join(tmp_path, "test.asdf") @@ -358,10 +385,10 @@ def test_update_add_array_at_end(tmp_path): original_size = os.stat(path).st_size - with asdf.open(os.path.join(tmp_path, "test.asdf"), mode="rw") as ff: + with asdf.open(os.path.join(tmp_path, "test.asdf"), lazy_load=lazy_load, copy_arrays=copy_arrays, mode="rw") as ff: ff.tree["arrays"].append(np.arange(65536, dtype="= original_size @@ -372,7 +399,9 @@ def test_update_add_array_at_end(tmp_path): assert_array_equal(ff.tree["arrays"][3], np.arange(65536, dtype="1.*"): + op_ctx.get_block_data_callback(1) + + # unless we use a key + key = op_ctx.generate_block_key() + cb1 = op_ctx.get_block_data_callback(1, key) + assert op_ctx.get_block_data_callback(1, key) is cb1 + + # we don't know the order of blocks, so find which block + # was used for which array by looking at the size + d0 = cb0() + d1 = cb1() + if d0.size == arr1.size: + arr0, arr1 = arr1, arr0 + np.testing.assert_array_equal(d0, arr0) + np.testing.assert_array_equal(d1, arr1) + + for access in (BlockAccess.NONE, BlockAccess.WRITE): + op_ctx = af._create_serialization_context(access) + with pytest.raises(NotImplementedError, match="abstract"): + op_ctx.get_block_data_callback(0) + + +def test_find_available_block_index(): + af = asdf.AsdfFile() + context = af._create_serialization_context() + + def cb(): + return np.arange(3, dtype="uint8") + + with pytest.raises(NotImplementedError, match="abstract"): + context.find_available_block_index(cb) + + class Foo: + pass + + op_ctx = af._create_serialization_context(BlockAccess.WRITE) + op_ctx.assign_object(Foo()) + assert op_ctx.find_available_block_index(cb) == 0 + + for access in (BlockAccess.NONE, BlockAccess.READ): + op_ctx = af._create_serialization_context(access) + with pytest.raises(NotImplementedError, match="abstract"): + op_ctx.find_available_block_index(cb) + + +def test_generate_block_key(): + af = asdf.AsdfFile() + context = af._create_serialization_context() + + with pytest.raises(NotImplementedError, match="abstract"): + context.generate_block_key() + + class Foo: + pass + + obj = Foo() + op_ctx = af._create_serialization_context(BlockAccess.WRITE) + op_ctx.assign_object(obj) + key = op_ctx.generate_block_key() + assert key._is_valid() + assert key._matches_object(obj) + + obj = Foo() + op_ctx = af._create_serialization_context(BlockAccess.READ) + # because this test generates but does not assign a key + # it should raise an exception + with pytest.raises(OSError, match=r"Converter generated a key.*"): + key = op_ctx.generate_block_key() + # the key does not yet have an assigned object + assert not key._is_valid() + op_ctx.assign_blocks() diff --git a/asdf/_tests/test_stream.py b/asdf/_tests/test_stream.py index 9b54373c3..3b66cec8c 100644 --- a/asdf/_tests/test_stream.py +++ b/asdf/_tests/test_stream.py @@ -6,13 +6,13 @@ from numpy.testing import assert_array_equal import asdf -from asdf import generic_io, stream +from asdf import Stream, generic_io def test_stream(): buff = io.BytesIO() - tree = {"stream": stream.Stream([6, 2], np.float64)} + tree = {"stream": Stream([6, 2], np.float64)} ff = asdf.AsdfFile(tree) ff.write_to(buff) @@ -22,7 +22,7 @@ def test_stream(): buff.seek(0) with asdf.open(buff) as ff: - assert len(ff._blocks) == 1 + assert len(ff._blocks.blocks) == 1 assert ff.tree["stream"].shape == (100, 6, 2) for i, row in enumerate(ff.tree["stream"]): assert np.all(row == i) @@ -35,7 +35,7 @@ def test_stream_write_nothing(): buff = io.BytesIO() - tree = {"stream": stream.Stream([6, 2], np.float64)} + tree = {"stream": Stream([6, 2], np.float64)} ff = asdf.AsdfFile(tree) ff.write_to(buff) @@ -43,7 +43,7 @@ def test_stream_write_nothing(): buff.seek(0) with asdf.open(buff) as ff: - assert len(ff._blocks) == 1 + assert len(ff._blocks.blocks) == 1 assert ff.tree["stream"].shape == (0, 6, 2) @@ -54,7 +54,7 @@ def test_stream_twice(): buff = io.BytesIO() - tree = {"stream": stream.Stream([6, 2], np.uint8), "stream2": stream.Stream([12, 2], np.uint8)} + tree = {"stream": Stream([6, 2], np.uint8), "stream2": Stream([12, 2], np.uint8)} ff = asdf.AsdfFile(tree) ff.write_to(buff) @@ -64,7 +64,7 @@ def test_stream_twice(): buff.seek(0) ff = asdf.open(buff) - assert len(ff._blocks) == 1 + assert len(ff._blocks.blocks) == 1 assert ff.tree["stream"].shape == (100, 6, 2) assert ff.tree["stream2"].shape == (50, 12, 2) @@ -72,7 +72,7 @@ def test_stream_twice(): def test_stream_with_nonstream(): buff = io.BytesIO() - tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": stream.Stream([6, 2], np.float64)} + tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": Stream([6, 2], np.float64)} ff = asdf.AsdfFile(tree) # Since we're testing with small arrays, force this array to be stored in @@ -85,10 +85,9 @@ def test_stream_with_nonstream(): buff.seek(0) with asdf.open(buff) as ff: - assert len(ff._blocks) == 1 + assert len(ff._blocks.blocks) == 2 assert_array_equal(ff.tree["nonstream"], np.array([1, 2, 3, 4], np.int64)) assert ff.tree["stream"].shape == (100, 6, 2) - assert len(ff._blocks) == 2 for i, row in enumerate(ff.tree["stream"]): assert np.all(row == i) @@ -96,7 +95,7 @@ def test_stream_with_nonstream(): def test_stream_real_file(tmp_path): path = os.path.join(str(tmp_path), "test.asdf") - tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": stream.Stream([6, 2], np.float64)} + tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": Stream([6, 2], np.float64)} with open(path, "wb") as fd: ff = asdf.AsdfFile(tree) @@ -109,16 +108,15 @@ def test_stream_real_file(tmp_path): fd.write(np.array([i] * 12, np.float64).tobytes()) with asdf.open(path) as ff: - assert len(ff._blocks) == 1 + assert len(ff._blocks.blocks) == 2 assert_array_equal(ff.tree["nonstream"], np.array([1, 2, 3, 4], np.int64)) assert ff.tree["stream"].shape == (100, 6, 2) - assert len(ff._blocks) == 2 for i, row in enumerate(ff.tree["stream"]): assert np.all(row == i) def test_stream_to_stream(): - tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": stream.Stream([6, 2], np.float64)} + tree = {"nonstream": np.array([1, 2, 3, 4], np.int64), "stream": Stream([6, 2], np.float64)} buff = io.BytesIO() fd = generic_io.OutputStream(buff) @@ -131,7 +129,7 @@ def test_stream_to_stream(): buff.seek(0) with asdf.open(generic_io.InputStream(buff, "r")) as ff: - assert len(ff._blocks) == 2 + assert len(ff._blocks.blocks) == 2 assert_array_equal(ff.tree["nonstream"], np.array([1, 2, 3, 4], np.int64)) assert ff.tree["stream"].shape == (100, 6, 2) for i, row in enumerate(ff.tree["stream"]): @@ -154,6 +152,7 @@ def test_array_to_stream(tmp_path): assert_array_equal(ff.tree["stream"], [1, 2, 3, 4, 5, 6, 7, 8]) buff.seek(0) ff2 = asdf.AsdfFile(ff) + ff2.set_array_storage(ff2["stream"], "streamed") ff2.write_to(buff) assert b"shape: ['*']" in buff.getvalue() @@ -180,7 +179,7 @@ def test_too_many_streams(): def test_stream_repr_and_str(): - tree = {"stream": stream.Stream([16], np.int64)} + tree = {"stream": Stream([16], np.int64)} ff = asdf.AsdfFile(tree) repr(ff.tree["stream"]) diff --git a/asdf/_tests/test_util.py b/asdf/_tests/test_util.py index 944631941..112221390 100644 --- a/asdf/_tests/test_util.py +++ b/asdf/_tests/test_util.py @@ -1,4 +1,3 @@ -import copy import io import pytest @@ -118,17 +117,3 @@ def test_minversion(): assert util.minversion(yaml, "3.1") assert util.minversion("yaml", "3.1") - - -def test_block_key(): - bk = util.BlockKey() - # make sure block key is hashable and can serve as a dictionary key - hash(bk) - d = {bk: 1} - # a new key should produce a different hash than the first - bk2 = util.BlockKey() - d[bk2] = 2 - assert len(d) == 2 - # check that equality and copying a key works - assert copy.copy(bk) == bk - assert bk != hash(bk) diff --git a/asdf/_tests/test_yaml.py b/asdf/_tests/test_yaml.py index 1faa015ef..a4f9f7e31 100644 --- a/asdf/_tests/test_yaml.py +++ b/asdf/_tests/test_yaml.py @@ -8,7 +8,7 @@ import asdf from asdf import tagged, treeutil, yamlutil -from asdf.exceptions import AsdfWarning +from asdf.exceptions import AsdfConversionWarning, AsdfWarning from . import _helpers as helpers @@ -203,8 +203,7 @@ def test_explicit_tags(): %YAML 1.1 --- ! foo: ! [1, 2, 3] -... - """ +...""" # Check that fully qualified explicit tags work buff = helpers.yaml_to_asdf(yaml, yaml_headers=False) @@ -294,3 +293,19 @@ def test_numpy_scalar(numpy_value, expected_value): assert abs_diff < eps, abs_diff else: assert loaded_value == expected_value + + +def test_ndarray_subclass_conversion(tmp_path): + class MyNDArray(np.ndarray): + pass + + fn = tmp_path / "test.asdf" + af = asdf.AsdfFile() + af["a"] = MyNDArray([1, 2, 3]) + with pytest.warns(AsdfConversionWarning, match=r"A ndarray subclass .*"): + af.write_to(fn) + + with asdf.config.config_context() as cfg: + cfg.convert_unknown_ndarray_subclasses = False + with pytest.raises(yaml.representer.RepresenterError, match=r".*cannot represent.*"): + af.write_to(fn) diff --git a/asdf/asdf.py b/asdf/asdf.py index 4ff592034..0ea3b45ef 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -11,7 +11,9 @@ from . import _display as display from . import _node_info as node_info from . import _version as version -from . import block, constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil +from . import compression as mcompression +from . import constants, generic_io, reference, schema, treeutil, util, versioning, yamlutil +from ._block.manager import Manager as BlockManager from ._helpers import validate_version from .config import config_context, get_config from .exceptions import ( @@ -21,12 +23,27 @@ DelimiterNotFoundError, ValidationError, ) -from .extension import Extension, ExtensionProxy, SerializationContext, _legacy, get_cached_extension_manager +from .extension import Extension, ExtensionProxy, _legacy, _serialization_context, get_cached_extension_manager from .search import AsdfSearchResult from .tags.core import AsdfObject, ExtensionMetadata, HistoryEntry, Software from .util import NotSet +def __getattr__(name): + if name == "SerializationContext": + warnings.warn( + "importing SerializationContext from asdf.asdf is deprecated. " + "Please import SerializationContext from asdf.extension", + AsdfDeprecationWarning, + ) + from .extension._serialization_context import SerializationContext + + return SerializationContext + + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) + + def get_asdf_library_info(): """ Get information about asdf to include in the asdf_library entry @@ -150,9 +167,7 @@ def __init__( self._fd = None self._closed = False self._external_asdf_by_uri = {} - self._blocks = block.BlockManager(self, copy_arrays=copy_arrays, lazy_load=lazy_load) - # set the uri here so validation can generate any required external blocks - self._uri = uri + self._blocks = BlockManager(uri=uri, lazy_load=lazy_load, memmap=not copy_arrays) if tree is None: # Bypassing the tree property here, to avoid validating # an empty tree. @@ -163,7 +178,7 @@ def __init__( # of copying the file? msg = "Can not copy AsdfFile and change active extensions" raise ValueError(msg) - self._uri = tree.uri + self._blocks._uri = tree.uri # Set directly to self._tree (bypassing property), since # we can assume the other AsdfFile is already valid. self._tree = tree.tree @@ -387,7 +402,7 @@ def _process_user_extensions(self, extensions): return result - def _update_extension_history(self, serialization_context): + def _update_extension_history(self, tree, serialization_context): """ Update the extension metadata on this file's tree to reflect extensions used during serialization. @@ -400,20 +415,20 @@ def _update_extension_history(self, serialization_context): if serialization_context.version < versioning.NEW_HISTORY_FORMAT_MIN_VERSION: return - if "history" not in self.tree: - self.tree["history"] = {"extensions": []} + if "history" not in tree: + tree["history"] = {"extensions": []} # Support clients who are still using the old history format - elif isinstance(self.tree["history"], list): - histlist = self.tree["history"] - self.tree["history"] = {"entries": histlist, "extensions": []} + elif isinstance(tree["history"], list): + histlist = tree["history"] + tree["history"] = {"entries": histlist, "extensions": []} warnings.warn( "The ASDF history format has changed in order to " "support metadata about extensions. History entries " "should now be stored under tree['history']['entries'].", AsdfWarning, ) - elif "extensions" not in self.tree["history"]: - self.tree["history"]["extensions"] = [] + elif "extensions" not in tree["history"]: + tree["history"]["extensions"] = [] for extension in serialization_context._extensions_used: ext_name = extension.class_name @@ -425,17 +440,17 @@ def _update_extension_history(self, serialization_context): if extension.compressors: ext_meta["supported_compression"] = [comp.label.decode("ascii") for comp in extension.compressors] - for i, entry in enumerate(self.tree["history"]["extensions"]): + for i, entry in enumerate(tree["history"]["extensions"]): # Update metadata about this extension if it already exists if ( entry.extension_uri is not None and entry.extension_uri == extension.extension_uri or entry.extension_class in extension.legacy_class_names ): - self.tree["history"]["extensions"][i] = ext_meta + tree["history"]["extensions"][i] = ext_meta break else: - self.tree["history"]["extensions"].append(ext_meta) + tree["history"]["extensions"].append(ext_meta) @property def file_format_version(self): @@ -465,7 +480,7 @@ def close(self): def copy(self): return self.__class__( copy.deepcopy(self._tree), - self._uri, + self._blocks._uri, self._user_extensions, ) @@ -479,11 +494,7 @@ def uri(self): In many cases, it is automatically determined from the file handle used to read or write the file. """ - if self._uri is not None: - return self._uri - if self._fd is not None: - return self._fd._uri - return None + return self._blocks._uri @property def _tag_to_schema_resolver(self): @@ -596,14 +607,15 @@ def comments(self): return self._comments def _validate(self, tree, custom=True, reading=False): - # If we're validating on read then the tree - # is already guaranteed to be in tagged form. - tagged_tree = tree if reading else yamlutil.custom_tree_to_tagged_tree(tree, self) + with self._blocks.options_context(): + # If we're validating on read then the tree + # is already guaranteed to be in tagged form. + tagged_tree = tree if reading else yamlutil.custom_tree_to_tagged_tree(tree, self) - schema.validate(tagged_tree, self, reading=reading) - # Perform secondary validation pass if requested - if custom and self._custom_schema: - schema.validate(tagged_tree, self, self._custom_schema, reading=reading) + schema.validate(tagged_tree, self, reading=reading) + # Perform secondary validation pass if requested + if custom and self._custom_schema: + schema.validate(tagged_tree, self, self._custom_schema, reading=reading) def validate(self): """ @@ -638,19 +650,6 @@ def make_reference(self, path=None): """ return reference.make_reference(self, [] if path is None else path) - @property - def blocks(self): - """ - Get the block manager associated with the `AsdfFile`. - """ - warnings.warn( - "The property AsdfFile.blocks has been deprecated and will be removed " - "in asdf-3.0. Public use of the block manager is strongly discouraged " - "as there is no stable API", - AsdfDeprecationWarning, - ) - return self._blocks - def set_array_storage(self, arr, array_storage): """ Set the block type to use for the given array data. @@ -673,8 +672,7 @@ def set_array_storage(self, arr, array_storage): - ``inline``: Store the data as YAML inline in the tree. """ - block = self._blocks[arr] - self._blocks.set_array_storage(block, array_storage) + self._blocks._set_array_storage(arr, array_storage) def get_array_storage(self, arr): """ @@ -684,7 +682,7 @@ def get_array_storage(self, arr): ---------- arr : numpy.ndarray """ - return self._blocks[arr].array_storage + return self._blocks._get_array_storage(arr) def set_array_compression(self, arr, compression, **compression_kwargs): """ @@ -712,8 +710,7 @@ def set_array_compression(self, arr, compression, **compression_kwargs): If there is no prior file, acts as None. """ - self._blocks[arr].output_compression = compression - self._blocks[arr].output_compression_kwargs = compression_kwargs + self._blocks._set_array_compression(arr, compression, **compression_kwargs) def get_array_compression(self, arr): """ @@ -727,11 +724,11 @@ def get_array_compression(self, arr): ------- compression : str or None """ - return self._blocks[arr].output_compression + return self._blocks._get_array_compression(arr) def get_array_compression_kwargs(self, arr): """ """ - return self._blocks[arr].output_compression_kwargs + return self._blocks._get_array_compression_kwargs(arr) @classmethod def _parse_header_line(cls, line): @@ -809,8 +806,13 @@ def _open_asdf( raise ValueError(msg) with config_context(): + # validate_checksums (unlike memmap and lazy_load) is provided + # here instead of in __init__ + self._blocks._validate_checksums = validate_checksums self._mode = fd.mode self._fd = fd + if self._fd._uri: + self._blocks._uri = self._fd._uri # The filename is currently only used for tracing warning information self._fname = self._fd._uri if self._fd._uri else "" try: @@ -834,7 +836,6 @@ def _open_asdf( self.extensions = extensions yaml_token = fd.read(4) - has_blocks = False tree = None if yaml_token == b"%YAM": reader = fd.reader_until( @@ -855,9 +856,10 @@ def _open_asdf( # now, but we don't do anything special with it until # after the blocks have been read tree = yamlutil.load_tree(reader) - has_blocks = fd.seek_until(constants.BLOCK_MAGIC, 4, include=True, exception=False) + self._blocks.read(fd) elif yaml_token == constants.BLOCK_MAGIC: - has_blocks = True + # this file has only blocks and we're already read the first block magic + self._blocks.read(fd, after_magic=True) elif yaml_token != b"": msg = "ASDF file appears to contain garbage after header." raise OSError(msg) @@ -869,10 +871,6 @@ def _open_asdf( # to select the correct tag for us. tree = yamlutil.custom_tree_to_tagged_tree(AsdfObject(), self) - if has_blocks: - self._blocks.read_internal_blocks(fd, past_magic=True, validate_checksums=validate_checksums) - self._blocks.read_block_index(fd, self) - tree = reference.find_references(tree, self) if self.version <= versioning.FILL_DEFAULTS_MAX_VERSION and get_config().legacy_fill_schema_defaults: @@ -941,11 +939,14 @@ def _write_tree(self, tree, fd, pad_blocks): fd.write(b"\n") if len(tree): - serialization_context = self._create_serialization_context() + serialization_context = self._create_serialization_context(_serialization_context.BlockAccess.WRITE) - compression_extensions = self._blocks.get_output_compression_extensions() - for ext in compression_extensions: - serialization_context._mark_extension_used(ext) + for compression in self._blocks.get_output_compressions(): + # lookup extension + compressor = mcompression._get_compressor_from_extensions(compression, return_extension=True) + if compressor is not None: + # mark it as used + serialization_context._mark_extension_used(compressor[1]) def _tree_finalizer(tagged_tree): """ @@ -954,10 +955,10 @@ def _tree_finalizer(tagged_tree): yamlutil.dump_tree to update extension metadata after the tree has been converted to tagged objects. """ - self._update_extension_history(serialization_context) - if "history" in self.tree: + self._update_extension_history(tree, serialization_context) + if "history" in tree: tagged_tree["history"] = yamlutil.custom_tree_to_tagged_tree( - self.tree["history"], + tree["history"], self, _serialization_context=serialization_context, ) @@ -980,26 +981,20 @@ def _pre_write(self, fd): if len(self._tree): self._run_hook("pre_write") - # This is where we'd do some more sophisticated block - # reorganization, if necessary - self._blocks.finalize(self) - - self._tree["asdf_library"] = get_asdf_library_info() - def _serial_write(self, fd, pad_blocks, include_block_index): - self._write_tree(self._tree, fd, pad_blocks) - self._blocks.write_internal_blocks_serial(fd, pad_blocks) - self._blocks.write_external_blocks(fd.uri, pad_blocks) - if include_block_index: - self._blocks.write_block_index(fd, self) - - def _random_write(self, fd, pad_blocks, include_block_index): - self._write_tree(self._tree, fd, False) - self._blocks.write_internal_blocks_random_access(fd) - self._blocks.write_external_blocks(fd.uri, pad_blocks) - if include_block_index: - self._blocks.write_block_index(fd, self) - fd.truncate() + with self._blocks.write_context(fd): + self._pre_write(fd) + try: + # prep a tree for a writing + tree = copy.copy(self._tree) + tree["asdf_library"] = get_asdf_library_info() + if "history" in self._tree: + tree["history"] = copy.deepcopy(self._tree["history"]) + + self._write_tree(tree, fd, pad_blocks) + self._blocks.write(pad_blocks, include_block_index) + finally: + self._post_write(fd) def _post_write(self, fd): if len(self._tree): @@ -1089,69 +1084,63 @@ def update( ) raise OSError(msg) - if version is not None: - self.version = version - - if config.all_array_storage == "external": - # If the file is fully exploded, there's no benefit to - # update, so just use write_to() - self.write_to(fd) - fd.truncate() - return - if not fd.seekable(): msg = "Can not update, since associated file is not seekable" raise OSError(msg) - self._blocks.finish_reading_internal_blocks() + if version is not None: + self.version = version # flush all pending memmap writes if fd.can_memmap(): fd.flush_memmap() - self._pre_write(fd) + def rewrite(): + self._fd.seek(0) + self._serial_write(self._fd, pad_blocks, include_block_index) + if self._fd.can_memmap(): + self._fd.close_memmap() + self._fd.truncate() - try: - fd.seek(0) + # if we have no read blocks, we can just call write_to as no internal blocks are reused + if len(self._blocks.blocks) == 0: + rewrite() + return - if not self._blocks.has_blocks_with_offset(): - # If we don't have any blocks that are being reused, just - # write out in a serial fashion. - self._serial_write(fd, pad_blocks, include_block_index) - fd.truncate() - return - - # Estimate how big the tree will be on disk by writing the - # YAML out in memory. Since the block indices aren't yet - # known, we have to count the number of block references and - # add enough space to accommodate the largest block number - # possible there. - tree_serialized = io.BytesIO() - self._write_tree(self._tree, tree_serialized, pad_blocks=False) - n_internal_blocks = len(self._blocks._internal_blocks) - - serialized_tree_size = tree_serialized.tell() + constants.MAX_BLOCKS_DIGITS * n_internal_blocks - - if not block.calculate_updated_layout(self._blocks, serialized_tree_size, pad_blocks, fd.block_size): - # If we don't have any blocks that are being reused, just - # write out in a serial fashion. - self._serial_write(fd, pad_blocks, include_block_index) - fd.truncate() - return + # if we have all external blocks, we can just call write_to as no internal blocks are reused + if config.all_array_storage == "external": + rewrite() + return + + self._pre_write(fd) + try: + self._tree["asdf_library"] = get_asdf_library_info() + + # prepare block manager for writing + with self._blocks.write_context(self._fd, copy_options=False): + # write out tree to temporary buffer + tree_fd = generic_io.get_file(io.BytesIO(), mode="rw") + self._write_tree(self._tree, tree_fd, False) + new_tree_size = tree_fd.tell() + + # update blocks + self._blocks.update(new_tree_size, pad_blocks, include_block_index) + end_of_file = self._fd.tell() + + # now write the tree + self._fd.seek(0) + tree_fd.seek(0) + self._fd.write(tree_fd.read()) + self._fd.flush() + + # close memmap to trigger arrays to reload themselves + self._fd.seek(end_of_file) + if self._fd.can_memmap(): + self._fd.close_memmap() + self._fd.truncate() - fd.seek(0) - self._random_write(fd, pad_blocks, include_block_index) - fd.flush() finally: self._post_write(fd) - # close memmaps so they will regenerate - if fd.can_memmap(): - fd.close_memmap() - # also clean any memmapped blocks - for b in self._blocks._internal_blocks: - if b._memmapped: - b._memmapped = False - b._data = None def write_to( self, @@ -1225,77 +1214,6 @@ def write_to( Update the ASDF Standard version of this AsdfFile before writing. """ - with config_context() as config: - if all_array_storage is not NotSet: - config.all_array_storage = all_array_storage - if all_array_compression is not NotSet: - config.all_array_compression = all_array_compression - if compression_kwargs is not NotSet: - config.all_array_compression_kwargs = compression_kwargs - - used_blocks = self._blocks._find_used_blocks(self.tree, self, remove=False) - - naf = AsdfFile( - {}, - uri=self._uri, - extensions=self.extensions, - version=self.version, - ignore_version_mismatch=self._ignore_version_mismatch, - ignore_unrecognized_tag=self._ignore_unrecognized_tag, - ignore_implicit_conversion=self._ignore_implicit_conversion, - ) - naf._tree = copy.copy(self.tree) # avoid an extra validate - - # deep copy keys that will be modified during write - modified_keys = ["history", "asdf_library"] - for k in modified_keys: - if k in self.tree: - naf._tree[k] = copy.deepcopy(self.tree[k]) - - # copy over block storage and other settings - block_to_key_mapping = {v: k for k, v in self._blocks._key_to_block_mapping.items()} - # this creates blocks in the new block manager that correspond to blocks - # in the original file - for b in self._blocks.blocks: - if b not in used_blocks: - continue - if b in self._blocks._streamed_blocks and b._data is None: - # streamed blocks might not have data - # add a streamed block to naf - blk = naf._blocks.get_streamed_block() - # mark this block as used so it doesn't get removed - blk._used = True - elif b._data is not None or b._fd is not None: # this block has data - arr = b.data - blk = naf._blocks[arr] - blk._used = True - naf.set_array_storage(arr, b.array_storage) - naf.set_array_compression(arr, b.output_compression, **b.output_compression_kwargs) - else: # this block does not have data - key = block_to_key_mapping[b] - blk = naf._blocks.find_or_create_block(key) - blk._used = True - blk._data_callback = b._data_callback - naf._write_to( - fd, - all_array_storage=all_array_storage, - all_array_compression=all_array_compression, - compression_kwargs=compression_kwargs, - pad_blocks=pad_blocks, - include_block_index=include_block_index, - version=version, - ) - - def _write_to( - self, - fd, - all_array_storage=NotSet, - all_array_compression=NotSet, - compression_kwargs=NotSet, - pad_blocks=False, - include_block_index=True, - version=None, - ): with config_context() as config: if all_array_storage is not NotSet: config.all_array_storage = all_array_storage @@ -1305,21 +1223,15 @@ def _write_to( config.all_array_compression_kwargs = compression_kwargs if version is not None: + previous_version = self.version self.version = version - with generic_io.get_file(fd, mode="w") as fd: - # TODO: This is not ideal: we really should pass the URI through - # explicitly to wherever it is required instead of making it an - # attribute of the AsdfFile. - if self._uri is None: - self._uri = fd.uri - self._pre_write(fd) - - try: + try: + with generic_io.get_file(fd, mode="w") as fd: self._serial_write(fd, pad_blocks, include_block_index) - fd.flush() - finally: - self._post_write(fd) + finally: + if version is not None: + self.version = previous_version def find_references(self): """ @@ -1376,10 +1288,9 @@ def resolve_and_inline(self): produces something that, when saved, is a 100% valid YAML file. """ - self._blocks.finish_reading_internal_blocks() self.resolve_references() - for b in list(self._blocks.blocks): - self._blocks.set_array_storage(b, "inline") + for b in self._blocks.blocks: + self.set_array_storage(b.data, "inline") def fill_defaults(self): """ @@ -1623,8 +1534,8 @@ def _warn_tag_mismatch(self, tag, best_tag): # This function is called from within yamlutil methods to create # a context when one isn't explicitly passed in. - def _create_serialization_context(self): - return SerializationContext(self.version_string, self.extension_manager, self.uri, self._blocks) + def _create_serialization_context(self, operation=_serialization_context.BlockAccess.NONE): + return _serialization_context.create(self, operation) def open_asdf( diff --git a/asdf/block.py b/asdf/block.py deleted file mode 100644 index b9cce1dfa..000000000 --- a/asdf/block.py +++ /dev/null @@ -1,1446 +0,0 @@ -import copy -import hashlib -import io -import os -import re -import struct -import weakref -from collections import namedtuple - -import numpy as np -import yaml - -from . import compression as mcompression -from . import constants, generic_io, treeutil, util, yamlutil -from .config import get_config -from .util import patched_urllib_parse - - -class BlockManager: - """ - Manages the `Block`s associated with a ASDF file. - """ - - def __init__(self, asdffile, copy_arrays=False, lazy_load=True): - self._asdffile = weakref.ref(asdffile) - - self._internal_blocks = [] - self._external_blocks = [] - self._inline_blocks = [] - self._streamed_blocks = [] - - self._block_type_mapping = { - "internal": self._internal_blocks, - "external": self._external_blocks, - "inline": self._inline_blocks, - "streamed": self._streamed_blocks, - } - - self._data_to_block_mapping = {} - self._key_to_block_mapping = {} - self._validate_checksums = False - self._memmap = not copy_arrays - self._lazy_load = lazy_load - self._internal_blocks_mapped = False - - def __len__(self): - """ - Return the total number of blocks being managed. - - This may not include all of the blocks in an open file, since - their reading may have been deferred. Call - `finish_reading_internal_blocks` to find the positions and - header information of all blocks in the file. - """ - return sum(len(x) for x in self._block_type_mapping.values()) - - def add(self, block, key=None): - """ - Add an internal block to the manager. - """ - if not self._internal_blocks_mapped: - # If the block index is missing we need to locate the remaining - # blocks so that we don't accidentally add our new block - # in the middle of the list. - self.finish_reading_internal_blocks() - - self._add(block, key=key) - - def _add(self, block, key=None): - block_set = self._block_type_mapping.get(block.array_storage, None) - if block_set is not None: - if block not in block_set: - block_set.append(block) - else: - msg = f"Unknown array storage type {block.array_storage}" - raise ValueError(msg) - - if block.array_storage == "streamed" and len(self._streamed_blocks) > 1: - msg = "Can not add second streaming block" - raise ValueError(msg) - - if block._data is not None or key is not None: - if key is None: - key = id(block._data) - self._data_to_block_mapping[key] = block - else: - self._key_to_block_mapping[key] = block - - def remove(self, block): - """ - Remove a block from the manager. - """ - block_set = self._block_type_mapping.get(block.array_storage, None) - if block_set is not None: - if block in block_set: - block_set.remove(block) - for key, blk in list(self._data_to_block_mapping.items()): - if blk is block: - del self._data_to_block_mapping[key] - for key, blk in list(self._key_to_block_mapping.items()): - if blk is block: - del self._key_to_block_mapping[key] - else: - msg = f"Unknown array storage type {block.array_storage}" - raise ValueError(msg) - - def set_array_storage(self, block, array_storage): - """ - Set the array storage type of the given block. - - Parameters - ---------- - block : Block instance - - array_storage : str - Must be one of: - - - ``internal``: The default. The array data will be - stored in a binary block in the same ASDF file. - - - ``external``: Store the data in a binary block in a - separate ASDF file. - - - ``inline``: Store the data as YAML inline in the tree. - - - ``streamed``: The special streamed inline block that - appears at the end of the file. - """ - if array_storage not in ["internal", "external", "streamed", "inline"]: - msg = "array_storage must be one of 'internal', 'external', 'streamed' or 'inline'" - raise ValueError(msg) - - if block.array_storage != array_storage: - if block in self.blocks: - self.remove(block) - block._array_storage = array_storage - self.add(block) - if array_storage == "streamed": - block.output_compression = None - block.output_compression_kwargs = None - - @property - def blocks(self): - """ - An iterator over all blocks being managed. - - This may not include all of the blocks in an open file, - since their reading may have been deferred. Call - `finish_reading_internal_blocks` to find the positions and - header information of all blocks in the file. - """ - for block_set in self._block_type_mapping.values(): - yield from block_set - - @property - def internal_blocks(self): - """ - An iterator over all internal blocks being managed. - - This may not include all of the blocks in an open file, - since their reading may have been deferred. Call - `finish_reading_internal_blocks` to find the positions and - header information of all blocks in the file. - """ - for block_set in (self._internal_blocks, self._streamed_blocks): - yield from block_set - - @property - def streamed_block(self): - """ - The streamed block (always the last internal block in a file), - or `None` if a streamed block is not present. - """ - self.finish_reading_internal_blocks() - - if len(self._streamed_blocks): - return self._streamed_blocks[0] - - return None - - @property - def external_blocks(self): - """ - An iterator over all external blocks being managed. - """ - yield from self._external_blocks - - @property - def inline_blocks(self): - """ - An iterator over all inline blocks being managed. - """ - yield from self._inline_blocks - - @property - def memmap(self): - """ - The flag which indicates whether the arrays are memory mapped - to the underlying file. - """ - return self._memmap - - @property - def lazy_load(self): - """ - The flag which indicates whether the blocks are lazily read. - """ - return self._lazy_load - - def has_blocks_with_offset(self): - """ - Returns `True` if any of the internal blocks currently have an - offset assigned. - """ - return any(block.offset is not None for block in self.internal_blocks) - - def _new_block(self): - return Block(memmap=self.memmap, lazy_load=self.lazy_load) - - def _sort_blocks_by_offset(self): - def sorter(x): - if x.offset is None: - msg = "Block is missing offset" - raise ValueError(msg) - - return x.offset - - self._internal_blocks.sort(key=sorter) - - def _read_next_internal_block(self, fd, past_magic=False): - # This assumes the file pointer is at the beginning of the - # block, (or beginning + 4 if past_magic is True) - block = self._new_block().read(fd, past_magic=past_magic, validate_checksum=self._validate_checksums) - if block is not None: - self._add(block) - - return block - - def read_internal_blocks(self, fd, past_magic=False, validate_checksums=False): - """ - Read internal blocks present in the file. If the file is - seekable, only the first block will be read, and the reading - of all others will be lazily deferred until an the loading of - an array requests it. - - Parameters - ---------- - fd : GenericFile - The file to read from. - - past_magic : bool, optional - If `True`, the file position is immediately after the - block magic token. If `False` (default), the file - position is exactly at the beginning of the block magic - token. - - validate_checksums : bool, optional - If `True`, validate the blocks against their checksums. - - """ - self._validate_checksums = validate_checksums - - while True: - block = self._read_next_internal_block(fd, past_magic=past_magic) - if block is None: - break - past_magic = False - - # If the file handle is seekable, we only read the first - # block and defer reading the rest until later. - if fd.seekable(): - break - - def finish_reading_internal_blocks(self): - """ - Read all remaining internal blocks present in the file, if any. - This is called before updating a file, since updating requires - knowledge of all internal blocks in the file. - """ - if not self._internal_blocks: - return - for block in self._internal_blocks: - if isinstance(block, UnloadedBlock): - block.load() - - last_block = self._internal_blocks[-1] - - # Read all of the remaining blocks in the file, if any - if last_block._fd is not None and last_block._fd.seekable(): - last_block._fd.seek(last_block.end_offset) - while True: - last_block = self._read_next_internal_block(last_block._fd, False) - if last_block is None: - break - - self._internal_blocks_mapped = True - - def write_internal_blocks_serial(self, fd, pad_blocks=False): - """ - Write all blocks to disk serially. - - Parameters - ---------- - fd : generic_io.GenericFile - The file to write internal blocks to. The file position - should be after the tree. - """ - for block in self.internal_blocks: - if block.output_compression: - block.offset = fd.tell() - block.write(fd) - else: - if block.input_compression: - block.update_size() - padding = util.calculate_padding(block.size, pad_blocks, fd.block_size) - block.allocated = block._size + padding - block.offset = fd.tell() - block.write(fd) - fd.fast_forward(block.allocated - block._size) - - def write_internal_blocks_random_access(self, fd): - """ - Write all blocks to disk at their specified offsets. All - internal blocks must have an offset assigned at this point. - - Parameters - ---------- - fd : generic_io.GenericFile - The file to write internal blocks to. The file position - should be after the tree. - """ - self._sort_blocks_by_offset() - - iter_ = self.internal_blocks - last_block = next(iter_) - # We need to explicitly clear anything between the tree - # and the first block, otherwise there may be other block - # markers left over which will throw off block indexing. - # We don't need to do this between each block. - fd.clear(last_block.offset - fd.tell()) - - for block in iter_: - last_block.allocated = (block.offset - last_block.offset) - last_block.header_size - fd.seek(last_block.offset) - last_block.write(fd) - last_block = block - - last_block.allocated = last_block.size - fd.seek(last_block.offset) - last_block.write(fd) - - fd.truncate(last_block.end_offset) - - def write_external_blocks(self, uri, pad_blocks=False): - """ - Write all blocks to disk serially. - - Parameters - ---------- - uri : str - The base uri of the external blocks - """ - from . import asdf - - for i, block in enumerate(self.external_blocks): - if uri is None: - msg = "Can't write external blocks, since URI of main file is unknown." - raise ValueError(msg) - subfd = self.get_external_uri(uri, i) - asdffile = asdf.AsdfFile() - blk = copy.copy(block) - blk._array_storage = "internal" - asdffile._blocks.add(blk) - blk._used = True - # skip the new block manager here - asdffile._write_to(subfd, pad_blocks=pad_blocks, all_array_storage="internal") - - def write_block_index(self, fd, ctx): - """ - Write the block index. - - Parameters - ---------- - fd : GenericFile - The file to write to. The file pointer should be at the - end of the file. - """ - if len(self._internal_blocks) and not len(self._streamed_blocks): - fd.write(constants.INDEX_HEADER) - fd.write(b"\n") - offsets = [x.offset for x in self.internal_blocks] - - yaml_version = tuple(int(x) for x in ctx.version_map["YAML_VERSION"].split(".")) - - yaml.dump( - offsets, - Dumper=yamlutil._yaml_base_dumper, - stream=fd, - explicit_start=True, - explicit_end=True, - version=yaml_version, - allow_unicode=True, - encoding="utf-8", - ) - - _re_index_content = re.compile(rb"^" + constants.INDEX_HEADER + rb"\r?\n%YAML.*\.\.\.\r?\n?$") - _re_index_misc = re.compile(rb"^[\n\r\x20-\x7f]+$") - - def read_block_index(self, fd, ctx): - """ - Read the block index. - - Parameters - ---------- - fd : GenericFile - The file to read from. It must be seekable. - """ - # This reads the block index by reading backward from the end - # of the file. This tries to be as conservative as possible, - # since not reading an index isn't a deal breaker -- - # everything can still be read from the file, only slower. - # Importantly, it must remain "transactionally clean", and not - # create any blocks until we're sure the block index makes - # sense. - - if not fd.seekable(): - return - - if not len(self._internal_blocks): - return - - first_block = self._internal_blocks[0] - first_block_end = first_block.end_offset - - fd.seek(0, generic_io.SEEK_END) - file_size = block_end = fd.tell() - # We want to read on filesystem block boundaries. We use - # "block_end - 5" here because we need to read at least 5 - # bytes in the first block. - block_start = ((block_end - 5) // fd.block_size) * fd.block_size - buff_size = block_end - block_start - - content = b"" - - fd.seek(block_start, generic_io.SEEK_SET) - buff = fd.read(buff_size) - - # Extra '\0' bytes are allowed after the ..., mainly to - # workaround poor truncation support on Windows - buff = buff.rstrip(b"\0") - content = buff - - # We need an explicit YAML end marker, or there's no - # block index - for ending in (b"...", b"...\r\n", b"...\n"): - if content.endswith(ending): - break - else: - return - - # Read blocks in reverse order from the end of the file - while True: - # Look for the index header - idx = content.rfind(constants.INDEX_HEADER) - if idx != -1: - content = content[idx:] - index_start = block_start + idx - break - - # If the rest of it starts to look like binary - # values, bail... - if not self._re_index_misc.match(buff): - return - - if block_start <= first_block_end: - return - - block_end = block_start - block_start = max(block_end - fd.block_size, first_block_end) - - fd.seek(block_start, generic_io.SEEK_SET) - buff_size = block_end - block_start - buff = fd.read(buff_size) - content = buff + content - - yaml_content = content[content.find(b"\n") + 1 :] - - # The following call to yaml.load is safe because we're - # using pyyaml's SafeLoader. - offsets = yaml.load(yaml_content, Loader=yamlutil._yaml_base_loader) # noqa: S506 - - # Make sure the indices look sane - if not isinstance(offsets, list) or len(offsets) == 0: - return - - last_offset = 0 - for x in offsets: - if not isinstance(x, int) or x > file_size or x < 0 or x <= last_offset + Block._header.size: - return - last_offset = x - - # We always read the first block, so we can confirm that the - # first entry in the block index matches the first block - if offsets[0] != first_block.offset: - return - - if len(offsets) == 1: - # If there's only one block in the index, we've already - # loaded the first block, so just return: we have nothing - # left to do - return - - # One last sanity check: Read the last block in the index and - # make sure it makes sense. - fd.seek(offsets[-1], generic_io.SEEK_SET) - try: - block = self._new_block().read(fd) - except (ValueError, OSError): - return - - # Now see if the end of the last block leads right into the index - if block.end_offset != index_start: - return - - # It seems we're good to go, so instantiate the UnloadedBlock - # objects - for offset in offsets[1:-1]: - self._internal_blocks.append(UnloadedBlock(fd, offset, memmap=self.memmap, lazy_load=self.lazy_load)) - - # We already read the last block in the file -- no need to read it again - self._internal_blocks.append(block) - - # Record that all block locations have been mapped out (used to avoid - # unnecessary calls to finish_reading_internal_blocks later). - self._internal_blocks_mapped = True - - # Materialize the internal blocks if we are not lazy - if not self.lazy_load: - self.finish_reading_internal_blocks() - - def get_external_filename(self, filename, index): - """ - Given a main filename and an index number, return a new file - name for referencing an external block. - """ - filename = os.path.splitext(filename)[0] - return filename + f"{index:04d}.asdf" - - def get_external_uri(self, uri, index): - """ - Given a main URI and an index number, return a new URI for - saving an external block. - """ - if uri is None: - uri = "" - parts = list(patched_urllib_parse.urlparse(uri)) - path = parts[2] - dirname, filename = os.path.split(path) - filename = self.get_external_filename(filename, index) - path = os.path.join(dirname, filename) - parts[2] = path - return patched_urllib_parse.urlunparse(parts) - - def _find_used_blocks(self, tree, ctx, remove=True): - reserved_blocks = set() - - for node in treeutil.iter_tree(tree): - if ctx.extension_manager.handles_type(type(node)): - converter = ctx.extension_manager.get_converter_for_type(type(node)) - sctx = ctx._create_serialization_context() - tag = converter.select_tag(node, sctx) - for key in converter.reserve_blocks(node, tag): - reserved_blocks.add(self.find_or_create_block(key)) - else: - hook = ctx._type_index.get_hook_for_type("reserve_blocks", type(node), ctx.version_string) - if hook is not None: - for block in hook(node, ctx): - reserved_blocks.add(block) - - if remove: - for block in list(self.blocks): - if not getattr(block, "_used", False) and block not in reserved_blocks: - self.remove(block) - return None - for block in list(self.blocks): - if getattr(block, "_used", False): - reserved_blocks.add(block) - return reserved_blocks - - def _handle_global_block_settings(self, block): - cfg = get_config() - all_array_storage = cfg.all_array_storage - if all_array_storage: - self.set_array_storage(block, all_array_storage) - - all_array_compression = cfg.all_array_compression - all_array_compression_kwargs = cfg.all_array_compression_kwargs - # Only override block compression algorithm if it wasn't explicitly set - # by AsdfFile.set_array_compression. - if all_array_compression != "input": - block.output_compression = all_array_compression - block.output_compression_kwargs = all_array_compression_kwargs - - if all_array_storage is None: - threshold = get_config().array_inline_threshold - if threshold is not None and block.array_storage in ["internal", "inline"]: - if np.prod(block.data.shape) < threshold: - self.set_array_storage(block, "inline") - else: - self.set_array_storage(block, "internal") - - def finalize(self, ctx): - """ - At this point, we have a complete set of blocks for the file, - with no extras. - - Here, they are reindexed, and possibly reorganized. - """ - # TODO: Should this reset the state (what's external and what - # isn't) afterword? - - self._find_used_blocks(ctx.tree, ctx) - - for block in list(self.blocks): - self._handle_global_block_settings(block) - - def get_block_by_key(self, key): - if key not in self._key_to_block_mapping: - msg = f"Unknown block key {key}" - raise KeyError(msg) - return self._key_to_block_mapping[key] - - def get_block(self, source): - """ - Given a "source identifier", return a block. - - Parameters - ---------- - source : any - If an integer, refers to the index of an internal block. - If a string, is a uri to an external block. - - Returns - ------- - buffer : buffer - """ - # If an "int", it is the index of an internal block - if isinstance(source, int): - if source == -1: - if len(self._streamed_blocks): - return self._streamed_blocks[0] - # If we don't have a streamed block, fall through so - # we can read all of the blocks, ultimately arriving - # at the last one, which, if all goes well is a - # streamed block. - - # First, look in the blocks we've already read - elif source >= 0: - if source < len(self._internal_blocks): - return self._internal_blocks[source] - else: - msg = f"Invalid source id {source}" - raise ValueError(msg) - - # If we have a streamed block or we already know we have - # no blocks, reading any further isn't going to yield any - # new blocks. - if len(self._streamed_blocks) or len(self._internal_blocks) == 0: - msg = f"Block '{source}' not found." - raise ValueError(msg) - - # If the desired block hasn't already been read, and the - # file is seekable, and we have at least one internal - # block, then we can move the file pointer to the end of - # the last known internal block, and start looking for - # more internal blocks. This is "deferred block loading". - last_block = self._internal_blocks[-1] - - if last_block._fd is not None and last_block._fd.seekable(): - last_block._fd.seek(last_block.end_offset) - while True: - next_block = self._read_next_internal_block(last_block._fd, False) - if next_block is None: - break - if len(self._internal_blocks) - 1 == source: - return next_block - last_block = next_block - - if source == -1 and last_block.array_storage == "streamed": - return last_block - - msg = f"Block '{source}' not found." - raise ValueError(msg) - - if isinstance(source, str): - asdffile = self._asdffile().open_external(source) - block = asdffile._blocks._internal_blocks[0] - self.set_array_storage(block, "external") - - # Handle the case of inline data - elif isinstance(source, list): - block = Block(data=np.array(source), array_storage="inline") - - else: - msg = f"Unknown source '{source}'" - raise TypeError(msg) - - return block - - def get_source(self, block): - """ - Get a source identifier for a given block. - - Parameters - ---------- - block : Block - - Returns - ------- - source_id : str - May be an integer for an internal block, or a URI for an - external block. - """ - for i, internal_block in enumerate(self.internal_blocks): - if block == internal_block: - if internal_block.array_storage == "streamed": - return -1 - return i - - for i, external_block in enumerate(self.external_blocks): - if block == external_block: - if self._asdffile().uri is None: - msg = "Can't write external blocks, since URI of main file is unknown." - raise ValueError(msg) - - parts = list(patched_urllib_parse.urlparse(self._asdffile().uri)) - path = parts[2] - filename = os.path.basename(path) - return self.get_external_filename(filename, i) - - msg = "block not found." - raise ValueError(msg) - - def find_or_create_block_for_array(self, arr): - """ - For a given array, looks for an existing block containing its - underlying data. If not found, adds a new block to the block - list. Returns the index in the block list to the array. - - Parameters - ---------- - arr : numpy.ndarray - - Returns - ------- - block : Block - """ - from .tags.core import ndarray - - if isinstance(arr, ndarray.NDArrayType) and arr.block is not None and arr.block in self.blocks: - return arr.block - - base = util.get_array_base(arr) - block = self._data_to_block_mapping.get(id(base)) - if block is not None: - return block - - block = Block(base) - self.add(block) - self._handle_global_block_settings(block) - return block - - def find_or_create_block(self, key): - """ - For a given hashable key, looks for an existing block. If not - found, adds a new block to the block list. Returns the index - in the block list to the array. - - Parameters - ---------- - key : hashable - - Returns - ------- - block : Block - """ - block = self._key_to_block_mapping.get(key) - if block is not None: - return block - - block = Block() - self.add(block, key=key) - self._handle_global_block_settings(block) - - return block - - def get_streamed_block(self): - """ - Get the streamed block, which is always the last one. A - streamed block, on writing, does not manage data of its own, - but the user is expected to stream it to disk directly. - """ - block = self.streamed_block - if block is None: - block = Block(array_storage="streamed") - self.add(block) - return block - - def add_inline(self, array): - """ - Add an inline block for ``array`` to the block set. - """ - block = Block(array, array_storage="inline") - self.add(block) - return block - - def get_output_compressions(self): - """ - Get the list of unique compressions used on blocks. - """ - return list({b.output_compression for b in self.blocks}) - - def get_output_compression_extensions(self): - """ - Infer the compression extensions used on blocks. - Note that this is somewhat indirect and could be fooled if a new extension - for the same compression label is loaded after the compression of the block. - """ - ext = [] - for label in self.get_output_compressions(): - compressor = mcompression._get_compressor_from_extensions(label, return_extension=True) - if compressor is not None: - ext += [compressor[1]] # second item is the extension - return ext - - def __getitem__(self, arr): - return self.find_or_create_block_for_array(arr) - - def close(self): - for block in self.blocks: - block.close() - - -class Block: - """ - Represents a single block in a ASDF file. This is an - implementation detail and should not be instantiated directly. - Instead, should only be created through the `BlockManager`. - """ - - _header = util.BinaryStruct( - [ - ("flags", "I"), - ("compression", "4s"), - ("allocated_size", "Q"), - ("used_size", "Q"), - ("data_size", "Q"), - ("checksum", "16s"), - ], - ) - - def __init__(self, data=None, uri=None, array_storage="internal", memmap=True, lazy_load=True, data_callback=None): - self._data_callback = data_callback - if self._data_callback is not None and data is not None: - msg = "Block.__init__ cannot contain non-None data and a non-None data_callback" - raise ValueError(msg) - self._data = data - self._uri = uri - self._array_storage = array_storage - - self._fd = None - self._offset = None - self._input_compression = None - self._output_compression = "input" - self._output_compression_kwargs = {} - self._checksum = None - self._should_memmap = memmap - self._memmapped = False - self._lazy_load = lazy_load - - self.update_size() - self._allocated = self._size - - def __repr__(self): - return f"" - - def __len__(self): - return self._size - - @property - def offset(self): - return self._offset - - @offset.setter - def offset(self, offset): - self._offset = offset - - @property - def allocated(self): - return self._allocated - - @allocated.setter - def allocated(self, allocated): - self._allocated = allocated - - @property - def header_size(self): - return self._header.size + constants.BLOCK_HEADER_BOILERPLATE_SIZE - - @property - def data_offset(self): - return self._offset + self.header_size - - @property - def size(self): - return self._size + self.header_size - - @property - def end_offset(self): - """ - The offset of the end of the allocated space for the block, - and where the next block should begin. - """ - return self.offset + self.header_size + self.allocated - - @property - def array_storage(self): - return self._array_storage - - @property - def input_compression(self): - """ - The compression codec used to read the block. - """ - return self._input_compression - - @input_compression.setter - def input_compression(self, compression): - self._input_compression = mcompression.validate(compression) - - @property - def output_compression(self): - """ - The compression codec used to write the block. - :return: - """ - if self._output_compression == "input": - return self._input_compression - return self._output_compression - - @output_compression.setter - def output_compression(self, compression): - self._output_compression = mcompression.validate(compression) - - @property - def output_compression_kwargs(self): - """ - The configuration options to the Compressor constructor - used to write the block. - :return: - """ - return self._output_compression_kwargs - - @output_compression_kwargs.setter - def output_compression_kwargs(self, config): - if config is None: - config = {} - self._output_compression_kwargs = config.copy() - - @property - def checksum(self): - return self._checksum - - def _set_checksum(self, checksum): - if checksum == b"\0" * 16: - self._checksum = None - else: - self._checksum = checksum - - def _calculate_checksum(self, array): - # The following line is safe because we're only using - # the MD5 as a checksum. - m = hashlib.new("md5") # noqa: S324 - m.update(array) - return m.digest() - - def validate_checksum(self): - """ - Validate the content of the block against the current checksum. - - Returns - ------- - valid : bool - `True` if the content is valid against the current - checksum or there is no current checksum. Otherwise, - `False`. - """ - if self._checksum: - checksum = self._calculate_checksum(self._flattened_data) - if checksum != self._checksum: - return False - return True - - def update_checksum(self): - """ - Update the checksum based on the current data contents. - """ - self._checksum = self._calculate_checksum(self._flattened_data) - - def update_size(self): - """ - Recalculate the on-disk size of the block. This causes any - compression steps to run. It should only be called when - updating the file in-place, otherwise the work is redundant. - """ - if self._data is not None: - data = self._flattened_data - self._data_size = data.nbytes - - if not self.output_compression: - self._size = self._data_size - else: - self._size = mcompression.get_compressed_size( - data, - self.output_compression, - config=self.output_compression_kwargs, - ) - else: - self._data_size = self._size = 0 - - def read(self, fd, past_magic=False, validate_checksum=False): - """ - Read a Block from the given Python file-like object. - - If the file is seekable and lazy_load is True, the reading - or memmapping of the actual data is postponed until an array - requests it. If the file is a stream or lazy_load is False, - the data will be read into memory immediately. - - As Block is used for reading, writing, configuring and - managing data there are circumstances where read should - not be used. For instance, if a data_callback is defined - a call to read would override the data corresponding to a - block and conflict with the use of the data_callback. To - signify this conflict, a RuntimeError is raised if read - is called on a block with a defined data_callback. - - Parameters - ---------- - fd : GenericFile - - past_magic : bool, optional - If `True`, the file position is immediately after the - block magic token. If `False` (default), the file - position is exactly at the beginning of the block magic - token. - - validate_checksum : bool, optional - If `True`, validate the data against the checksum, and - raise a `ValueError` if the data doesn't match. - - Raises - ------ - - RuntimeError - Read was called on a block with a defined data_callback. - - ValueError - The read file contains invalid data. - """ - if self._data_callback is not None: - msg = "read called on a Block with a data_callback" - raise RuntimeError(msg) - offset = None - if fd.seekable(): - offset = fd.tell() - - if not past_magic: - buff = fd.read(len(constants.BLOCK_MAGIC)) - if len(buff) < 4: - return None - - if buff not in (constants.BLOCK_MAGIC, constants.INDEX_HEADER[: len(buff)]): - msg = ( - "Bad magic number in block. " - "This may indicate an internal inconsistency about the " - "sizes of the blocks in the file." - ) - raise ValueError(msg) - - if buff == constants.INDEX_HEADER[: len(buff)]: - return None - - elif offset is not None: - offset -= 4 - - buff = fd.read(2) - (header_size,) = struct.unpack(b">H", buff) - if header_size < self._header.size: - msg = f"Header size must be >= {self._header.size}" - raise ValueError(msg) - - buff = fd.read(header_size) - header = self._header.unpack(buff) - - # This is used by the documentation system, but nowhere else. - self._flags = header["flags"] - self._set_checksum(header["checksum"]) - - try: - self.input_compression = header["compression"] - except ValueError: - raise # TODO: hint extension? - - if self.input_compression is None and header["used_size"] != header["data_size"]: - msg = "used_size and data_size must be equal when no compression is used." - raise ValueError(msg) - - if header["flags"] & constants.BLOCK_FLAG_STREAMED and self.input_compression is not None: - msg = "Compression set on a streamed block." - raise ValueError(msg) - - if fd.seekable(): - # If the file is seekable, we can delay reading the actual - # data until later. - self._fd = fd - self._offset = offset - self._header_size = header_size - if header["flags"] & constants.BLOCK_FLAG_STREAMED: - # Support streaming blocks - self._array_storage = "streamed" - if self._lazy_load: - fd.fast_forward(-1) - self._data_size = self._size = self._allocated = (fd.tell() - self.data_offset) + 1 - else: - self._data = fd.read_into_array(-1) - self._data_size = self._size = self._allocated = len(self._data) - else: - self._allocated = header["allocated_size"] - self._size = header["used_size"] - self._data_size = header["data_size"] - if self._lazy_load: - fd.fast_forward(self._allocated) - else: - curpos = fd.tell() - self._memmap_data() - fd.seek(curpos) - if not self._memmapped: - self._data = self._read_data(fd, self._size, self._data_size) - fd.fast_forward(self._allocated - self._size) - else: - fd.fast_forward(self._allocated) - else: - # If the file is a stream, we need to get the data now. - if header["flags"] & constants.BLOCK_FLAG_STREAMED: - # Support streaming blocks - self._array_storage = "streamed" - self._data = fd.read_into_array(-1) - self._data_size = self._size = self._allocated = len(self._data) - else: - self._allocated = header["allocated_size"] - self._size = header["used_size"] - self._data_size = header["data_size"] - self._data = self._read_data(fd, self._size, self._data_size) - fd.fast_forward(self._allocated - self._size) - fd.close() - - if validate_checksum and not self.validate_checksum(): - msg = f"Block at {self._offset} does not match given checksum" - raise ValueError(msg) - - return self - - def _read_data(self, fd, used_size, data_size): - """ - Read the block data from a file. - """ - if not self.input_compression: - return fd.read_into_array(used_size) - - return mcompression.decompress(fd, used_size, data_size, self.input_compression) - - def _memmap_data(self): - """ - Memory map the block data from the file. - """ - memmap = self._fd.can_memmap() and not self.input_compression - if self._should_memmap and memmap: - self._data = self._fd.memmap_array(self.data_offset, self._size) - self._memmapped = True - - @property - def _flattened_data(self): - """ - Retrieve flattened data suitable for writing. - - Returns - ------- - np.ndarray - 1D contiguous array. - """ - data = self.data - - # 'K' order flattens the array in the order that elements - # occur in memory, except axes with negative strides which - # are reversed. That is a problem for base arrays with - # negative strides and is an outstanding bug in this library. - return data.ravel(order="K") - - def write(self, fd): - """ - Write an internal block to the given Python file-like object. - """ - self._header_size = self._header.size - - if self._data_callback is not None: - self._data = self._data_callback() - data = self._flattened_data - self.update_size() - self._data = None - self._allocated = self._size - else: - data = self._flattened_data if self._data is not None else None - - flags = 0 - data_size = used_size = allocated_size = 0 - if self._array_storage == "streamed": - flags |= constants.BLOCK_FLAG_STREAMED - elif data is not None: - self._checksum = self._calculate_checksum(data) - data_size = data.nbytes - if not fd.seekable() and self.output_compression: - buff = io.BytesIO() - mcompression.compress(buff, data, self.output_compression, config=self.output_compression_kwargs) - self.allocated = self._size = buff.tell() - allocated_size = self.allocated - used_size = self._size - self.input_compression = self.output_compression - - if allocated_size < used_size: - msg = f"Block used size {used_size} larger than allocated size {allocated_size}" - raise RuntimeError(msg) - - checksum = self.checksum if self.checksum is not None else b"\x00" * 16 - - fd.write(constants.BLOCK_MAGIC) - fd.write(struct.pack(b">H", self._header_size)) - fd.write( - self._header.pack( - flags=flags, - compression=mcompression.to_compression_header(self.output_compression), - allocated_size=allocated_size, - used_size=used_size, - data_size=data_size, - checksum=checksum, - ), - ) - - if data is not None: - if self.output_compression: - if not fd.seekable(): - fd.write(buff.getvalue()) - else: - # If the file is seekable, we write the - # compressed data directly to it, then go back - # and write the resulting size in the block - # header. - start = fd.tell() - mcompression.compress(fd, data, self.output_compression, config=self.output_compression_kwargs) - end = fd.tell() - self.allocated = self._size = end - start - fd.seek(self.offset + 6) - self._header.update(fd, allocated_size=self.allocated, used_size=self._size) - fd.seek(end) - else: - if used_size != data_size: - msg = f"Block used size {used_size} is not equal to the data size {data_size}" - raise RuntimeError(msg) - fd.write_array(data) - - @property - def data(self): - """ - Get the data for the block, as a numpy array. - """ - if self._data is not None: - return self._data - if self._data_callback is not None: - return self._data_callback() - if self._fd.is_closed(): - msg = "ASDF file has already been closed. Can not get the data." - raise OSError(msg) - - # Be nice and reset the file position after we're done - curpos = self._fd.tell() - try: - self._memmap_data() - if not self._memmapped: - self._fd.seek(self.data_offset) - self._data = self._read_data(self._fd, self._size, self._data_size) - finally: - self._fd.seek(curpos) - return self._data - - def close(self): - self._data = None - - def generate_read_data_callback(self): - """Used in SerializationContext.get_block_data_callback""" - - def callback(): - return self.data - - return callback - - -class UnloadedBlock: - """ - Represents an indexed, but not yet loaded, internal block. All - that is known about it is its offset. It converts itself to a - full-fledged block whenever the underlying data or more detail is - requested. - """ - - def __init__(self, fd, offset, memmap=True, lazy_load=True): - self._fd = fd - self._offset = offset - self._data = None - self._uri = None - self._array_storage = "internal" - self._input_compression = None - self._output_compression = "input" - self._output_compression_kwargs = {} - self._checksum = None - self._should_memmap = memmap - self._memmapped = False - self._lazy_load = lazy_load - self._data_callback = None - - def __len__(self): - self.load() - return len(self) - - def close(self): - pass - - @property - def array_storage(self): - return "internal" - - @property - def offset(self): - return self._offset - - def __getattr__(self, attr): - self.load() - return getattr(self, attr) - - def load(self): - self._fd.seek(self._offset, generic_io.SEEK_SET) - self.__class__ = Block - self.read(self._fd) - - -def calculate_updated_layout(blocks, tree_size, pad_blocks, block_size): - """ - Calculates a block layout that will try to use as many blocks as - possible in their original locations, though at this point the - algorithm is fairly naive. The result will be stored in the - offsets of the blocks. - - Parameters - ---------- - blocks : Blocks instance - - tree_size : int - The amount of space to reserve for the tree at the beginning. - - Returns - ------- - Returns `False` if no good layout can be found and one is best off - rewriting the file serially, otherwise, returns `True`. - """ - - def unfix_block(i): - # If this algorithm gets more sophisticated we could carefully - # move memmapped blocks around without clobbering other ones. - - # TODO: Copy to a tmpfile on disk and memmap it from there. - entry = fixed[i] - copy = entry.block.data.copy() - entry.block.close() - entry.block._data = copy - del fixed[i] - free.append(entry.block) - - def fix_block(block, offset): - block.offset = offset - fixed.append(Entry(block.offset, block.offset + block.size, block)) - fixed.sort() - - Entry = namedtuple("Entry", ["start", "end", "block"]) - - fixed = [] - free = [] - for block in blocks._internal_blocks: - if block.offset is not None: - block.update_size() - fixed.append(Entry(block.offset, block.offset + block.size, block)) - else: - free.append(block) - - if not len(fixed): - return False - - fixed.sort() - - # Make enough room at the beginning for the tree, by popping off - # blocks at the beginning - while len(fixed) and fixed[0].start < tree_size: - unfix_block(0) - - if not len(fixed): - return False - - # This algorithm is pretty basic at this point -- it just looks - # for the first open spot big enough for the free block to fit. - while len(free): - block = free.pop() - last_end = tree_size - for entry in fixed: - if entry.start - last_end >= block.size: - fix_block(block, last_end) - break - last_end = entry.end - else: - padding = util.calculate_padding(entry.block.size, pad_blocks, block_size) - fix_block(block, last_end + padding) - - if blocks.streamed_block is not None: - padding = util.calculate_padding(fixed[-1].block.size, pad_blocks, block_size) - blocks.streamed_block.offset = fixed[-1].end + padding - - blocks._sort_blocks_by_offset() - - return True diff --git a/asdf/commands/diff.py b/asdf/commands/diff.py index 1b9d19e97..d1e2f70c1 100644 --- a/asdf/commands/diff.py +++ b/asdf/commands/diff.py @@ -31,7 +31,6 @@ import asdf from asdf.tagged import Tagged -from asdf.tags.core.ndarray import NDArrayType from asdf.util import human_list from .main import Command @@ -259,8 +258,13 @@ def compare_ndarrays(diff_ctx, array0, array1, keys): if array0.get(field) != array1.get(field): differences.append(field) - array0 = NDArrayType.from_tree(array0, diff_ctx.asdf0) - array1 = NDArrayType.from_tree(array1, diff_ctx.asdf1) + def get_flat(af, keys): + for k in keys: + af = af[k] + return af + + array0 = get_flat(diff_ctx.asdf0, keys) + array1 = get_flat(diff_ctx.asdf1, keys) if not array_equal(array0, array1): differences.append("contents") diff --git a/asdf/commands/edit.py b/asdf/commands/edit.py index 77c4c44f6..9e09d93c9 100644 --- a/asdf/commands/edit.py +++ b/asdf/commands/edit.py @@ -16,8 +16,9 @@ import yaml from asdf import constants, generic_io, schema, util +from asdf._block import io as bio +from asdf._block.exceptions import BlockIndexError from asdf.asdf import AsdfFile, open_asdf -from asdf.block import BlockManager from .main import Command @@ -130,23 +131,41 @@ def write_edited_yaml_larger(path, new_content, version): pad_length = util.calculate_padding(len(new_content), True, fd.block_size) fd.fast_forward(pad_length) + # now copy over ASDF block contents + with generic_io.get_file(path) as original_fd: - # Consume the file up to the first block, which must exist - # as a precondition to using this method. - original_fd.seek_until( - constants.BLOCK_MAGIC, - len(constants.BLOCK_MAGIC), - ) - ctx = AsdfFile(version=version) - blocks = BlockManager(ctx, copy_arrays=False, lazy_load=False) - blocks.read_internal_blocks(original_fd, past_magic=True, validate_checksums=False) - blocks.finish_reading_internal_blocks() - blocks.write_internal_blocks_serial(fd) - blocks.write_block_index(fd, ctx) - blocks.close() - - # the file needs to be closed here to release all memmaps - original_fd.close() + original_fd.seek_until(constants.BLOCK_MAGIC, len(constants.BLOCK_MAGIC)) + old_first_block_offset = original_fd.tell() - len(constants.BLOCK_MAGIC) + new_first_block_offset = fd.tell() + + # check if the original file has a block index which we will need to update + # as we're moving the blocks + block_index_offset = bio.find_block_index(original_fd) + if block_index_offset is None: + block_index = None + original_fd.seek(0, generic_io.SEEK_END) + blocks_end = original_fd.tell() + else: + blocks_end = block_index_offset + try: + block_index = bio.read_block_index(original_fd, block_index_offset) + except BlockIndexError: + # the original index was invalid + block_index = None + + # copy over blocks byte-for-byte from old_first_block_offset to block_index_offset + original_fd.seek(old_first_block_offset) + block_size = min(fd.block_size, original_fd.block_size) + n_bytes = blocks_end - old_first_block_offset + for offset in range(0, n_bytes, block_size): + this_size = min(block_size, n_bytes - offset) + fd.write(original_fd.read(this_size)) + + # update index + if block_index is not None: + offset = new_first_block_offset - old_first_block_offset + updated_block_index = [i + offset for i in block_index] + bio.write_block_index(fd, updated_block_index) # Swap in the new version of the file atomically: shutil.copy(temp_file.name, path) diff --git a/asdf/compression.py b/asdf/compression.py index 8c0e2741d..89076e313 100644 --- a/asdf/compression.py +++ b/asdf/compression.py @@ -227,7 +227,7 @@ def to_compression_header(compression): header. """ if not compression: - return b"" + return b"\0\0\0\0" if isinstance(compression, str): return compression.encode("ascii") diff --git a/asdf/config.py b/asdf/config.py index cb78d0642..9789c3913 100644 --- a/asdf/config.py +++ b/asdf/config.py @@ -23,6 +23,7 @@ DEFAULT_ALL_ARRAY_STORAGE = None DEFAULT_ALL_ARRAY_COMPRESSION = "input" DEFAULT_ALL_ARRAY_COMPRESSION_KWARGS = None +DEFAULT_CONVERT_UNKNOWN_NDARRAY_SUBCLASSES = True class AsdfConfig: @@ -44,6 +45,7 @@ def __init__(self): self._all_array_storage = DEFAULT_ALL_ARRAY_STORAGE self._all_array_compression = DEFAULT_ALL_ARRAY_COMPRESSION self._all_array_compression_kwargs = DEFAULT_ALL_ARRAY_COMPRESSION_KWARGS + self._convert_unknown_ndarray_subclasses = DEFAULT_CONVERT_UNKNOWN_NDARRAY_SUBCLASSES self._lock = threading.RLock() @@ -413,6 +415,30 @@ def validate_on_read(self, value): """ self._validate_on_read = value + @property + def convert_unknown_ndarray_subclasses(self): + """ + Get configuration that controls if ndarray subclasses + (subclasses that aren't otherwise handled by a specific + converter) are serialized as ndarray. If `True`, instances + of these subclasses will appear in ASDF files as ndarrays + and when loaded, will load as ndarrays. + + Note that these conversions will result in an + AsdfConversionWarning being issued as this support for + converting subclasses will be removed in a future version + of ASDF. + + Returns + ------- + bool + """ + return self._convert_unknown_ndarray_subclasses + + @convert_unknown_ndarray_subclasses.setter + def convert_unknown_ndarray_subclasses(self, value): + self._convert_unknown_ndarray_subclasses = value + def __repr__(self): return ( " 0: + result["offset"] = offset + + if strides is not None: + result["strides"] = list(strides) + + if isinstance(data, ma.MaskedArray) and np.any(data.mask): + if options.storage_type == "inline": + ctx._blocks._set_array_storage(data.mask, "inline") + + result["mask"] = data.mask + + return result + + def from_yaml_tree(self, node, tag, ctx): + import sys + import weakref + + from asdf.tags.core import NDArrayType + from asdf.tags.core.ndarray import asdf_datatype_to_numpy_dtype + + if isinstance(node, list): + instance = NDArrayType(node, None, None, None, None, None, None) + ctx._blocks._set_array_storage(instance, "inline") + return instance + + if isinstance(node, dict): + shape = node.get("shape", None) + if "source" in node and "data" in node: + msg = "Both source and data may not be provided at the same time" + raise ValueError(msg) + if "source" in node: + source = node["source"] + byteorder = node["byteorder"] + else: + source = node["data"] + byteorder = sys.byteorder + dtype = asdf_datatype_to_numpy_dtype(node["datatype"], byteorder) if "datatype" in node else None + offset = node.get("offset", 0) + strides = node.get("strides", None) + mask = node.get("mask", None) + + if isinstance(source, int): + # internal block + data_callback = ctx.get_block_data_callback(source) + instance = NDArrayType(source, shape, dtype, offset, strides, "A", mask, data_callback) + elif isinstance(source, str): + # external + def data_callback(_attr=None, _ref=weakref.ref(ctx._blocks)): + blks = _ref() + if blks is None: + msg = "Failed to resolve reference to AsdfFile to read external block" + raise OSError(msg) + array = blks._load_external(source) + blks._set_array_storage(array, "external") + return array + + instance = NDArrayType(source, shape, dtype, offset, strides, "A", mask, data_callback) + else: + # inline + instance = NDArrayType(source, shape, dtype, offset, strides, "A", mask) + ctx._blocks._set_array_storage(instance, "inline") + + if not ctx._blocks._lazy_load: + instance._make_array() + return instance + + msg = "Invalid ndarray description." + raise TypeError(msg) diff --git a/asdf/core/_converters/reference.py b/asdf/core/_converters/reference.py index 316b24ffd..eeedaec25 100644 --- a/asdf/core/_converters/reference.py +++ b/asdf/core/_converters/reference.py @@ -8,7 +8,12 @@ class ReferenceConverter(Converter): def to_yaml_tree(self, obj, tag, ctx): from asdf.generic_io import relative_uri - uri = relative_uri(ctx.url, obj._uri) if ctx.url is not None else obj._uri + base_uri = None + if ctx._blocks._write_fd is not None and ctx._blocks._write_fd.uri is not None: + base_uri = ctx._blocks._write_fd.uri + elif ctx.url is not None: + base_uri = ctx.url + uri = relative_uri(base_uri, obj._uri) if base_uri is not None else obj._uri return {"$ref": uri} def from_yaml_tree(self, node, tag, ctx): diff --git a/asdf/core/_extensions.py b/asdf/core/_extensions.py index 685994e27..fb2a9d982 100644 --- a/asdf/core/_extensions.py +++ b/asdf/core/_extensions.py @@ -3,6 +3,7 @@ from ._converters.complex import ComplexConverter from ._converters.constant import ConstantConverter from ._converters.external_reference import ExternalArrayReferenceConverter +from ._converters.ndarray import NDArrayConverter from ._converters.reference import ReferenceConverter from ._converters.tree import ( AsdfObjectConverter, @@ -23,6 +24,7 @@ SoftwareConverter(), SubclassMetadataConverter(), ReferenceConverter(), + NDArrayConverter(), ] diff --git a/asdf/exceptions.py b/asdf/exceptions.py index cdccc9b1d..752a99b15 100644 --- a/asdf/exceptions.py +++ b/asdf/exceptions.py @@ -28,6 +28,12 @@ class AsdfConversionWarning(AsdfWarning): """ +class AsdfBlockIndexWarning(AsdfWarning): + """ + Warning class to indicate that a file was read with an invalid block index + """ + + class DelimiterNotFoundError(ValueError): """ Indicates that a delimiter was not found when reading or diff --git a/asdf/extension/__init__.py b/asdf/extension/__init__.py index 909094f8d..b51911caf 100644 --- a/asdf/extension/__init__.py +++ b/asdf/extension/__init__.py @@ -15,15 +15,15 @@ __all__ = [ # New API + "Compressor", + "Converter", + "ConverterProxy", "Extension", + "ExtensionManager", "ExtensionProxy", "ManifestExtension", - "ExtensionManager", - "get_cached_extension_manager", + "SerializationContext", "TagDefinition", - "Converter", - "ConverterProxy", - "Compressor", "Validator", - "SerializationContext", + "get_cached_extension_manager", ] diff --git a/asdf/extension/_converter.py b/asdf/extension/_converter.py index 1966df0a6..12546d87c 100644 --- a/asdf/extension/_converter.py +++ b/asdf/extension/_converter.py @@ -152,29 +152,6 @@ def from_yaml_tree(self, node, tag, ctx): or a generator that yields such an instance. """ - def reserve_blocks(self, obj, tag): - """ - Reserve any number of blocks in which data (ndarrays) can be - stored. - - Parameters - ---------- - obj : object - Instance of a custom type to be serialized. Guaranteed to - be an instance of one of the types listed in the `types` - property. - tag : str - The tag identifying the YAML type that ``obj`` should be - converted into. Selected by a call to this converter's - select_tag method. - - Returns - ------- - keys : list of unique hashable keys - These keys will be used to reserve blocks for later use - """ - return [] - class ConverterProxy(Converter): """ @@ -308,31 +285,6 @@ def from_yaml_tree(self, node, tag, ctx): """ return self._delegate.from_yaml_tree(node, tag, ctx) - def reserve_blocks(self, obj, tag): - """ - Reserve blocks to be used during conversion of this object - - Parameters - ---------- - obj : object - Instance of a custom type to be serialized. Guaranteed to - be an instance of one of the types listed in the `types` - property. - tag : str - The tag identifying the YAML type that ``obj`` should be - converted into. Selected by a call to this converter's - select_tag method. - - Returns - ------- - keys : list of unique hashable keys - These keys will be used to reserve blocks for later use - - """ - if hasattr(self._delegate, "reserve_blocks"): - return self._delegate.reserve_blocks(obj, tag) - return [] - @property def delegate(self): """ diff --git a/asdf/extension/_serialization_context.py b/asdf/extension/_serialization_context.py index 6a6723500..80ed8cdc8 100644 --- a/asdf/extension/_serialization_context.py +++ b/asdf/extension/_serialization_context.py @@ -1,5 +1,8 @@ +import enum + +from asdf._block.key import Key as BlockKey from asdf._helpers import validate_version -from asdf.extension import ExtensionProxy +from asdf.extension._extension import ExtensionProxy class SerializationContext: @@ -11,11 +14,12 @@ class SerializationContext: classes (like Converters) via method arguments. """ - def __init__(self, version, extension_manager, url, block_manager): + def __init__(self, version, extension_manager, url, blocks): self._version = validate_version(version) self._extension_manager = extension_manager self._url = url - self._block_manager = block_manager + self._blocks = blocks + self._obj = None self.__extensions_used = set() @@ -77,15 +81,19 @@ def _extensions_used(self): """ return self.__extensions_used - def get_block_data_callback(self, index): + def get_block_data_callback(self, index, key=None): """ Generate a callable that when called will read data - from a block at the provided index + from an ASDF block at the provided index. Parameters ---------- index : int - Block index + Index of ASDF block. + + key : BlockKey, optional + BlockKey generated using self.generate_block_key. Only + needed for a Converter that uses multiple blocks. Returns ------- @@ -93,69 +101,159 @@ def get_block_data_callback(self, index): A callable that when called (with no arguments) returns the block data as a one dimensional array of uint8 """ - blk = self._block_manager.get_block(index) - return blk.generate_read_data_callback() + raise NotImplementedError("abstract") - def assign_block_key(self, block_index, key): + def find_available_block_index(self, data_callback, key=None): """ - Associate a unique hashable key with a block. - - This is used during Converter.from_yaml_tree and allows - the AsdfFile to be aware of which blocks belong to the - object handled by the converter and allows load_block - to locate the block using the key instead of the index - (which might change if a file undergoes an AsdfFile.update). + Find the index of an available ASDF block to write data. - If the block index is later needed (like during to_yaml_tree) - the key can be used with find_block_index to lookup the - block index. + This is typically used inside asdf.extension.Converter.to_yaml_tree. Parameters ---------- + data_callback: callable + Callable that when called will return data (ndarray) that will + be written to a block. - block_index : int - The index of the block to associate with the key + key : BlockKey, optional + BlockKey generated using self.generate_block_key. Only + needed for a Converter that uses multiple blocks. - key : hashable - A unique hashable key to associate with a block + Returns + ------- + block_index: int + Index of the ASDF block where data returned from + data_callback will be written. """ - blk = self._block_manager.get_block(block_index) - if self._block_manager._key_to_block_mapping.get(key, blk) is not blk: - msg = f"key {key} is already assigned to a block" - raise ValueError(msg) - if blk in self._block_manager._key_to_block_mapping.values(): - msg = f"block {block_index} is already assigned to a key" - raise ValueError(msg) - self._block_manager._key_to_block_mapping[key] = blk + raise NotImplementedError("abstract") - def find_block_index(self, lookup_key, data_callback=None): + def generate_block_key(self): """ - Find the index of a previously allocated or reserved block. + Generate a BlockKey used for Converters that wish to use + multiple blocks - This is typically used inside asdf.extension.Converter.to_yaml_tree + Returns + ------- + key : BlockKey + A hashable object that will be associated with the + serialized/deserialized object and can be used to + access multiple blocks within a Converter + """ + raise NotImplementedError("abstract") - Parameters - ---------- - lookup_key : hashable - Unique key used to retrieve the index of a block that was - previously allocated or reserved. For ndarrays this is - typically the id of the base ndarray. + def assign_object(self, obj): + self._obj = obj - data_callback: callable, optional - Callable that when called will return data (ndarray) that will - be written to a block. - At the moment, this is only assigned if a new block - is created to avoid circular references during AsdfFile.update. + def assign_blocks(self): + pass - Returns - ------- - block_index: int - Index of the block where data returned from data_callback - will be written. - """ - new_block = lookup_key not in self._block_manager._key_to_block_mapping - blk = self._block_manager.find_or_create_block(lookup_key) - # if we're not creating a block, don't update the data callback - if data_callback is not None and (new_block or (blk._data_callback is None and blk._fd is None)): - blk._data_callback = data_callback - return self._block_manager.get_source(blk) + +class ReadBlocksContext(SerializationContext): + """ + Perform deserialization (reading) with a `SerializationContext`. + + To allow for block access, `ReadBlocksContext` implements: + - `SerializationContext.generate_block_key` + - `SerializationContext.get_block_data_callback` + and tracks which blocks (and keys) are accessed, assigning them + to the deserialized object after `assign_object` and + `assign_blocks` are called. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.assign_object(None) + + def assign_object(self, obj): + super().assign_object(obj) + if obj is None: + self._cb = None + self._keys_to_assign = {} + + def assign_blocks(self): + super().assign_blocks() + if self._cb is not None: + self._blocks._data_callbacks.assign_object(self._obj, self._cb) + for key, cb in self._keys_to_assign.items(): + if cb is None: + msg = "Converter generated a key that was never used" + raise OSError(msg) + # now that we have an object, make the key valid + key._assign_object(self._obj) + + # assign the key to the callback + self._blocks._data_callbacks.assign_object(key, cb) + # now that we've assigned blocks, remove the reference to the + # assigned object + self.assign_object(None) + + def get_block_data_callback(self, index, key=None): + if key is None: + if self._cb is not None: + # this operation has already accessed a block without using + # a key so check if the same index was accessed + if self._cb._index == index: + return self._cb + msg = "Converters accessing >1 block must provide a key for each block" + raise OSError(msg) + self._cb = self._blocks._get_data_callback(index) + return self._cb + + if self._keys_to_assign.get(key, None) is not None: + return self._keys_to_assign[key] + + cb = self._blocks._get_data_callback(index) + # mark this as a key to later assign + self._keys_to_assign[key] = cb + return cb + + def generate_block_key(self): + key = BlockKey() + self._keys_to_assign[key] = None + return key + + +class WriteBlocksContext(SerializationContext): + """ + Perform serialization (writing) with a `SerializationContext`. + + To allow for block access, `WriteBlocksContext` implements: + - `SerializationContext.generate_block_key` + - `SerializationContext.find_available_block_index` + and assigns any accessed blocks (and keys) to the object + being serialized. + """ + + def find_available_block_index(self, data_callback, key=None): + if key is None: + key = self._obj + return self._blocks.make_write_block(data_callback, None, key) + + def generate_block_key(self): + return BlockKey(self._obj) + + +class BlockAccess(enum.Enum): + """ + Block access enumerated values that define + how a SerializationContext can access ASDF blocks. + """ + + NONE = SerializationContext + WRITE = WriteBlocksContext + READ = ReadBlocksContext + + +def create(asdf_file, block_access=BlockAccess.NONE): + """ + Create a SerializationContext instance (or subclass) using + an AsdfFile instance, asdf_file. + + Parameters + ---------- + asdf_file : asdf.AsdfFile + + block_access : BlockAccess, optional + Defaults to BlockAccess.NONE + """ + return block_access.value(asdf_file.version_string, asdf_file.extension_manager, asdf_file.uri, asdf_file._blocks) diff --git a/asdf/generic_io.py b/asdf/generic_io.py index 1907946a9..128a4af73 100644 --- a/asdf/generic_io.py +++ b/asdf/generic_io.py @@ -380,7 +380,6 @@ def seek(self, offset, whence=0): file`s end). """ result = self._fd.seek(offset, whence) - self.tell() return result def tell(self): @@ -738,41 +737,16 @@ def fast_forward(self, size): self.seek(0, SEEK_END) self.seek(size, SEEK_CUR) - if sys.platform.startswith("win"): # pragma: no cover - - def truncate(self, size=None): - # ftruncate doesn't work on an open file in Windows. The - # best we can do is clear the extra bytes or add extra - # bytes to the end. - if size is None: - size = self.tell() - - self.seek(0, SEEK_END) - file_size = self.tell() - if size < file_size: - self.seek(size, SEEK_SET) - nbytes = file_size - size - elif size > file_size: - nbytes = size - file_size - else: - nbytes = 0 - - block = b"\0" * self.block_size - while nbytes > 0: - self.write(block[: min(nbytes, self.block_size)]) - nbytes -= self.block_size - + def truncate(self, size=None): + # windows supports truncating as long as the file not opened + # more than once. So this must be called after closing all + # memmaps + if size is None: + self._fd.truncate() + else: + self._fd.truncate(size) self.seek(size, SEEK_SET) - else: - - def truncate(self, size=None): - if size is None: - self._fd.truncate() - else: - self._fd.truncate(size) - self.seek(size, SEEK_SET) - class RealFile(RandomAccessFile): """ @@ -805,8 +779,10 @@ def memmap_array(self, offset, size): acc = mmap.ACCESS_WRITE if "w" in self._mode else mmap.ACCESS_READ self._fd.seek(0, 2) nbytes = self._fd.tell() - self._fd.seek(loc, 0) self._mmap = mmap.mmap(self._fd.fileno(), nbytes, access=acc) + # on windows mmap seeks to the start of the file so return the file + # pointer to this previous location + self._fd.seek(loc, 0) return np.ndarray.__new__(np.memmap, shape=size, offset=offset, dtype="uint8", buffer=self._mmap) def close_memmap(self): diff --git a/asdf/stream.py b/asdf/stream.py index 05984ba97..b66952c65 100644 --- a/asdf/stream.py +++ b/asdf/stream.py @@ -1,62 +1,9 @@ -from .tags.core import ndarray +import warnings +from .exceptions import AsdfDeprecationWarning +from .tags.core.stream import Stream # noqa: F401 -class Stream(ndarray.NDArrayType): - """ - Used to put a streamed array into the tree. - - Examples - -------- - Save a double-precision array with 1024 columns, one row at a - time:: - - >>> from asdf import AsdfFile, Stream - >>> import numpy as np - >>> ff = AsdfFile() - >>> ff.tree['streamed'] = Stream([1024], np.float64) - >>> with open('test.asdf', 'wb') as fd: - ... ff.write_to(fd) - ... for i in range(200): - ... nbytes = fd.write( - ... np.array([i] * 1024, np.float64).tobytes()) - """ - - name = None - types = [] - - def __init__(self, shape, dtype, strides=None): - self._shape = shape - self._datatype, self._byteorder = ndarray.numpy_dtype_to_asdf_datatype(dtype) - self._strides = strides - self._array = None - - def _make_array(self): - self._array = None - - @classmethod - def reserve_blocks(cls, data, ctx): - if isinstance(data, Stream): - yield ctx._blocks.get_streamed_block() - - @classmethod - def from_tree(cls, data, ctx): - return ndarray.NDArrayType.from_tree(data, ctx) - - @classmethod - def to_tree(cls, data, ctx): - ctx._blocks.get_streamed_block() - - result = {} - result["source"] = -1 - result["shape"] = ["*", *data._shape] - result["datatype"] = data._datatype - result["byteorder"] = data._byteorder - if data._strides is not None: - result["strides"] = data._strides - return result - - def __repr__(self): - return f"Stream({self._shape}, {self._datatype}, strides={self._strides})" - - def __str__(self): - return str(self.__repr__()) +warnings.warn( + "asdf.stream is deprecated. Please use asdf.tags.core.stream", + AsdfDeprecationWarning, +) diff --git a/asdf/tags/core/__init__.py b/asdf/tags/core/__init__.py index bf9e6138f..f075e6fce 100644 --- a/asdf/tags/core/__init__.py +++ b/asdf/tags/core/__init__.py @@ -2,6 +2,7 @@ from .external_reference import ExternalArrayReference from .integer import IntegerType from .ndarray import NDArrayType +from .stream import Stream __all__ = [ "AsdfObject", @@ -13,6 +14,7 @@ "NDArrayType", "IntegerType", "ExternalArrayReference", + "Stream", ] diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py index be8a8e466..5567b909f 100644 --- a/asdf/tags/core/ndarray.py +++ b/asdf/tags/core/ndarray.py @@ -4,7 +4,7 @@ import numpy as np from numpy import ma -from asdf import _types, util +from asdf import util from asdf._jsonschema import ValidationError _datatype_names = { @@ -226,25 +226,21 @@ def ascii_to_unicode(x): return ascii_to_unicode(tolist(array)) -class NDArrayType(_types._AsdfType): - name = "core/ndarray" - version = "1.0.0" - supported_versions = {"1.0.0", "1.1.0"} - types = [np.ndarray, ma.MaskedArray] - - def __init__(self, source, shape, dtype, offset, strides, order, mask, asdffile): - self._asdffile = asdffile +class NDArrayType: + def __init__(self, source, shape, dtype, offset, strides, order, mask, data_callback=None): self._source = source - self._block = None + self._data_callback = data_callback self._array = None self._mask = mask if isinstance(source, list): self._array = inline_data_asarray(source, dtype) self._array = self._apply_mask(self._array, self._mask) - self._block = asdffile._blocks.add_inline(self._array) + # single element structured arrays can have shape == () + # https://github.com/asdf-format/asdf/issues/1540 if shape is not None and ( - (shape[0] == "*" and self._array.shape[1:] != tuple(shape[1:])) or (self._array.shape != tuple(shape)) + self._array.shape != tuple(shape) + or (len(shape) and shape[0] == "*" and self._array.shape[1:] != tuple(shape[1:])) ): msg = "inline data doesn't match the given shape" raise ValueError(msg) @@ -254,8 +250,6 @@ def __init__(self, source, shape, dtype, offset, strides, order, mask, asdffile) self._offset = offset self._strides = strides self._order = order - if not asdffile._blocks.lazy_load: - self._make_array() def _make_array(self): # If the ASDF file has been updated in-place, then there's @@ -269,10 +263,27 @@ def _make_array(self): self._array = None if self._array is None: - block = self.block - shape = self.get_actual_shape(self._shape, self._strides, self._dtype, len(block)) - - self._array = np.ndarray(shape, self._dtype, block.data, self._offset, self._strides, self._order) + if isinstance(self._source, str): + # we need to keep _source as a str to allow stdatamodels to + # support AsdfInFits + data = self._data_callback() + else: + # cached data is used here so that multiple NDArrayTypes will all use + # the same base array + data = self._data_callback(_attr="cached_data") + + if hasattr(data, "base") and isinstance(data.base, mmap.mmap) and data.base.closed: + msg = "ASDF file has already been closed. Can not get the data." + raise OSError(msg) + + # compute shape (streaming blocks have '0' data size in the block header) + shape = self.get_actual_shape( + self._shape, + self._strides, + self._dtype, + data.size, + ) + self._array = np.ndarray(shape, self._dtype, data, self._offset, self._strides, self._order) self._array = self._apply_mask(self._array, self._mask) return self._array @@ -335,18 +346,12 @@ def get_actual_shape(self, shape, strides, dtype, block_size): msg = f"Invalid shape '{shape}'" raise ValueError(msg) - @property - def block(self): - if self._block is None: - self._block = self._asdffile._blocks.get_block(self._source) - return self._block - @property def shape(self): - if self._shape is None: + if self._shape is None or self._array is not None or "*" in self._shape: + # streamed blocks have a '0' data_size in the header so we + # need to make the array to get the shape return self.__array__().shape - if "*" in self._shape: - return tuple(self.get_actual_shape(self._shape, self._strides, self._dtype, len(self.block))) return tuple(self._shape) @property @@ -388,164 +393,6 @@ def __setitem__(self, *args): raise - def __getattribute__(self, name): - # The presence of these attributes on an NDArrayType instance - # can cause problems when the array is passed to other - # libraries. - # See https://github.com/asdf-format/asdf/issues/1015 - if name in ("name", "version", "supported_versions"): - msg = f"'{self.__class__.name}' object has no attribute '{name}'" - raise AttributeError(msg) - - return _types._AsdfType.__getattribute__(self, name) - - @classmethod - def from_tree(cls, node, ctx): - if isinstance(node, list): - return cls(node, None, None, None, None, None, None, ctx) - - if isinstance(node, dict): - source = node.get("source") - data = node.get("data") - if source and data: - msg = "Both source and data may not be provided at the same time" - raise ValueError(msg) - if data: - source = data - shape = node.get("shape", None) - byteorder = sys.byteorder if data is not None else node["byteorder"] - dtype = asdf_datatype_to_numpy_dtype(node["datatype"], byteorder) if "datatype" in node else None - offset = node.get("offset", 0) - strides = node.get("strides", None) - mask = node.get("mask", None) - - return cls(source, shape, dtype, offset, strides, "A", mask, ctx) - - msg = "Invalid ndarray description." - raise TypeError(msg) - - @classmethod - def reserve_blocks(cls, data, ctx): - # Find all of the used data buffers so we can add or rearrange - # them if necessary - if isinstance(data, np.ndarray): - yield ctx._blocks.find_or_create_block_for_array(data) - elif isinstance(data, NDArrayType): - yield data.block - - @classmethod - def to_tree(cls, data, ctx): - # The ndarray-1.0.0 schema does not permit 0 valued strides. - # Perhaps we'll want to allow this someday, to efficiently - # represent an array of all the same value. - if any(stride == 0 for stride in data.strides): - data = np.ascontiguousarray(data) - - # The view computations that follow assume that the base array - # is contiguous. If not, we need to make a copy to avoid - # writing a nonsense view. - base = util.get_array_base(data) - if not base.flags.forc: - data = np.ascontiguousarray(data) - base = util.get_array_base(data) - - shape = data.shape - - block = ctx._blocks.find_or_create_block_for_array(data) - - # Compute the offset relative to the base array and not the - # block data, in case the block is compressed. - offset = data.ctypes.data - base.ctypes.data - - strides = None if data.flags.c_contiguous else data.strides - dtype, byteorder = numpy_dtype_to_asdf_datatype( - data.dtype, - include_byteorder=(block.array_storage != "inline"), - ) - - result = {} - - result["shape"] = list(shape) - if block.array_storage == "streamed": - result["shape"][0] = "*" - - if block.array_storage == "inline": - listdata = numpy_array_to_list(data) - result["data"] = listdata - result["datatype"] = dtype - - else: - result["shape"] = list(shape) - if block.array_storage == "streamed": - result["shape"][0] = "*" - - result["source"] = ctx._blocks.get_source(block) - result["datatype"] = dtype - result["byteorder"] = byteorder - - if offset > 0: - result["offset"] = offset - - if strides is not None: - result["strides"] = list(strides) - - if isinstance(data, ma.MaskedArray) and np.any(data.mask): - if block.array_storage == "inline": - ctx._blocks.set_array_storage(ctx._blocks[data.mask], "inline") - - result["mask"] = data.mask - - return result - - @classmethod - def _assert_equality(cls, old, new, func): - if old.dtype.fields: - if not new.dtype.fields: - # This line is safe because this is actually a piece of test - # code, even though it lives in this file: - msg = "arrays not equal" - raise AssertionError(msg) - for a, b in zip(old, new): - cls._assert_equality(a, b, func) - else: - old = old.__array__() - new = new.__array__() - if old.dtype.char in "SU": - if old.dtype.char == "S": - old = old.astype("U") - if new.dtype.char == "S": - new = new.astype("U") - old = old.tolist() - new = new.tolist() - # This line is safe because this is actually a piece of test - # code, even though it lives in this file: - assert old == new # noqa: S101 - else: - func(old, new) - - @classmethod - def assert_equal(cls, old, new): - from numpy.testing import assert_array_equal - - cls._assert_equality(old, new, assert_array_equal) - - @classmethod - def assert_allclose(cls, old, new): - from numpy.testing import assert_allclose, assert_array_equal - - if old.dtype.kind in "iu" and new.dtype.kind in "iu": - cls._assert_equality(old, new, assert_array_equal) - else: - cls._assert_equality(old, new, assert_allclose) - - @classmethod - def copy_to_new_asdf(cls, node, asdffile): - if isinstance(node, NDArrayType): - array = node._make_array() - asdffile._blocks.set_array_storage(asdffile._blocks[array], node.block.array_storage) - return node._make_array() - return node - def _make_operation(name): def operation(self, *args): @@ -554,7 +401,6 @@ def operation(self, *args): return operation -classes_to_modify = [*NDArrayType.__versioned_siblings, NDArrayType] for op in [ "__neg__", "__pos__", @@ -619,8 +465,7 @@ def operation(self, *args): "__delitem__", "__contains__", ]: - [setattr(cls, op, _make_operation(op)) for cls in classes_to_modify] -del classes_to_modify + setattr(NDArrayType, op, _make_operation(op)) def _get_ndim(instance): diff --git a/asdf/tags/core/stream.py b/asdf/tags/core/stream.py new file mode 100644 index 000000000..fed1c84c8 --- /dev/null +++ b/asdf/tags/core/stream.py @@ -0,0 +1,37 @@ +from .ndarray import numpy_dtype_to_asdf_datatype + + +class Stream: + """ + Used to put a streamed array into the tree. + + Examples + -------- + Save a double-precision array with 1024 columns, one row at a + time:: + + >>> from asdf import AsdfFile, Stream + >>> import numpy as np + >>> ff = AsdfFile() + >>> ff.tree['streamed'] = Stream([1024], np.float64) + >>> with open('test.asdf', 'wb') as fd: + ... ff.write_to(fd) + ... for i in range(200): + ... nbytes = fd.write( + ... np.array([i] * 1024, np.float64).tobytes()) + """ + + def __init__(self, shape, dtype, strides=None): + self._shape = shape + self._datatype, self._byteorder = numpy_dtype_to_asdf_datatype(dtype) + self._strides = strides + self._array = None + + def _make_array(self): + self._array = None + + def __repr__(self): + return f"Stream({self._shape}, {self._datatype}, strides={self._strides})" + + def __str__(self): + return str(self.__repr__()) diff --git a/asdf/util.py b/asdf/util.py index 58623d84e..7d6dc1d69 100644 --- a/asdf/util.py +++ b/asdf/util.py @@ -529,25 +529,3 @@ class FileType(enum.Enum): ASDF = 1 FITS = 2 UNKNOWN = 3 - - -class BlockKey: - """ - Helper class that generates a unique hashable value for every instance - useful for associates blocks and objects during serialization and - deserialization - """ - - _next = 0 - - def __init__(self): - self._key = BlockKey._next - BlockKey._next += 1 - - def __hash__(self): - return self._key - - def __eq__(self, other): - if not isinstance(other, BlockKey): - return NotImplemented - return self._key == other._key diff --git a/asdf/yamlutil.py b/asdf/yamlutil.py index 8b2252dd6..0fd20b986 100644 --- a/asdf/yamlutil.py +++ b/asdf/yamlutil.py @@ -5,9 +5,10 @@ import numpy as np import yaml -from . import schema, tagged, treeutil, util +from . import config, schema, tagged, treeutil, util from .constants import STSCI_SCHEMA_TAG_BASE, YAML_TAG_PREFIX from .exceptions import AsdfConversionWarning +from .extension._serialization_context import BlockAccess from .tags.core import AsdfObject from .versioning import _yaml_base_loader, split_tag_version @@ -216,18 +217,18 @@ def custom_tree_to_tagged_tree(tree, ctx, _serialization_context=None): annotated with tags. """ if _serialization_context is None: - _serialization_context = ctx._create_serialization_context() + _serialization_context = ctx._create_serialization_context(BlockAccess.WRITE) extension_manager = _serialization_context.extension_manager + version_string = str(_serialization_context.version) - def _convert_obj(obj): - converter = extension_manager.get_converter_for_type(type(obj)) + def _convert_obj(obj, converter): tag = converter.select_tag(obj, _serialization_context) - converters = set() # if select_tag returns None, converter.to_yaml_tree should return a new # object which will be handled by a different converter + converters_used = set() while tag is None: - converters.add(converter) + converters_used.add(converter) obj = converter.to_yaml_tree(obj, tag, _serialization_context) try: converter = extension_manager.get_converter_for_type(type(obj)) @@ -235,11 +236,13 @@ def _convert_obj(obj): # no converter supports this type, return it as-is yield obj return - if converter in converters: + if converter in converters_used: msg = "Conversion cycle detected" raise TypeError(msg) tag = converter.select_tag(obj, _serialization_context) + _serialization_context.assign_object(obj) node = converter.to_yaml_tree(obj, tag, _serialization_context) + _serialization_context.assign_blocks() if isinstance(node, GeneratorType): generator = node @@ -264,19 +267,40 @@ def _convert_obj(obj): if generator is not None: yield from generator + cfg = config.get_config() + convert_ndarray_subclasses = cfg.convert_unknown_ndarray_subclasses + converters_cache = {} + def _walker(obj): - if extension_manager.handles_type(type(obj)): - return _convert_obj(obj) + typ = type(obj) + if typ in converters_cache: + return converters_cache[typ](obj) + if extension_manager.handles_type(typ): + converter = extension_manager.get_converter_for_type(typ) + converters_cache[typ] = lambda obj, _converter=converter: _convert_obj(obj, _converter) + return _convert_obj(obj, converter) + if convert_ndarray_subclasses and isinstance(obj, np.ndarray): + warnings.warn( + f"A ndarray subclass ({type(obj)}) was converted as a ndarray. " + "This behavior will be removed from a future version of ASDF. " + "See https://asdf.readthedocs.io/en/latest/asdf/config.html#convert-unknown-ndarray-subclasses", + AsdfConversionWarning, + ) + converter = extension_manager.get_converter_for_type(np.ndarray) + converters_cache[typ] = lambda obj, _converter=converter: _convert_obj(obj, _converter) + return _convert_obj(obj, converter) tag = ctx._type_index.from_custom_type( - type(obj), - ctx.version_string, + typ, + version_string, _serialization_context=_serialization_context, ) if tag is not None: + converters_cache[typ] = lambda obj, _tag=tag: _tag.to_tree_tagged(obj, ctx) return tag.to_tree_tagged(obj, ctx) + converters_cache[typ] = lambda obj: obj return obj return treeutil.walk_and_modify( @@ -296,7 +320,7 @@ def tagged_tree_to_custom_tree(tree, ctx, force_raw_types=False, _serialization_ tags, to a tree containing custom data types. """ if _serialization_context is None: - _serialization_context = ctx._create_serialization_context() + _serialization_context = ctx._create_serialization_context(BlockAccess.READ) extension_manager = _serialization_context.extension_manager @@ -311,6 +335,8 @@ def _walker(node): if extension_manager.handles_tag(tag): converter = extension_manager.get_converter_for_tag(tag) obj = converter.from_yaml_tree(node.data, tag, _serialization_context) + _serialization_context.assign_object(obj) + _serialization_context.assign_blocks() _serialization_context._mark_extension_used(converter.extension) return obj diff --git a/docs/asdf/arrays.rst b/docs/asdf/arrays.rst index ba9f79f5c..34c4cbf2e 100644 --- a/docs/asdf/arrays.rst +++ b/docs/asdf/arrays.rst @@ -158,14 +158,15 @@ implicitly determined to include all of the remaining contents of the file. By definition, it must be the last block in the file. To use streaming, rather than including a Numpy array object in the -tree, you include a `asdf.Stream` object which sets up the structure +tree, you include a `asdf.tags.core.Stream` object which sets up the structure of the streamed data, but will not write out the actual content. The file handle's ``write`` method is then used to manually write out the binary data. .. runcode:: - from asdf import AsdfFile, Stream + from asdf import AsdfFile + from asdf.tags.core import Stream import numpy as np tree = { @@ -194,7 +195,8 @@ to numpy arrays stored in ASDF: import csv import numpy as np - from asdf import AsdfFile, Stream + from asdf import AsdfFile + from asdf.tags.core import Stream tree = { # We happen to know in advance that each row in the CSV has 100 ints diff --git a/docs/asdf/config.rst b/docs/asdf/config.rst index 30c889fa0..d3b85092a 100644 --- a/docs/asdf/config.rst +++ b/docs/asdf/config.rst @@ -39,6 +39,7 @@ the currently active config: all_array_storage: None all_array_compression: input all_array_compression_kwargs: None + convert_unknown_ndarray_subclasses: True default_version: 1.5.0 io_block_size: -1 legacy_fill_schema_defaults: True @@ -62,6 +63,7 @@ This allows for short-lived configuration changes that do not impact other code: all_array_storage: None all_array_compression: input all_array_compression_kwargs: None + convert_unknown_ndarray_subclasses: True default_version: 1.5.0 io_block_size: -1 legacy_fill_schema_defaults: True @@ -73,6 +75,7 @@ This allows for short-lived configuration changes that do not impact other code: all_array_storage: None all_array_compression: input all_array_compression_kwargs: None + convert_unknown_ndarray_subclasses: True default_version: 1.5.0 io_block_size: -1 legacy_fill_schema_defaults: True @@ -101,6 +104,64 @@ type is not managed automatically. Defaults to ``None``. +all_array_storage +----------------- + +Use this storage type for all arrays within an ASDF file. Must be one of + +- ``"internal"`` +- ``"external"`` +- ``"inline"`` +- ``None`` + +If ``None`` a different storage type can be used for each array. +See ``AsdfFile.set_array_storage`` for more details. + +Defaults to ``None``. + +all_array_compression +--------------------- + +Use this compression type for all arrays within an ASDF file. +If ``"input"`` a different compression type can be used for each +array. See ``AsdfFile.set_array_compression`` for more details. + +Defaults to ``"input"``. + +all_array_compression_kwargs +---------------------------- + +Use these additional compression keyword arguments for all arrays +within an ASDF file. If ``None`` diffeerent keyword arguments +can be set for each array. See ``AsdfFile.set_array_compression`` for more details. + +Defaults to ``None``. + +.. _convert_unknown_ndarray_subclasses: + +convert_unknown_ndarray_subclasses +---------------------------------- + +Convert otherwise unhandled instances of subclasses of ndarray into +ndarrays prior to serialization. + +Previous extension code allowed AsdfTypes to convert instances of subclasses +of supported types. Internally, the handling of ndarrays has been moved +from an AsdfType to a Converter which does not support converting +instances of subclasses unless they are explicitly listed. This means +that code that previously relied on asdf converting instances of subclasses +of ndarray into an ndarray will need to be updated to define a Converter +for the ndarray subclass or to request that support be added directly +in asdf (for subclasses in existing asdf dependencies). + +With this setting enabled, asdf will continue to convert instances +of subclasses of ndarray but will issue a warning when an instance is +converted. In a future version of asdf this default will change +to ``False``, a deprecation warning will be issued and finally +the conversion of instances of subclasses will be removed. + +Defaults to ``True``. + default_version --------------- diff --git a/docs/asdf/deprecations.rst b/docs/asdf/deprecations.rst index 8428ef66c..3b952870f 100644 --- a/docs/asdf/deprecations.rst +++ b/docs/asdf/deprecations.rst @@ -6,6 +6,16 @@ Deprecations ************ +Version 3.0 +=========== + +SerializationContext was previously importable from ``asdf.asdf.SerializationContext``. +Although not part of the public API, this import path has been deprecated and users +should instead import ``SerializationContext`` from `asdf.extension`. + +Version 2.15 +============ + ASDF 2.15 introduced many new `asdf.exceptions.AsdfDeprecationWarning` messages. These warnings are subclasses of the built-in python `DeprecationWarning` and will by default be ignored except in `__main__` and with testing tools such as @@ -18,7 +28,7 @@ versioning, compatibility and support policy). .. _legacy_extension_api_deprecation: Legacy Extension API Deprecation -================================ +-------------------------------- A large number of `asdf.exceptions.AsdfDeprecationWarning` messages appear related to use of the ``legacy extension api``. Some examples include: @@ -55,7 +65,7 @@ package that uses these new-style extension api. .. _asdf_in_fits_deprecation: ASDF-in-FITS Deprecation -======================== +------------------------ Support for ``AsdfInFits`` (including the ``asdf.fits_embed`` module) is deprecated. Code using this format can migrate to using `stdatamodels` which @@ -66,20 +76,10 @@ Without support for ``fits_embed.AsdfInFits`` the ``extract`` and ``remove-hdu`` commands for :ref:`asdftool ` are no longer usable and are deprecated. -.. _asdffile_blocks_deprecation: - -AsdfFile.blocks Deprecation -=========================== - -Direct usage of the ASDF block manager through `asdf.AsdfFile.blocks` is deprecated. -The BlockManager api was not previously included in the documentation and -was unused by the legacy and new style extensions. Planned features for ASDF 3.0 -include adding block storage support to :ref:`converters `. - .. _tests_helpers_deprecation: asdf.tests.helpers Deprecation -============================== +------------------------------ Use of ``asdf.tests.helpers`` is deprecated. Please see `asdf.testing.helpers` for alternative functions to aid in testing. diff --git a/docs/asdf/extending/converters.rst b/docs/asdf/extending/converters.rst index 999b0b559..4e5d12c9c 100644 --- a/docs/asdf/extending/converters.rst +++ b/docs/asdf/extending/converters.rst @@ -351,7 +351,7 @@ Block storage ============= As described above :ref:`extending_converters` can return complex objects that will -be passed to other Converters. If a Converter returns a ndarray, ASDF will recognize this +be passed to other Converters. If a Converter returns a ndarray, asdf will recognize this array and store it in an ASDF block. This is the easiest and preferred means of storing data in ASDF blocks. @@ -359,19 +359,18 @@ For applications that require more flexibility, Converters can control block storage through use of the `asdf.extension.SerializationContext` provided as an argument to `Converter.to_yaml_tree` `Converter.from_yaml_tree` and `Converter.select_tag`. -It is helpful to first review some details of how ASDF +It is helpful to first review some details of how asdf :ref:`stores block `. Blocks are stored sequentially within a -ASDF file following the YAML tree. During reads and writes, ASDF will need to know +ASDF file following the YAML tree. During reads and writes, asdf will need to know the index of the block a Converter would like to use to read or write the correct block. However, the index used for reading might not be the same index for writing -if the tree was modified or the file is being written to a new location. To allow -ASDF to track the relationship between blocks and objects, Converters will need -to generate unique hashable keys for each block used and associate these keys with -block indices during read and write (more on this below). +if the tree was modified or the file is being written to a new location. During +serialization and deserialization, asdf will associate each object with the +accessed block during `Converter.from_yaml_tree` and `Converter.to_yaml_tree`. .. note:: - Use of ``id(obj)`` will not generate a unique key as it returns the memory address - which might be reused after the object is garbage collected. + Converters using multiple blocks are slightly more complicated. + See: :ref:`extending_converter_multiple_block_storage` A simple example of a Converter using block storage to store the ``payload`` for ``BlockData`` object instances is as follows: @@ -385,7 +384,6 @@ A simple example of a Converter using block storage to store the ``payload`` for class BlockData: def __init__(self, payload): self.payload = payload - self._asdf_key = asdf.util.BlockKey() class BlockConverter(Converter): @@ -393,22 +391,17 @@ A simple example of a Converter using block storage to store the ``payload`` for types = [BlockData] def to_yaml_tree(self, obj, tag, ctx): - block_index = ctx.find_block_index( - obj._asdf_key, + block_index = ctx.find_available_block_index( lambda: np.ndarray(len(obj.payload), dtype="uint8", buffer=obj.payload), ) return {"block_index": block_index} def from_yaml_tree(self, node, tag, ctx): block_index = node["block_index"] - obj = BlockData(b"") - ctx.assign_block_key(block_index, obj._asdf_key) - obj.payload = ctx.get_block_data_callback(block_index)() + data_callback = ctx.get_block_data_callback(block_index) + obj = BlockData(data_callback()) return obj - def reserve_blocks(self, obj, tag): - return [obj._asdf_key] - class BlockExtension(Extension): tags = ["asdf://somewhere.org/tags/block_data-1.0.0"] converters = [BlockConverter()] @@ -422,29 +415,78 @@ A simple example of a Converter using block storage to store the ``payload`` for .. asdf:: block_converter_example.asdf During read, ``Converter.from_yaml_tree`` will be called. Within this method -the Converter should associate any used blocks with unique hashable keys by calling -`asdf.extension.SerializationContext.assign_block_key` and can generate (and use) a callable -function that will return block data using `asdf.extension.SerializationContext.get_block_data_callback`. -A callback for reading the data is provided to support lazy loading without -keeping a reference to the `asdf.extension.SerializationContext` (which is meant to be -a short lived and lightweight object). - -During write, ``Converter.to_yaml_tree`` will be called. The Converter should -use `asdf.extension.SerializationContext.find_block_index` to find the location of an -available block by providing a hashable key unique to this object (this should -be the same key used during reading to allow ASDF to associate blocks and objects -during in-place updates). The second argument to `asdf.extension.SerializationContext.find_block_index` -must be a callable function (returning a ndarray) that ASDF will call when it -is time to write data to the portion of the file corresponding to this block. -Note that it's possible this callback will be called multiple times during a -write and ASDF will not cache the result. If the data is coming from a non-repeatable -source (such as a non-seekable stream of bytes) the data should be cached prior -to providing it to ASDF to allow ASDF to call the callback multiple times. - -A Converter that uses block storage must also define ``Converter.reserve_blocks``. -``Converter.reserve_blocks`` will be called during memory management to free -resources for unused blocks. ``Converter.reserve_blocks`` must -return a list of keys associated with an object. +the Converter can prepare to access a block by calling +``SerializationContext.get_block_data_callback``. This will return a function +that when called will return the contents of the block (to support lazy +loading without keeping a reference to the ``SerializationContext`` (which is meant +to be a short lived and lightweight object). + +During write, ``Converter.to_yaml_tree`` will be called. The Converter can +use ``SerializationContext.find_available_block_index`` to find the location of an +available block for writing. The data to be written to the block can be provided +as an ``ndarray`` or a callable function that will return a ``ndarray`` (note that +it is possible this callable function will be called multiple times and the +developer should cache results from any non-repeatable sources). + +.. _extending_converter_multiple_block_storage: + +Converters using multiple blocks +-------------------------------- + +As discussed above, while serializing and deserializing objects that use +one block, asdf will watch which block is accessed by ``find_available_block_index`` +and ``get_block_data_callback`` and associate the block with the converted object. +This association allows asdf to map read and write blocks during updates of ASDF +files. An object that uses multiple blocks must provide a unique key for each +block it uses. These keys are generated using ``SerializationContext.generate_block_key`` +and must be stored by the extension code. These keys must be resupplied to the converter +when writing an object that was read from an ASDF file. + +.. runcode:: + + import asdf + import numpy as np + from asdf.extension import Converter, Extension + + class MultiBlockData: + def __init__(self, data): + self.data = data + self.keys = [] + + + class MultiBlockConverter(Converter): + tags = ["asdf://somewhere.org/tags/multi_block_data-1.0.0"] + types = [MultiBlockData] + + def to_yaml_tree(self, obj, tag, ctx): + if not len(obj.keys): + obj.keys = [ctx.generate_block_key() for _ in obj.data] + indices = [ctx.find_available_block_index(d, k) for d, k in zip(obj.data, obj.keys)] + return { + "indices": indices, + } + + def from_yaml_tree(self, node, tag, ctx): + indices = node["indices"] + keys = [ctx.generate_block_key() for _ in indices] + cbs = [ctx.get_block_data_callback(i, k) for i, k in zip(indices, keys)] + obj = MultiBlockData([cb() for cb in cbs]) + obj.keys = keys + return obj + + + class MultiBlockExtension(Extension): + tags = ["asdf://somewhere.org/tags/multi_block_data-1.0.0"] + converters = [MultiBlockConverter()] + extension_uri = "asdf://somewhere.org/extensions/multi_block_data-1.0.0" + + with asdf.config_context() as cfg: + cfg.add_extension(MultiBlockExtension()) + obj = MultiBlockData([np.arange(3, dtype="uint8") + i for i in range(3)]) + ff = asdf.AsdfFile({"example": obj}) + ff.write_to("multi_block_converter_example.asdf") + +.. asdf:: multi_block_converter_example.asdf .. _extending_converters_performance: diff --git a/docs/asdf/user_api.rst b/docs/asdf/user_api.rst index 2b2647eb3..214d3de18 100644 --- a/docs/asdf/user_api.rst +++ b/docs/asdf/user_api.rst @@ -9,6 +9,7 @@ User API :inherited-members: :no-inheritance-diagram: :skip: ValidationError + :skip: Stream .. automodapi:: asdf.search diff --git a/pyproject.toml b/pyproject.toml index 4c59c81f8..8c06773f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ all = [ "lz4>=0.10", ] docs = [ - "sphinx-asdf>=0.1.4", + "sphinx-asdf>=0.2.2", "graphviz", "sphinx-inline-tabs", 'tomli; python_version < "3.11"', diff --git a/pytest_asdf/plugin.py b/pytest_asdf/plugin.py index c4a94241c..469cf7fe9 100644 --- a/pytest_asdf/plugin.py +++ b/pytest_asdf/plugin.py @@ -218,7 +218,7 @@ def from_parent( return result def runtest(self): - from asdf import AsdfFile, block, util + from asdf import AsdfFile, _block, generic_io, util from asdf._tests import _helpers as helpers from asdf.exceptions import AsdfDeprecationWarning @@ -239,12 +239,11 @@ def runtest(self): util.filepath_to_url(os.path.abspath(os.path.join(os.path.dirname(self.filename), "external.asdf"))) ] = ff2 - # Add some dummy blocks so that the ndarray examples work - for _ in range(3): - b = block.Block(np.zeros((1024 * 1024 * 8), dtype=np.uint8)) - b._used = True - ff._blocks.add(b) - b._array_storage = "streamed" + wb = _block.writer.WriteBlock(np.zeros(1024 * 1024 * 8, dtype=np.uint8)) + with generic_io.get_file(buff, mode="rw") as f: + f.seek(0, 2) + _block.writer.write_blocks(f, [wb, wb], streamed_block=wb) + f.seek(0) try: # Do not tolerate any warnings that occur during schema validation