From 9880d15715dd1af9120e452afc52490c270f2b4d Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 7 Oct 2024 19:03:39 +0200 Subject: [PATCH 1/8] Allow reading BAM from memory without a header --- src/dnaio/_bam.py | 35 ++++++++++ src/dnaio/_core.pyi | 2 +- src/dnaio/_core.pyx | 63 ++++++++---------- src/dnaio/readers.py | 5 +- src/dnaio/singleend.py | 5 ++ .../missing_header_no_bgzip_raw_bam_bytes | Bin 0 -> 197 bytes tests/test_internal.py | 1 + tests/test_open.py | 10 +++ 8 files changed, 84 insertions(+), 37 deletions(-) create mode 100644 src/dnaio/_bam.py create mode 100644 tests/data/missing_header_no_bgzip_raw_bam_bytes diff --git a/src/dnaio/_bam.py b/src/dnaio/_bam.py new file mode 100644 index 00000000..c377ce30 --- /dev/null +++ b/src/dnaio/_bam.py @@ -0,0 +1,35 @@ +from typing import BinaryIO + + +def read_bam_header(fileobj: BinaryIO) -> bytes: + magic_and_header_size = fileobj.read(8) + if not isinstance(magic_and_header_size, bytes): + raise TypeError( + f"fileobj {fileobj} is not a binary IO type, " f"got {type(fileobj)}" + ) + if len(magic_and_header_size) < 8: + raise EOFError("Truncated BAM file") + if magic_and_header_size[:4] != b"BAM\1": + raise ValueError( + f"fileobj: {fileobj}, is not a BAM file. No BAM magic, instead " + f"found {magic_and_header_size[:4]}" + ) + l_text = int.from_bytes(magic_and_header_size[4:], "little", signed=False) + header = fileobj.read(l_text) + if len(header) < l_text: + raise EOFError("Truncated BAM file") + n_ref_obj = fileobj.read(4) + if len(n_ref_obj) < 4: + raise EOFError("Truncated BAM file") + n_ref = int.from_bytes(n_ref_obj, "little", signed=False) + for i in range(n_ref): + l_name_obj = fileobj.read(4) + if len(l_name_obj) < 4: + raise EOFError("Truncated BAM file") + l_name = int.from_bytes(l_name_obj, "little", signed=False) + reference_chunk_size = l_name + 4 # Include name and uint32_t of size + reference_chunk = fileobj.read(reference_chunk_size) + if len(reference_chunk) < reference_chunk_size: + raise EOFError("Truncated BAM file") + # Fileobj is now skipped ahead and at the start of the BAM records + return header diff --git a/src/dnaio/_core.pyi b/src/dnaio/_core.pyi index 5a729a4f..f519b266 100644 --- a/src/dnaio/_core.pyi +++ b/src/dnaio/_core.pyi @@ -54,7 +54,7 @@ class FastqIter(Generic[T]): def number_of_records(self) -> int: ... class BamIter: - def __init__(self, file: BinaryIO, buffer_size: int): ... + def __init__(self, file: BinaryIO, read_in_size: int, with_header: bool = True): ... def __iter__(self) -> Iterator[SequenceRecord]: ... def __next__(self) -> SequenceRecord: ... @property diff --git a/src/dnaio/_core.pyx b/src/dnaio/_core.pyx index c726de67..50624c61 100644 --- a/src/dnaio/_core.pyx +++ b/src/dnaio/_core.pyx @@ -12,6 +12,8 @@ from libc.stdint cimport uint8_t, uint16_t, uint32_t, int32_t cimport cython +from ._bam import read_bam_header + cdef extern from "Python.h": void *PyUnicode_DATA(object o) bint PyUnicode_IS_COMPACT_ASCII(object o) @@ -688,6 +690,22 @@ cdef struct BamRecordHeader: int32_t tlen cdef class BamIter: + """ + Parse a uBAM file and yield SequenceRecord objects + + Arguments: + file: a file-like object, opened in binary mode (it must have a readinto + method) + + buffer_size: size of the initial buffer. This is automatically grown + if a BAM record is encountered that does not fit. + + with_header: The BAM file has a header that needs parsing. Default is True. + False can be used in circumstances where chunks of BAM records are read. + + Yields: + SequenceRecord Objects + """ cdef: uint8_t *record_start uint8_t *buffer_end @@ -701,42 +719,16 @@ cdef class BamIter: def __dealloc__(self): PyMem_Free(self.read_in_buffer) - def __cinit__(self, fileobj, read_in_size = 48 * 1024): + def __cinit__(self, fileobj, read_in_size = 48 * 1024, with_header = True): if read_in_size < 4: raise ValueError(f"read_in_size must be at least 4 got " f"{read_in_size}") - # Skip ahead and save the BAM header for later inspection - magic_and_header_size = fileobj.read(8) - if not isinstance(magic_and_header_size, bytes): - raise TypeError(f"fileobj {fileobj} is not a binary IO type, " - f"got {type(fileobj)}") - if len(magic_and_header_size) < 8: - raise EOFError("Truncated BAM file") - if magic_and_header_size[:4] != b"BAM\1": - raise ValueError( - f"fileobj: {fileobj}, is not a BAM file. No BAM magic, instead " - f"found {magic_and_header_size[:4]}") - l_text = int.from_bytes(magic_and_header_size[4:], "little", signed=False) - header = fileobj.read(l_text) - if len(header) < l_text: - raise EOFError("Truncated BAM file") - n_ref_obj = fileobj.read(4) - if len(n_ref_obj) < 4: - raise EOFError("Truncated BAM file") - n_ref = int.from_bytes(n_ref_obj, "little", signed=False) - for i in range(n_ref): - l_name_obj = fileobj.read(4) - if len(l_name_obj) < 4: - raise EOFError("Truncated BAM file") - l_name = int.from_bytes(l_name_obj, "little", signed=False) - reference_chunk_size = l_name + 4 # Include name and uint32_t of size - reference_chunk = fileobj.read(reference_chunk_size) - if len(reference_chunk) < reference_chunk_size: - raise EOFError("Truncated BAM file") - # Fileobj is now skipped ahead and at the start of the BAM records - - self.header = header + if with_header: + # Skip ahead and save the BAM header for later inspection + self.header = read_bam_header(fileobj) + else: + self.header = b"" self.read_in_size = read_in_size self.file = fileobj self.read_in_buffer = NULL @@ -746,9 +738,9 @@ cdef class BamIter: def __iter__(self): return self - + cdef _read_into_buffer(self): - cdef size_t read_in_size + cdef size_t read_in_size cdef size_t leftover_size = self.buffer_end - self.record_start cdef uint32_t block_size memmove(self.read_in_buffer, self.record_start, leftover_size) @@ -769,7 +761,7 @@ cdef class BamIter: raise StopIteration() elif new_bytes_size == 0: raise EOFError("Incomplete record at the end of file") - cdef uint8_t *tmp + cdef uint8_t *tmp if new_buffer_size > self.read_in_buffer_size: tmp = PyMem_Realloc(self.read_in_buffer, new_buffer_size) if tmp == NULL: @@ -802,6 +794,7 @@ cdef class BamIter: self._read_into_buffer() continue record_size = (record_start)[0] + print(record_size) record_end = record_start + 4 + record_size if record_end > buffer_end: self._read_into_buffer() diff --git a/src/dnaio/readers.py b/src/dnaio/readers.py index 254e9698..f4f7b61c 100644 --- a/src/dnaio/readers.py +++ b/src/dnaio/readers.py @@ -212,13 +212,16 @@ def __init__( buffer_size: int = 128 * 1024, # Buffer size used by cat, pigz etc. opener=xopen, _close_file: Optional[bool] = None, + with_header: bool = True, ): super().__init__(file, opener=opener, _close_file=_close_file) self.sequence_class = sequence_class self.delivers_qualities = True self.buffer_size = buffer_size try: - self._iter: Iterator[SequenceRecord] = BamIter(self._file, self.buffer_size) + self._iter: Iterator[SequenceRecord] = BamIter( + self._file, read_in_size=self.buffer_size, with_header=with_header + ) except Exception: self.close() raise diff --git a/src/dnaio/singleend.py b/src/dnaio/singleend.py index f7ab09f9..3651c189 100644 --- a/src/dnaio/singleend.py +++ b/src/dnaio/singleend.py @@ -64,6 +64,11 @@ def _open_single( return BamReader(file, _close_file=close_file) # This should not be reached raise NotImplementedError("Only reading is supported for BAM files") + elif fileformat == "bam_no_header": + if "r" in mode: + return BamReader(file, _close_file=close_file, with_header=False) + # This should not be reached + raise NotImplementedError("Only reading is supported for headerless BAM files") if close_file: file.close() raise UnknownFileFormat( diff --git a/tests/data/missing_header_no_bgzip_raw_bam_bytes b/tests/data/missing_header_no_bgzip_raw_bam_bytes new file mode 100644 index 0000000000000000000000000000000000000000..dd804d6e438a1380eb29fd3d911db9a25a99ae81 GIT binary patch literal 197 zcmcCyU|{$U1)M-O0|N^KJCYbk#J4gdH8CZ%h{3T#NI)F|g50AV8SIc0@_|%h(d?L) yUy_kp None: assert record.name == "Myheader" +def test_read_raw_bam_no_header_from_memory() -> None: + with open("tests/data/missing_header_no_bgzip_raw_bam_bytes", "rb") as f: + raw_bam = f.read() + in_memory_bam = io.BytesIO(raw_bam) + with dnaio.open(in_memory_bam, fileformat="bam_no_header") as f: + record = next(iter(f)) + assert record.name == "Myheader" + + def test_write(tmp_path, extension) -> None: out_fastq = tmp_path / ("out.fastq" + extension) with dnaio.open(str(out_fastq), mode="w") as f: From 4bbcc2f137be298cc1e41d63a996e309c116bf30 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 8 Oct 2024 08:42:21 +0200 Subject: [PATCH 2/8] Add chunking for BAM files. --- src/dnaio/_bam.py | 24 ++++++++++++++++-------- src/dnaio/_core.pyi | 2 ++ src/dnaio/_core.pyx | 20 ++++++++++++++++++++ src/dnaio/chunks.py | 14 ++++++++++---- tests/test_chunks.py | 22 ++++++++++++++++++++++ 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/src/dnaio/_bam.py b/src/dnaio/_bam.py index c377ce30..1bc04012 100644 --- a/src/dnaio/_bam.py +++ b/src/dnaio/_bam.py @@ -1,32 +1,40 @@ +import struct from typing import BinaryIO def read_bam_header(fileobj: BinaryIO) -> bytes: - magic_and_header_size = fileobj.read(8) - if not isinstance(magic_and_header_size, bytes): + magic = fileobj.read(4) + if not isinstance(magic, bytes): raise TypeError( f"fileobj {fileobj} is not a binary IO type, " f"got {type(fileobj)}" ) - if len(magic_and_header_size) < 8: + if len(magic) < 4: raise EOFError("Truncated BAM file") - if magic_and_header_size[:4] != b"BAM\1": + if magic[:4] != b"BAM\1": raise ValueError( f"fileobj: {fileobj}, is not a BAM file. No BAM magic, instead " - f"found {magic_and_header_size[:4]}" + f"found {magic}" ) - l_text = int.from_bytes(magic_and_header_size[4:], "little", signed=False) + return read_bam_header_after_magic(fileobj) + + +def read_bam_header_after_magic(fileobj: BinaryIO) -> bytes: + header_size = fileobj.read(4) + if len(header_size) < 4: + raise EOFError("Truncated BAM file") + (l_text,) = struct.unpack(" Tuple[int, int]: ... +def bam_head(buf: ByteString, end: int = sys.maxsize) -> int: ... def records_are_mates( __first_record: SequenceRecord, __second_record: SequenceRecord, diff --git a/src/dnaio/_core.pyx b/src/dnaio/_core.pyx index 50624c61..bdefb6cd 100644 --- a/src/dnaio/_core.pyx +++ b/src/dnaio/_core.pyx @@ -5,6 +5,7 @@ from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING, PyBytes from cpython.mem cimport PyMem_Free, PyMem_Malloc, PyMem_Realloc from cpython.unicode cimport PyUnicode_CheckExact, PyUnicode_GET_LENGTH, PyUnicode_DecodeASCII from cpython.object cimport Py_TYPE, PyTypeObject +from cpython.pyport cimport PY_SSIZE_T_MAX from cpython.ref cimport PyObject from cpython.tuple cimport PyTuple_GET_ITEM from libc.string cimport memcmp, memcpy, memchr, strcspn, strspn, memmove @@ -433,6 +434,25 @@ def paired_fastq_heads(buf1, buf2, Py_ssize_t end1, Py_ssize_t end2): return record_start1 - data1, record_start2 - data2 +def bam_head(buf, Py_ssize_t end = PY_SSIZE_T_MAX): + """Return the end of the last complete BAM record in the buf.""" + cdef Py_buffer buffer; + PyObject_GetBuffer(buf, &buffer, PyBUF_SIMPLE) + cdef: + uint8_t *buffer_start = buffer.buf + uint8_t *record_start = buffer_start + uint8_t *buffer_end = buffer_start + min(end, buffer.len) + uint32_t block_size + size_t record_size + + while (record_start + 4) < buffer_end: + record_size = (record_start)[0] + 4 + if (record_start + record_size) > buffer_end: + break + record_start += record_size + return (record_start - buffer_start) + + cdef class FastqIter: """ Parse a FASTQ file and yield SequenceRecord objects diff --git a/src/dnaio/chunks.py b/src/dnaio/chunks.py index 233d1f13..53b12bc5 100644 --- a/src/dnaio/chunks.py +++ b/src/dnaio/chunks.py @@ -9,7 +9,9 @@ from io import RawIOBase from typing import Optional, Iterator, Tuple +from ._bam import read_bam_header_after_magic from ._core import paired_fastq_heads as _paired_fastq_heads +from ._core import bam_head as _bam_head from .exceptions import FileFormatError, FastaFormatError, UnknownFileFormat @@ -104,19 +106,23 @@ def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memory # Read one byte to determine file format. # If there is a comment char, we assume FASTA! - start = f.readinto(memoryview(buf)[0:1]) + start = f.readinto(memoryview(buf)[0:4]) if start == 0: # Empty file return - assert start == 1 + assert start == 4 if buf[0:1] == b"@": head = _fastq_head elif buf[0:1] == b"#" or buf[0:1] == b">": head = _fasta_head + elif buf[0:4] == b"BAM\x01": + head = _bam_head + _ = read_bam_header_after_magic(f) + start = 0 # Skip header and start at the records. else: raise UnknownFileFormat( - f"Cannnot determine input file format: First character expected to be '>' or '@', " - f"but found {repr(chr(buf[0]))}" + f"Cannnot determine input file format: First characters expected " + f"to be '>'. '@', or 'BAM\1', but found {repr(buf[0:4])}" ) # Layout of buf diff --git a/tests/test_chunks.py b/tests/test_chunks.py index 366e545c..6c48c4e4 100644 --- a/tests/test_chunks.py +++ b/tests/test_chunks.py @@ -1,3 +1,5 @@ +import gzip +import struct import textwrap from pytest import raises @@ -7,6 +9,7 @@ from dnaio import UnknownFileFormat, FileFormatError from dnaio._core import paired_fastq_heads from dnaio.chunks import ( + _bam_head, _fastq_head, _fasta_head, read_chunks, @@ -41,6 +44,13 @@ def test_fasta_head_with_comment(): assert _fasta_head(b"#\n>3\n5\n>") == 7 +def test_bam_head(): + assert _bam_head(struct.pack(" Date: Tue, 8 Oct 2024 10:28:17 +0200 Subject: [PATCH 3/8] Ensure buffer is properly released before exit --- src/dnaio/_core.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/dnaio/_core.pyx b/src/dnaio/_core.pyx index bdefb6cd..bf312e30 100644 --- a/src/dnaio/_core.pyx +++ b/src/dnaio/_core.pyx @@ -436,7 +436,7 @@ def paired_fastq_heads(buf1, buf2, Py_ssize_t end1, Py_ssize_t end2): def bam_head(buf, Py_ssize_t end = PY_SSIZE_T_MAX): """Return the end of the last complete BAM record in the buf.""" - cdef Py_buffer buffer; + cdef Py_buffer buffer PyObject_GetBuffer(buf, &buffer, PyBUF_SIMPLE) cdef: uint8_t *buffer_start = buffer.buf @@ -450,8 +450,9 @@ def bam_head(buf, Py_ssize_t end = PY_SSIZE_T_MAX): if (record_start + record_size) > buffer_end: break record_start += record_size - return (record_start - buffer_start) - + cdef Py_ssize_t head = (record_start - buffer_start) + PyBuffer_Release(&buffer) + return head cdef class FastqIter: """ From 4d2a4b509f49142cb26c473b762f24974912f107 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 8 Oct 2024 11:12:18 +0200 Subject: [PATCH 4/8] Remove rogue import --- tests/test_internal.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_internal.py b/tests/test_internal.py index 864d5538..16ea8ab5 100644 --- a/tests/test_internal.py +++ b/tests/test_internal.py @@ -33,8 +33,6 @@ from dnaio.readers import BinaryFileReader from dnaio._core import bytes_ascii_check -from tests.test_open import fileformat - TEST_DATA = Path(__file__).parent / "data" SIMPLE_FASTQ = str(TEST_DATA / "simple.fastq") # files tests/data/simple.fast{q,a} From 596ef7bc08f7bc4a2a30d0aa7f6416559725201e Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 8 Oct 2024 11:18:52 +0200 Subject: [PATCH 5/8] Silence too complex warning --- src/dnaio/singleend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dnaio/singleend.py b/src/dnaio/singleend.py index 3651c189..7d8d79df 100644 --- a/src/dnaio/singleend.py +++ b/src/dnaio/singleend.py @@ -6,7 +6,7 @@ from .writers import FastaWriter, FastqWriter -def _open_single( +def _open_single( # noqa: C901 file_or_path: Union[str, os.PathLike, BinaryIO], opener, *, From 8ed8a92bbc35961191fa36c8160ec64c7b4be8fa Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 8 Oct 2024 11:30:47 +0200 Subject: [PATCH 6/8] Fix several typing errors --- src/dnaio/_bam.py | 8 ++++---- src/dnaio/_core.pyi | 2 +- src/dnaio/_core.pyx | 7 +++++-- src/dnaio/chunks.py | 16 +++++++++------- 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/dnaio/_bam.py b/src/dnaio/_bam.py index 1bc04012..625b80c7 100644 --- a/src/dnaio/_bam.py +++ b/src/dnaio/_bam.py @@ -1,8 +1,8 @@ import struct -from typing import BinaryIO +from io import BufferedIOBase -def read_bam_header(fileobj: BinaryIO) -> bytes: +def read_bam_header(fileobj: BufferedIOBase) -> bytes: magic = fileobj.read(4) if not isinstance(magic, bytes): raise TypeError( @@ -13,12 +13,12 @@ def read_bam_header(fileobj: BinaryIO) -> bytes: if magic[:4] != b"BAM\1": raise ValueError( f"fileobj: {fileobj}, is not a BAM file. No BAM magic, instead " - f"found {magic}" + f"found {magic!r}" ) return read_bam_header_after_magic(fileobj) -def read_bam_header_after_magic(fileobj: BinaryIO) -> bytes: +def read_bam_header_after_magic(fileobj: BufferedIOBase) -> bytes: header_size = fileobj.read(4) if len(header_size) < 4: raise EOFError("Truncated BAM file") diff --git a/src/dnaio/_core.pyi b/src/dnaio/_core.pyi index 7685d05a..0a4cee35 100644 --- a/src/dnaio/_core.pyi +++ b/src/dnaio/_core.pyi @@ -37,7 +37,7 @@ class SequenceRecord: def paired_fastq_heads( buf1: ByteString, buf2: ByteString, end1: int, end2: int ) -> Tuple[int, int]: ... -def bam_head(buf: ByteString, end: int = sys.maxsize) -> int: ... +def bam_head(buf: bytes, end: Optional[int] = None) -> int: ... def records_are_mates( __first_record: SequenceRecord, __second_record: SequenceRecord, diff --git a/src/dnaio/_core.pyx b/src/dnaio/_core.pyx index bf312e30..d746a31d 100644 --- a/src/dnaio/_core.pyx +++ b/src/dnaio/_core.pyx @@ -434,14 +434,17 @@ def paired_fastq_heads(buf1, buf2, Py_ssize_t end1, Py_ssize_t end2): return record_start1 - data1, record_start2 - data2 -def bam_head(buf, Py_ssize_t end = PY_SSIZE_T_MAX): +def bam_head(buf, end = None): """Return the end of the last complete BAM record in the buf.""" + cdef Py_ssize_t c_end = PY_SSIZE_T_MAX + if end is not None: + c_end = end cdef Py_buffer buffer PyObject_GetBuffer(buf, &buffer, PyBUF_SIMPLE) cdef: uint8_t *buffer_start = buffer.buf uint8_t *record_start = buffer_start - uint8_t *buffer_end = buffer_start + min(end, buffer.len) + uint8_t *buffer_end = buffer_start + min(c_end, buffer.len) uint32_t block_size size_t record_size diff --git a/src/dnaio/chunks.py b/src/dnaio/chunks.py index 53b12bc5..9b13177e 100644 --- a/src/dnaio/chunks.py +++ b/src/dnaio/chunks.py @@ -6,7 +6,7 @@ or subprocess and be parsed and processed there. """ -from io import RawIOBase +from io import BufferedIOBase from typing import Optional, Iterator, Tuple from ._bam import read_bam_header_after_magic @@ -81,7 +81,9 @@ def _fastq_head(buf: bytes, end: Optional[int] = None) -> int: return right + 1 # type: ignore -def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memoryview]: +def read_chunks( + f: BufferedIOBase, buffer_size: int = 4 * 1024**2 +) -> Iterator[memoryview]: """ Read chunks of complete FASTA or FASTQ records from a file. If the format is detected to be FASTQ, all chunks except possibly the last contain @@ -141,7 +143,7 @@ def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memory while True: if start == len(buf): raise OverflowError("FASTA/FASTQ record does not fit into buffer") - bufend = f.readinto(memoryview(buf)[start:]) + start # type: ignore + bufend = f.readinto(memoryview(buf)[start:]) + start if start == bufend: # End of file break @@ -158,8 +160,8 @@ def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memory def read_paired_chunks( - f: RawIOBase, - f2: RawIOBase, + f: BufferedIOBase, + f2: BufferedIOBase, buffer_size: int = 4 * 1024**2, ) -> Iterator[Tuple[memoryview, memoryview]]: """ @@ -228,8 +230,8 @@ def read_paired_chunks( raise ValueError( f"FASTA/FASTQ records do not fit into buffer of size {buffer_size}" ) - bufend1 = f.readinto(memoryview(buf1)[start1:]) + start1 # type: ignore - bufend2 = f2.readinto(memoryview(buf2)[start2:]) + start2 # type: ignore + bufend1 = f.readinto(memoryview(buf1)[start1:]) + start1 + bufend2 = f2.readinto(memoryview(buf2)[start2:]) + start2 if start1 == bufend1 and start2 == bufend2: break From 8a35f9ec8fbe54f2887c4948040d1a538a017fb7 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 8 Oct 2024 14:03:38 +0200 Subject: [PATCH 7/8] Remove print statement --- src/dnaio/_core.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/src/dnaio/_core.pyx b/src/dnaio/_core.pyx index d746a31d..714e0c59 100644 --- a/src/dnaio/_core.pyx +++ b/src/dnaio/_core.pyx @@ -818,7 +818,6 @@ cdef class BamIter: self._read_into_buffer() continue record_size = (record_start)[0] - print(record_size) record_end = record_start + 4 + record_size if record_end > buffer_end: self._read_into_buffer() From 497310f589f4c1d4be103a9d6045d27dcab977fa Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 4 Nov 2024 08:43:27 +0100 Subject: [PATCH 8/8] Improve user messages --- src/dnaio/_bam.py | 4 ++-- tests/test_internal.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dnaio/_bam.py b/src/dnaio/_bam.py index 625b80c7..11ea1e72 100644 --- a/src/dnaio/_bam.py +++ b/src/dnaio/_bam.py @@ -6,13 +6,13 @@ def read_bam_header(fileobj: BufferedIOBase) -> bytes: magic = fileobj.read(4) if not isinstance(magic, bytes): raise TypeError( - f"fileobj {fileobj} is not a binary IO type, " f"got {type(fileobj)}" + f"fileobj {fileobj} (type: {type(fileobj)}), was not opened in binary mode." ) if len(magic) < 4: raise EOFError("Truncated BAM file") if magic[:4] != b"BAM\1": raise ValueError( - f"fileobj: {fileobj}, is not a BAM file. No BAM magic, instead " + f"fileobj: {fileobj}, is not a BAM file. No BAM file signature, instead " f"found {magic!r}" ) return read_bam_header_after_magic(fileobj) diff --git a/tests/test_internal.py b/tests/test_internal.py index 16ea8ab5..7b463574 100644 --- a/tests/test_internal.py +++ b/tests/test_internal.py @@ -785,7 +785,7 @@ def test_bam_parser_not_binary_error(self): ) with pytest.raises(TypeError) as error: BamReader(file) - error.match("binary IO") + error.match("binary mode") @pytest.mark.parametrize("buffersize", [4, 8, 10, 20, 40]) def test_small_buffersize(self, buffersize):