Skip to content

Commit

Permalink
Merge pull request #116 from rhpvorderman/bamparser
Browse files Browse the repository at this point in the history
Add a simple BAM parser to dnaio
  • Loading branch information
marcelm authored Nov 2, 2023
2 parents 2ea2078 + 9afbee7 commit 650e423
Show file tree
Hide file tree
Showing 18 changed files with 792 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*.fastq -crlf
*.fasta -crlf
*.sam -crlf
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,15 @@ jobs:
matrix:
os: [ubuntu-latest]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
compile_flags: [""]
include:
- os: macos-latest
python-version: "3.10"
- os: windows-latest
python-version: "3.10"
- os: ubuntu-latest
python-version: "3.10"
compile_flags: "-mssse3"
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
Expand All @@ -66,6 +70,8 @@ jobs:
run: python -m pip install tox
- name: Test
run: tox -e py
env:
CFLAGS: ${{ matrix.compile_flags }}
- name: Upload coverage report
uses: codecov/codecov-action@v3

Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@ write_to = "src/dnaio/_version.py"
testpaths = ["tests"]

[tool.cibuildwheel]
environment = "CFLAGS=-g0"
environment_windows = "CFLAGS=-g0 -DNDEBUG"
environment_macos = "CFLAGS=-g0 -DNDEBUG"
environment_linux = "CFLAGS=-g0 -DNDEBUG -mssse3"

test-requires = "pytest"
test-command = ["cd {project}", "pytest tests"]

Expand Down
3 changes: 2 additions & 1 deletion src/dnaio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

__all__ = [
"open",
"BamReader",
"SequenceRecord",
"SingleEndReader",
"PairedEndReader",
Expand Down Expand Up @@ -41,7 +42,7 @@
)
from ._core import record_names_match # noqa: F401 # deprecated
from ._core import records_are_mates
from .readers import FastaReader, FastqReader
from .readers import BamReader, FastaReader, FastqReader
from .writers import FastaWriter, FastqWriter
from .singleend import _open_single
from .pairedend import (
Expand Down
9 changes: 9 additions & 0 deletions src/dnaio/_core.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ class FastqIter(Generic[T]):
@property
def number_of_records(self) -> int: ...

class BamIter:
def __init__(self, file: BinaryIO, buffer_size: int): ...
def __iter__(self) -> Iterator[SequenceRecord]: ...
def __next__(self) -> SequenceRecord: ...
@property
def header(self) -> bytes: ...
@property
def number_of_records(self) -> int: ...

# Deprecated
def record_names_match(header1: str, header2: str) -> bool: ...

Expand Down
168 changes: 168 additions & 0 deletions src/dnaio/_core.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ from cpython.object cimport Py_TYPE, PyTypeObject
from cpython.ref cimport PyObject
from cpython.tuple cimport PyTuple_GET_ITEM
from libc.string cimport memcmp, memcpy, memchr, strcspn, strspn, memmove
from libc.stdint cimport uint8_t, uint16_t, uint32_t, int32_t

cimport cython

cdef extern from "Python.h":
Expand All @@ -21,6 +23,10 @@ cdef extern from "ascii_check.h":
cdef extern from "_conversions.h":
const char NUCLEOTIDE_COMPLEMENTS[256]

cdef extern from "bam.h":
void decode_bam_sequence(void *dest, void *encoded_sequence, size_t length)
void decode_bam_qualities(uint8_t *dest, uint8_t *encoded_qualities, size_t length)

from .exceptions import FastqFormatError
from ._util import shorten

Expand Down Expand Up @@ -667,6 +673,168 @@ cdef class FastqIter:
self.record_start = qualities_end + 1
return ret_val

cdef struct BamRecordHeader:
uint32_t block_size
int32_t reference_id
int32_t pos
uint8_t l_read_name
uint8_t mapq
uint16_t bin
uint16_t n_cigar_op
uint16_t flag
uint32_t l_seq
int32_t next_ref_id
int32_t next_pos
int32_t tlen

cdef class BamIter:
cdef:
uint8_t *record_start
uint8_t *buffer_end
size_t read_in_size
uint8_t *read_in_buffer
size_t read_in_buffer_size
object file
readonly object header
readonly Py_ssize_t number_of_records

def __dealloc__(self):
PyMem_Free(self.read_in_buffer)

def __cinit__(self, fileobj, read_in_size = 48 * 1024):
if read_in_size < 4:
raise ValueError(f"read_in_size must be at least 4 got "
f"{read_in_size}")

# Skip ahead and save the BAM header for later inspection
magic_and_header_size = fileobj.read(8)
if not isinstance(magic_and_header_size, bytes):
raise TypeError(f"fileobj {fileobj} is not a binary IO type, "
f"got {type(fileobj)}")
if len(magic_and_header_size) < 8:
raise EOFError("Truncated BAM file")
if magic_and_header_size[:4] != b"BAM\1":
raise ValueError(
f"fileobj: {fileobj}, is not a BAM file. No BAM magic, instead "
f"found {magic_and_header_size[:4]}")
l_text = int.from_bytes(magic_and_header_size[4:], "little", signed=False)
header = fileobj.read(l_text)
if len(header) < l_text:
raise EOFError("Truncated BAM file")
n_ref_obj = fileobj.read(4)
if len(n_ref_obj) < 4:
raise EOFError("Truncated BAM file")
n_ref = int.from_bytes(n_ref_obj, "little", signed=False)
for i in range(n_ref):
l_name_obj = fileobj.read(4)
if len(l_name_obj) < 4:
raise EOFError("Truncated BAM file")
l_name = int.from_bytes(l_name_obj, "little", signed=False)
reference_chunk_size = l_name + 4 # Include name and uint32_t of size
reference_chunk = fileobj.read(reference_chunk_size)
if len(reference_chunk) < reference_chunk_size:
raise EOFError("Truncated BAM file")
# Fileobj is now skipped ahead and at the start of the BAM records

self.header = header
self.read_in_size = read_in_size
self.file = fileobj
self.read_in_buffer = NULL
self.read_in_buffer_size = 0
self.record_start = self.read_in_buffer
self.buffer_end = self.record_start

def __iter__(self):
return self

cdef _read_into_buffer(self):
cdef size_t read_in_size
cdef size_t leftover_size = self.buffer_end - self.record_start
cdef uint32_t block_size
memmove(self.read_in_buffer, self.record_start, leftover_size)
self.record_start = self.read_in_buffer
self.buffer_end = self.record_start + leftover_size
if leftover_size >= 4:
# Immediately check how much data is at least required
block_size = (<uint32_t *>self.record_start)[0]
read_in_size = max(block_size, self.read_in_size)
else:
read_in_size = self.read_in_size - leftover_size
new_bytes = self.file.read(read_in_size)
cdef size_t new_bytes_size = PyBytes_GET_SIZE(new_bytes)
cdef uint8_t *new_bytes_buf = <uint8_t *>PyBytes_AS_STRING(new_bytes)
cdef size_t new_buffer_size = leftover_size + new_bytes_size
if new_buffer_size == 0:
# File completely read
raise StopIteration()
elif new_bytes_size == 0:
raise EOFError("Incomplete record at the end of file")
cdef uint8_t *tmp
if new_buffer_size > self.read_in_buffer_size:
tmp = <uint8_t *>PyMem_Realloc(self.read_in_buffer, new_buffer_size)
if tmp == NULL:
raise MemoryError()
self.read_in_buffer = tmp
self.read_in_buffer_size = new_buffer_size
memcpy(self.read_in_buffer + leftover_size, new_bytes_buf, new_bytes_size)
self.record_start = self.read_in_buffer
self.buffer_end = self.record_start + new_buffer_size

def __next__(self):
cdef:
SequenceRecord seq_record
uint8_t *record_start
uint8_t *buffer_end
uint32_t record_size
uint8_t *record_end
cdef BamRecordHeader header
cdef uint8_t *bam_name_start
uint32_t name_length
uint8_t *bam_seq_start
uint32_t seq_length
uint8_t *bam_qual_start
uint32_t encoded_seq_length

while True:
record_start = self.record_start
buffer_end = self.buffer_end
if record_start + 4 >= buffer_end:
self._read_into_buffer()
continue
record_size = (<uint32_t *>record_start)[0]
record_end = record_start + 4 + record_size
if record_end > buffer_end:
self._read_into_buffer()
continue
header = (<BamRecordHeader *>record_start)[0]
if header.flag != 4:
raise NotImplementedError(
"The BAM parser has been implemented with unmapped single "
"reads in mind to support ONT sequencing input. Mapped "
"BAM files or files with multiple reads are not supported. "
"Please use samtools fastq to make the appropriate "
"conversion to FASTQ format."
)
bam_name_start = record_start + sizeof(BamRecordHeader)
name_length = header.l_read_name
bam_seq_start = bam_name_start + name_length + header.n_cigar_op * sizeof(uint32_t)
name_length -= 1 # Do not include the null byte
seq_length = header.l_seq
encoded_seq_length = (seq_length + 1) // 2
bam_qual_start = bam_seq_start + encoded_seq_length
name = PyUnicode_New(name_length, 127)
sequence = PyUnicode_New(seq_length, 127)
qualities = PyUnicode_New(seq_length, 127)
memcpy(<uint8_t *>PyUnicode_DATA(name), bam_name_start, name_length)
decode_bam_sequence(<uint8_t *>PyUnicode_DATA(sequence), bam_seq_start, seq_length)
decode_bam_qualities(<uint8_t *>PyUnicode_DATA(qualities), bam_qual_start, seq_length)
seq_record = SequenceRecord.__new__(SequenceRecord)
seq_record._name = name
seq_record._sequence = sequence
seq_record._qualities = qualities
self.number_of_records += 1
self.record_start = record_end
return seq_record

def record_names_match(header1: str, header2: str):
"""
Expand Down
127 changes: 127 additions & 0 deletions src/dnaio/bam.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <assert.h>

#ifdef __SSE2__
#include "emmintrin.h"
#endif

#ifdef __SSSE3__
#include "tmmintrin.h"
#endif

static void
decode_bam_sequence(uint8_t *dest, const uint8_t *encoded_sequence, size_t length)
{
/* Reuse a trick from sam_internal.h in htslib. Have a table to lookup
two characters simultaneously.*/
static const char code2base[512] =
"===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N"
"A=AAACAMAGARASAVATAWAYAHAKADABAN"
"C=CACCCMCGCRCSCVCTCWCYCHCKCDCBCN"
"M=MAMCMMMGMRMSMVMTMWMYMHMKMDMBMN"
"G=GAGCGMGGGRGSGVGTGWGYGHGKGDGBGN"
"R=RARCRMRGRRRSRVRTRWRYRHRKRDRBRN"
"S=SASCSMSGSRSSSVSTSWSYSHSKSDSBSN"
"V=VAVCVMVGVRVSVVVTVWVYVHVKVDVBVN"
"T=TATCTMTGTRTSTVTTTWTYTHTKTDTBTN"
"W=WAWCWMWGWRWSWVWTWWWYWHWKWDWBWN"
"Y=YAYCYMYGYRYSYVYTYWYYYHYKYDYBYN"
"H=HAHCHMHGHRHSHVHTHWHYHHHKHDHBHN"
"K=KAKCKMKGKRKSKVKTKWKYKHKKKDKBKN"
"D=DADCDMDGDRDSDVDTDWDYDHDKDDDBDN"
"B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN"
"N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN";
static const uint8_t *nuc_lookup = (uint8_t *)"=ACMGRSVTWYHKDBN";
const uint8_t *dest_end_ptr = dest + length;
uint8_t *dest_cursor = dest;
const uint8_t *encoded_cursor = encoded_sequence;
#ifdef __SSSE3__
const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i));
__m128i first_upper_shuffle = _mm_setr_epi8(
0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff);
__m128i first_lower_shuffle = _mm_setr_epi8(
0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7);
__m128i second_upper_shuffle = _mm_setr_epi8(
8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff);
__m128i second_lower_shuffle = _mm_setr_epi8(
0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15);
__m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)nuc_lookup);
/* Work on 16 encoded characters at the time resulting in 32 decoded characters
Examples are given for 8 encoded characters A until H to keep it readable.
Encoded stored as |AB|CD|EF|GH|
Shuffle into |AB|00|CD|00|EF|00|GH|00| and
|00|AB|00|CD|00|EF|00|GH|
Shift upper to the right resulting into
|0A|B0|0C|D0|0E|F0|0G|H0| and
|00|AB|00|CD|00|EF|00|GH|
Merge with or resulting into (X stands for garbage)
|0A|XB|0C|XD|0E|XF|0G|XH|
Bitwise and with 0b1111 leads to:
|0A|0B|0C|0D|0E|0F|0G|0H|
We can use the resulting 4-bit integers as indexes for the shuffle of
the nucleotide lookup. */
while (dest_cursor < dest_vec_end_ptr) {
__m128i encoded = _mm_lddqu_si128((__m128i *)encoded_cursor);

__m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
__m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
__m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
__m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
__m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111));
__m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
_mm_storeu_si128((__m128i *)dest_cursor, first_nucleotides);

__m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
__m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
__m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
__m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
__m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111));
__m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
_mm_storeu_si128((__m128i *)(dest_cursor + 16), second_nucleotides);

encoded_cursor += sizeof(__m128i);
dest_cursor += 2 * sizeof(__m128i);
}
#endif
/* Do two at the time until it gets to the last even address. */
const uint8_t *dest_end_ptr_twoatatime = dest + (length & (~1ULL));
while (dest_cursor < dest_end_ptr_twoatatime) {
/* According to htslib, size_t cast helps the optimizer.
Code confirmed to indeed run faster. */
memcpy(dest_cursor, code2base + ((size_t)*encoded_cursor * 2), 2);
dest_cursor += 2;
encoded_cursor += 1;
}
assert((dest_end_ptr - dest_cursor) < 2);
if (dest_cursor != dest_end_ptr) {
/* There is a single encoded nuc left */
uint8_t encoded_nucs = *encoded_cursor;
uint8_t upper_nuc_index = encoded_nucs >> 4;
dest_cursor[0] = nuc_lookup[upper_nuc_index];
}
}

static void
decode_bam_qualities(uint8_t *dest, const uint8_t *encoded_qualities, size_t length)
{
const uint8_t *end_ptr = encoded_qualities + length;
const uint8_t *cursor = encoded_qualities;
uint8_t *dest_cursor = dest;
#ifdef __SSE2__
const uint8_t *vec_end_ptr = end_ptr - sizeof(__m128i);
while (cursor < vec_end_ptr) {
__m128i quals = _mm_loadu_si128((__m128i *)cursor);
__m128i phreds = _mm_add_epi8(quals, _mm_set1_epi8(33));
_mm_storeu_si128((__m128i *)dest_cursor, phreds);
cursor += sizeof(__m128i);
dest_cursor += sizeof(__m128i);
}
#endif
while (cursor < end_ptr) {
*dest_cursor = *cursor + 33;
cursor += 1;
dest_cursor += 1;
}
}
Loading

0 comments on commit 650e423

Please sign in to comment.