diff --git a/.gitattributes b/.gitattributes index 985eb903..bd538c3a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,3 @@ *.fastq -crlf *.fasta -crlf +*.sam -crlf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 74930a2b..26d18fa8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,11 +51,15 @@ jobs: matrix: os: [ubuntu-latest] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + compile_flags: [""] include: - os: macos-latest python-version: "3.10" - os: windows-latest python-version: "3.10" + - os: ubuntu-latest + python-version: "3.10" + compile_flags: "-mssse3" steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} @@ -66,6 +70,8 @@ jobs: run: python -m pip install tox - name: Test run: tox -e py + env: + CFLAGS: ${{ matrix.compile_flags }} - name: Upload coverage report uses: codecov/codecov-action@v3 diff --git a/pyproject.toml b/pyproject.toml index 2e4ab70a..2f15e953 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,10 @@ write_to = "src/dnaio/_version.py" testpaths = ["tests"] [tool.cibuildwheel] -environment = "CFLAGS=-g0" +environment_windows = "CFLAGS=-g0 -DNDEBUG" +environment_macos = "CFLAGS=-g0 -DNDEBUG" +environment_linux = "CFLAGS=-g0 -DNDEBUG -mssse3" + test-requires = "pytest" test-command = ["cd {project}", "pytest tests"] diff --git a/src/dnaio/__init__.py b/src/dnaio/__init__.py index b7048d4e..07506dc2 100644 --- a/src/dnaio/__init__.py +++ b/src/dnaio/__init__.py @@ -4,6 +4,7 @@ __all__ = [ "open", + "BamReader", "SequenceRecord", "SingleEndReader", "PairedEndReader", @@ -41,7 +42,7 @@ ) from ._core import record_names_match # noqa: F401 # deprecated from ._core import records_are_mates -from .readers import FastaReader, FastqReader +from .readers import BamReader, FastaReader, FastqReader from .writers import FastaWriter, FastqWriter from .singleend import _open_single from .pairedend import ( diff --git a/src/dnaio/_core.pyi b/src/dnaio/_core.pyi index ceacfc4c..5a729a4f 100644 --- a/src/dnaio/_core.pyi +++ b/src/dnaio/_core.pyi @@ -53,6 +53,15 @@ class FastqIter(Generic[T]): @property def number_of_records(self) -> int: ... +class BamIter: + def __init__(self, file: BinaryIO, buffer_size: int): ... + def __iter__(self) -> Iterator[SequenceRecord]: ... + def __next__(self) -> SequenceRecord: ... + @property + def header(self) -> bytes: ... + @property + def number_of_records(self) -> int: ... + # Deprecated def record_names_match(header1: str, header2: str) -> bool: ... diff --git a/src/dnaio/_core.pyx b/src/dnaio/_core.pyx index 1cbcb09f..288a9f8f 100644 --- a/src/dnaio/_core.pyx +++ b/src/dnaio/_core.pyx @@ -8,6 +8,8 @@ from cpython.object cimport Py_TYPE, PyTypeObject from cpython.ref cimport PyObject from cpython.tuple cimport PyTuple_GET_ITEM from libc.string cimport memcmp, memcpy, memchr, strcspn, strspn, memmove +from libc.stdint cimport uint8_t, uint16_t, uint32_t, int32_t + cimport cython cdef extern from "Python.h": @@ -21,6 +23,10 @@ cdef extern from "ascii_check.h": cdef extern from "_conversions.h": const char NUCLEOTIDE_COMPLEMENTS[256] +cdef extern from "bam.h": + void decode_bam_sequence(void *dest, void *encoded_sequence, size_t length) + void decode_bam_qualities(uint8_t *dest, uint8_t *encoded_qualities, size_t length) + from .exceptions import FastqFormatError from ._util import shorten @@ -667,6 +673,168 @@ cdef class FastqIter: self.record_start = qualities_end + 1 return ret_val +cdef struct BamRecordHeader: + uint32_t block_size + int32_t reference_id + int32_t pos + uint8_t l_read_name + uint8_t mapq + uint16_t bin + uint16_t n_cigar_op + uint16_t flag + uint32_t l_seq + int32_t next_ref_id + int32_t next_pos + int32_t tlen + +cdef class BamIter: + cdef: + uint8_t *record_start + uint8_t *buffer_end + size_t read_in_size + uint8_t *read_in_buffer + size_t read_in_buffer_size + object file + readonly object header + readonly Py_ssize_t number_of_records + + def __dealloc__(self): + PyMem_Free(self.read_in_buffer) + + def __cinit__(self, fileobj, read_in_size = 48 * 1024): + if read_in_size < 4: + raise ValueError(f"read_in_size must be at least 4 got " + f"{read_in_size}") + + # Skip ahead and save the BAM header for later inspection + magic_and_header_size = fileobj.read(8) + if not isinstance(magic_and_header_size, bytes): + raise TypeError(f"fileobj {fileobj} is not a binary IO type, " + f"got {type(fileobj)}") + if len(magic_and_header_size) < 8: + raise EOFError("Truncated BAM file") + if magic_and_header_size[:4] != b"BAM\1": + raise ValueError( + f"fileobj: {fileobj}, is not a BAM file. No BAM magic, instead " + f"found {magic_and_header_size[:4]}") + l_text = int.from_bytes(magic_and_header_size[4:], "little", signed=False) + header = fileobj.read(l_text) + if len(header) < l_text: + raise EOFError("Truncated BAM file") + n_ref_obj = fileobj.read(4) + if len(n_ref_obj) < 4: + raise EOFError("Truncated BAM file") + n_ref = int.from_bytes(n_ref_obj, "little", signed=False) + for i in range(n_ref): + l_name_obj = fileobj.read(4) + if len(l_name_obj) < 4: + raise EOFError("Truncated BAM file") + l_name = int.from_bytes(l_name_obj, "little", signed=False) + reference_chunk_size = l_name + 4 # Include name and uint32_t of size + reference_chunk = fileobj.read(reference_chunk_size) + if len(reference_chunk) < reference_chunk_size: + raise EOFError("Truncated BAM file") + # Fileobj is now skipped ahead and at the start of the BAM records + + self.header = header + self.read_in_size = read_in_size + self.file = fileobj + self.read_in_buffer = NULL + self.read_in_buffer_size = 0 + self.record_start = self.read_in_buffer + self.buffer_end = self.record_start + + def __iter__(self): + return self + + cdef _read_into_buffer(self): + cdef size_t read_in_size + cdef size_t leftover_size = self.buffer_end - self.record_start + cdef uint32_t block_size + memmove(self.read_in_buffer, self.record_start, leftover_size) + self.record_start = self.read_in_buffer + self.buffer_end = self.record_start + leftover_size + if leftover_size >= 4: + # Immediately check how much data is at least required + block_size = (self.record_start)[0] + read_in_size = max(block_size, self.read_in_size) + else: + read_in_size = self.read_in_size - leftover_size + new_bytes = self.file.read(read_in_size) + cdef size_t new_bytes_size = PyBytes_GET_SIZE(new_bytes) + cdef uint8_t *new_bytes_buf = PyBytes_AS_STRING(new_bytes) + cdef size_t new_buffer_size = leftover_size + new_bytes_size + if new_buffer_size == 0: + # File completely read + raise StopIteration() + elif new_bytes_size == 0: + raise EOFError("Incomplete record at the end of file") + cdef uint8_t *tmp + if new_buffer_size > self.read_in_buffer_size: + tmp = PyMem_Realloc(self.read_in_buffer, new_buffer_size) + if tmp == NULL: + raise MemoryError() + self.read_in_buffer = tmp + self.read_in_buffer_size = new_buffer_size + memcpy(self.read_in_buffer + leftover_size, new_bytes_buf, new_bytes_size) + self.record_start = self.read_in_buffer + self.buffer_end = self.record_start + new_buffer_size + + def __next__(self): + cdef: + SequenceRecord seq_record + uint8_t *record_start + uint8_t *buffer_end + uint32_t record_size + uint8_t *record_end + cdef BamRecordHeader header + cdef uint8_t *bam_name_start + uint32_t name_length + uint8_t *bam_seq_start + uint32_t seq_length + uint8_t *bam_qual_start + uint32_t encoded_seq_length + + while True: + record_start = self.record_start + buffer_end = self.buffer_end + if record_start + 4 >= buffer_end: + self._read_into_buffer() + continue + record_size = (record_start)[0] + record_end = record_start + 4 + record_size + if record_end > buffer_end: + self._read_into_buffer() + continue + header = (record_start)[0] + if header.flag != 4: + raise NotImplementedError( + "The BAM parser has been implemented with unmapped single " + "reads in mind to support ONT sequencing input. Mapped " + "BAM files or files with multiple reads are not supported. " + "Please use samtools fastq to make the appropriate " + "conversion to FASTQ format." + ) + bam_name_start = record_start + sizeof(BamRecordHeader) + name_length = header.l_read_name + bam_seq_start = bam_name_start + name_length + header.n_cigar_op * sizeof(uint32_t) + name_length -= 1 # Do not include the null byte + seq_length = header.l_seq + encoded_seq_length = (seq_length + 1) // 2 + bam_qual_start = bam_seq_start + encoded_seq_length + name = PyUnicode_New(name_length, 127) + sequence = PyUnicode_New(seq_length, 127) + qualities = PyUnicode_New(seq_length, 127) + memcpy(PyUnicode_DATA(name), bam_name_start, name_length) + decode_bam_sequence(PyUnicode_DATA(sequence), bam_seq_start, seq_length) + decode_bam_qualities(PyUnicode_DATA(qualities), bam_qual_start, seq_length) + seq_record = SequenceRecord.__new__(SequenceRecord) + seq_record._name = name + seq_record._sequence = sequence + seq_record._qualities = qualities + self.number_of_records += 1 + self.record_start = record_end + return seq_record def record_names_match(header1: str, header2: str): """ diff --git a/src/dnaio/bam.h b/src/dnaio/bam.h new file mode 100644 index 00000000..a718feaa --- /dev/null +++ b/src/dnaio/bam.h @@ -0,0 +1,127 @@ +#include +#include +#include +#include + +#ifdef __SSE2__ +#include "emmintrin.h" +#endif + +#ifdef __SSSE3__ +#include "tmmintrin.h" +#endif + +static void +decode_bam_sequence(uint8_t *dest, const uint8_t *encoded_sequence, size_t length) +{ + /* Reuse a trick from sam_internal.h in htslib. Have a table to lookup + two characters simultaneously.*/ + static const char code2base[512] = + "===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N" + "A=AAACAMAGARASAVATAWAYAHAKADABAN" + "C=CACCCMCGCRCSCVCTCWCYCHCKCDCBCN" + "M=MAMCMMMGMRMSMVMTMWMYMHMKMDMBMN" + "G=GAGCGMGGGRGSGVGTGWGYGHGKGDGBGN" + "R=RARCRMRGRRRSRVRTRWRYRHRKRDRBRN" + "S=SASCSMSGSRSSSVSTSWSYSHSKSDSBSN" + "V=VAVCVMVGVRVSVVVTVWVYVHVKVDVBVN" + "T=TATCTMTGTRTSTVTTTWTYTHTKTDTBTN" + "W=WAWCWMWGWRWSWVWTWWWYWHWKWDWBWN" + "Y=YAYCYMYGYRYSYVYTYWYYYHYKYDYBYN" + "H=HAHCHMHGHRHSHVHTHWHYHHHKHDHBHN" + "K=KAKCKMKGKRKSKVKTKWKYKHKKKDKBKN" + "D=DADCDMDGDRDSDVDTDWDYDHDKDDDBDN" + "B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN" + "N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN"; + static const uint8_t *nuc_lookup = (uint8_t *)"=ACMGRSVTWYHKDBN"; + const uint8_t *dest_end_ptr = dest + length; + uint8_t *dest_cursor = dest; + const uint8_t *encoded_cursor = encoded_sequence; + #ifdef __SSSE3__ + const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i)); + __m128i first_upper_shuffle = _mm_setr_epi8( + 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff); + __m128i first_lower_shuffle = _mm_setr_epi8( + 0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7); + __m128i second_upper_shuffle = _mm_setr_epi8( + 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff); + __m128i second_lower_shuffle = _mm_setr_epi8( + 0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15); + __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)nuc_lookup); + /* Work on 16 encoded characters at the time resulting in 32 decoded characters + Examples are given for 8 encoded characters A until H to keep it readable. + Encoded stored as |AB|CD|EF|GH| + Shuffle into |AB|00|CD|00|EF|00|GH|00| and + |00|AB|00|CD|00|EF|00|GH| + Shift upper to the right resulting into + |0A|B0|0C|D0|0E|F0|0G|H0| and + |00|AB|00|CD|00|EF|00|GH| + Merge with or resulting into (X stands for garbage) + |0A|XB|0C|XD|0E|XF|0G|XH| + Bitwise and with 0b1111 leads to: + |0A|0B|0C|0D|0E|0F|0G|0H| + We can use the resulting 4-bit integers as indexes for the shuffle of + the nucleotide lookup. */ + while (dest_cursor < dest_vec_end_ptr) { + __m128i encoded = _mm_lddqu_si128((__m128i *)encoded_cursor); + + __m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle); + __m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle); + __m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4); + __m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower); + __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111)); + __m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes); + _mm_storeu_si128((__m128i *)dest_cursor, first_nucleotides); + + __m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle); + __m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle); + __m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4); + __m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower); + __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111)); + __m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes); + _mm_storeu_si128((__m128i *)(dest_cursor + 16), second_nucleotides); + + encoded_cursor += sizeof(__m128i); + dest_cursor += 2 * sizeof(__m128i); + } + #endif + /* Do two at the time until it gets to the last even address. */ + const uint8_t *dest_end_ptr_twoatatime = dest + (length & (~1ULL)); + while (dest_cursor < dest_end_ptr_twoatatime) { + /* According to htslib, size_t cast helps the optimizer. + Code confirmed to indeed run faster. */ + memcpy(dest_cursor, code2base + ((size_t)*encoded_cursor * 2), 2); + dest_cursor += 2; + encoded_cursor += 1; + } + assert((dest_end_ptr - dest_cursor) < 2); + if (dest_cursor != dest_end_ptr) { + /* There is a single encoded nuc left */ + uint8_t encoded_nucs = *encoded_cursor; + uint8_t upper_nuc_index = encoded_nucs >> 4; + dest_cursor[0] = nuc_lookup[upper_nuc_index]; + } +} + +static void +decode_bam_qualities(uint8_t *dest, const uint8_t *encoded_qualities, size_t length) +{ + const uint8_t *end_ptr = encoded_qualities + length; + const uint8_t *cursor = encoded_qualities; + uint8_t *dest_cursor = dest; + #ifdef __SSE2__ + const uint8_t *vec_end_ptr = end_ptr - sizeof(__m128i); + while (cursor < vec_end_ptr) { + __m128i quals = _mm_loadu_si128((__m128i *)cursor); + __m128i phreds = _mm_add_epi8(quals, _mm_set1_epi8(33)); + _mm_storeu_si128((__m128i *)dest_cursor, phreds); + cursor += sizeof(__m128i); + dest_cursor += sizeof(__m128i); + } + #endif + while (cursor < end_ptr) { + *dest_cursor = *cursor + 33; + cursor += 1; + dest_cursor += 1; + } +} \ No newline at end of file diff --git a/src/dnaio/readers.py b/src/dnaio/readers.py index 1ee59129..943b05f6 100644 --- a/src/dnaio/readers.py +++ b/src/dnaio/readers.py @@ -9,7 +9,7 @@ from xopen import xopen -from ._core import FastqIter, SequenceRecord +from ._core import BamIter, FastqIter, SequenceRecord from ._util import shorten as _shorten from .exceptions import FastaFormatError from .interfaces import SingleEndReader @@ -191,3 +191,51 @@ def number_of_records(self): return self._iter.number_of_records except AttributeError: return 0 + + +class BamReader(BinaryFileReader, SingleEndReader): + """ + Reader for BAM files. + + While this class can be instantiated directly, the recommended way is to + use `dnaio.open` with appropriate arguments. + """ + + def __init__( + self, + file: Union[PathLike, str, BinaryIO], + *, + sequence_class=SequenceRecord, + buffer_size: int = 128 * 1024, # Buffer size used by cat, pigz etc. + opener=xopen, + _close_file: Optional[bool] = None, + ): + super().__init__(file, opener=opener, _close_file=_close_file) + self.sequence_class = sequence_class + self.delivers_qualities = True + self.buffer_size = buffer_size + try: + self._iter: Iterator[SequenceRecord] = BamIter(self._file, self.buffer_size) + except Exception: + self.close() + raise + + self.two_headers: bool = False + + def __iter__(self) -> Iterator[SequenceRecord]: + """Iterate over the records in this BAM file.""" + return self._iter + + @property + def number_of_records(self): + try: + return self._iter.number_of_records + except AttributeError: + return 0 + + @property + def header(self): + try: + return self._iter.header + except AttributeError: + return b"" diff --git a/src/dnaio/singleend.py b/src/dnaio/singleend.py index 61f79f29..f7ab09f9 100644 --- a/src/dnaio/singleend.py +++ b/src/dnaio/singleend.py @@ -2,7 +2,7 @@ from typing import Optional, Union, BinaryIO, Tuple from .exceptions import UnknownFileFormat -from .readers import FastaReader, FastqReader +from .readers import BamReader, FastaReader, FastqReader from .writers import FastaWriter, FastqWriter @@ -13,7 +13,7 @@ def _open_single( fileformat: Optional[str] = None, mode: str = "r", qualities: Optional[bool] = None, -) -> Union[FastaReader, FastaWriter, FastqReader, FastqWriter]: +) -> Union[BamReader, FastaReader, FastaWriter, FastqReader, FastqWriter]: """ Open a single sequence file. """ @@ -59,7 +59,11 @@ def _open_single( if "r" in mode: return FastqReader(file, _close_file=close_file) return FastqWriter(file, _close_file=close_file) - + elif fileformat == "bam": + if "r" in mode: + return BamReader(file, _close_file=close_file) + # This should not be reached + raise NotImplementedError("Only reading is supported for BAM files") if close_file: file.close() raise UnknownFileFormat( @@ -108,6 +112,8 @@ def _detect_format_from_name(name: str) -> Optional[str]: return "fasta" elif ext in [".fastq", ".fq"] or (ext == ".txt" and name.endswith("_sequence")): return "fastq" + elif ext == "bam": + return "bam" return None @@ -117,15 +123,17 @@ def _detect_format_from_content(file: BinaryIO) -> Optional[str]: """ if file.seekable(): original_position = file.tell() - first_char = file.read(1) + magic = file.read(4) file.seek(original_position) else: # We cannot always use peek() because BytesIO objects do not suppert it - first_char = file.peek(1)[0:1] # type: ignore - formats = { - b"@": "fastq", - b">": "fasta", - b"#": "fasta", # Some FASTA variants allow comments - b"": "fastq", # Pretend FASTQ for empty input - } - return formats.get(first_char, None) + magic = file.peek(4)[0:4] # type: ignore + if magic.startswith(b"@") or magic == b"": + # Pretend FASTQ for empty input + return "fastq" + elif magic.startswith(b">") or magic.startswith(b"#"): + # Some FASTA variants allow comments + return "fasta" + elif magic == b"BAM\1": + return "bam" + return None diff --git a/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.bam b/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.bam new file mode 100644 index 00000000..c4933703 Binary files /dev/null and b/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.bam differ diff --git a/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.header.sam b/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.header.sam new file mode 100644 index 00000000..72edc830 --- /dev/null +++ b/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.header.sam @@ -0,0 +1,97 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:chrM LN:16571 +@SQ SN:chr1 LN:249250621 +@SQ SN:chr2 LN:243199373 +@SQ SN:chr3 LN:198022430 +@SQ SN:chr4 LN:191154276 +@SQ SN:chr5 LN:180915260 +@SQ SN:chr6 LN:171115067 +@SQ SN:chr7 LN:159138663 +@SQ SN:chr8 LN:146364022 +@SQ SN:chr9 LN:141213431 +@SQ SN:chr10 LN:135534747 +@SQ SN:chr11 LN:135006516 +@SQ SN:chr12 LN:133851895 +@SQ SN:chr13 LN:115169878 +@SQ SN:chr14 LN:107349540 +@SQ SN:chr15 LN:102531392 +@SQ SN:chr16 LN:90354753 +@SQ SN:chr17 LN:81195210 +@SQ SN:chr18 LN:78077248 +@SQ SN:chr19 LN:59128983 +@SQ SN:chr20 LN:63025520 +@SQ SN:chr21 LN:48129895 +@SQ SN:chr22 LN:51304566 +@SQ SN:chrX LN:155270560 +@SQ SN:chrY LN:59373566 +@SQ SN:chr1_gl000191_random LN:106433 +@SQ SN:chr1_gl000192_random LN:547496 +@SQ SN:chr4_ctg9_hap1 LN:590426 +@SQ SN:chr4_gl000193_random LN:189789 +@SQ SN:chr4_gl000194_random LN:191469 +@SQ SN:chr6_apd_hap1 LN:4622290 +@SQ SN:chr6_cox_hap2 LN:4795371 +@SQ SN:chr6_dbb_hap3 LN:4610396 +@SQ SN:chr6_mann_hap4 LN:4683263 +@SQ SN:chr6_mcf_hap5 LN:4833398 +@SQ SN:chr6_qbl_hap6 LN:4611984 +@SQ SN:chr6_ssto_hap7 LN:4928567 +@SQ SN:chr7_gl000195_random LN:182896 +@SQ SN:chr8_gl000196_random LN:38914 +@SQ SN:chr8_gl000197_random LN:37175 +@SQ SN:chr9_gl000198_random LN:90085 +@SQ SN:chr9_gl000199_random LN:169874 +@SQ SN:chr9_gl000200_random LN:187035 +@SQ SN:chr9_gl000201_random LN:36148 +@SQ SN:chr11_gl000202_random LN:40103 +@SQ SN:chr17_ctg5_hap1 LN:1680828 +@SQ SN:chr17_gl000203_random LN:37498 +@SQ SN:chr17_gl000204_random LN:81310 +@SQ SN:chr17_gl000205_random LN:174588 +@SQ SN:chr17_gl000206_random LN:41001 +@SQ SN:chr18_gl000207_random LN:4262 +@SQ SN:chr19_gl000208_random LN:92689 +@SQ SN:chr19_gl000209_random LN:159169 +@SQ SN:chr21_gl000210_random LN:27682 +@SQ SN:chrUn_gl000211 LN:166566 +@SQ SN:chrUn_gl000212 LN:186858 +@SQ SN:chrUn_gl000213 LN:164239 +@SQ SN:chrUn_gl000214 LN:137718 +@SQ SN:chrUn_gl000215 LN:172545 +@SQ SN:chrUn_gl000216 LN:172294 +@SQ SN:chrUn_gl000217 LN:172149 +@SQ SN:chrUn_gl000218 LN:161147 +@SQ SN:chrUn_gl000219 LN:179198 +@SQ SN:chrUn_gl000220 LN:161802 +@SQ SN:chrUn_gl000221 LN:155397 +@SQ SN:chrUn_gl000222 LN:186861 +@SQ SN:chrUn_gl000223 LN:180455 +@SQ SN:chrUn_gl000224 LN:179693 +@SQ SN:chrUn_gl000225 LN:211173 +@SQ SN:chrUn_gl000226 LN:15008 +@SQ SN:chrUn_gl000227 LN:128374 +@SQ SN:chrUn_gl000228 LN:129120 +@SQ SN:chrUn_gl000229 LN:19913 +@SQ SN:chrUn_gl000230 LN:43691 +@SQ SN:chrUn_gl000231 LN:27386 +@SQ SN:chrUn_gl000232 LN:40652 +@SQ SN:chrUn_gl000233 LN:45941 +@SQ SN:chrUn_gl000234 LN:40531 +@SQ SN:chrUn_gl000235 LN:34474 +@SQ SN:chrUn_gl000236 LN:41934 +@SQ SN:chrUn_gl000237 LN:45867 +@SQ SN:chrUn_gl000238 LN:39939 +@SQ SN:chrUn_gl000239 LN:33824 +@SQ SN:chrUn_gl000240 LN:41933 +@SQ SN:chrUn_gl000241 LN:42152 +@SQ SN:chrUn_gl000242 LN:43523 +@SQ SN:chrUn_gl000243 LN:43341 +@SQ SN:chrUn_gl000244 LN:39929 +@SQ SN:chrUn_gl000245 LN:36651 +@SQ SN:chrUn_gl000246 LN:38154 +@SQ SN:chrUn_gl000247 LN:36422 +@SQ SN:chrUn_gl000248 LN:39786 +@SQ SN:chrUn_gl000249 LN:38502 +@RG ID:1 PL:Illumina PU:H7AP8ADXX_TAAGGCGA_1_NA12878 LB:library DS:SampleProject:NIST SM:NIST7035 CN:illuminafacility +@PG ID:bwa PN:bwa VN:0.7.5a-r405 +@PG ID:MarkDuplicates PN:MarkDuplicates CL:net.sf.picard.sam.MarkDuplicates INPUT=[/misc/ClinicalGenomicsPipeline/dev/projects/CLINICAL_NIST/work_dirs/2013_1211_162011/run/phase1_phase1_list/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.bam] OUTPUT=/misc/ClinicalGenomicsPipeline/dev/projects/CLINICAL_NIST/work_dirs/2013_1211_162011/run/phase1_phase1_list/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.bam METRICS_FILE=/misc/ClinicalGenomicsPipeline/dev/projects/CLINICAL_NIST/work_dirs/2013_1211_162011/run/phase1_phase1_list/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.metrics TMP_DIR=[/misc/ClinicalGenomicsPipeline/dev/projects/CLINICAL_NIST/work_dirs/2013_1211_162011/run/phase1_phase1_list/.queue/tmp] VALIDATION_STRINGENCY=SILENT CREATE_INDEX=true PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates REMOVE_DUPLICATES=false ASSUME_SORTED=false MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 READ_NAME_REGEX=[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).* OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_MD5_FILE=false diff --git a/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.sam b/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.sam new file mode 100644 index 00000000..bdbab665 --- /dev/null +++ b/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.sam @@ -0,0 +1,100 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:chrM LN:16571 +@SQ SN:chr1 LN:249250621 +@SQ SN:chr2 LN:243199373 +@SQ SN:chr3 LN:198022430 +@SQ SN:chr4 LN:191154276 +@SQ SN:chr5 LN:180915260 +@SQ SN:chr6 LN:171115067 +@SQ SN:chr7 LN:159138663 +@SQ SN:chr8 LN:146364022 +@SQ SN:chr9 LN:141213431 +@SQ SN:chr10 LN:135534747 +@SQ SN:chr11 LN:135006516 +@SQ SN:chr12 LN:133851895 +@SQ SN:chr13 LN:115169878 +@SQ SN:chr14 LN:107349540 +@SQ SN:chr15 LN:102531392 +@SQ SN:chr16 LN:90354753 +@SQ SN:chr17 LN:81195210 +@SQ SN:chr18 LN:78077248 +@SQ SN:chr19 LN:59128983 +@SQ SN:chr20 LN:63025520 +@SQ SN:chr21 LN:48129895 +@SQ SN:chr22 LN:51304566 +@SQ SN:chrX LN:155270560 +@SQ SN:chrY LN:59373566 +@SQ SN:chr1_gl000191_random LN:106433 +@SQ SN:chr1_gl000192_random LN:547496 +@SQ SN:chr4_ctg9_hap1 LN:590426 +@SQ SN:chr4_gl000193_random LN:189789 +@SQ SN:chr4_gl000194_random LN:191469 +@SQ SN:chr6_apd_hap1 LN:4622290 +@SQ SN:chr6_cox_hap2 LN:4795371 +@SQ SN:chr6_dbb_hap3 LN:4610396 +@SQ SN:chr6_mann_hap4 LN:4683263 +@SQ SN:chr6_mcf_hap5 LN:4833398 +@SQ SN:chr6_qbl_hap6 LN:4611984 +@SQ SN:chr6_ssto_hap7 LN:4928567 +@SQ SN:chr7_gl000195_random LN:182896 +@SQ SN:chr8_gl000196_random LN:38914 +@SQ SN:chr8_gl000197_random LN:37175 +@SQ SN:chr9_gl000198_random LN:90085 +@SQ SN:chr9_gl000199_random LN:169874 +@SQ SN:chr9_gl000200_random LN:187035 +@SQ SN:chr9_gl000201_random LN:36148 +@SQ SN:chr11_gl000202_random LN:40103 +@SQ SN:chr17_ctg5_hap1 LN:1680828 +@SQ SN:chr17_gl000203_random LN:37498 +@SQ SN:chr17_gl000204_random LN:81310 +@SQ SN:chr17_gl000205_random LN:174588 +@SQ SN:chr17_gl000206_random LN:41001 +@SQ SN:chr18_gl000207_random LN:4262 +@SQ SN:chr19_gl000208_random LN:92689 +@SQ SN:chr19_gl000209_random LN:159169 +@SQ SN:chr21_gl000210_random LN:27682 +@SQ SN:chrUn_gl000211 LN:166566 +@SQ SN:chrUn_gl000212 LN:186858 +@SQ SN:chrUn_gl000213 LN:164239 +@SQ SN:chrUn_gl000214 LN:137718 +@SQ SN:chrUn_gl000215 LN:172545 +@SQ SN:chrUn_gl000216 LN:172294 +@SQ SN:chrUn_gl000217 LN:172149 +@SQ SN:chrUn_gl000218 LN:161147 +@SQ SN:chrUn_gl000219 LN:179198 +@SQ SN:chrUn_gl000220 LN:161802 +@SQ SN:chrUn_gl000221 LN:155397 +@SQ SN:chrUn_gl000222 LN:186861 +@SQ SN:chrUn_gl000223 LN:180455 +@SQ SN:chrUn_gl000224 LN:179693 +@SQ SN:chrUn_gl000225 LN:211173 +@SQ SN:chrUn_gl000226 LN:15008 +@SQ SN:chrUn_gl000227 LN:128374 +@SQ SN:chrUn_gl000228 LN:129120 +@SQ SN:chrUn_gl000229 LN:19913 +@SQ SN:chrUn_gl000230 LN:43691 +@SQ SN:chrUn_gl000231 LN:27386 +@SQ SN:chrUn_gl000232 LN:40652 +@SQ SN:chrUn_gl000233 LN:45941 +@SQ SN:chrUn_gl000234 LN:40531 +@SQ SN:chrUn_gl000235 LN:34474 +@SQ SN:chrUn_gl000236 LN:41934 +@SQ SN:chrUn_gl000237 LN:45867 +@SQ SN:chrUn_gl000238 LN:39939 +@SQ SN:chrUn_gl000239 LN:33824 +@SQ SN:chrUn_gl000240 LN:41933 +@SQ SN:chrUn_gl000241 LN:42152 +@SQ SN:chrUn_gl000242 LN:43523 +@SQ SN:chrUn_gl000243 LN:43341 +@SQ SN:chrUn_gl000244 LN:39929 +@SQ SN:chrUn_gl000245 LN:36651 +@SQ SN:chrUn_gl000246 LN:38154 +@SQ SN:chrUn_gl000247 LN:36422 +@SQ SN:chrUn_gl000248 LN:39786 +@SQ SN:chrUn_gl000249 LN:38502 +@RG ID:1 PL:Illumina PU:H7AP8ADXX_TAAGGCGA_1_NA12878 LB:library DS:SampleProject:NIST SM:NIST7035 CN:illuminafacility +@PG ID:bwa PN:bwa VN:0.7.5a-r405 +@PG ID:MarkDuplicates PN:MarkDuplicates CL:net.sf.picard.sam.MarkDuplicates INPUT=[/misc/ClinicalGenomicsPipeline/dev/projects/CLINICAL_NIST/work_dirs/2013_1211_162011/run/phase1_phase1_list/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.bam] OUTPUT=/misc/ClinicalGenomicsPipeline/dev/projects/CLINICAL_NIST/work_dirs/2013_1211_162011/run/phase1_phase1_list/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.bam METRICS_FILE=/misc/ClinicalGenomicsPipeline/dev/projects/CLINICAL_NIST/work_dirs/2013_1211_162011/run/phase1_phase1_list/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.metrics TMP_DIR=[/misc/ClinicalGenomicsPipeline/dev/projects/CLINICAL_NIST/work_dirs/2013_1211_162011/run/phase1_phase1_list/.queue/tmp] VALIDATION_STRINGENCY=SILENT CREATE_INDEX=true PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates REMOVE_DUPLICATES=false ASSUME_SORTED=false MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 READ_NAME_REGEX=[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).* OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_MD5_FILE=false +HWI-D00119:50:H7AP8ADXX:1:1104:8519:18990 99 chrM 1 60 101M = 24 124 GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG CCCFFFFFHFFHHJIJJIJGGJJJJJJJJJJJJJIGHIIEHIJJJJJJIJJJJIBGGIIIHIIIIHHHHDD;9CCDEDDDDDDDDDDEDDDDDDDDDDDDD X0:i:1 X1:i:0 MD:Z:72G28 PG:Z:MarkDuplicates RG:Z:1 XG:i:0 AM:i:37 NM:i:1 SM:i:37 XM:i:1 XO:i:0 XT:A:U +HWI-D00119:50:H7AP8ADXX:1:2104:18479:82511 99 chrM 1 60 101M = 10 109 GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG CCCFFFFFHFFHHJJJJIJJJJIIJJJJJGIJJJJGIJJJJJJJJGJIJJIJJJGHIJJJJJJJIHHHHDD@>CDDEDDDDDDDDDDEDDCDDDDD?BBD9 X0:i:1 X1:i:0 MD:Z:72G28 PG:Z:MarkDuplicates RG:Z:1 XG:i:0 AM:i:37 NM:i:1 SM:i:37 XM:i:1 XO:i:0 XT:A:U +HWI-D00119:50:H7AP8ADXX:1:2105:7076:23015 163 chrM 1 60 101M = 59 159 GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG @@CFFFDFGFHHHJIIJIJIJJJJJJJIIJJJJIIJIJFIIJJJJIIIGIJJJJDHIJIIJIJJJHHGGCB>BDDDDDDDDDDDBDDEDDDDDDDDDDDDD X0:i:1 X1:i:0 MD:Z:72G28 PG:Z:MarkDuplicates RG:Z:1 XG:i:0 AM:i:37 NM:i:1 SM:i:37 XM:i:1 XO:i:0 XT:A:U diff --git a/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.unmapped.bam b/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.unmapped.bam new file mode 100644 index 00000000..441fe9b0 Binary files /dev/null and b/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.unmapped.bam differ diff --git a/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.unmapped.sam b/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.unmapped.sam new file mode 100644 index 00000000..ffffdfd8 --- /dev/null +++ b/tests/data/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.unmapped.sam @@ -0,0 +1,100 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:chrM LN:16571 +@SQ SN:chr1 LN:249250621 +@SQ SN:chr2 LN:243199373 +@SQ SN:chr3 LN:198022430 +@SQ SN:chr4 LN:191154276 +@SQ SN:chr5 LN:180915260 +@SQ SN:chr6 LN:171115067 +@SQ SN:chr7 LN:159138663 +@SQ SN:chr8 LN:146364022 +@SQ SN:chr9 LN:141213431 +@SQ SN:chr10 LN:135534747 +@SQ SN:chr11 LN:135006516 +@SQ SN:chr12 LN:133851895 +@SQ SN:chr13 LN:115169878 +@SQ SN:chr14 LN:107349540 +@SQ SN:chr15 LN:102531392 +@SQ SN:chr16 LN:90354753 +@SQ SN:chr17 LN:81195210 +@SQ SN:chr18 LN:78077248 +@SQ SN:chr19 LN:59128983 +@SQ SN:chr20 LN:63025520 +@SQ SN:chr21 LN:48129895 +@SQ SN:chr22 LN:51304566 +@SQ SN:chrX LN:155270560 +@SQ SN:chrY LN:59373566 +@SQ SN:chr1_gl000191_random LN:106433 +@SQ SN:chr1_gl000192_random LN:547496 +@SQ SN:chr4_ctg9_hap1 LN:590426 +@SQ SN:chr4_gl000193_random LN:189789 +@SQ SN:chr4_gl000194_random LN:191469 +@SQ SN:chr6_apd_hap1 LN:4622290 +@SQ SN:chr6_cox_hap2 LN:4795371 +@SQ SN:chr6_dbb_hap3 LN:4610396 +@SQ SN:chr6_mann_hap4 LN:4683263 +@SQ SN:chr6_mcf_hap5 LN:4833398 +@SQ SN:chr6_qbl_hap6 LN:4611984 +@SQ SN:chr6_ssto_hap7 LN:4928567 +@SQ SN:chr7_gl000195_random LN:182896 +@SQ SN:chr8_gl000196_random LN:38914 +@SQ SN:chr8_gl000197_random LN:37175 +@SQ SN:chr9_gl000198_random LN:90085 +@SQ SN:chr9_gl000199_random LN:169874 +@SQ SN:chr9_gl000200_random LN:187035 +@SQ SN:chr9_gl000201_random LN:36148 +@SQ SN:chr11_gl000202_random LN:40103 +@SQ SN:chr17_ctg5_hap1 LN:1680828 +@SQ SN:chr17_gl000203_random LN:37498 +@SQ SN:chr17_gl000204_random LN:81310 +@SQ SN:chr17_gl000205_random LN:174588 +@SQ SN:chr17_gl000206_random LN:41001 +@SQ SN:chr18_gl000207_random LN:4262 +@SQ SN:chr19_gl000208_random LN:92689 +@SQ SN:chr19_gl000209_random LN:159169 +@SQ SN:chr21_gl000210_random LN:27682 +@SQ SN:chrUn_gl000211 LN:166566 +@SQ SN:chrUn_gl000212 LN:186858 +@SQ SN:chrUn_gl000213 LN:164239 +@SQ SN:chrUn_gl000214 LN:137718 +@SQ SN:chrUn_gl000215 LN:172545 +@SQ SN:chrUn_gl000216 LN:172294 +@SQ SN:chrUn_gl000217 LN:172149 +@SQ SN:chrUn_gl000218 LN:161147 +@SQ SN:chrUn_gl000219 LN:179198 +@SQ SN:chrUn_gl000220 LN:161802 +@SQ SN:chrUn_gl000221 LN:155397 +@SQ SN:chrUn_gl000222 LN:186861 +@SQ SN:chrUn_gl000223 LN:180455 +@SQ SN:chrUn_gl000224 LN:179693 +@SQ SN:chrUn_gl000225 LN:211173 +@SQ SN:chrUn_gl000226 LN:15008 +@SQ SN:chrUn_gl000227 LN:128374 +@SQ SN:chrUn_gl000228 LN:129120 +@SQ SN:chrUn_gl000229 LN:19913 +@SQ SN:chrUn_gl000230 LN:43691 +@SQ SN:chrUn_gl000231 LN:27386 +@SQ SN:chrUn_gl000232 LN:40652 +@SQ SN:chrUn_gl000233 LN:45941 +@SQ SN:chrUn_gl000234 LN:40531 +@SQ SN:chrUn_gl000235 LN:34474 +@SQ SN:chrUn_gl000236 LN:41934 +@SQ SN:chrUn_gl000237 LN:45867 +@SQ SN:chrUn_gl000238 LN:39939 +@SQ SN:chrUn_gl000239 LN:33824 +@SQ SN:chrUn_gl000240 LN:41933 +@SQ SN:chrUn_gl000241 LN:42152 +@SQ SN:chrUn_gl000242 LN:43523 +@SQ SN:chrUn_gl000243 LN:43341 +@SQ SN:chrUn_gl000244 LN:39929 +@SQ SN:chrUn_gl000245 LN:36651 +@SQ SN:chrUn_gl000246 LN:38154 +@SQ SN:chrUn_gl000247 LN:36422 +@SQ SN:chrUn_gl000248 LN:39786 +@SQ SN:chrUn_gl000249 LN:38502 +@RG ID:1 PL:Illumina PU:H7AP8ADXX_TAAGGCGA_1_NA12878 LB:library DS:SampleProject:NIST SM:NIST7035 CN:illuminafacility +@PG ID:bwa PN:bwa VN:0.7.5a-r405 +@PG ID:MarkDuplicates PN:MarkDuplicates CL:net.sf.picard.sam.MarkDuplicates INPUT=[/misc/ClinicalGenomicsPipeline/dev/projects/CLINICAL_NIST/work_dirs/2013_1211_162011/run/phase1_phase1_list/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.bam] OUTPUT=/misc/ClinicalGenomicsPipeline/dev/projects/CLINICAL_NIST/work_dirs/2013_1211_162011/run/phase1_phase1_list/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.bam METRICS_FILE=/misc/ClinicalGenomicsPipeline/dev/projects/CLINICAL_NIST/work_dirs/2013_1211_162011/run/phase1_phase1_list/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.metrics TMP_DIR=[/misc/ClinicalGenomicsPipeline/dev/projects/CLINICAL_NIST/work_dirs/2013_1211_162011/run/phase1_phase1_list/.queue/tmp] VALIDATION_STRINGENCY=SILENT CREATE_INDEX=true PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates REMOVE_DUPLICATES=false ASSUME_SORTED=false MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 READ_NAME_REGEX=[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).* OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_MD5_FILE=false +HWI-D00119:50:H7AP8ADXX:1:1104:8519:18990 4 chrM 1 60 101M = 24 124 GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG CCCFFFFFHFFHHJIJJIJGGJJJJJJJJJJJJJIGHIIEHIJJJJJJIJJJJIBGGIIIHIIIIHHHHDD;9CCDEDDDDDDDDDDEDDDDDDDDDDDDD X0:i:1 X1:i:0 MD:Z:72G28 PG:Z:MarkDuplicates RG:Z:1 XG:i:0 AM:i:37 NM:i:1 SM:i:37 XM:i:1 XO:i:0 XT:A:U +HWI-D00119:50:H7AP8ADXX:1:2104:18479:82511 4 chrM 1 60 101M = 10 109 GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG CCCFFFFFHFFHHJJJJIJJJJIIJJJJJGIJJJJGIJJJJJJJJGJIJJIJJJGHIJJJJJJJIHHHHDD@>CDDEDDDDDDDDDDEDDCDDDDD?BBD9 X0:i:1 X1:i:0 MD:Z:72G28 PG:Z:MarkDuplicates RG:Z:1 XG:i:0 AM:i:37 NM:i:1 SM:i:37 XM:i:1 XO:i:0 XT:A:U +HWI-D00119:50:H7AP8ADXX:1:2105:7076:23015 4 chrM 1 60 101M = 59 159 GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG @@CFFFDFGFHHHJIIJIJIJJJJJJJIIJJJJIIJIJFIIJJJJIIIGIJJJJDHIJIIJIJJJHHGGCB>BDDDDDDDDDDDBDDEDDDDDDDDDDDDD X0:i:1 X1:i:0 MD:Z:72G28 PG:Z:MarkDuplicates RG:Z:1 XG:i:0 AM:i:37 NM:i:1 SM:i:37 XM:i:1 XO:i:0 XT:A:U diff --git a/tests/data/simple.unaligned.bam b/tests/data/simple.unaligned.bam new file mode 100644 index 00000000..472806f8 Binary files /dev/null and b/tests/data/simple.unaligned.bam differ diff --git a/tests/data/simplebamnoextension b/tests/data/simplebamnoextension new file mode 100644 index 00000000..472806f8 Binary files /dev/null and b/tests/data/simplebamnoextension differ diff --git a/tests/test_internal.py b/tests/test_internal.py index be2a244e..96da8ec4 100644 --- a/tests/test_internal.py +++ b/tests/test_internal.py @@ -1,3 +1,5 @@ +import gzip +import io import os import shutil import subprocess @@ -12,6 +14,7 @@ import dnaio from dnaio import ( + BamReader, FileFormatError, FastaFormatError, FastqFormatError, @@ -706,3 +709,97 @@ def test_records_are_mates_one_argument(self) -> None: with pytest.raises(TypeError) as error: records_are_mates(SequenceRecord("A", "A", "A")) # type: ignore error.match("records_are_mates requires at least two arguments") + + +class TestBamReader: + bam_file = ( + TEST_DATA / "project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878" + ".bwa.markDuplicates.unmapped.bam" + ) + raw_bam_bytes = gzip.decompress(bam_file.read_bytes()) + complete_record_with_header = raw_bam_bytes[:6661] + complete_header = complete_record_with_header[:6359] + + def test_parse_bam(self): + with dnaio.open(self.bam_file) as reader: + records = list(reader) + assert len(records) == 3 + assert reader.number_of_records == 3 + assert records[0].name == "HWI-D00119:50:H7AP8ADXX:1:1104:8519:18990" + assert records[0].sequence == ( + "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCT" + "GGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG" + ) + assert records[0].qualities == ( + "CCCFFFFFHFFHHJIJJIJGGJJJJJJJJJJJJJIGHIIEHIJJJJJJIJJJJIBGGIIIHIIII" + "HHHHDD;9CCDEDDDDDDDDDDEDDDDDDDDDDDDD" + ) + assert records[1].name == "HWI-D00119:50:H7AP8ADXX:1:2104:18479:82511" + assert records[1].sequence == ( + "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCT" + "GGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG" + ) + assert records[1].qualities == ( + "CCCFFFFFHFFHHJJJJIJJJJIIJJJJJGIJJJJGIJJJJJJJJGJIJJIJJJGHIJJJJJJJI" + "HHHHDD@>CDDEDDDDDDDDDDEDDCDDDDD?BBD9" + ) + assert records[2].name == "HWI-D00119:50:H7AP8ADXX:1:2105:7076:23015" + assert records[2].sequence == ( + "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCT" + "GGGGGGTATGCACGCGATAGCATTGCGAGACGCTGG" + ) + assert records[2].qualities == ( + "@@CFFFDFGFHHHJIIJIJIJJJJJJJIIJJJJIIJIJFIIJJJJIIIGIJJJJDHIJIIJIJJJ" + "HHGGCB>BDDDDDDDDDDDBDDEDDDDDDDDDDDDD" + ) + + def test_parse_header(self): + header = ( + Path(__file__).parent + / "data" + / "project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa" + ".markDuplicates.header.sam" + ) + header_bytes = header.read_bytes() + with dnaio.open(self.bam_file) as bam: + assert bam.header == header_bytes + + @pytest.mark.parametrize( + "end", range(len(complete_header) + 1, len(complete_record_with_header)) + ) + def test_truncated_record(self, end: int): + file = io.BytesIO(self.complete_record_with_header[:end]) + with pytest.raises(EOFError) as e: + list(BamReader(file)) + e.match("Incomplete record at the end of file") + + @pytest.mark.parametrize("end", [3, 5, 2000, 6000]) + def test_truncated_header(self, end): + file = io.BytesIO(self.complete_record_with_header[:end]) + with pytest.raises(EOFError) as e: + list(BamReader(file)) + e.match("Truncated BAM file") + + def test_bam_parser_not_binary_error(self): + file = io.StringIO( + "Don't be too proud of this technological terror you have constructed." + ) + with pytest.raises(TypeError) as error: + BamReader(file) + error.match("binary IO") + + @pytest.mark.parametrize("buffersize", [4, 8, 10, 20, 40]) + def test_small_buffersize(self, buffersize): + reader = BamReader(str(self.bam_file), buffer_size=buffersize) + assert len(list(reader)) == 3 + + def test_error_on_mapped_bam(self): + bam = TEST_DATA / ( + "project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878" + ".bwa.markDuplicates.bam" + ) + reader = BamReader(str(bam)) + it = iter(reader) + with pytest.raises(NotImplementedError) as error: + next(it) + assert error.match("unmapped single reads") diff --git a/tests/test_open.py b/tests/test_open.py index 507d5ada..969b1dd5 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -182,6 +182,18 @@ def test_detect_compressed_fastq_from_content() -> None: assert record.name == "prefix:1_13_573/1" +def test_detect_bam_from_content() -> None: + with dnaio.open("tests/data/simplebamnoextension") as f: + record = next(iter(f)) + assert record.name == "Myheader" + + +def test_detect_bam_from_filename() -> None: + with dnaio.open("tests/data/simple.unaligned.bam") as f: + record = next(iter(f)) + assert record.name == "Myheader" + + def test_write(tmp_path, extension) -> None: out_fastq = tmp_path / ("out.fastq" + extension) with dnaio.open(str(out_fastq), mode="w") as f: