From fb12b76d349c8d94453c27348912433d85e150ae Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 9 Mar 2020 17:20:12 +0000 Subject: [PATCH 1/7] First draft of CRAM v4.0 specification This has been forked from CRAMv3.tex c831aba as given this is a major release we don't wish to keep markup for both versions in the same document. --- CRAMv4.tex | 2728 ++++++++++++++++++++++++++++++++++++++++++++++++++++ Makefile | 2 + 2 files changed, 2730 insertions(+) create mode 100644 CRAMv4.tex diff --git a/CRAMv4.tex b/CRAMv4.tex new file mode 100644 index 000000000..d6a8702bf --- /dev/null +++ b/CRAMv4.tex @@ -0,0 +1,2728 @@ +%&pdfLaTeX +% !TEX encoding = UTF-8 Unicode +\documentclass[a4paper]{article} +\usepackage{ifxetex} +\ifxetex +\usepackage{fontspec} +\setmainfont[Mapping=tex-text]{STIXGeneral} +\else +\usepackage[T1]{fontenc} +\usepackage[latin1]{inputenc} +\fi +\usepackage{textcomp} + +\usepackage{graphicx} +\usepackage{array} +\usepackage{fixltx2e} +\usepackage{amssymb} +\usepackage{fancyhdr} +\usepackage{amsmath} +\usepackage{algpseudocode} +\usepackage{threeparttable} +\usepackage{tikz} +\usetikzlibrary{positioning,shapes.multipart} +\usetikzlibrary{decorations.pathreplacing} +\renewcommand{\headrulewidth}{0pt} +\renewcommand{\footrulewidth}{0pt} + +\newcommand\bits{\,\mbox{bits}} +\newcommand\MB{\,\mbox{MB}} + +\makeatletter +\newcommand*{\bdiv}{% + \nonscript\mskip-\medmuskip\mkern5mu% + \mathbin{\operator@font div}\penalty900\mkern5mu% + \nonscript\mskip-\medmuskip +} +\newcommand*{\bitand}{% + \nonscript\mskip-\medmuskip\mkern5mu% + \mathbin{\operator@font AND}\penalty900\mkern5mu% + \nonscript\mskip-\medmuskip +} +\newcommand*{\logand}{% keyword rather than mathematical operator + \nonscript\mskip-\medmuskip\mkern5mu% + \mathbin{\operator@font \textbf{and}}\penalty900\mkern5mu% + \nonscript\mskip-\medmuskip +} +\newcommand*{\logor}{% keyword rather than mathematical operator + \nonscript\mskip-\medmuskip\mkern5mu% + \mathbin{\operator@font \textbf{or}}\penalty900\mkern5mu% + \nonscript\mskip-\medmuskip +} +\newcommand*{\bitor}{% + \nonscript\mskip-\medmuskip\mkern5mu% + \mathbin{\operator@font OR}\penalty900\mkern5mu% + \nonscript\mskip-\medmuskip +} +\newcommand*{\bitxor}{% + \nonscript\mskip-\medmuskip\mkern5mu% + \mathbin{\operator@font XOR}\penalty900\mkern5mu% + \nonscript\mskip-\medmuskip +} +\newcommand\concat{\mathbin{+\mkern-10mu+\,}} +\newcommand\shiftl{\mathbin{<\mkern-3mu<\,}} +\newcommand\shiftr{\mathbin{>\mkern-3mu>\,}} +\makeatother + +\setlength{\parindent}{0cm} +\setlength{\parskip}{0.18cm} +\usepackage[hmargin=2cm,vmargin=2.5cm,bindingoffset=0.0cm]{geometry} +\usepackage[pdfborder={0 0 0}]{hyperref} +\begin{document} + + + +\input{CRAMv3.ver} +\title{CRAM format specification (version 4.0)} +\author{samtools-devel@lists.sourceforge.net} +\date{\headdate} +\maketitle + + +\begin{quote}\small +The master version of this document can be found at +\url{https://github.com/samtools/hts-specs}.\\ +This printing is version~\commitdesc\ from that repository, +last modified on the date shown above. +\end{quote} + +\begin{center} +\textit{license: Apache 2.0} +\end{center} +\vspace*{1em} + +\section{\textbf{Overview}} + +This specification describes the CRAM 4.0 format. + +CRAM has the following major objectives: + +\begin{enumerate} +\item Significantly better lossless compression than BAM + +\item Full compatibility with BAM + +\item Effortless transition to CRAM from using BAM files + +\item Support for controlled loss of BAM data +\end{enumerate} + +The first three objectives allow users to take immediate advantage of the CRAM +format while offering a smooth transition path from using BAM files. The fourth +objective supports the exploration of different lossy compression strategies and +provides a framework in which to effect these choices. Please note that the CRAM +format does not impose any rules about what data should or should not be preserved. +Instead, CRAM supports a wide range of lossless and lossy data preservation strategies +enabling users to choose which data should be preserved. + +Data in CRAM is aggregated by data type (analogous to SAM columns) and +stored in blocks with each block compressed using a choice of general +purpose or custom compression codecs. Sequence is typically encoded +relative to a reference sequence\footnote{Markus Hsi-Yang Fritz, +Rasko Leinonen, Guy Cochrane, and Ewan Birney, +\textbf{Efficient storage of high throughput DNA sequencing data using reference-based compression}, +{\sl Genome Res.}~2011~21: 734--740; +\href{http://dx.doi.org/doi:10.1101/gr.114819.110}{doi:10.1101/gr.114819.110}; +{\sc pmid:}21245279.}, but this is not a requirement. Both +aligned and unaligned sequence is supported. + +\section{\textbf{Data types and formats}} + +The CRAM file format uses a combination of fixed sized and variable +sized integer types, which may be signed or unsigned. Additionally +arrays of any types may be used. + +We distinguish between abstract types and their on-disk formats, which +are described in more detail in the next section. + +\begin{tabular}{llll} +\textbf{Name} & \textbf{Type} & \textbf{Format}& \textbf{Description}\\ +\hline +\\ +\textbf{Bool} & \textit{bool} & \textit{bool} & A boolean, 0 or 1 \\ +\\ +\textbf{Byte} & \textit{byte} & \textit{byte} & A single byte (8 bits) \\ +\\ +\textbf{Integer} & \textit{} & \textit{int32} & A 32-bit integer \textcolor{gray}{FIXME: cull need for this}\\ + & \textit{} & \textit{int64} & A 64-bit integer \textcolor{gray}{FIXME: cull need for this}\\ + & \textit{sint} & \textit{int7} & A variable length signed integer \\ + & \textit{uint} & \textit{uint7} & A variable length unsigned integer \\ +\\ +\textbf{Array (explicit)} + & \textit{array} & \textit{array} & An array preceded by an explicit length\\ +\\ +\textbf{Array (implicit)} + & \textit{[ ]} & \textit{fmt[]} & Zero or more items with unspecified length\\ + & \textit{[ ]} & \textit{fmt[4]} & A constant number of items\\ +\\ +\textbf{Encoding}& \textit{encoding} & \textit{encoding} & A description of how and + where a data series is stored\\ +\\ +\textbf{Map} & \textit{} & \textit{map} & A collection of key-value pairs\\ +\end{tabular} + +\vskip 10pt + +The \textit{types} are used to describe the contents of the data series. +These are agnostic of magnitude or implementation, but do define the sign of integer values. +The \textit{formats} are used in the fixed structures and the \textit{encoding} definitions and do take into account data sizes and/or sign. + +For example, the \textbf{BF} data series has a value data type of \textit{encoding}. +If the encoding is EXTERNAL then the values will be stored in \textit{uint7} format as specified in the EXTERNAL encoding section. + +{ +\color{gray} +FIXME: is this an overly complex way of defining things? Maybe we should define EXTERNAL\_SINT, EXTERNAL\_UINT and EXTERNAL\_BYTE? That makes it explicit and removes one level of indirection. This is infact how my code already works. The EXTERNAL decode initialisation specialises the decode function by data type: + +\begin{verbatim} +cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_external_type option, + int version, varint_vec *vv) { + //... + if (option == E_INT) + c->decode = cram_external_decode_int; + else if (option == E_SINT) + c->decode = cram_external_decode_sint; + else if (option == E_LONG) + c->decode = cram_external_decode_long; + else if (option == E_SLONG) + c->decode = cram_external_decode_slong; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_external_decode_char; + else + c->decode = cram_external_decode_block; + //... +} +\end{verbatim} +} + +\label{subsec:writing-bytes} + +CRAM uses \emph{little endianness} for bytes unless specified +otherwise. A more detailed description of the each type and format follows. + +\begin{description} + +\item[Boolean (bool)]\ \newline +Boolean is written as 1-byte with 0x0 being `false' and 0x1 being `true'. + +\item[Byte (byte)]\ \newline +A single unsigned byte. + +\item[Signed integer (int32)]\ \newline +Signed 32-bit integer, written as 4 bytes in little-endian byte order. + +\item[Signed long integer (int64)]\ \newline +Signed 64-bit integer, written as 8 bytes in little-endian byte order. + +\item[Unsigned variable sized integer (uint7)]\ \newline +An unsigned integer of unspecified size, stored 7-bits at a time. + +The on-disk size being proportional to the magnitude of the integer +value encode and is known as Variable Length +Quantity\footnote{\url{https://en.wikipedia.org/wiki/Variable-length_quantity}}. + +If the value is larger than 7-bits then the top-bit of the byte is +set, indicating a subsequent byte must be read. This is repeated +until the top bit is unset. Bytes are written with the most +significant values first (big-endian format), which simplifies the +decode process. + +Algorithmically this looks like: + +\begin{algorithmic}[1] +\Statex +\Statex \textit{Read a variable sized unsigned integer 7-bits at a time.} +\Function{ReadUint7}{} + \State $value \gets 0$ + \Repeat + \State $c \gets$ \Call{ReadUint8}{} + \State $value \gets (value \shiftl 7) + (c \bitand 127)$ + \Until{$c < 128$} + \State \Return $value$ + \EndFunction +\end{algorithmic} + + +\item[Signed variable sized integer (sint7)]\ \newline +This is a signed version of uint7 above. Attempting to store a +negative value with the uint7 method would lead to the signed value +being interpreted as a very large unsigned quantity, consuming many +bytes. (This effect is seen in ITF8 and LTF8 in CRAM 3.0.) + +Instead we map signed values to unsigned ones keeping their +magnitudes proportional by using zig-zag encoding, and then encode +that value using uint7. This maps signed values 0, -1, +1, -2, +2 to +unsigned values 0, 1, 2, 3, 4 respectively, and vice versa during +decode. + +The zig-zag encoding method involves shifting the value left 1 +bit and then XORing with all-bits-zero if positive or all-bits-one if +negative. I.e. $(v \shiftl 1) \bitxor (v \shiftr 31)$ for a +32-bit quantity using a 2's complement arithmetic shift right. + +Decoding a zig-zag value is similar. We XOR the value shifted right 1 +bit with 0 or -1, taken from the negation of the bottom bit, as seen +below. + +\begin{algorithmic}[1] +\Statex +\Statex \textit{Read a variable sized signed integer 7-bits at a time.} +\Function{ReadSint7}{} + \State $value \gets $ \Call{ReadUint7}{} + \State \Return $(value \shiftr 1) \bitxor -(value \bitand 1)$ + \EndFunction +\end{algorithmic} + +\item[{array}]\ \newline +Used for arrays where the size must be specified as part of the array itself. +The length is written first as an integer (uint7), followed by the elements of the +array which are encoded as per their individual type-specific format +(usually byte or uint7). + +{\color{gray} +FIXME: arrays like this are used so rarely we could just replace \textit{array} with \textit{uint7 nlandmarks} followed by \textit{uint7[nlandmarks]}. +} + +\item[{fmt[ ]}] +\item[{fmt[len]}]\ \newline +Used for arrays of data where the length is either unspecified or +known up-front. In neither case is the length part of the byte stream. + +This is used within the specification for fixed sized items such as +the 4 byte magic number or as a marker for an unspecified block of +bytes such as \textit{byte[*] blocks}. + +\item[{Encoding}]\ \newline +{\color{gray} +TODO: Move me? +} +Encoding is a data type that specifies how data series have been compressed. Encodings +are defined as encoding\texttt{<}type\texttt{>} where the type is a logical data +type as opposed to a storage data type. + +An encoding is written as follows. The first integer (uint7) denotes the codec id +and the second integer (uint7) the number of bytes in the following encoding-specific +values. + +Subexponential encoding example: + +\begin{tabular}{|l|l|l|} +\hline +\textbf{Value} & \textbf{Type} & \textbf{Name}\tabularnewline +\hline +0x7 & uint7 & codec id\tabularnewline +\hline +0x2 & uint7 & number of bytes to follow\tabularnewline +\hline +0x0 & uint7 & offset\tabularnewline +\hline +0x1 & uint7 & K parameter\tabularnewline +\hline +\end{tabular} + +The first byte ``0x7'' is the codec id. + +The next byte ``0x2'' denotes the length of the bytes to follow (2). + +The subexponential encoding has 2 parameters: integer (itf8) offset and integer (itf8) K. + +offset = 0x0 = 0 + +K = 0x1 = 1 + + +\item[{Map}]\ \newline +A map is a collection of keys and associated values. A map with N keys is written +as follows: + +\begin{tabular}{|l|l|l|l|l|l|l|l|} +\hline +size in bytes & N & key 1 & value 1 & key ... & value ... & key N & value N\tabularnewline +\hline +\end{tabular} + +Both the size in bytes and the number of keys are written as integer (uint7). Keys +and values are written according to their data types and are specific to each map. + +\end{description} + + +\section{\textbf{Encodings }} + +Encoding is a data structure that captures information about compression details +of a data series that are required to uncompress it. This could be +the definition of a constant, a series of data transformations, or the +block content id for an external block. Some encodings may be nested, +such as \texttt{BYTE\_ARRAY\_LEN} which defines one sub-encoding for the +length and another for the bytes. + +Encoding notation is defined as the keyword `encoding' followed by its data type +in angular brackets, for example `encoding\texttt{<}byte\texttt{>}' stands for +an encoding that operates on a data series of data type `byte'. + +The encodings themselves are stored in the compression header block +(\ref{subsec:compression-header}). + +Each encoding type has its own set of parameters listed below. + +\begin{tabular}{|l|l|>{\raggedright}p{155pt}|>{\raggedright}p{160pt}|} +\hline +\textbf{Codec} & \textbf{ID} & \textbf{Parameters} & \textbf{Comment}\tabularnewline +\hline +NULL & 0 & none & series not preserved\tabularnewline +\hline +EXTERNAL & 1 & int block content id & the block content identifier used to associate +external data blocks with data series\tabularnewline +\hline +HUFFMAN & 3 & array\texttt{<}int\texttt{>} values, array\texttt{<}int\texttt{>} lengths & \textcolor{gray}{FIXME: replace with CONSTANT.} coding with int/byte values\tabularnewline +\hline +BYTE\_ARRAY\_LEN & 4 & encoding\texttt{<}int\texttt{>} array length, encoding\texttt{<}byte\texttt{>} +bytes & coding of byte arrays with array length\tabularnewline +\hline +BYTE\_ARRAY\_STOP & 5 & byte stop, int external block\linebreak{} +content id & coding of byte arrays with a stop value \tabularnewline +\hline +\end{tabular} + +See section~\ref{sec:encodings} for more detailed descriptions of all the above coding algorithms and their parameters. + +\section{\textbf{Checksums}} +The checksumming is used to ensure data integrity. The following checksumming algorithms are used in CRAM. +\subsection{\textbf{CRC32}} +This is a cyclic redundancy checksum 32-bit long with the polynomial 0x04C11DB7. Please refer to \href{http://www.itu.int/rec/recommendation.asp?type=folders&lang=e&parent=T-REC-V.42}{ITU-T V.42} for more details. The value of the CRC32 hash function is written as an integer. + +\section{\textbf{File structure}} + +The overall CRAM file structure is described in this section. Please refer to other +sections of this document for more detailed information. + +A CRAM file consists of a fixed length file definition, followed by a CRAM header container, +then one or more data containers, and finally a special end-of-file container. + +\begin{center} +\begin{tikzpicture}[ + every node/.style={scale=1.0}, + boxes/.style={rectangle split,rectangle split parts=#1,draw,rectangle split horizontal,text width=5em,align=center,minimum height=1cm,fill=black!5,on grid}, + notes/.style={text width=20em,align=center,minimum height=1cm,on grid}, +] +\node (file) [boxes=6] { +\nodepart{one}File definition +\nodepart[text width=8em]{two}CRAM Header Container +\nodepart{three}Data Container +\nodepart{four}... +\nodepart{five}Data Container +\nodepart[text width=8em]{six}CRAM EOF Container +}; +\end{tikzpicture} + +Figure 1: A CRAM file consists of a file definition, followed by a header container, then other containers. +\end{center} + +Containers consist of one or more blocks. The first container, called the CRAM header container, +is used to store a textual header as described in the SAM specification (see the section 7.1). + +\begin{center} +\begin{tikzpicture}[ + every node/.style={scale=1.0}, + boxes/.style={rectangle split,rectangle split parts=#1,draw,rectangle split horizontal,text width=5em,align=center,minimum height=1cm,fill=black!5,on grid}, + notes/.style={text width=20em,align=center,minimum height=1cm,on grid}, +] +\node (file) [boxes=6] { +\nodepart{one}File definition +\nodepart[text width=8em]{two}CRAM Header Container +\nodepart{three}Data Container +\nodepart{four}... +\nodepart{five}Data Container +\nodepart[text width=8em]{six}CRAM EOF Container +}; + +\node (header) [boxes=2,below=1 of file.three south, text width=15em] { +\nodepart{one}Block 1:\break +CRAM Header\break +(optionally compressed) +\nodepart{two}Optional Block 2:\break +nul padding bytes\break +(uncompressed) +}; +\draw (file.one split south) to (header.north west); +\draw (file.two split south) to (header.north east); +\end{tikzpicture} + +Figure 2: The the first container holds the CRAM header text. +\end{center} + +Each container starts with a container header structure followed by +one or more blocks. +The first block in each container is the compression header block +giving details of how to decode data in subsequent blocks. +Each block starts with a block header structure followed by the block +data. + +\begin{center} +\begin{tikzpicture}[ + every node/.style={scale=1.0}, + boxes/.style={rectangle split,rectangle split parts=#1,draw,rectangle split horizontal,text width=5em,align=center,minimum height=1cm,fill=black!5,on grid}, + notes/.style={text width=20em,align=center,minimum height=1cm,on grid}, +] +\node (file) [boxes=6] { +\nodepart{one}File definition +\nodepart[text width=8em]{two}CRAM Header Container +\nodepart{three}Data Container +\nodepart{four}... +\nodepart{five}Data Container +\nodepart[text width=8em]{six}CRAM EOF Container +}; + +\node (container) [boxes=5,below=1 of file.three south,text width=8em] { +\nodepart{one}Container Header structure +\nodepart{two}Compression Header Block +\nodepart[text width=4em]{three}Block 1 +\nodepart[text width=3em]{four}... +\nodepart[text width=4em]{five}Block M +}; +\draw (file.two split south) to (container.north west); +\draw (file.three split south) to (container.north east); + +\node (blocks) [boxes=2,below=1 of container.two south,text width=6em] { +\nodepart{one}Block Header structure +\nodepart{two}Block data +}; +\draw (container.one split south) to (blocks.north west); +\draw (container.two split south) to (blocks.north east); +\end{tikzpicture} + +Figure 3: Containers as a series of blocks +\end{center} + +The blocks after the compression header are organised logically into slices. One +slice may contain, for example, a contiguous region of alignment data. Slices begin +with a slice header block and are followed by one or more data blocks. +It is these data blocks which hold the primary bulk of CRAM data. +The data blocks are further subdivided into a core data block and one or more external data blocks. + +\begin{center} +\begin{tikzpicture}[ + every node/.style={scale=1.0}, + boxes/.style={rectangle split,rectangle split parts=#1,draw,rectangle split horizontal,text width=5em,align=center,minimum height=1cm,fill=black!5,on grid}, + notes/.style={text width=20em,align=center,minimum height=1cm,on grid}, +] +\node (file) [boxes=6] { +\nodepart{one}File definition +\nodepart[text width=8em]{two}CRAM Header Container +\nodepart{three}Data Container +\nodepart{four}... +\nodepart{five}Data Container +\nodepart[text width=8em]{six}CRAM EOF Container +}; + +\node (container) [boxes=9,below=1 of file.three south,text width=8em] { +\nodepart{one}Container Header structure +\nodepart{two}Compression Header Block +\nodepart[text width=3em]{three}Block +\nodepart[text width=2em]{four}... +\nodepart[text width=3em]{five}Block +\nodepart[text width=7em]{six}. . . +\nodepart[text width=3em]{seven}Block +\nodepart[text width=2em]{eight}... +\nodepart[text width=3em]{nine}Block +}; +\draw (file.two split south) to (container.north west); +\draw (file.three split south) to (container.north east); + +\draw[decoration={brace,mirror,amplitude=5pt,raise=2pt},decorate] + (container.two split south) to (container.five split south); +\node [below=0.2 of container.four south] {Slice 1}; + +\draw[decoration={brace,mirror,amplitude=5pt,raise=2pt},decorate] + (container.six split south) to (container.south east); +\node [below=0.2 of container.eight south] {Slice N}; + +\node (slice) [boxes=5,below=1 of container.four south, text width=6.5em] { +\nodepart{one}Slice Header Block +\nodepart{two}Core Data Block +\nodepart{three}External Data Block 1 +\nodepart{four}... +\nodepart{five}External Data Block M +}; +\draw (container.two split south) to (slice.north west); +\draw (container.five split south) to (slice.north east); +\end{tikzpicture} + +Figure 4: Slices formed from a series of concatenated blocks +\end{center} + +\section{\textbf{File definition}} + +Each CRAM file starts with a fixed length (26 bytes) definition with the following +fields: + +\begin{tabular}{|l|l|l|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Value}\tabularnewline +\hline +byte[4] & format magic number & CRAM (0x43 0x52 0x41 0x4d)\tabularnewline +\hline +unsigned byte & major format number & 3 (0x3)\tabularnewline +\hline +unsigned byte & minor format number & 1 (0x1)\tabularnewline +\hline +byte[20] & file id & CRAM file identifier (e.g. file name or SHA1 checksum)\tabularnewline +\hline +\end{tabular} + +Valid CRAM \textit{major}.\textit{minor} version numbers are as follows: + +\begin{itemize} +\item[\textit{1.0}] +The original public CRAM release. + +\item[\textit{2.0}] +The first CRAM release implemented in both Java and C; tidied up +implementation vs specification differences in \textit{1.0}. + +\item[\textit{2.1}] +Gained end of file markers; compatible with \textit{2.0}. + +\item[\textit{3.0}] +Additional compression methods; header and data checksums; +improvements for unsorted data. + +\item[\textit{3.1}] +Additional EXTERNAL compression codecs only. + +\item[\textit{4.0}] +This specification. Revised variable sized integers, MD/NM/RG tag +locators, and deduplication of read names. Removed some encodings. +See end for a more detailed list of changes. +\end {itemize} + +CRAM 3.0 and 3.1 differ only in the list of compression +methods available, so tools that output CRAM 3 without using any 3.1 +codecs should write the header to indicate 3.0 in order to permit +maximum compatibility. + +\section{\textbf{Container header structure}} + +The file definition is followed by one or more containers with the following header +structure where the container content is stored in the `blocks' field: + +\begin{tabular}{|l|>{\raggedright}p{120pt}|>{\raggedright}p{260pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Value} +\tabularnewline +\hline +uint32 & length & \textcolor{gray}{FIXME: make uint7} the sum of the lengths of all blocks in this container (headers and data); +equal to the total byte length of the container minus the byte length of this header structure\tabularnewline +\hline +int7\textcolor{gray}{FIXME} & reference sequence id & reference sequence identifier or\linebreak{} +-1 for unmapped reads\linebreak{} +-2 for multiple reference sequences.\linebreak{} +All slices in this container must have a reference sequence id matching this value.\tabularnewline +\hline +uint7 & starting position on the reference & the alignment start position or\linebreak{} +0 if the container is multiple-reference +or contains unmapped unplaced reads\tabularnewline +\hline +uint7 & alignment span & the length of the alignment or\linebreak{} +0 if the container is multiple-reference +or contains unmapped unplaced reads\tabularnewline +\hline +uint7 & number of records & number of records in the container\tabularnewline +\hline +uint7 & record counter & 1-based sequential index of records in the file/stream.\tabularnewline +\hline +uint7 & bases & number of read bases\tabularnewline +\hline +uint7 & number of blocks & the total number of blocks in this container\tabularnewline +\hline +array & landmarks & the locations of slices in this container as byte offsets from the end of +this container header, used for random access indexing. +The landmark count must equal the slice count.\linebreak{} +Since the block before the first slice is the compression header, +landmarks[0] is equal to the byte length of the compression header.\tabularnewline +\hline +uint32 & crc32 & CRC32 hash of the all the preceding bytes in the container.\tabularnewline +\hline +byte[ ] & blocks & The blocks contained within the container.\tabularnewline +\hline +\end{tabular} + +\subsection{\textbf{CRAM header container}} + +The first container in a CRAM file contains a textual header in a single block, optionally +gzip compressed. This text header currently matches the SAM header specification. Only +gzip is allowed as compression method for this block. The CRAM header container does not +include a compression header block. + +It is recommended to reserve 50\% more space in the CRAM header container than +is required for the SAM header text by optionally padding the container with a second +raw block consisting of all zeroes. This can be used to subsequently expand the header +container in place, such as when updating @SQ records, while preserving the absolute +offsets of all subsequent containers. + +\section{\textbf{Block structure}} +\label{sec:block-struct} + +Containers consist of one or more blocks. Block compression is applied independently +and in addition to any encodings used to compress data within the block. The block +have the following header structure with the data stored in the `block data' field: + +\begin{tabular}{|l|>{\raggedright}p{120pt}|>{\raggedright}p{260pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Value} +\tabularnewline +\hline +byte & method & the block compression method (and first CRAM version): \linebreak{} +0: raw (none)*\linebreak{} +1: gzip\linebreak{} +2: bzip2 (v2.0)\linebreak{} +3: lzma (v3.0)\linebreak{} +4: rans4x8 (v3.0)\linebreak{} +5: rans4x16 (v3.1)\linebreak{} +6: adaptive arithmetic coder (v3.1)\linebreak{} +7: fqzcomp (v3.1)\linebreak{} +8: name tokeniser (v3.1) +\tabularnewline +\hline +byte & block content type id & the block content type identifier\tabularnewline +\hline +uint7 & block content id & the block content identifier used to associate external +data blocks with data series\tabularnewline +\hline +uint7 & size in bytes* & size of the block data after applying block compression\tabularnewline +\hline +uint7 & raw size in bytes* & size of the block data before applying block compression\tabularnewline +\hline +byte[ ] & block data & the data stored in the block:\linebreak{} +-- bit stream of CRAM records (core data block)\linebreak{} +-- byte stream (external data block)\linebreak{} +-- additional fields ( header blocks)\tabularnewline +\hline +uint32 & CRC32 & CRC32 hash value for all preceding bytes in the block\tabularnewline +\hline +\end{tabular} + +* Note on raw method: both compressed and raw sizes must be set to the same value. + +\subsection{\textbf{Block content types}} + +CRAM has the following block content types: + +\begin{threeparttable}[t] +\begin{tabular}{|>{\raggedright}p{143pt}|>{\raggedright}p{45pt}|>{\raggedright}p{116pt}|>{\raggedright}p{114pt}|} +\hline +\textbf{Block content type} & \textbf{Block content type id} & \textbf{Name} & \textbf{Contents}\tabularnewline +\hline +FILE\_HEADER & 0 & CRAM header block & CRAM header\tabularnewline +\hline +COMPRESSION\_HEADER & 1 & Compression header block & See specific section\tabularnewline +\hline +SLICE\_HEADER & 2 & Slice header block & See specific section\tabularnewline +\hline + & 3 & & reserved\tabularnewline +\hline +EXTERNAL\_DATA & 4 & external data block & data produced by external encodings\tabularnewline +\hline +CORE\_DATA & 5 & core data block & bit stream of all encodings except for external encodings\tabularnewline +\hline +\end{tabular} +\end{threeparttable} + +\subsection{\textbf{Block content id}} + +Block content id is used to distinguish between external blocks in the same slice. +Each external encoding has an id parameter which must be one of the external block +content ids. For external blocks the content id is a positive integer. For all +other blocks content id should be 0. Consequently, all external encodings must +not use content id less than 1. + +\subsubsection*{Data blocks} + +Data is stored in data blocks. There are two types of data blocks: core data blocks +and external data blocks.The difference between core and external data blocks is +that core data blocks consist of data series that are compressed using bit encodings +while the external data blocks are byte compressed. One core data block and any +number of external data blocks are associated with each slice. + +Writing to and reading from core and external data blocks is organised through +CRAM records. Each data series is associated with an encoding. In case of external +encodings the block content id is used to identify the block where the data series +is stored. Please note that external blocks can have multiple data series associated +with them; in this case the values from these data series will be interleaved. + + +\subsection{\textbf{CRAM header block}} + +The SAM header is stored in a single block within the first container. + +The following constraints apply to the SAM header: + +\begin{itemize} +\item The SQ:MD5 checksum is required unless the reference sequence has been embedded +into the file. +\end{itemize} + +\subsection{\textbf{Compression header block}} +\label{subsec:compression-header} + +The compression header block consists of 3 parts: preservation map, data series +encoding map and tag encoding map. + +These are meta-data on what is stored in the following slices, how it is encoded, and in which blocks the raw byte streams from these encodings reside. + +\begin{tabular}{|l|>{\raggedright}p{120pt}|>{\raggedright}p{260pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Value} +\tabularnewline +\hline +map & preservation map & meta-data about types of data stored and dictionaries \tabularnewline +\hline +map & data series encoding map & meta-data for how data series are encoded \tabularnewline +\hline +map & tag encoding map & meta-data for how tags are encoded \tabularnewline +\hline +\end{tabular} + +\subsubsection*{Preservation map} + +The preservation map contains information about which data was preserved in the +CRAM file. It is stored as a map with byte[2] keys: + +\begin{tabular}{|l|l|>{\raggedright}p{100pt}|>{\raggedright}p{220pt}|} +\hline +\textbf{Key} & \textbf{Value data type} & \textbf{Name} & \textbf{Value}\tabularnewline +\hline +RN & bool & read names included & true if read names are preserved for all reads\tabularnewline +\hline +AP & bool & AP data series delta & true if AP data series is delta, false otherwise\tabularnewline +\hline +RR & bool & reference required & true if reference sequence is required to restore +the data completely\tabularnewline +\hline +SM & byte[5] & substitution matrix & substitution matrix\tabularnewline +\hline +TD & array & tag ids dictionary & a list of lists of tag ids, see tag encoding +section\tabularnewline +\hline +QO & bool & qual orientation & true if quality values are in the same orientation as sequence. If false quality values are recorded in the orientation as produced by the sequencing instrument, which may also still match sequence orientation.\tabularnewline +\hline +\end{tabular} + +The boolean values are optional, defaulting to true when absent, although it is recommended to explicitly set them. SM and TD are mandatory. + +\subsubsection*{Data series encoding map} + +Each data series has an encoding. These encoding are stored in a map with byte[2] +keys and are decoded in approximately this order\footnote{The precise order is defined in section~\ref{sec:record}.}: + +\begin{threeparttable}[t] +\begin{tabular}{|l|l|>{\raggedright}p{100pt}|>{\raggedright}p{220pt}|} +\hline +\textbf{Key} & \textbf{Value data type} & \textbf{Name} & \textbf{Value}\tabularnewline +\hline +BF & encoding\texttt{<}uint\texttt{>} & BAM bit flags & see separate section\tabularnewline +\hline +CF & encoding\texttt{<}uint\texttt{>} & CRAM bit flags & see specific section\tabularnewline +\hline +RI & encoding\texttt{<}uint\texttt{>} & reference id & record reference id from +the SAM file header\tabularnewline +\hline +RL & encoding\texttt{<}uint\texttt{>} & read lengths & read lengths\tabularnewline +\hline +AP & encoding\texttt{<}sint\texttt{>} & in-seq positions & if \textbf{AP-Delta} = true: 0-based alignment start +delta from the AP value in the previous record. +Note this delta may be negative, for example when switching references in a multi-reference slice. +When the record is the first in the slice, the previous position used is the slice alignment-start field (hence the first delta should be zero for single-reference slices, or the AP value itself for multi-reference slices). \linebreak{} +if \textbf{AP-Delta} = false: encodes the alignment start position directly\tabularnewline +\hline +RG & encoding\texttt{<}sint\texttt{>} & read groups & read groups. Special value +`-1' stands for no group.\tabularnewline +\hline +RN\tnote{a} & encoding\texttt{<}byte[ ]\texttt{>} & read names & read names\tabularnewline +\hline +MF & encoding\texttt{<}uint\texttt{>} & next mate bit flags & see specific section\tabularnewline +\hline +NS & encoding\texttt{<}uint\texttt{>} & next fragment reference sequence id & reference +sequence ids for the next fragment \tabularnewline +\hline +NP & encoding\texttt{<}uint\texttt{>} & next mate alignment start & alignment positions +for the next fragment\tabularnewline +\hline +TS & encoding\texttt{<}uint\texttt{>} & template size & template sizes\tabularnewline +\hline +NF & encoding\texttt{<}uint\texttt{>} & distance to next fragment & number of records +to the next fragment\tnote{b}\tabularnewline +\hline +TL\tnote{c} & encoding\texttt{<}uint\texttt{>} & tag ids & list of tag ids, see tag encoding +section\tabularnewline +\hline +FN & encoding\texttt{<}uint\texttt{>} & number of read features & number of read +features in each record\tabularnewline +\hline +FC & encoding\texttt{<}byte\texttt{>} & read features codes & see separate section\tabularnewline +\hline +FP & encoding\texttt{<}uint\texttt{>} & in-read positions & positions of the read +features\tabularnewline +\hline +DL & encoding\texttt{<}uint\texttt{>} & deletion lengths & base-pair deletion lengths\tabularnewline +\hline +BB & encoding\texttt{<}byte[ ]\texttt{>} & stretches of bases & bases\tabularnewline +\hline +QQ & encoding\texttt{<}byte[ ]\texttt{>} & stretches of quality scores & quality scores\tabularnewline +\hline +BS & encoding\texttt{<}byte\texttt{>} & base substitution codes & base substitution +codes\tabularnewline +\hline +IN & encoding\texttt{<}byte[ ]\texttt{>} & insertion & inserted bases\tabularnewline +\hline +RS & encoding\texttt{<}uint\texttt{>} & reference skip length & number of skipped +bases for the `N' read feature\tabularnewline +\hline +PD & encoding\texttt{<}uint\texttt{>} & padding & number of padded bases\tabularnewline +\hline +HC & encoding\texttt{<}uint\texttt{>} & hard clip & number of hard clipped bases\tabularnewline +\hline +SC & encoding\texttt{<}byte[ ]\texttt{>} & soft clip & soft clipped bases\tabularnewline +\hline +MQ & encoding\texttt{<}uint\texttt{>} & mapping qualities & mapping quality scores\tabularnewline +\hline +BA & encoding\texttt{<}byte\texttt{>} & bases & bases\tabularnewline +\hline +QS & encoding\texttt{<}byte\texttt{>} & quality scores & quality scores\tabularnewline +\hline +\end{tabular} + +\begin{tablenotes} +\item[a] Note RN this is decoded after MF if the record is detached from the mate and we are attempting to auto-generate read names. +\item[b] The count is reset for each slice so NF can only refer to a record later within this slice. +\item[c] Decode of TL is followed by decoding the tag values themselves, in order of appearance in the tag dictionary. +\end{tablenotes} +\end{threeparttable} + +\subsubsection*{Tag encoding map} +\label{subsubsec:tags} + +The tag dictionary (TD) describes the unique combinations of tag id / type that occur on each alignment record. +For example if we search the id / types present in each record and find only two combinations -- X1:i BC:Z SA:Z: and X1:i: BC:Z -- then we have two dictionary entries in the TD map. + +Let $L_{i}=\{T_{i0}, T_{i1}, \ldots, T_{ix}\}$ be a list of all tag ids for a record $R_{i}$, where $i$ is the sequential record index and $T_{ij}$ denotes $j$-th tag id in the record. +The list of unique $L_{i}$ is stored as the TD value in the preservation map. +Maintaining the order is not a requirement for encoders (hence ``combinations''), but it is permissible and thus different permutations, each encoded with their own elements in TD, should be supported by the decoder. +Each $L_{i}$ element in TD is assigned a sequential integer number starting with 0. +These integer numbers are referred to by the TL data series. +Using TD, an integer from the TL data series can be mapped back into a list of tag ids. +Thus per alignment record we only need to store tag values and not their ids and types. + +The TD is written as a byte array consisting of $L_{i}$ values separated with \textbackslash{}0. +Each $L_{i}$ value is written as a concatenation of 3 byte $T_{ij}$ elements: tag id followed by BAM tag type code (one of A, c, C, s, S, i, I, f, F, Z, H or B, as described in the SAM specification) or \texttt{*}. +Type code \texttt{*} is used as a placeholder to mark the presence and location of the MD, NM or RG tags. + +For example the TD for tag lists X1:i BC:Z SA:Z and X1:i MD:Z NM:i BC:Z may be encoded as\\ +X1CBCZSAZ\textbackslash{}0X1CMD*NM*BCZ\textbackslash{}0, with X1C indicating a 1 byte unsigned value for tag X1 and an assumption that all MD and NM tags present can be auto-generated. + +\subsubsection*{Tag values} + +The encodings used for different tags are stored in a map. +The key is 3 bytes formed from the BAM tag id and type code, matching the TD dictionary described above. +Unlike the Data Series Encoding Map, the key is stored in the map as a uint7 encoded integer, constructed using $(char1<<16) + (char2<<8) + type$. +For example, the 3-byte representation of OQ:Z is \{0x4F, 0x51, 0x5A\} and these bytes are intepreted as the integer key 0x004F515A, leading to an uint7 byte stream \{0x82, 0xbd, 0xa2, 0x59\}. + +\begin{tabular}{|l|l|l|>{\raggedright}p{160pt}|} +\hline +\textbf{Key} & \textbf{Value data type} & \textbf{Name} & \textbf{Value} +\tabularnewline +\hline +TAG ID 1:TAG TYPE 1 & encoding\texttt{<}byte[ ]\texttt{>} & read tag 1 & tag values +(names and types are available in the data series code)\tabularnewline +\hline +... & & ... & ...\tabularnewline +\hline +TAG ID N:TAG TYPE N & encoding\texttt{<}byte[ ]\texttt{>} & read tag N & ...\tabularnewline +\hline +\end{tabular} + +Note that tag values are encoded as array of bytes. The routines to convert tag +values into byte array and back are the same as in BAM with the exception of value +type being captured in the tag key rather in the value. +Hence consuming 1 byte for types `C' and `c', 2 bytes for types `S' and `s', 4 bytes for types `I', `i' and `f', and a variable number of bytes for types `H', `Z' and `B'. + +\subsection{\textbf{Slice header block}} + +The slice header block is never compressed (block method=raw). For reference mapped +reads the slice header also defines the reference sequence context of the data +blocks associated with the slice. Mapped reads can be stored along with +\textbf{placed unmapped}\footnote{Unmapped reads can be \textit{placed} or \textit{unplaced}. +A read that is unmapped according to bit 0x4 of the BF (BAM bit flags) +data series, but has position and reference fields filled in, is +\textit{placed unmapped}. In contrast, \textit{unplaced unmapped} +reads have have a reference sequence ID of -1 and alignment position of 0.} +reads on the same reference within the same slice. + +Slices with the Multiple Reference flag (-2) set as the sequence ID in the header may contain reads +mapped to multiple external references, including unmapped\footnotemark[\value{footnote}] reads (placed on these references or unplaced), +but multiple embedded references cannot be combined in this way. When multiple references are +used, the RI data series will be used to determine the reference sequence ID for each record. This +data series is not present when only a single reference is used within a slice. + +The Unmapped (-1) sequence ID in the header is for slices containing only unplaced +unmapped\footnotemark[\value{footnote}] reads. + +A slice containing data that does not use the external reference in +any sequence may set the reference MD5 sum to zero. This can happen +because the data is unmapped or the sequence has been stored verbatim +instead of via reference-differencing. This latter scenario is +recommended for unsorted or non-coordinate-sorted data. + +The slice header block contains the following fields. + +\begin{tabular}{|l|l|>{\raggedright}p{200pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Value}\tabularnewline +\hline +int7\textcolor{gray}{FIXME} & reference sequence id & reference sequence identifier or\linebreak{} +-1 for unmapped reads\linebreak{} +-2 for multiple reference sequences.\linebreak{} +This value must match that of its enclosing container.\tabularnewline +\hline +uint7 & alignment start & the alignment start position.\linebreak{} +0 if the slice is multiple-reference +or contains unmapped unplaced reads\tabularnewline +\hline +uint7 & alignment span & the length of the alignment.\linebreak{} +0 if the slice is multiple-reference +or contains unmapped unplaced reads\tabularnewline +\hline +uint7 & number of records & the number of records in the slice\tabularnewline +\hline +uint7 & record counter & 1-based sequential index of records in the file/stream\tabularnewline +\hline +uint7 & number of blocks & the number of blocks in the slice\tabularnewline +\hline +uint7[ ] & block content ids & block content ids of the blocks in the slice\tabularnewline +\hline +int7\textcolor{gray}{FIXME} & embedded reference bases block content id & block content id for the embedded +reference sequence bases or -1 \textcolor{gray}{FIXME: why not 0?} for none\tabularnewline +\hline +byte[16] & reference md5 & MD5 checksum of the reference bases within the slice +boundaries. If this slice has reference sequence id of -1 (unmapped) or -2 (multi-ref) +the MD5 should be 16 bytes of \textbackslash{}0. For embedded references, the MD5 +can either be all-zeros or the MD5 of the embedded sequence.\tabularnewline +\hline +byte[] & optional tags & a series of tag,type,value tuples encoded as +per BAM auxiliary fields.\tabularnewline +\hline +\end{tabular} + +The optional tags are encoded in the same manner as BAM tags. I.e. a +series of binary encoded tags concatenated together where each tag +consists of a 2 byte key (matching [A-Za-z][A-Za-z0-9]) followed by a +1 byte type ([AfZHcCsSiIB]) followed by a string of bytes in a format +defined by the type. + +Tags starting in a capital letter are reserved while lowercase ones or +those starting with X, Y or Z are user definable. Any tag not +understood by a decoder should be skipped over without producing an +error. + +At present no tags are defined, but potential uses include additional +checksums and statistical information. + +% Details omitted until we fully work through all the corner cases, +% such as seq/qual of *. +% +% Reserved tags are defined as follows: +% +% \begin{tabular}{|l|l|>{\raggedright}p{325pt}|} +% \hline +% \textbf{Tag type} & \textbf{BAM format} & \textbf{Meaning}\tabularnewline +% \hline +% BD & i & Sum over all reads of the CRC32 hash of sequence base. This +% may be used to validate round-trips in and out of CRAM. +% calls\tabularnewline +% \hline +% SD & i & Sum over all reads of the CRC32 hash of quality scores. (If +% the quality string is ``*'' in SAM then the hash is of the BAM encoded +% version - a string of bytes with value 255.)\tabularnewline +% \hline +% \end{tabular} + + +\subsection{\textbf{Core data block}} + +{\color{gray} +FIXME: candidate for culling, in order to simplify the format. +} +A core data block is a bit stream (most significant bit first) consisting of data from one +or more CRAM records. Please note that one byte could hold more then one CRAM record +as a minimal CRAM record could be just a few bits long. The core data block has +the following fields: + +\begin{tabular}{|l|>{\raggedright}p{120pt}|>{\raggedright}p{260pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Value} +\tabularnewline +\hline +bit[ ] & CRAM record 1 & The first CRAM record\tabularnewline +\hline +... & ... & ...\tabularnewline +\hline +bit[ ] & CRAM record N & The Nth CRAM record \tabularnewline +\hline +\end{tabular} + +\subsection{\textbf{External data blocks}} + +The relationship between the core data block and external data blocks is shown in the following +picture: + +\begin{center} +\begin{tikzpicture} + +\usetikzlibrary{shapes, shadows, positioning, arrows, decorations.markings} + +\pgfdeclarelayer{background} +\pgfsetlayers{background,main} + +\tikzstyle{dsbox} = [blockbox, text width=6em, minimum width=6em, minimum height=3em, rounded corners, drop shadow] +\tikzstyle{blockbox}=[draw, fill=white, text width=4.0em, text centered, minimum height=1.0em, drop shadow] +\tikzstyle{encodedblocklarge} = [blockbox, text width=6em, minimum width=8em, minimum height=4em, drop shadow] +\tikzstyle{encodedblock} = [blockbox, text width=6em, minimum width=8em, minimum height=3em, drop shadow] + +\tikzstyle{texto} = [above, text width=8em, text centered] + +\newcommand{\cramRecord}[6]{% +\begin{pgfonlayer}{background} + \path (#1.west |- #1.north) + (-0.5, 0.5) node (a1) {}; + \path (#1.east |- #6.south) + (+0.5,-0.25) node (a2) {}; + \path[fill=brown!10,rounded corners, draw=black!50, dashed] + (a1) rectangle (a2); + \path (a1.east |- a1.south) + (1.0,-0.3) node[texto] + {\scriptsize\textbf{CRAM Record}}; +\end{pgfonlayer}} + +\newcommand{\dsbox}[2]{node (p#1) [dsbox] +{\scriptsize{#2}}} +\newcommand{\encodedblock}[2]{node (p#1) [encodedblock] + {\scriptsize{#2}}} +\newcommand{\encodedblocklarge}[2]{node (p#1) [encodedblocklarge] + {\scriptsize{#2}}} + +\path +(-2.5,-1.5) \dsbox{1}{Bit Flags}; +\path (p1.south)+(0.0,-1.0) \dsbox{2}{Alignment Start}; +\path (p2.south)+(0.0,-1.0) \dsbox{3}{Read Length}; +\path (p3.south)+(0.0,-1.0) \dsbox{4}{Bases}; +\path (p4.south)+(0.0,-1.0) \dsbox{5}{Quality Scores}; +\path (p5.south)+(0.0,-1.0) \dsbox{6}{Tags...}; + +\cramRecord{p1}{p2}{p3}{p4}{p5}{p6} + +\newcommand{\blockStreams}[4]{% +\begin{pgfonlayer}{background} + \path (#1.west |- #1.north)+(-0.5, .7) node (a1) {}; + \path (#1.east |- #1.south)+(+0.5, -0.25) node (a2) {}; + \path (#2.west |- #2.north)+(-0.5, 0.5) node (a3) {}; + \path (#4.east |- #4.south)+(+0.5, -0.25) node (a4) {}; + \path[fill=yellow!10, rounded corners, draw=black!50, dashed] + (a1) rectangle (a2); + \path (a1.east |- a1.south)+(0.8,-0.3) node[texto] + {\scriptsize\textbf{Bit Stream}}; + \path[fill=yellow!10, rounded corners, draw=black!50, dashed] + (a3) rectangle (a4); + \path (a3.east |- a3.south)+(1.0,-0.3) node[texto]{\scriptsize\textbf{Byte Streams}}; +\end{pgfonlayer}} + +\path (a1.south) + (12.0, -2.3) \encodedblocklarge {7} {Core Data Block}; +\path (p7.south) + (0.0, -2.0) \encodedblock {8} {External Data Block}; +\path (p8.south) + (0.0, -1.0) \encodedblock {9} {External Data Block}; +\path (p9.south) + (0.0, -1.0) \encodedblock {10}{External Data Block...}; + +\blockStreams {p7} {p8} {p9} {p10} + +\node[minimum size=4cm, right = of p2, single arrow, draw, single arrow head indent=0ex, black] {Bit encodings (Huffman, ...) }; + +\node[minimum size=4cm, right = of p5, single arrow, draw, single arrow head indent=0ex, black] {Byte encodings (external, ...) }; + +\end{tikzpicture} + +Figure 5: The relationship between core and external encodings, and core and external data blocks. + +\end{center} + +{\color{gray} +FIXME: we may also wish to enforce one data series to one external +block (in which case the block IDs may as well also be fixed)? This +is a restriction, but also a simplification and matches current +practice (check Java)? +} +The picture shows how a CRAM record (on the left) is distributed between the core +data block and one or more external data blocks, via core or external encodings. The specific +encodings presented are only examples for purposes of illustration. The main point is +to distinguish between core bit encodings whose output is always stored in a core data +block, and external byte encodings whose output is always stored in external data +blocks. + +\section{\textbf{End of file container}} + +A special container is used to mark the end of a file or stream. It is required in version 3 or later. +The idea is to provide an easy and a quick way to detect that a CRAM file or stream is complete. +The marker is an empty container with ref seq id set to -1 (unaligned) and alignment start set to 4542278 (which is ``EOF'' when seen in ASCII). + +It is recommended that implementations of CRAM validate EOF by checking these values rather than direct comparison of byte values, as these checks will be valid for all versions of CRAM + +\section{\textbf{Record structure}} +\label{sec:record} + +CRAM record is based on the SAM record but has additional features allowing for +more efficient data storage. In contrast to BAM record CRAM record uses bits as +well as bytes for data storage. This way, for example, various coding techniques +which output variable length binary codes can be used directly in CRAM. On the +other hand, data series that do not require binary coding can be stored separately +in external blocks with some other compression applied to them independently. + +As CRAM data series may be interleaved within the same blocks\footnote{Interleaving can sometimes provide better compression, however it also adds dependency between types of data meaning it is not possible to selectively decode one data series if it co-locates with another data series in the same block.} understanding the order in which CRAM data series must be decoded is vital. + +The overall flowchart is below, with more detailed description in the subsequent sections. + +\algnewcommand\algorithmicto{\text{ \textbf{to} }} + +\subsection{\textbf{CRAM record}} + +Both mapped and unmapped reads start with the following fields. Please note that +the data series type refers to the logical data type and the data series name corresponds +to the data series encoding map. + +\begin{tabular}{|>{\raggedright}p{70pt}|>{\raggedright}p{75pt}|>{\raggedright}p{90pt}|>{\raggedright}p{171pt}|} +\hline +\textbf{Data series type} & \textbf{Data series name} & \textbf{Field} & \textbf{Description}\tabularnewline +\hline +uint & BF & BAM bit flags & see BAM bit flags below\tabularnewline +\hline +uint & CF & CRAM bit flags & see CRAM bit flags below\tabularnewline +\hline +- & - & Positional data & See section \ref{subsec:positions}\tabularnewline +\hline +- & - & Read names & See section \ref{subsec:names}\tabularnewline +\hline +- & - & Mate records & See section \ref{subsec:mate}\tabularnewline +\hline +- & - & Auxiliary tags & See section \ref{subsec:tags}\tabularnewline +\hline +- & - & Sequences & See sections \ref{subsec:mapped} and \ref{subsec:unmapped}\tabularnewline +\hline +\end{tabular} + +\subsubsection*{\textbf{BAM bit flags (BF data series)}} + +The following flags are duplicated from the SAM and BAM specification, with identical meaning. +Note however some of these flags can be derived during decode, so may be omitted in the CRAM file and the bits computed based on both reads of a pair-end library residing within the same slice. + +\begin{threeparttable}[t] +\begin{tabular}{|>{\raggedright}p{144pt}|>{\raggedright}p{144pt}|>{\raggedright}p{144pt}|} +\hline +\textbf{Bit flag} & \textbf{Comment} & \textbf{Description}\tabularnewline +\hline +0x1 & & template having multiple segments in sequencing\tabularnewline +\hline +0x2 & & each segment properly aligned according to the aligner\tabularnewline +\hline +0x4 & & segment unmapped\tnote{a}\tabularnewline +\hline +0x8 & calculated\tnote{b}\ \ or stored in the mate's info & next segment in template unmapped\tabularnewline +\hline +0x10 & & SEQ being reverse complemented\tabularnewline +\hline +0x20 & calculated\tnote{b}\ \ or stored in the mate's info & SEQ of the next segment in the +template being reversed\tabularnewline +\hline +0x40 & & the first segment in the template\tnote{c}\tabularnewline +\hline +0x80 & & the last segment in the template\tnote{c}\tabularnewline +\hline +0x100 & & secondary alignment\tabularnewline +\hline +0x200 & & not passing quality controls\tabularnewline +\hline +0x400 & & PCT or optical duplicate\tabularnewline +\hline +0x800 & & Supplementary alignment\tabularnewline +\hline +\end{tabular} +\begin{tablenotes} +\item[a] Bit 0x4 is the only reliable place to tell whether the read is unmapped. If 0x4 is set, no assumptions may be made about bits 0x2, 0x100 and 0x800. +\item[b] For segments within the same slice. +\item[c] Bits 0x40 and 0x80 reflect the read ordering within each template inherent in the sequencing technology used, which may be independent from the actual mapping orientation. +If 0x40 and 0x80 are both set, the read is part of a linear template (one where the template sequence is expected to be in a linear order), but it is neither the first nor the last read. +If both 0x40 and 0x80 are unset, the index of the read in the template is unknown. +This may happen for a non-linear template (such as one constructed by stitching together other templates) or when this information is lost during data processing. +\end{tablenotes} +\end{threeparttable} + +\subsubsection*{\textbf{CRAM bit flags (CF data series)}} + +The CRAM bit flags (also known as compression bit flags) expressed as an integer represent the CF data series. +The following compression flags are defined for each CRAM read record: + +\begin{tabular}{|>{\raggedright}p{39pt}|>{\raggedright}p{150pt}|>{\raggedright}p{242pt}|} +\hline +\textbf{Bit flag} & \textbf{Name} & \textbf{Description}\tabularnewline +\hline +0x1 & quality scores stored as array & quality scores can be stored as read features +or as an array similar to read bases.\tabularnewline +\hline +0x2 & detached & mate information is stored verbatim (e.g. because the pair spans multiple slices or the fields differ to the CRAM computed method)\tabularnewline +\hline +0x4 & has mate downstream & tells if the next segment should be expected further +in the stream\tabularnewline +\hline +0x8 & decode sequence as ``*'' & informs the decoder that the sequence +is unknown and that any encoded reference differences are present only to +recreate the CIGAR string.\tabularnewline +\hline +0x10 & explicit template size & decode from the template size (TS) data series even for record in an attached pair.\tabularnewline +\hline +\end{tabular} + + +The following pseudocode describes the general process of decoding an entire CRAM record. +The sequence data itself is in one of two encoding formats depending on whether the record is aligned (mapped). + +\subsubsection*{\textbf{Decode pseudocode}} +\newlength{\maxwidth} +\newcommand{\algalign}[2] % #1 = text to left, #2 = text to right +{\makebox[\maxwidth][l]{$#1{}$}${}#2$} + +\begin{algorithmic}[1] +\Procedure{DecodeRecord}{} +\settowidth{\maxwidth}{CRAM\_flags\quad} +\State \algalign{BAM\_flags}{\gets} \Call{ReadItem}{BF, Integer} +\State \algalign{CRAM\_flags}{\gets} \Call{ReadItem}{CF, Integer} +\State \Call{DecodePositions}{}\Comment{See section \ref{subsec:positions}} +\State \Call{DecodeNames}{}\Comment{See section \ref{subsec:names}} +\State \Call{DecodeMateData}{}\Comment{See section \ref{subsec:mate}} +\State \Call{DecodeTagData}{}\Comment{See section \ref{subsec:tags}} +\Statex + +\If{$(BF$ AND $4) \ne 0$}\Comment{Unmapped flag} + \State \Call{DecodeMappedRead}{}\Comment{See section \ref{subsec:mapped}} +\Else + \State \Call{DecodeUnmappedRead}{}\Comment{See section \ref{subsec:unmapped}} +\EndIf +\EndProcedure +\end{algorithmic} + +\subsection{\textbf{CRAM positional data}} +\label{subsec:positions} + +Following the bit-wise BAM and CRAM flags, CRAM encodes positional related data including reference, alignment positions and length, and read-group. +Positional data is stored for both mapped and unmapped sequences, as unmapped data may still be ``placed'' at a specific location in the genome (without being aligned). +Typically this is done to keep a sequence pair (paired-end or mate-pair sequencing libraries) together when one of the pair aligns and the other does not. + +For reads stored in a position-sorted slice, the AP-delta flag in the compression header preservation map should be set and the AP data series will be delta encoded, using the slice alignment-start value as the first position to delta against. +Note for multi-reference slices this may mean that the AP series includes negative values, such as when moving from an alignment to the end of one reference sequence to the start of the next or to unmapped unplaced data. When the AP-delta flag is not set the AP data series is stored as a normal integer value. + +\begin{tabular}{|>{\raggedright}p{70pt}|>{\raggedright}p{75pt}|>{\raggedright}p{90pt}|>{\raggedright}p{171pt}|} +\hline +\textbf{Data series type} & \textbf{Data series name} & \textbf{Field} & \textbf{Description}\tabularnewline +\hline +uint & RI & ref id & reference sequence id (only present in multiref slices)\tabularnewline +\hline +uint & RL & read length & the length of the read\tabularnewline +\hline +sint & AP & alignment start & the alignment start position\tabularnewline +\hline +sint & RG & read group & the read group identifier expressed as the N\textsuperscript{th} record in the header, starting from 0 with -1 for no group\tabularnewline +\hline +\end{tabular} + +\vskip 20pt +\begin{algorithmic}[1] +\Procedure{DecodePositions}{} +\If{$slice\_header.reference\_sequence\_id = -2$} + \State $reference\_id\gets$ \Call{ReadItem}{RI, Integer} +\Else + \State $reference\_id\gets slice\_header.reference\_sequence\_id$ +\EndIf +\State $read\_length \gets$ \Call{ReadItem}{RL, Integer} +\If{$container\_pmap.AP\_delta \ne 0$} + \If{$first\_record\_in\_slice$} + \State $last\_position\gets$ $slice\_header.alignment\_start$ + \EndIf + \State $alignment\_position \gets$ \Call{ReadItem}{AP, Integer} + $last\_position$ + \State $last\_position \gets alignment\_position$ +\Else + \State $alignment\_position \gets$ \Call{ReadItem}{AP, Integer} +\EndIf +\State $read\_group \gets$ \Call{ReadItem}{RG, Integer} +\EndProcedure +\end{algorithmic} + +\subsection{Read names (RN data series)} +\label{subsec:names} + +Read names can be preserved in the CRAM format, but this is optional and is governed by the \texttt{RN} preservation map key in the container compression header. See section \ref{subsec:compression-header}. +When read names are not preserved the CRAM decoder should generate names, typically based on the file name and a numeric ID of the read using the record counter field of the slice header block. +Note read names may still be preserved even when the \texttt{RN} compression header key indicates otherwise, such as where a read is part of a read-pair and the pair spans multiple slices. +In this situation the record will be marked as detached (see the CF data series) and the mate data below (section \ref{subsec:mate}) will contain the read name. + +\begin{tabular}{|>{\raggedright}p{70pt}|>{\raggedright}p{75pt}|>{\raggedright}p{90pt}|>{\raggedright}p{171pt}|} +\hline +\textbf{Data series type} & \textbf{Data series name} & \textbf{Field} & \textbf{Description}\tabularnewline +\hline +byte[] & RN & read names & read names\tabularnewline +\hline +\end{tabular} + +\vskip 20pt +\begin{algorithmic}[1] +\Procedure{DecodeNames}{} +\State $read\_name \gets empty$ +\If{$container\_pmap.read\_names\_included = 1$} + \State $read\_name \gets$ \Call{ReadItem}{RN, Byte[]} +\EndIf +\Statex +\EndProcedure +\end{algorithmic} + +\subsection{\textbf{Mate record}} +\label{subsec:mate} + +There are two ways in which mate information can be preserved in CRAM: number of records downstream (distance, within this slice) to the next fragment in the template and a special mate record if the next fragment is not in the current slice. +In the latter case the record is labelled as ``detached'', see the CF data series. + +For mates within the slice only the distance is captured, and only for the first record. The mate has neither detached nor downstream flags set in the CF data series. + +\begin{tabular}{|>{\raggedright}p{68pt}|>{\raggedright}p{115pt}|>{\raggedright}p{228pt}|} +\hline +\textbf{Data series type} & \textbf{Data series name} & \textbf{Description}\tabularnewline +\hline +uint & NF & the number of records to the next fragment\tabularnewline +\hline +\end{tabular} + +In the above case, the NS (mate reference name), NP (mate position) and TS (template size) fields for both records should be derived once the mate has also been decoded. +Mate reference name and position are obvious and simply copied from the mate. +The template size is computed using the method described in the SAM specification; the inclusive distance from the leftmost to rightmost mapped bases with the sign being positive for the leftmost record and negative for the rightmost record. + +If the next fragment is not found within this slice then the following structure is included into the CRAM record. +Note there are cases where read-pairs within the same slice may be marked as detached and use this structure, such as to store mate-pair information that does not match the algorithm used by CRAM for computing the mate data on-the-fly. + +\begin{tabular}{|>{\raggedright}p{66pt}|>{\raggedright}p{117pt}|>{\raggedright}p{228pt}|} +\hline +\textbf{Data series type} & \textbf{Data series name} & \textbf{Description}\tabularnewline +\hline +uint & MF & next mate bit flags, see table below\tabularnewline +\hline +byte[ ] & RN & the read name (if and only if not known already)\tabularnewline +\hline +uint & NS & mate reference sequence identifier \tabularnewline +\hline +uint & NP & mate alignment start position \tabularnewline +\hline +uint & TS & the size of the template (insert size)\tabularnewline +\hline +\end{tabular} + +\subsubsection*{Next mate bit flags (MF data series)} + +The next mate bit flags expressed as an integer represent the MF data series. +These represent the missing bits we excluded from the BF data series (when compared to the full SAM/BAM flags). +The following bit flags are defined: + +\begin{tabular}{|>{\raggedright}p{47pt}|>{\raggedright}p{134pt}|>{\raggedright}p{250pt}|} +\hline +\textbf{Bit flag} & \textbf{Name} & \textbf{Description}\tabularnewline +\hline +0x1 & mate negative strand bit & the bit is set if the mate is on the negative +strand\tabularnewline +\hline +0x2 & mate mapped bit & the bit is set if the mate is mapped\tabularnewline +\hline +\end{tabular} + + +\subsubsection*{\textbf{Decode mate pseudocode}} + +In the following pseudocode we are assuming the current record is $this$ and its mate is $next\_frag$. + +\begin{algorithmic}[1] +\Procedure{DecodeMateData}{} +\If{$CF\bitand 2$}\Comment{Detached from mate} + \State $mate\_flags\gets $ \Call{ReadItem}{MF,Integer} + \If{$mate\_flags\bitand 1$} + \State $bam\_flags\gets bam\_flags\bitor$ 0x20\Comment{Mate is reverse-complemented} + \EndIf + \If{$mate\_flags\bitand 2$} + \State $bam\_flags\gets bam\_flags\bitor$ 0x08\Comment{Mate is unmapped} + \EndIf + \If{$container\_pmap.read\_names\_included \ne 1$} + \State $read\_name \gets$ \Call{ReadItem}{RN, Byte[]} + \EndIf +\settowidth{\maxwidth}{mate\_position\ } +\State \algalign{mate\_ref\_id}{\gets} \Call{ReadItem}{NS, Integer} +\State \algalign{mate\_position}{\gets} \Call{ReadItem}{NP, Integer} +\State \algalign{template\_size}{\gets} \Call{ReadItem}{TS, Integer} +\ElsIf{$CF\bitand 16$}\Comment{Explicit template size} + \State $template\_size\gets$ \Call{ReadItem}{TS, Integer} +\EndIf +\EndProcedure +\end{algorithmic} + +Note as with the SAM specification a template may be permitted to have more than two alignment records. +In this case the ``mate'' for each record is considered to be the next record, with the mate for the last record being the first to form a circular list. +The above algorithm is a simplification that does not deal with this scenario. +The full method needs to observe when record $this+NF$ is also labelled as having an additional mate downstream. +One recommended approach is to resolve the mate information in a second pass, once the entire slice has been decoded. +The final segment in the mate chain needs to set $bam\_flags$ fields 0x20 and 0x08 accordingly based on the first segment. +This is also not listed in the above algorithm, for brevity. + +\subsection{Auxiliary tags} +\label{subsec:tags} + +Tags are encoded using a tag line (TL data series) integer into the tag dictionary (TD field in the compression header preservation map, see section \ref{subsec:compression-header}). +See section \ref{subsubsec:tags} for a more detailed description of this process. + +\begin{tabular}{|>{\raggedright}p{70pt}|>{\raggedright}p{75pt}|>{\raggedright}p{90pt}|>{\raggedright}p{200pt}|} +\hline +\textbf{Data series type} & \textbf{Data series name} & \textbf{Field} & \textbf{Description}\tabularnewline +\hline +uint & TL & tag line & an index into the tag dictionary (TD)\tabularnewline +\hline +\textit{various} & \textit{various} & tag name/type & 3 character key (2 tag identifier and 1 tag type), as specified by the tag dictionary\tabularnewline +\hline +\end{tabular} + +\vskip 20pt +\begin{algorithmic}[1] +\Procedure{DecodeTagData}{} +\State $tag\_line\gets$ \Call{ReadItem}{TL,Integer} +\ForAll {$ele \in container\_pmap.tag\_dict(tag\_line)$} + \State $name\gets$ first two characters of $ele$ + \State $tag(type)\gets$ last character of $ele$ + \If{$tag(type) = ``*''$} + \State $tag(name)\gets$ \Call{GenerateTag}{$name$} + \Else + \State $tag(name)\gets$ \Call{ReadItem}{$ele$, Byte[]} + \EndIf +\EndFor +\EndProcedure +\end{algorithmic} + +In the above procedure, $name$ is a two letter tag name and $type$ is one of the permitted types documented in the SAM/BAM specification. +Type is \texttt{c} (signed 8-bit integer), \texttt{C} (unsigned 8-bit integer), \texttt{s} (signed 16-bit integer), \texttt{S} (unsigned 16-bit integer), \texttt{i} (signed 32-bit integer), \texttt{I} (unsigned 32-bit integer), \texttt{f} (32-bit float), \texttt{Z} (nul-terminated string), \texttt{H} (nul-terminated string of hex digits) and \texttt{B} (binary data in array format with the first byte being one of c,C,s,S,i,I,f using the meaning above, a 32-bit integer for the number of array elements, followed by array data encoded using the specified format). All integers are little endian encoded. + +For example a SAM tag \texttt{MQ:i} has name \texttt{MQ} and type \texttt{i} and will be decoded using one of MQc, MQC, MQs, MQS, MQi and MQI data series depending on size and sign of the integer value. + +Note some auxiliary tags can be created automatically during decode so can optionally be removed by the encoder. +However if the decoder finds a tag stored verbatim it should use this in preference to automatically computing the value. + +The RG (read group) auxiliary tag should be created if the read group (RG data series) value is not $-1$. + +The MD and NM auxiliary tags store the differences (an edit string) between the sequence and the reference along with the number of mismatches. +These may optionally be created on-the-fly during reference-based sequence reconstruction and should match the description provided in the SAMtags document. +An encoder may decide to store these verbatim when no reference is used or where the automatically constructed values differ to the input data. + +The tag type \texttt{*} is used in conjunction with MD, NM and RG as a flag to indicate that this tag was present but is not being stored verbatim. +(If MD is being stored verbatim, we would expect MDZ in the TD dictionary for this tag line.) +The \texttt{*} type is also useful for ensuring the ordering of tags does not change. + +Note it is permitted for decoders to choose to auto-generate MD and NM tags even in the absence of MD* and NM* entries. + +\subsection{\textbf{Mapped reads}} +\label{subsec:mapped} + +\subsubsection*{\textbf{Read feature records}} +\label{subsec:features} + +Read features are used to store read details that are expressed using read coordinates +(e.g. base differences respective to the reference sequence). The read feature +records start with the number of read features followed by the read features themselves. +Finally the single mapping quality and per-base quality scores are stored. + +\begin{threeparttable}[t] +\begin{tabular}{|>{\raggedright}p{88pt}|>{\raggedright}p{83pt}|>{\raggedright}p{85pt}|>{\raggedright}p{180pt}|} +\hline +\textbf{Data series type} & \textbf{Data series name} & \textbf{Field} & \textbf{Description}\tabularnewline +\hline +uint & FN & number of read features & the number of read features\tabularnewline +\hline +uint & FP & in-read-position\tnote{a} & position of the read feature\tabularnewline +\hline +byte & FC & read feature code\tnote{a} & See feature codes below\tabularnewline +\hline +* & * & read feature data\tnote{a} & See feature codes below\tabularnewline +\hline +uint & MQ & mapping qualities & mapping quality score\tabularnewline +\hline +byte[read length] & QS & quality scores & the base qualities, if preserved\tabularnewline +\hline +\end{tabular} +\begin{tablenotes} +\item[a] Repeated FN times, once for each read feature. +\end{tablenotes} +\end{threeparttable} + +\subsubsection*{Read feature codes} + +Each feature code has its own associated data series containing further information specific to that feature. +The following codes are used to distinguish variations in read coordinates: + +\begin{tabular}{|>{\raggedright}p{91pt}|>{\raggedright}p{45pt}|>{\raggedright}p{72pt}|>{\raggedright}p{66pt}|>{\raggedright}p{132pt}|} +\hline +\textbf{Feature code} & \textbf{Id} & \textbf{Data series type} & \textbf{Data +series name} & \textbf{Description}\tabularnewline +\hline +Bases & b (0x62) & byte[ ] & BB & a stretch of bases\tabularnewline +\hline +Scores & q (0x71) & byte[ ] & QQ & a stretch of scores\tabularnewline +\hline +% Neither C nor Java implementations generator nor can decode the 'A' +% feature code, but if they did they'd be BB/QQ and not BA/QS. Best +% to omit it from published spec for now? +% +% Bases and scores & A (0x41) & byte[ ],byte[ ] & BB,QQ & A a stretch of bases and +% quality scores score\tabularnewline +% \hline +Read base & B (0x42) & byte,byte & BA,QS & A base and associated quality score\tabularnewline +\hline +Substitution & X (0x58) & byte & BS & base substitution codes, SAM operators X, +M and =\tabularnewline +\hline +Insertion & I (0x49) & byte[ ] & IN & inserted bases, SAM operator I\tabularnewline +\hline +Deletion & D (0x44) & uint & DL & number of deleted bases, SAM operator D\tabularnewline +\hline +Insert base & i (0x69) & byte & BA & single inserted base, SAM operator I\tabularnewline +\hline +Quality score & Q (0x51) & byte & QS & single quality score\tabularnewline +\hline +Reference skip & N (0x4E) & uint & RS & number of skipped bases, SAM operator N\tabularnewline +\hline +Soft clip & S (0x53) & byte[ ] & SC & soft clipped bases, SAM operator S\tabularnewline +\hline +Padding & P (0x50) & uint & PD & number of padded bases, SAM operator P\tabularnewline +\hline +Hard clip & H (0x48) & uint & HC & number of hard clipped bases, SAM operator H\tabularnewline +\hline +\end{tabular} + +\subsubsection*{Base substitution codes (BS data series)} + +A base substitution is defined as a change from one nucleotide base (reference base) to +another (read base), including N as an unknown or missing base. There are 5 possible reference +bases (ACGTN), with 4 possible substitutions for each base, and 20 substitutions in total. +The codes for all possible substitutions are stored in a substitution matrix. To restore a +base, one would use the reference base and the substitution code, resolving the base via lookup +in the substitution matrix. + +\subsubsection*{Substitution Matrix Format} + +Each of the 4 possible substitutions for a given reference base is assigned a 2-bit integer +code (see below) with a value ranging from 0 to 3 inclusive. The 4 2-bit codes are packed +into a single byte, high 2-bits first, for each base ACGTN (minus the reference base itself). +The entire substitution matrix is written as 5 such bytes, one for each reference base, also +in the order ACGTN. + +\subsubsection*{Substitution Code Assignment} + +To assign the susbtitution code for a given reference base/read base, the substitutions for +each reference base may optionally be sorted by their frequencies, in descending order, with +same-frequency ties broken using the fixed order ACGTN. Although sorting by substitution +frequency is not required by the CRAM format, assigning substitution codes based on frequency +maximizes compression by ensuring that the most frequent substitutions use the shortest possible +codes. + +For example, let us assume the following substitution frequencies for base A: + +AC: 15\% + +AG: 25\% + +AT: 55\% + +AN: 5\% + +Then the substitution codes are: + +AC: 2 + +AG: 1 + +AT: 0 + +AN: 3 + +The first byte of the substitution matrix entry for reference base A is written as a single byte, +with the codes in the order CGTN: 10 01 00 11 = 147 decimal, or 0x93 in this case. This will then +be followed by 4 more bytes representing substitutions for reference bases C, G, T and N. + +\subsubsection*{Decode mapped read pseudocode} + +\begin{algorithmic}[1] +\Procedure{DecodeMappedRead}{} + \State $feature\_number\gets$ \Call{ReadItem}{FN, Integer} + \For{$i\gets 1 \algorithmicto feature\_number$} + \State \Call{DecodeFeature}{} + \EndFor + \State $mapping\_quality\gets$ \Call{ReadItem}{MQ, Integer} + \If{$container\_pmap.preserve\_quality\_scores$} + \If{$(container\_pmap.qual\_orientation = 0) \logand (bam\_flags \bitand 0x10)$} + \For{$i\gets 0 \algorithmicto read\_length-1$} + \State $quality\_score_{read\_length-1-i}\gets$ \Call{ReadItem}{QS, Integer} + \EndFor + \Else + \For{$i\gets 0 \algorithmicto read\_length-1$} + \State $quality\_score_i\gets$ \Call{ReadItem}{QS, Integer} + \EndFor + \EndIf + \EndIf +\EndProcedure +\Statex +\Procedure{DecodeFeature}{} + \settowidth{\maxwidth}{feature\_position\ } + \State \algalign{feature\_code}{\gets} \Call{ReadItem}{FC, Integer} + \State \algalign{feature\_position}{\gets} \Call{ReadItem}{FP, Integer} + \settowidth{\maxwidth}{substitution\_code\ } + \If{$feature\_code = $`B'} + \State \algalign{base}{\gets} \Call{ReadItem}{BA, Byte} + \State \algalign{quality\_score}{\gets} \Call{ReadItem}{QS, Byte} + \ElsIf{$feature\_code = $`X'} + \State \algalign{substitution\_code}{\gets} \Call{ReadItem}{BS, Byte} + \ElsIf{$feature\_code = $`I'} + \State \algalign{inserted\_bases}{\gets} \Call{ReadItem}{IN, Byte[]} + \ElsIf{$feature\_code = $`S'} + \State \algalign{softclip\_bases}{\gets} \Call{ReadItem}{SC, Byte[]} + \ElsIf{$feature\_code = $`H'} + \State \algalign{hardclip\_length}{\gets} \Call{ReadItem}{HC, Integer} + \ElsIf{$feature\_code = $`P'} + \State \algalign{pad\_length}{\gets} \Call{ReadItem}{PD, Integer} + \ElsIf{$feature\_code = $`D'} + \State \algalign{deletion\_length}{\gets} \Call{ReadItem}{DL, Integer} + \ElsIf{$feature\_code = $`N'} + \State \algalign{ref\_skip\_length}{\gets} \Call{ReadItem}{RS, Integer} + \ElsIf{$feature\_code = $`i'} + \State \algalign{base}{\gets} \Call{ReadItem}{BA, Byte} + \ElsIf{$feature\_code = $`b'} + \State \algalign{bases}{\gets} \Call{ReadItem}{BB, Byte[]} + \ElsIf{$feature\_code = $`q'} + \State \algalign{quality\_scores}{\gets} \Call{ReadItem}{QQ, Byte[]} + \ElsIf{$feature\_code = $`Q'} + \State \algalign{quality\_score}{\gets} \Call{ReadItem}{QS, Byte} + \EndIf +\EndProcedure +\end{algorithmic} + +\subsection{\textbf{Unmapped reads}} +\label{subsec:unmapped} + +The CRAM record structure for unmapped reads has the following additional fields: + +\begin{tabular}{|>{\raggedright}p{88pt}|>{\raggedright}p{83pt}|>{\raggedright}p{85pt}|>{\raggedright}p{180pt}|} +\hline +\textbf{Data series type} & \textbf{Data series name} & \textbf{Field} & \textbf{Description}\tabularnewline +\hline +byte[read length] & BA & bases & the read bases\tabularnewline +\hline +byte[read length] & QS & quality scores & the base qualities, if preserved\tabularnewline +\hline +\end{tabular} + +\vskip20pt +\begin{algorithmic}[1] +\Procedure{DecodeUnmappedRead}{} + \For{$i\gets 1 \algorithmicto read\_length$} + \State $base\gets$ \Call{ReadItem}{BA, Byte} + \EndFor + \If{$container\_pmap.preserve\_quality\_scores$} + \If{$(container\_pmap.qual\_orientation = 0) \logand (bam\_flags \bitand 0x10)$} + \For{$i\gets 0 \algorithmicto read\_length-1$} + \State $quality\_score_{read\_length-1-i}\gets$ \Call{ReadItem}{QS, Integer} + \EndFor + \Else + \For{$i\gets 0 \algorithmicto read\_length-1$} + \State $quality\_score_i\gets$ \Call{ReadItem}{QS, Integer} + \EndFor + \EndIf + \EndIf +\EndProcedure +\end{algorithmic} + +\subsection{Resolving inter-record dependencies} + +Some entries in a record can only be filled out after decoding +the entire slice, such as the computed PNEXT, RNEXT and TLEN SAM +fields and reversing the deduplication of read names (QNAME). + +Thus \textsc{DecodeRecord} is considered as part of a wider \textsc{DecodeSlice} procedure. +This procedure is rather complicated so best understood with the addition of pseudocode. + +If a record has read the next fragment (NF) data series, then it is has some data that has been deduplicated (not stored verbatim for each record in the template) and is expected to be computed. +All such records will have mate position updated and some BAM flags (mate unmapped, mate reverse) set, as well as setting the paired in sequencing flag. + +If such a record also has an undefined template size, meaning the TS data series has not yet been read, this is derived by finding the leftmost and rightmost alignment records and provided they all map to the same reference the template size (SAM TLEN field) is set. Normally the leftmost read has a positive TLEN value, but special care is made for cases where multiple records are mapped to the leftmost position in which case the record with BAM flag READ1 is defined to be the positive one. + +Finally read names (SAM QNAME field) may have been deduplicated too. +If a record has an empty name and also has the next fragment defined, the record name is copied from this next fragment + +If the read name is still empty, the name is auto-generated. It is suggested that this is based on the global record count in the file and some predefined prefix. + +\begin{algorithmic}[1] +\Procedure{DecodeSlice}{} +\State \Call{DecodeSliceHeader}{} +\For{$i\gets 0 \algorithmicto num\_recs-1$} + \State \Call{DecodeRecord}{$rec_i$} +\EndFor +% FIXME: should this be from num_recs-1 to 0 so next_frag works +% with 3+ fragments, with last being actual copy? +\For{$i\gets 0 \algorithmicto num\_recs-1$} + \If{$rec_i.next\_frag \ne undefined$} + \State \Call{ResolvePair}{$i$} + \EndIf + \State \Call{ResolveName}{$i$} +\EndFor +\EndProcedure +\end{algorithmic} + +\begin{algorithmic}[1] +\Procedure{ResolvePair}{$rnum$} + \State $i \gets rnum$ + \If{$rec_i.template\_size = undefined$} + \State $(leftmost,\ rightmost,\ same\_ref) \gets$ \Call{FindLeftRight}{$i$} + + \If{$same\_ref = 1$} \Comment{Template entirely mapped to one reference sequence} + \State $j \gets i$ + \Repeat + \If{$rec_j.position = leftmost$} + \If{$left\_count = 1 \logor rec_j.bam\_flags \bitand$ 0x40} + \State $rec_j.template\_size \gets tlen$\Comment{Use READ1 flag as tie-breaker if same location} + \Else + \State $rec_j.template\_size \gets -tlen$ + \EndIf + \Else + \State $rec_j.template\_size \gets -tlen$ + \EndIf + \State $j \gets j + 1 + rec_j.next\_ref$ + \Until{$j = i$} + \Else \Comment{Template spans multiple references} + \State $j \gets i$ + \Repeat + \State $rec_j.template\_size \gets 0$ + \State $j \gets j + 1 + rec_j.next\_ref$ + \Until{$j = i$} + \EndIf + \EndIf + + \Statex\Comment{Ensure other pairing data is complete} + \State $j \gets j + 1 + rec_i.next\_ref$ + \State $rec_i.mate\_position \gets rec_j.mate\_position$ + \State $rec_i.bam\_flags \gets rec_i.bam\_flags \bitor 1$ \Comment{Set BAM PAIRED flag} + \If{$rec_j.bam\_flags \bitand 16$}\Comment{BAM REVERSE flag} + \State $rec_i.bam\_flags \gets rec_i.bam\_flags \bitor 32$ + \EndIf + \If{$rec_j.bam\_flags \bitand 4$}\Comment{BAM UNMAPPED flag} + \State $rec_i.bam\_flags \gets rec_i.bam\_flags \bitor 8$ + \EndIf + \State +\EndProcedure +\end{algorithmic} + +\begin{algorithmic}[1] +\Function{FindLeftRight}{$rnum$} + \State $i \gets rnum$ + \State $leftmost \gets rec_i.position$\Comment{Find leftmost and rightmost record} + \State $rightmost \gets rec_i.end\_position$\Comment{Computed from POS + CIGAR} + \State $same\_ref \gets 1$ + \State $left\_count \gets 0$\Comment{Count of number of records at leftmost pos} + \State $j \gets i$ + \Repeat + \If{$leftmost > ref_j.position$} + \State $leftmost \gets ref_j.position$ + \State $left\_count \gets 1$ + \ElsIf{$leftmost = ref_j.position$} + \State $left\_count \gets left\_count + 1$ + \EndIf + \If{$rightmost < ref_j.end\_position$} + \State $rightmost \gets ref_j.end\_position$ + \EndIf + \If{$rec_i.ref \ne rec_j.ref$} + \State $same\_ref \gets 0$ + \EndIf + \If{$rec_j.next\_frag \ne undefined$} + \State $j \gets j + 1 + rec_j.next\_frag$ + \Else + \State $j \gets i$ + \EndIf + \Until{$j = i$} + \Return $(leftmost,\ rightmost, \ same\_ref)$ +\EndFunction +\end{algorithmic} + +\begin{algorithmic}[1] +\Procedure{ResolveName}{rnum} + \State $i \gets rnum$ + \If{$rec_i.read\_name = empty$} + \If{$rec_i.cram\_flag \bitand 4$} + \State $j \gets i + 1 + rec_i.next\_frag$ + \State $rec_i.read\_name = rec_j.read\_name$ + \EndIf + \EndIf + \If{$rec_i.name = empty$} + \State $rec_i.read\_name \gets$ \Call{GenerateName}{} + \EndIf +\EndProcedure +\end{algorithmic} + +\section{\textbf{Reference sequences}} + +CRAM format is natively based upon usage of reference sequences even though in +some cases they are not required. In contrast to BAM format CRAM format has strict +rules about reference sequences. + +\begin{enumerate} +\item M5 (sequence MD5 checksum) field of @SQ sequence record in the BAM header is +required and UR (URI for the sequence fasta optionally gzipped file) field is strongly +advised. The rule for calculating MD5 is to remove any non-base symbols (like \textbackslash{}n, +sequence name or length and spaces) and upper case the rest. Here are some examples: + +\texttt{> samtools faidx human\_g1k\_v37.fasta 1 \textbar{} grep -v '\textasciicircum{}>' \textbar{} tr -d '\textbackslash{}n' \textbar{} tr a-z A-Z \textbar{} md5sum -\\ +1b22b98cdeb4a9304cb5d48026a85128 -} + +\texttt{> samtools faidx human\_g1k\_v37.fasta 1:10-20 \textbar{}grep -v '\textasciicircum{}\texttt{>}' \textbar{}tr -d '\textbackslash{}n' \textbar{}tr a-z A-Z \textbar{}md5sum -\\ +0f2a4865e3952676ffad2c3671f14057 -} + +Please note that the latter calculates the checksum for 11 bases from position +10 (inclusive) to 20 (inclusive) and the bases are counted 1-based, so the first +base position is 1. + +\item All CRAM reader implementations are expected to check for reference MD5 checksums +and report any missing or mismatching entries. Consequently, all writer implementations +are expected to ensure that all checksums are injected or checked during compression +time. + +\item In some cases reads may be mapped beyond the reference sequence. All out of +range reference bases are all assumed to be `N'. + +\item MD5 checksum bytes in slice header should be ignored for unmapped or multiref +slices. +\end{enumerate} + +\section{\textbf{Indexing}} + +\subsubsection*{General notes} + +Indexing is only valid on coordinate (reference ID and then leftmost position) sorted files. + +Please note that CRAM indexing is external to the file format itself and may change +independently of the file format specification in the future. For example, a new +type of index file may appear. + +Individual records are not indexed in CRAM files, slices should be used instead +as a unit of random access. Another important difference between CRAM and BAM indexing +is that CRAM container header and compression header block (first block in container) +must always be read before decoding a slice. Therefore two read operations are +required for random access in CRAM. + +Indexing a CRAM file is deemed to be a lightweight operation because it usually does not require any CRAM records to be read. +Indexing information can be obtained from container headers, namely sequence id, alignment start and span, container start byte offset and slice byte offset inside the container (landmarks). +The exception to this is with multi-reference containers, where the ``RI'' data series must be read. + +\subsubsection*{CRAM index} + +A CRAM index is a gzipped tab delimited file containing the following columns: + +\begin{enumerate} +\item Reference sequence id + +\item Alignment start (ignored on read for unmapped slices, set to 0 on write) + +\item Alignment span (ignored on read for unmapped slices, set to 0 on write) + +\item Absolute byte offset of Container header in the file. + +\item Relative byte offset of the Slice header block, from the end of +the container header. This is the same as the ``landmark'' field +in the container header. + +\item Slice size in bytes (including slice header and all blocks). +\end{enumerate} + +Each line represents a slice in the CRAM file. +Please note that all slices must be listed in the index file. + +Multi-reference slices may need to have multiple lines for the same slice; one for each reference contained within that slice. +In this case the index reference sequence ID will be the actual reference ID (from the ``RI'' data series) and not -2. + +Slices containing solely unmapped unplaced data (reference ID -1) still require values for all columns, although the alignment start and span will be ignored. +It is recommended that they are both set to zero. + +To illustrate this the absolute and relative offsets used in a three slice container are shown in the diagram below. + +\begin{center} +\begin{tikzpicture}[ + boxed/.style={rectangle, draw=black, fill=black!10, minimum height=1.3cm, text width=1.8cm}, +] +\node(A) [boxed]{Container Header}; +\node(B) [boxed,right,text width=2.2cm] at (A.east){Compression Header Block}; +\node(C) [boxed,right] at (B.east){\quad Slice 1}; +\node(D) [boxed,right] at (C.east){\quad Slice 2}; +\node(E) [boxed,right] at (D.east){\quad Slice 3}; + +\draw[<-] (A.south west)+(1pt,-0.1cm) -- +(1pt,-1cm) + node[below, text width=1.5cm]{Container absolute offset}; + + +\draw[|<->|] ([yshift=-0.2cm]B.south west) node[below,xshift=1.2cm]{Slice offset 1} + -- ([yshift=-0.2cm]B.south east); +\draw[|<->|] ([yshift=-1.2cm]B.south west) node[below,xshift=1.2cm]{Slice offset 2} + -- ([yshift=-1.2cm]C.south east); +\draw[|<->|] ([yshift=-2.2cm]B.south west) node[below,xshift=1.2cm]{Slice offset 3} + -- ([yshift=-2.2cm]D.south east); + +\draw[|<->|] ([yshift=+0.2cm]C.north west) node[above,xshift=1cm]{Size 1} -- ([yshift=+0.2cm]C.north east); +\draw[|<->|] ([yshift=+0.2cm]D.north west) node[above,xshift=1cm]{Size 2} -- ([yshift=+0.2cm]D.north east); +\draw[|<->|] ([yshift=+0.2cm]E.north west) node[above,xshift=1cm]{Size 3} -- ([yshift=+0.2cm]E.north east); + + +\end{tikzpicture} +\end{center} + +\subsubsection*{BAM index} + +BAM indexes are supported by using 4-byte integer pointers called landmarks that +are stored in container header. BAM index pointer is a 64-bit value with 48 bits +reserved for the BAM block start position and 16 bits reserved for the in-block +offset. When used to index CRAM files, the first 48 bits are used to store the +CRAM container start position and the last 16 bits are used to store the index +of the landmark in the landmark array stored in container header. The landmark +index can be used to access the appropriate slice. + +The above indexing scheme treats CRAM slices as individual records in BAM file. +This allows to apply BAM indexing to CRAM files, however it introduces some overhead +in seeking specific alignment start because all preceding records in the slice +must be read and discarded. + +\section{\textbf{Encodings}} +\label{sec:encodings} + +% FIXME: we have a mishash of coding, encoding and codec. We should +% go through the entire document and be consistent. + +\subsection{\textbf{Introduction}} + +The basic idea for codings is to efficiently represent some values in binary format. +This can be achieved in a number of ways that most frequently involve some knowledge +about the nature of the values being encoded, for example, distribution statistics. +The methods for choosing the best encoding and determining its parameters are very +diverse and are not part of the CRAM format specification, which only describes +how the information needed to decode the values should be stored. + +Note two of the encodings (Golomb and Golomb-Rice) are listed as deprecated. +These are still formally part of the CRAM specification, but have not been used by the primary implementations and may not be well supported. +Therefore their use is permitted, but not recommended. + +\subsubsection*{Offset} + +Many of the codings listed below encode positive integer numbers. An integer offset +value is used to allow any integer numbers and not just positive ones to be encoded. +It can also be used for monotonically decreasing distributions with the maximum +not equal to zero. For example, given offset is 10 and the value to be encoded +is 1, the actually encoded value would be offset+value=11. Then when decoding, +the offset would be subtracted from the decoded value. + +\subsection{EXTERNAL: codec ID 1} + +Can encode types \textit{Byte}, \textit{Integer}. + +The EXTERNAL coding is simply storage of data verbatim to an external block with a given ID. +If the type is \textit{Byte} the data is stored as-is, otherwise for \textit{Integer} type the data is stored in uint7 or sint7 depending on the sign of the data series. + +\subsubsection*{Parameters} + +CRAM format defines the following parameters of EXTERNAL coding: + +\begin{tabular}{|>{\raggedright}p{100pt}|>{\raggedright}p{100pt}|>{\raggedright}p{230pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Comment} +\tabularnewline +\hline +uint7 & external id & id of an external block containing the byte stream\tabularnewline +\hline +\end{tabular} + +\subsection{Huffman coding: codec ID 3} + +Can encode types \textit{Byte}, \textit{Integer}. + +Huffman coding replaces symbols (values to encode) by binary codewords, with common symbols having shorter codewords such that the total message of binary codewords is shorter than using uniform binary codeword lengths. +The general process consists of the following steps. + +\begin{itemize} +\item Obtain symbol code lengths. +\begin{itemize} +\item If encoding:\\ +- Compute symbol frequencies.\\ +- Compute code lengths from frequencies. +\item If decoding:\\ +- Read code lengths from codec parameters. +\end{itemize} + +\item Compute canonical Huffman codewords from code lengths\footnote{\url{https://en.wikipedia.org/wiki/Canonical_Huffman_code}}. + +\item Encode or decode bits as per the symbol to codeword table. +Codewords have the ``prefix property'' that no codeword is a prefix of another codeword, enabling unambiguous decode bit by bit. +\end{itemize} + +The use of canonical Huffman codes means that we only need to store the code lengths and use the same algorithm in both encoder and decoder to generate the codewords. +This is achieved by ensuring our symbol alphabet has a natural sort order and codewords are assigned in numerical order. + +\textbf{Important note: for alphabets with only one value, the codeword will be zero bits long.} +This makes the Huffman codec an efficient mechanism for specifying constant values. + +\subsubsection*{Canonical code computation} + +\begin{enumerate} +\item Sort the alphabet ascending using bit-lengths and then using numerical order +of the values. + +\item The first symbol in the list gets assigned a codeword which is the same length +as the symbol's original codeword but all zeros. This will often be a single zero +('0'). + +\item Each subsequent symbol is assigned the next binary number in sequence, ensuring +that following codes are always higher in value. + +\item When you reach a longer codeword, then after incrementing, append zeros until +the length of the new codeword is equal to the length of the old codeword. +\end{enumerate} + +\subsubsection*{Examples} + +\begin{tabular}{|>{\raggedright}p{105pt}|>{\raggedright}p{105pt}|>{\raggedright}p{105pt}|} +\hline +\textbf{Symbol} & \textbf{Code length} & \textbf{Codeword}\tabularnewline +\hline +A & 1 & 0\tabularnewline +\hline +B & 3 & 100\tabularnewline +C & 3 & 101\tabularnewline +D & 3 & 110\tabularnewline +\hline +E & 4 & 1110\tabularnewline +F & 4 & 1111\tabularnewline +\hline +\end{tabular} + +\subsubsection*{Parameters} + +\begin{tabular}{|>{\raggedright}p{100pt}|>{\raggedright}p{100pt}|>{\raggedright}p{230pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Comment} +\tabularnewline +\hline +array & alphabet & list of all encoded symbols (values)\tabularnewline +\hline +array & bit-lengths & array of bit-lengths for each symbol in the alphabet\tabularnewline +\hline +\end{tabular} + +\subsection{Byte array coding} + +Often there is a need to encode an array of bytes where the length is not predetermined. +For example the read identifiers differ per alignment record, possibly with different lengths, and this length must be stored somewhere. +There are two choices available: storing the length explicitly (BYTE\_ARRAY\_LEN) or continuing to read bytes until a termination value is seen (BYTE\_ARRAY\_STOP). + +Note in contrast to this, quality values are known to be the same length as the sequence which is an already known quantity, so this does not need to be encoded using the byte array codecs. + +\subsubsection*{BYTE\_ARRAY\_LEN: codec ID 4} + +Can encode types \textit{Byte[]}. + +Byte arrays are captured length-first, meaning that the length of every array element is written using an additional encoding. +For example this could be a HUFFMAN encoding or another EXTERNAL block. +The length is decoded first followed by the data, followed by the next length and data, and so on. + +This encoding can therefore be considered as a nested encoding, with each pair of nested encodings containing their own set of parameters. +The byte stream for parameters of the BYTE\_ARRAY\_LEN encoding is therefore the concatenation of the length and value encoding parameters as described in section~\ref{subsec:writing-bytes}. + +The parameter for BYTE\_ARRAY\_LEN are listed below: + +\begin{tabular}{|>{\raggedright}p{100pt}|>{\raggedright}p{100pt}|>{\raggedright}p{230pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Comment} +\tabularnewline +\hline +encoding\texttt{<}int\texttt{>} & lengths encoding & an encoding describing how +the arrays lengths are captured\tabularnewline +\hline +encoding\texttt{<}byte\texttt{>} & values encoding & an encoding describing how +the values are captured\tabularnewline +\hline +\end{tabular} + +For example, the bytes specifying a BYTE\_ARRAY\_LEN encoding, including the codec and parameters, for a 16-bit X0 auxiliary tag (``X0C'') may use HUFFMAN encoding to specify the length (always 2 bytes) and an EXTERNAL encoding to store the value to an external block with ID 200. + +\begin{tabular}{lll} +\hline +\textbf{Bytes} & & \textbf{Meaning}\\ +\hline +\texttt{0x04} & & BYTE\_ARRAY\_LEN codec ID \\ +\texttt{0x0a} & & 10 remaining bytes of BYTE\_ARRAY\_LEN parameters \\ +\\ +\texttt{0x03} & & HUFFMAN codec ID, for aux tag lengths \\ +\texttt{0x04} & & 4 more bytes of HUFFMAN parameters \\ +\texttt{0x01} & & Alphabet array size = 1 \\ +\texttt{0x02} & & alphabet symbol; (length = 2) \\ +\texttt{0x01} & & Codeword array size = 1 \\ +\texttt{0x00} & & Code length = 0 (zero bits needed as alphabet is size 1) \\ +\\ +\texttt{0x01} & & EXTERNAL codec ID, for aux tag values \\ +\texttt{0x02} & & 2 more bytes of EXTERNAL parameters \\ +\texttt{0x81 0x48} & & uint7 encoding for block ID 200 \\ +\hline +\end{tabular} + +\subsubsection*{BYTE\_ARRAY\_STOP: codec ID 5} + +Can encode types \textit{Byte[]}. + +Byte arrays are captured as a sequence of bytes terminated by a special stop byte. +The data returned does not include the stop byte itself. +In contrast to BYTE\_ARRAY\_LEN the value is always encoded with EXTERNAL so the parameter is an external id instead of another encoding. + +\begin{tabular}{|>{\raggedright}p{100pt}|>{\raggedright}p{100pt}|>{\raggedright}p{230pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Comment} +\tabularnewline +\hline +byte & stop byte & a special byte treated as a delimiter\tabularnewline +\hline +uint8 & external id & id of an external block containing the byte stream\tabularnewline +\hline +\end{tabular} + +% \subsection{Beta coding: codec ID 6} +% +% Can encode types \textit{Integer}. +% +% \subsubsection*{Definition} +% +% Beta coding is a most common way to represent numbers in \emph{binary notation} and is sometimes referred to as binary coding. +% The decoder reads the specified fixed number of bits (most significant first) and subtracts the offset value to get the decoded integer. +% +% \subsubsection*{Parameters} +% +% CRAM format defines the following parameters of beta coding: +% +% \begin{tabular}{|>{\raggedright}p{144pt}|>{\raggedright}p{144pt}|>{\raggedright}p{144pt}|} +% \hline +% \textbf{Data type} & \textbf{Name} & \textbf{Comment}\tabularnewline +% \hline +% itf8 & offset & offset is subtracted from each value during decode\tabularnewline +% \hline +% itf8 & length & the number of bits used\tabularnewline +% \hline +% \end{tabular} +% +% \subsubsection*{Examples} +% +% If we have integer values in the range 10 to 15 inclusive, the largest value would traditionally need 4 bits, but with an offset of -10 we can hold values 0 to 5, using a fixed size of 3 bits. +% Using fixed Offset and Length coming from the beta parameters, we decode these values as: +% +% \begin{tabular}{|>{\raggedright}p{105pt}|>{\raggedright}p{105pt}|>{\raggedright}p{105pt}|>{\raggedright}p{105pt}|} +% \hline +% Offset & Length & \textbf{Bits} & \textbf{Value}\tabularnewline +% \hline +% -10 & 3 & 000 & 10\tabularnewline +% \hline +% -10 & 3 & 001 & 11\tabularnewline +% \hline +% -10 & 3 & 010 & 12\tabularnewline +% \hline +% -10 & 3 & 011 & 13\tabularnewline +% \hline +% -10 & 3 & 100 & 14\tabularnewline +% \hline +% -10 & 3 & 101 & 15\tabularnewline +% \hline +% \end{tabular} +% +% \subsection{Subexponential coding: codec ID 7} +% +% Can encode types \textit{Integer}. +% +% \subsubsection*{Definition} +% +% Subexponential coding\footnote{Fast progressive lossless image compression, Paul G. Howard and Jeffrey Scott Vitter, 1994. \url{http://www.ittc.ku.edu/~jsv/Papers/HoV94.progressive_FELICS.pdf}} is parametrized by a non-negative integer $k$. +% For values $n < 2^{k+1}$ subexponential coding produces codewords identical to Rice coding \footnote{\url{https://en.wikipedia.org/wiki/Golomb_coding\#Rice_coding}}. For larger values it grows logarithmically with $n$. +% +% \subsubsection*{Encoding} +% +% \begin{enumerate} +% \item Add $\mathit{offset}$ to $n$. +% +% \item Determine $u$ and $b$ values from $n$ +% \begin{align*} +% b = +% \begin{cases} +% \ k & \text{ if $n < 2^k$} \\ +% \ \lfloor log_{2}n \rfloor & \text{ if $n \ge 2^k$} +% \end{cases} +% &\ +% &u = +% \begin{cases} +% \ 0 & \text{ if $n < 2^k$} \\ +% \ b-k+1 & \text{ if $n \ge 2^k$} +% \end{cases} +% \end{align*} +% +% \item Write $u$ in unary form; $u$ 1 bits followed by a single 0 bit. +% +% \item Write the bottom $b$-bits of $n$ in binary form. +% \end{enumerate} +% +% \subsubsection*{Decoding} +% +% \begin{enumerate} +% \item Read $u$ in unary form, counting the number of leading 1s (prefix) in the codeword (discard the trailing 0 bit). +% +% \item Determine $n$ via: +% \begin{enumerate} +% \item if $u = 0$ then read $n$ as a $k$-bit binary number. +% \item if $u \ge 1$ then read $x$ as a $(u + k - 1)$-bit binary. Let $n = 2^{u+k-1} + x$. +% \end{enumerate} +% +% \item Subtract $\mathit{offset}$ from $n$. +% \end{enumerate} +% +% \subsubsection*{Examples} +% +% \begin{tabular}{|>{\raggedright}p{105pt}|>{\raggedright}p{105pt}|>{\raggedright}p{105pt}|>{\raggedright}p{105pt}|} +% \hline +% \textbf{Number} & \textbf{Codeword, k=0} & \textbf{Codeword, k=1} & \textbf{Codeword, +% k=2}\tabularnewline +% \hline +% 0 & 0 & 00 & 000\tabularnewline +% \hline +% 1 & 10 & 01 & 001\tabularnewline +% \hline +% 2 & 1100 & 100 & 010\tabularnewline +% \hline +% 3 & 1101 & 101 & 011\tabularnewline +% \hline +% 4 & 111000 & 11000 & 1000\tabularnewline +% \hline +% 5 & 111001 & 11001 & 1001\tabularnewline +% \hline +% 6 & 111010 & 11010 & 1010\tabularnewline +% \hline +% 7 & 111011 & 11011 & 1011\tabularnewline +% \hline +% 8 & 11110000 & 1110000 & 110000\tabularnewline +% \hline +% 9 & 11110001 & 1110001 & 110001\tabularnewline +% \hline +% 10 & 11110010 & 1110010 & 110010\tabularnewline +% \hline +% \end{tabular} +% +% \subsubsection*{Parameters} +% +% \begin{tabular}{|>{\raggedright}p{100pt}|>{\raggedright}p{100pt}|>{\raggedright}p{230pt}|} +% \hline +% \textbf{Data type} & \textbf{Name} & \textbf{Comment} +% \tabularnewline +% \hline +% itf8 & offset & offset is subtracted from each value during decode\tabularnewline +% \hline +% itf8 & k & the order of the subexponential coding\tabularnewline +% \hline +% \end{tabular} +% +% \subsection{Gamma coding: codec ID 9} +% +% Can encode types \textit{Integer}. +% +% \subsubsection*{Definition} +% +% \emph{Elias gamma code} is a prefix encoding of positive integers. This is a combination +% of unary coding and beta coding. The first is used to capture the number of bits +% required for beta coding to capture the value. +% +% \subsubsection*{Encoding} +% +% \begin{enumerate} +% \item Write it in binary. +% +% \item Subtract $1$ from the number of bits written in step 1 and prepend that many zeros. +% +% \item An equivalent way to express the same process: +% +% \item Separate the integer into the highest power of $2$ it contains ($2N$) and the remaining +% $N$ binary digits of the integer. +% +% \item Encode $N$ in unary; that is, as $N$ zeroes followed by a one. +% +% \item Append the remaining $N$ binary digits to this representation of $N$. +% \end{enumerate} +% +% \subsubsection*{Decoding} +% +% \begin{enumerate} +% \item Read and count 0s from the stream until you reach the first 1. Call this count +% of zeroes $N$. +% +% \item Considering the one that was reached to be the first digit of the integer, with +% a value of $2N$, read the remaining $N$ digits of the integer. +% \end{enumerate} +% +% \subsubsection*{Examples} +% +% \begin{tabular}{|>{\raggedright}p{76pt}|>{\raggedright}p{107pt}|} +% \hline +% \textbf{Value} & \textbf{Codeword}\tabularnewline +% \hline +% 1 & 1\tabularnewline +% \hline +% 2 & 010\tabularnewline +% \hline +% 3 & 011\tabularnewline +% \hline +% 4 & 00100\tabularnewline +% \hline +% \end{tabular} +% +% \subsubsection*{Parameters} +% +% \begin{tabular}{|>{\raggedright}p{144pt}|>{\raggedright}p{144pt}|>{\raggedright}p{144pt}|} +% \hline +% \textbf{Data type} & \textbf{Name} & \textbf{Comment}\tabularnewline +% \hline +% itf8 & offset & offset to subtract from each value after decode\tabularnewline +% \hline +% \end{tabular} +% +% \subsection{DEPRECATED: Golomb coding: codec ID 2} +% +% Can encode types \textit{Integer}. +% +% Note this codec has not been used in any known CRAM implementation since before CRAM v1.0. +% Nor is it implemented in some of the major software. +% Therefore its use is not recommended. +% +% \subsubsection*{Definition} +% +% \emph{Golomb encoding} is a prefix encoding optimal for representation of random +% positive numbers following geometric distribution. +% +% \subsubsection*{Encoding} +% +% \begin{enumerate} +% \item Fix the parameter $M$ to an integer value. +% +% \item For $N$, the number to be encoded, find +% +% \begin{enumerate} +% \item quotient $q = \lfloor N/M \rfloor$ +% +% \item remainder $r = N \bmod M$ +% \end{enumerate} +% +% \item Generate Codeword +% +% \begin{enumerate} +% \item The Code format : \texttt{<}Quotient Code\texttt{>}\texttt{<}Remainder Code\texttt{>}, +% where +% +% \item Quotient Code (in unary coding) +% +% \begin{enumerate} +% \item Write a $q$-length string of 1 bits +% +% \item Write a 0 bit +% \end{enumerate} +% +% \item Remainder Code (in truncated binary encoding) +% +% Set $b=\lceil log_{2}(M) \rceil$ +% +% \begin{enumerate} +% \item If $r < 2^{b}-M$ code $r$ as plain binary using $b-1$ bits. +% +% \item If $r \ge 2^{b}-M$ code the number $r+2^{b}-M$ in plain binary representation +% using $b$ bits. +% \end{enumerate} +% \end{enumerate} +% \end{enumerate} +% +% \subsubsection*{Decoding} +% +% \begin{enumerate} +% \item Read $q$ via unary coding: count the number of 1 bits and consume the following 0 bits. +% \item Set $b=\lceil log_{2}(M) \rceil$ +% \item Read $r$ via $b-1$ bits of binary coding +% \item If $r \ge 2^{b}-M$ +% \begin{enumerate} +% \item Read 1 single bit, $x$. +% \item Set $r = r*2 + x - (2^{b}-M)$ +% \end{enumerate} +% \item Value is $q*M + r - \mathit{offset}$ +% \end{enumerate} +% +% \subsubsection*{Examples} +% +% \begin{tabular}{|>{\raggedright}p{76pt}|>{\raggedright}p{107pt}|} +% \hline +% \textbf{Number} & \textbf{Codeword, M=10, (thus b=4)}\tabularnewline +% \hline +% 0 & 0000\tabularnewline +% \hline +% 4 & 0100\tabularnewline +% \hline +% 10 & 10000\tabularnewline +% \hline +% 26 & 1101100\tabularnewline +% \hline +% 42 & 11110010\tabularnewline +% \hline +% \end{tabular} +% +% \subsubsection*{Parameters} +% +% Golomb coding takes the following parameters: +% +% \begin{tabular}{|>{\raggedright}p{144pt}|>{\raggedright}p{144pt}|>{\raggedright}p{144pt}|} +% \hline +% \textbf{Data type} & \textbf{Name} & \textbf{Comment}\tabularnewline +% \hline +% itf8 & offset & offset is added to each value\tabularnewline +% \hline +% itf8 & M & the golomb parameter (number of bins)\tabularnewline +% \hline +% \end{tabular} +% +% \subsection{DEPRECATED: Golomb-Rice coding: codec ID 8} +% +% Can encode types \textit{Integer}. +% +% Note this codec has not been used in any known CRAM implementation since before CRAM v1.0. +% Nor is it implemented in some of the major software. +% Therefore its use is not recommended. +% +% Golomb-Rice coding is a special case of Golomb coding when the M parameter is a power of 2. +% The reason for this coding is that the division operations in Golomb coding can be replaced with bit shift operators as well as avoiding the extra $r < 2^{b}-M$ check. + +\section{\textbf{External compression methods}} + +External encoding operates on bytes only. Therefore any data series must be translated into bytes before sending data into an external block. +The following methods are defined. +Exact definitions of these methods are in their respective internet links or the ancillary \textit{CRAMcodecs} document found along side this specification. + +Integer values are written as ITF8, which then can be translated into an array +of bytes. + +Strings, like read name, are translated into bytes according to UTF8 rules. In +most cases these should coincide with ASCII, making the translation trivial. + +Each method has an associated numeric code which is defined in Section~\ref{sec:block-struct}. + +\subsection{\textbf{Gzip}} + +The Gzip specification is defined in RFC 1952. +Gzip in turn is an encapsulation on the Deflate algorithm defined in RFC 1951. + +\subsection{\textbf{Bzip2}} + +First available in CRAM v2.0. + +Bzip2 is a compression method utilising the Burrows Wheeler Transform, Move To Front transform, Run Length Encoding and a Huffman entropy encoder. +It is often superior to Gzip for textual data. + +An informal format specification exists:\\ +\url{https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf} + +\subsection{\textbf{LZMA}} + +First available in CRAM v3.0. + +LZMA is the Lempel-Ziv Markov chain algorithm. +CRAM uses the xz Stream format to encapsulate this algorithm, as defined in \url{https://tukaani.org/xz/xz-file-format.txt}. + +\subsection{\textbf{rANS4x8 codec}} + +First available in CRAM v3.0. + +rANS is the range-coder variant of the Asymmetric Numerical +System\footnote{J. Duda, \textit{Asymmetric numeral systems: entropy + coding combining speed of Huffman coding with compression rate of + arithmetic coding}, \url{http://arxiv.org/abs/1311.2540}}. + +``4x8'' refers to 4-way interleaving with 8-bit renormalisation.\newline +This variant of rANS first appeared in CRAM v3.0. + +Details of this algorithm have been moved to the \textit{CRAMcodecs} document. + +\subsection{\textbf{rANS4x16 codec}} + +First available in CRAM v3.1. + +``4x16'' refers to 4-way interleaving with 16-bit renormalisation.\newline +This variant of rANS first appeared in CRAM v3.1. + +Details of this algorithm are listed in the \textit{CRAMcodecs} document. + +\subsection{\textbf{adaptive arithemtic coding}} + +First available in CRAM v3.1. + +An entropy encoder that is slower but slightly more concise than +rANS. It achieves this by adapting the probabilities as it compresses +and decompresses instead of using a fixed table. + +Details of this algorithm are listed in the \textit{CRAMcodecs} document. + +\subsection{\textbf{fqzcomp codec}} + +First available in CRAM v3.1. + +This is a method dedicated to compression of quality values. + +Details of this algorithm are listed in the \textit{CRAMcodecs} document. + +\subsection{\textbf{name tokeniser}} + +First available in CRAM v3.1. + +This is a method dedicated to compression of read names. + +Details of this algorithm are listed in the \textit{CRAMcodecs} document. + +\section*{Appendix 1: Choosing the container size} + +The CRAM format does not constrain the size of the containers. +However, the following should be considered when deciding the container size: + +-- Data can be compressed better by using larger containers. + +-- Random access performance is better for smaller containers. + +-- Streaming is more convenient for small containers. + +-- Applications typically buffer containers into memory. + +-- Multi-threaded applications likely have a granularity of 1 +container for each unit of work. + +As a guidance, the default container size for htslib and htsjdk is one +slice of 10,000 short reads (fewer for long reads), but some users +find 1,000 is more appropriate if they need a lot of random access. + +\section*{Appendix 2: CRAM 4.0 changes and rationale} + +CRAM 4.0 has a number of changes. For those familier with the version +3.0 format, this is a list of changes along with a rationale for +making them. + +\begin{description} +\item[New variable sized integer encoding]\ \newline + Use VLQ (https://en.wikipedia.org/wiki/Variable-length\_quantity) + instead of ITF8 and LTF8. + + This has the impact of not needing to distinguish between 32-bit and + 64-bit quantities, which we previously needed to do with ITF8 vs + LTF8. It also has a small reduction to compressed sizes. + + +\item[Long chromosome support]\ \newline + A corollary of the removal of ITF8 and LTF8 means all fields + can now be 64-bit if appropriate. Importantly this removes the + 32-bit limit on the size of AP (alignment pos), TS (template size), + NP (next mate pos). + + +\item[Signed numbers]\ \newline + Variable sized integer encoding can now also do signed numbers. + Previously -1 was stored as FF FF FF 0F (ITF8) and even more FFs + for LTF8. This is unwieldy and also bakes in knowledge of the + size of the intended data type, making it impossible to increase + data type sizes without changing the format. + + Signed fields are used for AP (alignment pos), TS (template size) + and RG (read group, which can be -1 for not present). + + +\item[MD, NM presence and location]\ \newline + CRAM derives MD and NM tags on-the-fly where possible. However it + doesn't record whether they were in the original data. + + The presence of MD and NM are now recorded, so if the input data didn't + have them we won't reproduce on decode, and vice versa. The + location of MD, NM and RG in the tag stream are now also recorded, + meaning CRAM 4.0 will round-trip more precisely. + + +\item[Quality value orientation]\ \newline + On some technologies it is better to record qualities in their + original orientation rather than the alignment orientation as this + improves compression ratios. + + This is an optional setting in the compression header. The data + should be automatically flipped back during decode so the operation + is transparent to the user. + + +\item[New CF explicit template size flag]\ \newline + In CRAM RNEXT, PNEXT and TLEN are generated on-the-fly for read + pairs residing in the same slice. If we wish to preserve data + verbatim for exceptions, such as TLEN off-by-one errors (common in + old data) or TLEN 5' to 5' vs leftmost to rightmost differences then + we had to mark the record as ``detached'' and store all 3 fields. + + We now have an flag to state that TLEN is explicitly stored without + needing to use the CF detached flag. This can have a substantial + impact on file sizes for some older data sets. + + +\item[Deduplication of read names]\ \newline + For read pairs residing in the same slice we can use the same logic + we previously used for RNEXT, PNEXT and TLEN generation to also + deduplicate read names so it only needs storing once. + + This roughly halves the size of uncompressed RN data series, + reducing the compressed size. + +{\color{gray} +\item[(Planned) Removal of CORE block and bit-encodings]\ \newline + The bit based mechanisms are no predominantly unused in htslib and + htsjdk for CRAM 3.0 as separating into external blocks offers + greater flexibility for partial decoding. + + The sole use of HUFFMAN in htslib is to specify constant values, + which by definition have zero length codes (and consume no + additional space). Hence HUFFMAN is replaced by a new CONSTANT + encoding schema. +} + +{\color{gray} +\item[(For consideration) Removal of slices]\ \newline + CRAM 3.0 offers the ability to have multiple slices per container. + + This is largely unused and does not provide the originally planned + benefits of sharing compression meta-data between slices. The + defaults from both htslib and htsjdk are one slice per container. + + Removal of multi-slice containers, and by extension slices + themselves, offers a simplified interface. +} + +\item[TODO: add PACK, RLE and DELTA encodings]\ \newline + +\item[TODO: sanitize read\_names\_included]\ \newline + Given we now state a blank name will copy from the mate pair and + auto-generate if not, there's no need for this field. We can just + store blank names and cull the whole RN in DecodeMateData vs in-line + shenanigans. + +\end{description} + +\end{document} diff --git a/Makefile b/Makefile index c53aa6d74..0b4e01f7b 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ PDFS = BCFv1_qref.pdf \ BCFv2_qref.pdf \ CRAMv2.1.pdf \ CRAMv3.pdf \ + CRAMv4.pdf \ crypt4gh.pdf \ CSIv1.pdf \ SAMv1.pdf \ @@ -21,6 +22,7 @@ pdf: $(PDFS:%=new/%) new/CRAMv2.1.pdf diff/CRAMv2.1.pdf: CRAMv2.1.tex new/CRAMv2.1.ver new/CRAMv3.pdf diff/CRAMv3.pdf: CRAMv3.tex new/CRAMv3.ver +new/CRAMv4.pdf diff/CRAMv4.pdf: CRAMv4.tex new/CRAMv4.ver new/crypt4gh.pdf diff/crypt4gh.pdf: crypt4gh.tex new/crypt4gh.ver new/SAMv1.pdf diff/SAMv1.pdf: SAMv1.tex new/SAMv1.ver new/SAMtags.pdf diff/SAMtags.pdf: SAMtags.tex new/SAMtags.ver From 289c70e2e384e306330c52ab08ffc25e81949ad0 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 11 Mar 2020 14:41:37 +0000 Subject: [PATCH 2/7] Further CRAMv4.0 updates This includes a massive reorganisation of the document, which is long overdue. It also now replaces EXTERNAL with a variety of type specific codecs for signed, unsigned and bytes; ditches HUFFMAN (along with all other bit codecs) and adds CONST codecs instead; removes the CORE block; removes "external" for block descriptions, instead favouring data blocks as a way to distinguish from header / structure containing blocks. As yet, nothing implements this variant of V4.0! However it's far cleaner to describe and hopefully far simpler to implement due to the aggressive culling of largely unused features. --- CRAMv4.tex | 1144 ++++++++++++++++++++++++++-------------------------- 1 file changed, 564 insertions(+), 580 deletions(-) diff --git a/CRAMv4.tex b/CRAMv4.tex index d6a8702bf..0d3e944c0 100644 --- a/CRAMv4.tex +++ b/CRAMv4.tex @@ -91,7 +91,10 @@ \end{center} \vspace*{1em} -\section{\textbf{Overview}} +\tableofcontents +\newpage + +\section{Overview} This specification describes the CRAM 4.0 format. @@ -112,115 +115,52 @@ \section{\textbf{Overview}} objective supports the exploration of different lossy compression strategies and provides a framework in which to effect these choices. Please note that the CRAM format does not impose any rules about what data should or should not be preserved. -Instead, CRAM supports a wide range of lossless and lossy data preservation strategies -enabling users to choose which data should be preserved. - -Data in CRAM is aggregated by data type (analogous to SAM columns) and -stored in blocks with each block compressed using a choice of general -purpose or custom compression codecs. Sequence is typically encoded -relative to a reference sequence\footnote{Markus Hsi-Yang Fritz, -Rasko Leinonen, Guy Cochrane, and Ewan Birney, -\textbf{Efficient storage of high throughput DNA sequencing data using reference-based compression}, -{\sl Genome Res.}~2011~21: 734--740; -\href{http://dx.doi.org/doi:10.1101/gr.114819.110}{doi:10.1101/gr.114819.110}; -{\sc pmid:}21245279.}, but this is not a requirement. Both -aligned and unaligned sequence is supported. - -\section{\textbf{Data types and formats}} -The CRAM file format uses a combination of fixed sized and variable -sized integer types, which may be signed or unsigned. Additionally -arrays of any types may be used. +Data in CRAM is aggregated by data type (analogous to SAM columns) +known as Data Series and stored in blocks with each block compressed +using a choice of general purpose or custom compression codecs. +Sequence is typically encoded relative to a reference +sequence\footnote{Markus Hsi-Yang Fritz, Rasko Leinonen, Guy Cochrane, + and Ewan Birney, \textbf{Efficient storage of high throughput DNA + sequencing data using reference-based compression}, {\sl Genome + Res.}~2011~21: 734--740; + \href{http://dx.doi.org/doi:10.1101/gr.114819.110}{doi:10.1101/gr.114819.110}; + {\sc pmid:}21245279.}, but this is not a requirement. Both aligned +and unaligned sequence is supported. + +\section{Data types and formats} +\subsection*{Structures} + +The CRAM format consists of header structures (container, compression, slice and block) and data series stored within the blocks themselves. +The fields of these structures use a mixture of fundamental types - boolean, byte, integer or array thereof - which are defined below. + +% Types used in structures. +% Container: uint, sint, uint[], byte[4](CRC), byte[] +% Block: byte, uint, byte[], byte[4](CRC) +% CompHdr: map, bool, byte[5], uint, byte[], encoding<> +% Slice: sint, uint, uint[], byte[16], byte[] -We distinguish between abstract types and their on-disk formats, which -are described in more detail in the next section. - -\begin{tabular}{llll} -\textbf{Name} & \textbf{Type} & \textbf{Format}& \textbf{Description}\\ -\hline -\\ -\textbf{Bool} & \textit{bool} & \textit{bool} & A boolean, 0 or 1 \\ -\\ -\textbf{Byte} & \textit{byte} & \textit{byte} & A single byte (8 bits) \\ -\\ -\textbf{Integer} & \textit{} & \textit{int32} & A 32-bit integer \textcolor{gray}{FIXME: cull need for this}\\ - & \textit{} & \textit{int64} & A 64-bit integer \textcolor{gray}{FIXME: cull need for this}\\ - & \textit{sint} & \textit{int7} & A variable length signed integer \\ - & \textit{uint} & \textit{uint7} & A variable length unsigned integer \\ -\\ -\textbf{Array (explicit)} - & \textit{array} & \textit{array} & An array preceded by an explicit length\\ -\\ -\textbf{Array (implicit)} - & \textit{[ ]} & \textit{fmt[]} & Zero or more items with unspecified length\\ - & \textit{[ ]} & \textit{fmt[4]} & A constant number of items\\ -\\ -\textbf{Encoding}& \textit{encoding} & \textit{encoding} & A description of how and - where a data series is stored\\ +\begin{tabular}{lll} +\textbf{Field type} & \textbf{Encoding} & \textbf{Description}\\ +\textit{bool} & BYTE & A boolean, 0 (false) or 1 (true), stored as a byte. \\ +\textit{byte} & BYTE & An unsigned single byte (8 bits) \\ +\textit{uint} & VARINT\_UNSIGNED & A variable length integer $x >= 0$\\ +\textit{sint} & VARINT\_SIGNED & A variable length integer (may be negative)\\ +\textit{array\texttt{<}type\texttt{>}} & ARRAY & An array of items of \textit{type} including the dimension\\ \\ -\textbf{Map} & \textit{} & \textit{map} & A collection of key-value pairs\\ +\textit{type[]} & \textit{as appropriate} & Zero or more items with unspecified length\\ +\textit{type[4]} & \textit{as appropriate} & A constant number of items, dimension not stored\\ \end{tabular} - \vskip 10pt -The \textit{types} are used to describe the contents of the data series. -These are agnostic of magnitude or implementation, but do define the sign of integer values. -The \textit{formats} are used in the fixed structures and the \textit{encoding} definitions and do take into account data sizes and/or sign. - -For example, the \textbf{BF} data series has a value data type of \textit{encoding}. -If the encoding is EXTERNAL then the values will be stored in \textit{uint7} format as specified in the EXTERNAL encoding section. - -{ -\color{gray} -FIXME: is this an overly complex way of defining things? Maybe we should define EXTERNAL\_SINT, EXTERNAL\_UINT and EXTERNAL\_BYTE? That makes it explicit and removes one level of indirection. This is infact how my code already works. The EXTERNAL decode initialisation specialises the decode function by data type: - -\begin{verbatim} -cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, - char *data, int size, - enum cram_external_type option, - int version, varint_vec *vv) { - //... - if (option == E_INT) - c->decode = cram_external_decode_int; - else if (option == E_SINT) - c->decode = cram_external_decode_sint; - else if (option == E_LONG) - c->decode = cram_external_decode_long; - else if (option == E_SLONG) - c->decode = cram_external_decode_slong; - else if (option == E_BYTE_ARRAY || option == E_BYTE) - c->decode = cram_external_decode_char; - else - c->decode = cram_external_decode_block; - //... -} -\end{verbatim} -} - -\label{subsec:writing-bytes} - -CRAM uses \emph{little endianness} for bytes unless specified -otherwise. A more detailed description of the each type and format follows. +The on-disk format for each of these encodings is listed below: \begin{description} +\item[BYTE]\ \newline +A single byte -\item[Boolean (bool)]\ \newline -Boolean is written as 1-byte with 0x0 being `false' and 0x1 being `true'. - -\item[Byte (byte)]\ \newline -A single unsigned byte. - -\item[Signed integer (int32)]\ \newline -Signed 32-bit integer, written as 4 bytes in little-endian byte order. - -\item[Signed long integer (int64)]\ \newline -Signed 64-bit integer, written as 8 bytes in little-endian byte order. - -\item[Unsigned variable sized integer (uint7)]\ \newline -An unsigned integer of unspecified size, stored 7-bits at a time. - -The on-disk size being proportional to the magnitude of the integer -value encode and is known as Variable Length +\item[VARINT\_UNSIGNED]\ \newline +Integer values are encoded using Variable Length Quantity\footnote{\url{https://en.wikipedia.org/wiki/Variable-length_quantity}}. If the value is larger than 7-bits then the top-bit of the byte is @@ -244,18 +184,9 @@ \section{\textbf{Data types and formats}} \EndFunction \end{algorithmic} - -\item[Signed variable sized integer (sint7)]\ \newline -This is a signed version of uint7 above. Attempting to store a -negative value with the uint7 method would lead to the signed value -being interpreted as a very large unsigned quantity, consuming many -bytes. (This effect is seen in ITF8 and LTF8 in CRAM 3.0.) - -Instead we map signed values to unsigned ones keeping their -magnitudes proportional by using zig-zag encoding, and then encode -that value using uint7. This maps signed values 0, -1, +1, -2, +2 to -unsigned values 0, 1, 2, 3, 4 respectively, and vice versa during -decode. +\item[VARINT\_SIGNED]\ \newline +Signed integers are initially transformed to unsigned values using a zig-zag transformation, and then encoded as per unsigned values. +This maps signed values 0, -1, +1, -2, +2 to unsigned values 0, 1, 2, 3, 4 respectively, and vice versa during decode. The zig-zag encoding method involves shifting the value left 1 bit and then XORing with all-bits-zero if positive or all-bits-one if @@ -275,131 +206,54 @@ \section{\textbf{Data types and formats}} \EndFunction \end{algorithmic} -\item[{array}]\ \newline -Used for arrays where the size must be specified as part of the array itself. -The length is written first as an integer (uint7), followed by the elements of the -array which are encoded as per their individual type-specific format -(usually byte or uint7). - -{\color{gray} -FIXME: arrays like this are used so rarely we could just replace \textit{array} with \textit{uint7 nlandmarks} followed by \textit{uint7[nlandmarks]}. -} - -\item[{fmt[ ]}] -\item[{fmt[len]}]\ \newline -Used for arrays of data where the length is either unspecified or -known up-front. In neither case is the length part of the byte stream. - -This is used within the specification for fixed sized items such as -the 4 byte magic number or as a marker for an unspecified block of -bytes such as \textit{byte[*] blocks}. - -\item[{Encoding}]\ \newline -{\color{gray} -TODO: Move me? -} -Encoding is a data type that specifies how data series have been compressed. Encodings -are defined as encoding\texttt{<}type\texttt{>} where the type is a logical data -type as opposed to a storage data type. - -An encoding is written as follows. The first integer (uint7) denotes the codec id -and the second integer (uint7) the number of bytes in the following encoding-specific -values. - -Subexponential encoding example: - -\begin{tabular}{|l|l|l|} -\hline -\textbf{Value} & \textbf{Type} & \textbf{Name}\tabularnewline -\hline -0x7 & uint7 & codec id\tabularnewline -\hline -0x2 & uint7 & number of bytes to follow\tabularnewline -\hline -0x0 & uint7 & offset\tabularnewline -\hline -0x1 & uint7 & K parameter\tabularnewline -\hline -\end{tabular} - -The first byte ``0x7'' is the codec id. - -The next byte ``0x2'' denotes the length of the bytes to follow (2). - -The subexponential encoding has 2 parameters: integer (itf8) offset and integer (itf8) K. - -offset = 0x0 = 0 - -K = 0x1 = 1 - - -\item[{Map}]\ \newline -A map is a collection of keys and associated values. A map with N keys is written -as follows: - -\begin{tabular}{|l|l|l|l|l|l|l|l|} -\hline -size in bytes & N & key 1 & value 1 & key ... & value ... & key N & value N\tabularnewline -\hline -\end{tabular} - -Both the size in bytes and the number of keys are written as integer (uint7). Keys -and values are written according to their data types and are specific to each map. +\item[ARRAY]\ \newline +An array of items with the array dimension explicitly stored. +The array dimen is first, in VARINT\_UNSIGNED encoding, followed by the array elements stored as per their \textit{type}. \end{description} +Other custom data types exist, but will be introduced in the relevant sections. -\section{\textbf{Encodings }} +\subsection*{Data series} -Encoding is a data structure that captures information about compression details -of a data series that are required to uncompress it. This could be -the definition of a constant, a series of data transformations, or the -block content id for an external block. Some encodings may be nested, -such as \texttt{BYTE\_ARRAY\_LEN} which defines one sub-encoding for the -length and another for the bytes. - -Encoding notation is defined as the keyword `encoding' followed by its data type -in angular brackets, for example `encoding\texttt{<}byte\texttt{>}' stands for -an encoding that operates on a data series of data type `byte'. +Each data series in CRAM has a designated type - boolean, byte, integer or array thereof - but unlike the structures which explicitly state sign and encoding, the encoder has choice of how these types are stored. +This type encoding meta-data is held in the compression header structure, once per data series. +Hence the decoder has baked-in knowledge of the fundamental type of each data series (e.g. \textit{byte} vs \textit{int}) but the description of how that data is stored, whether it is constant or variable and whether it may be negative, is part of the file itself. -The encodings themselves are stored in the compression header block -(\ref{subsec:compression-header}). +Integers are stored as a variable length quantity, with two distinct encodings for unsigned and unsigned values (a departure from CRAM 3.0 and earlier). +The latter may still store positive numbers, but permits a mixture of positives and negatives in the same block. -Each encoding type has its own set of parameters listed below. +A full description of the encoding methods can be found later.(FIXME add ref) +Where \textit{N/A} is listed in the Encoding column below this data type only occurs within a structure and so has no encoding associated with it. -\begin{tabular}{|l|l|>{\raggedright}p{155pt}|>{\raggedright}p{160pt}|} -\hline -\textbf{Codec} & \textbf{ID} & \textbf{Parameters} & \textbf{Comment}\tabularnewline -\hline -NULL & 0 & none & series not preserved\tabularnewline -\hline -EXTERNAL & 1 & int block content id & the block content identifier used to associate -external data blocks with data series\tabularnewline -\hline -HUFFMAN & 3 & array\texttt{<}int\texttt{>} values, array\texttt{<}int\texttt{>} lengths & \textcolor{gray}{FIXME: replace with CONSTANT.} coding with int/byte values\tabularnewline -\hline -BYTE\_ARRAY\_LEN & 4 & encoding\texttt{<}int\texttt{>} array length, encoding\texttt{<}byte\texttt{>} -bytes & coding of byte arrays with array length\tabularnewline -\hline -BYTE\_ARRAY\_STOP & 5 & byte stop, int external block\linebreak{} -content id & coding of byte arrays with a stop value \tabularnewline +\begin{tabular}{lll} +\textbf{Data Type} & \textbf{Encoding} & \textbf{Description}\\ \hline +\\ +\textit{byte} & BYTE & An unsigned single byte (8 bits) \\ + & CONST\_BYTE & A fixed byte value\\ +\\ +\textit{byte[]}& BYTE\_ARRAY\_LEN & Zero or more items with unspecified length\\ +\textit{byte[]}& BYTE\_ARRAY\_STOP& Zero or more items with unspecified length\\ +\\ +\textit{int} & VARINT\_UNSIGNED & A variable length integer $x >= 0$\\ + & VARINT\_SIGNED & A variable length integer (may be negative)\\ + & CONST\_INT & A fixed integer value (signed)\\ \end{tabular} -See section~\ref{sec:encodings} for more detailed descriptions of all the above coding algorithms and their parameters. - -\section{\textbf{Checksums}} -The checksumming is used to ensure data integrity. The following checksumming algorithms are used in CRAM. -\subsection{\textbf{CRC32}} -This is a cyclic redundancy checksum 32-bit long with the polynomial 0x04C11DB7. Please refer to \href{http://www.itu.int/rec/recommendation.asp?type=folders&lang=e&parent=T-REC-V.42}{ITU-T V.42} for more details. The value of the CRC32 hash function is written as an integer. +\vskip 10pt -\section{\textbf{File structure}} +For example, the reference ID (\textbf{RI}) data series has a value data type of \textit{encoding}. +If all alignments in tihs slice are mapped ($RI >= 0$) then the encoding may be VARINT\_UNSIGNED. +If some data is unmapped ($RI = -1$) then the encoding will be VARINT\_SIGNED. -The overall CRAM file structure is described in this section. Please refer to other -sections of this document for more detailed information. +\section{File Layout} -A CRAM file consists of a fixed length file definition, followed by a CRAM header container, -then one or more data containers, and finally a special end-of-file container. +The basic structure of a CRAM file is a file header (magic number) +followed by a series of containers. The first of these containers +holds textual meta-data (CRAM Header Container) and the last is used +as an end-of-file marker (CRAM EOF Container) with everyone inbetween +holding the sequence records themselves (Data Contaners). \begin{center} \begin{tikzpicture}[ @@ -420,8 +274,11 @@ \section{\textbf{File structure}} Figure 1: A CRAM file consists of a file definition, followed by a header container, then other containers. \end{center} -Containers consist of one or more blocks. The first container, called the CRAM header container, -is used to store a textual header as described in the SAM specification (see the section 7.1). +Containers are just a Container Header Structure followed by one or +more Blocks. Blocks have different types (see +~\ref{subsec:block-content-types}) corresponding to their usage. +Note the block itself also consists of a defined structure followed by +the block contents data. \begin{center} \begin{tikzpicture}[ @@ -448,17 +305,26 @@ \section{\textbf{File structure}} }; \draw (file.one split south) to (header.north west); \draw (file.two split south) to (header.north east); + +\node (blocks) [boxes=2,below=1 of header.one south,text width=6em] { +\nodepart{one}Block Header structure +\nodepart{two}Block data +}; +\draw (header.south west) to (blocks.north west); +\draw (header.two split south) to (blocks.north east); \end{tikzpicture} Figure 2: The the first container holds the CRAM header text. \end{center} -Each container starts with a container header structure followed by -one or more blocks. -The first block in each container is the compression header block -giving details of how to decode data in subsequent blocks. -Each block starts with a block header structure followed by the block -data. +The first container, called the CRAM header container, is used to +store a textual header as described in the SAM specification (see the +section 7.1). This is optionally followed by another uncompressed +block of nul padding bytes. The purpose of this additional block is +to permit in-situ modification of the CRAM header without the +requirement to rewrite the entire file (instead needing just this +container to be rewritten, provided it can be padded to an identical +size). \begin{center} \begin{tikzpicture}[ @@ -484,23 +350,15 @@ \section{\textbf{File structure}} }; \draw (file.two split south) to (container.north west); \draw (file.three split south) to (container.north east); - -\node (blocks) [boxes=2,below=1 of container.two south,text width=6em] { -\nodepart{one}Block Header structure -\nodepart{two}Block data -}; -\draw (container.one split south) to (blocks.north west); -\draw (container.two split south) to (blocks.north east); \end{tikzpicture} Figure 3: Containers as a series of blocks \end{center} -The blocks after the compression header are organised logically into slices. One -slice may contain, for example, a contiguous region of alignment data. Slices begin -with a slice header block and are followed by one or more data blocks. -It is these data blocks which hold the primary bulk of CRAM data. -The data blocks are further subdivided into a core data block and one or more external data blocks. +The Data Containers hold the sequence records themselves. +These start with a Compression Header block which holds meta-data +describing how and where the Data Series are encoded within this +container. \begin{center} \begin{tikzpicture}[ @@ -539,12 +397,11 @@ \section{\textbf{File structure}} (container.six split south) to (container.south east); \node [below=0.2 of container.eight south] {Slice N}; -\node (slice) [boxes=5,below=1 of container.four south, text width=6.5em] { +\node (slice) [boxes=4,below=1 of container.four south, text width=6.5em] { \nodepart{one}Slice Header Block -\nodepart{two}Core Data Block -\nodepart{three}External Data Block 1 -\nodepart{four}... -\nodepart{five}External Data Block M +\nodepart{two}External Data Block +\nodepart{three}... +\nodepart{four}External Data Block }; \draw (container.two split south) to (slice.north west); \draw (container.five split south) to (slice.north east); @@ -553,7 +410,16 @@ \section{\textbf{File structure}} Figure 4: Slices formed from a series of concatenated blocks \end{center} -\section{\textbf{File definition}} +The blocks after the compression header are organised logically into slices. One +slice may contain, for example, a contiguous region of alignment data. Slices begin +with a slice header block and are followed by one or more data blocks. +It is these data blocks which hold the primary bulk of CRAM data, the +Data Series. + +Note many CRAM files will just use one slice per contaner as this is +the default output format of several major implementations. + +\subsection{File definition} Each CRAM file starts with a fixed length (26 bytes) definition with the following fields: @@ -590,12 +456,12 @@ \section{\textbf{File definition}} improvements for unsorted data. \item[\textit{3.1}] -Additional EXTERNAL compression codecs only. +Additional block compression codecs only. \item[\textit{4.0}] This specification. Revised variable sized integers, MD/NM/RG tag locators, and deduplication of read names. Removed some encodings. -See end for a more detailed list of changes. +See \ref{sec:cram4changes} for a more detailed list of changes. \end {itemize} CRAM 3.0 and 3.1 differ only in the list of compression @@ -603,7 +469,7 @@ \section{\textbf{File definition}} codecs should write the header to indicate 3.0 in order to permit maximum compatibility. -\section{\textbf{Container header structure}} +\subsection{Container header structure} The file definition is followed by one or more containers with the following header structure where the container content is stored in the `blocks' field: @@ -613,10 +479,10 @@ \section{\textbf{Container header structure}} \textbf{Data type} & \textbf{Name} & \textbf{Value} \tabularnewline \hline -uint32 & length & \textcolor{gray}{FIXME: make uint7} the sum of the lengths of all blocks in this container (headers and data); +uint7 & length & the sum of the lengths of all blocks in this container (headers and data); equal to the total byte length of the container minus the byte length of this header structure\tabularnewline \hline -int7\textcolor{gray}{FIXME} & reference sequence id & reference sequence identifier or\linebreak{} +int7 & reference sequence id & reference sequence identifier or\linebreak{} -1 for unmapped reads\linebreak{} -2 for multiple reference sequences.\linebreak{} All slices in this container must have a reference sequence id matching this value.\tabularnewline @@ -649,20 +515,11 @@ \section{\textbf{Container header structure}} \hline \end{tabular} -\subsection{\textbf{CRAM header container}} - -The first container in a CRAM file contains a textual header in a single block, optionally -gzip compressed. This text header currently matches the SAM header specification. Only -gzip is allowed as compression method for this block. The CRAM header container does not -include a compression header block. +\subsubsection*{CRC32} +This is a cyclic redundancy checksum 32-bit long with the polynomial 0x04C11DB7. Please refer to \href{http://www.itu.int/rec/recommendation.asp?type=folders&lang=e&parent=T-REC-V.42}{ITU-T V.42} for more details. The value of the CRC32 hash function is written as an integer. -It is recommended to reserve 50\% more space in the CRAM header container than -is required for the SAM header text by optionally padding the container with a second -raw block consisting of all zeroes. This can be used to subsequently expand the header -container in place, such as when updating @SQ records, while preserving the absolute -offsets of all subsequent containers. -\section{\textbf{Block structure}} +\subsection{Block structure} \label{sec:block-struct} Containers consist of one or more blocks. Block compression is applied independently @@ -688,17 +545,14 @@ \section{\textbf{Block structure}} \hline byte & block content type id & the block content type identifier\tabularnewline \hline -uint7 & block content id & the block content identifier used to associate external +uint7 & block content id & the block content identifier used to associate data blocks with data series\tabularnewline \hline uint7 & size in bytes* & size of the block data after applying block compression\tabularnewline \hline uint7 & raw size in bytes* & size of the block data before applying block compression\tabularnewline \hline -byte[ ] & block data & the data stored in the block:\linebreak{} --- bit stream of CRAM records (core data block)\linebreak{} --- byte stream (external data block)\linebreak{} --- additional fields ( header blocks)\tabularnewline +byte[ ] & block data & the data stored in the block\tabularnewline \hline uint32 & CRC32 & CRC32 hash value for all preceding bytes in the block\tabularnewline \hline @@ -706,7 +560,8 @@ \section{\textbf{Block structure}} * Note on raw method: both compressed and raw sizes must be set to the same value. -\subsection{\textbf{Block content types}} +\subsubsection*{Block content types} +\label{subsec:block-content-types} CRAM has the following block content types: @@ -723,39 +578,25 @@ \subsection{\textbf{Block content types}} \hline & 3 & & reserved\tabularnewline \hline -EXTERNAL\_DATA & 4 & external data block & data produced by external encodings\tabularnewline -\hline -CORE\_DATA & 5 & core data block & bit stream of all encodings except for external encodings\tabularnewline +EXTERNAL\_DATA & 4 & data block & data produced by encodings\tabularnewline \hline \end{tabular} \end{threeparttable} -\subsection{\textbf{Block content id}} +\subsubsection*{Block content id} -Block content id is used to distinguish between external blocks in the same slice. -Each external encoding has an id parameter which must be one of the external block -content ids. For external blocks the content id is a positive integer. For all -other blocks content id should be 0. Consequently, all external encodings must +Block content id is used to distinguish between data blocks in the same slice. +Each encoding has an id parameter which must be one of the block +content ids. For data blocks the content id is a positive integer. For all +other blocks content id should be 0. Consequently, all data encodings must not use content id less than 1. -\subsubsection*{Data blocks} - -Data is stored in data blocks. There are two types of data blocks: core data blocks -and external data blocks.The difference between core and external data blocks is -that core data blocks consist of data series that are compressed using bit encodings -while the external data blocks are byte compressed. One core data block and any -number of external data blocks are associated with each slice. - -Writing to and reading from core and external data blocks is organised through -CRAM records. Each data series is associated with an encoding. In case of external -encodings the block content id is used to identify the block where the data series -is stored. Please note that external blocks can have multiple data series associated -with them; in this case the values from these data series will be interleaved. - +\subsection{CRAM header container} -\subsection{\textbf{CRAM header block}} - -The SAM header is stored in a single block within the first container. +The first container in a CRAM file contains a textual header in a single block, optionally +gzip compressed. This text header currently matches the SAM header specification. Only +gzip is allowed as compression method for this block. The CRAM header container does not +include a compression header block. The following constraints apply to the SAM header: @@ -764,11 +605,17 @@ \subsection{\textbf{CRAM header block}} into the file. \end{itemize} -\subsection{\textbf{Compression header block}} +It is recommended to reserve 50\% more space in the CRAM header container than +is required for the SAM header text by optionally padding the container with a second +raw block consisting of all zeroes. This can be used to subsequently expand the header +container in place, such as when updating @SQ records, while preserving the absolute +offsets of all subsequent containers. + +\subsection{Compression header block} \label{subsec:compression-header} The compression header block consists of 3 parts: preservation map, data series -encoding map and tag encoding map. +encoding map and tag encoding map. See below for the data format of a map. These are meta-data on what is stored in the following slices, how it is encoded, and in which blocks the raw byte streams from these encodings reside. @@ -785,7 +632,41 @@ \subsection{\textbf{Compression header block}} \hline \end{tabular} -\subsubsection*{Preservation map} +\subsubsection{Map structure} + +% FIXME: is this better defined as a sub-structure in the relevant section itself? +% Possibly... +A \textit{Map} is a collection of keys and associated values. + +Both the size in bytes and the number of keys are written as integer (uint7). Keys +and values are written according to their data types and are specific to each map. +Keys have a fixed size for all items in a map, but this size is not +the same for all map types. The order of keys is not defined. Values +are stored immediately after their key in a key-defined format. Some +keys will have simple boolean values, some are fixed size, while +others have a more complex sub-structures. + +The example below is for a two byte key map. + +\begin{tabular}{|l|>{\raggedright}p{120pt}|>{\raggedright}p{260pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Value} +\tabularnewline +\hline +uint7 & size & the remaining size in bytes of this map structure\tabularnewline +\hline +uint7 & num\_keys & the number of key-value pairs in this map\tabularnewline +\hline +byte[2] & key & first key\tabularnewline +\hline +byte[] & value & first value, with a key-specific size (see below).\tabularnewline +\hline +... & ... & \textit{repeated num\_keys time}\tabularnewline +\hline +\end{tabular} + + +\subsubsection{Preservation map} The preservation map contains information about which data was preserved in the CRAM file. It is stored as a map with byte[2] keys: @@ -803,7 +684,7 @@ \subsubsection*{Preservation map} \hline SM & byte[5] & substitution matrix & substitution matrix\tabularnewline \hline -TD & array & tag ids dictionary & a list of lists of tag ids, see tag encoding +TD & array\texttt{<}byte\texttt{<>} & tag ids dictionary & a list of lists of tag ids, see tag encoding section\tabularnewline \hline QO & bool & qual orientation & true if quality values are in the same orientation as sequence. If false quality values are recorded in the orientation as produced by the sequencing instrument, which may also still match sequence orientation.\tabularnewline @@ -812,9 +693,175 @@ \subsubsection*{Preservation map} The boolean values are optional, defaulting to true when absent, although it is recommended to explicitly set them. SM and TD are mandatory. -\subsubsection*{Data series encoding map} +\subsubsection{Encoding structure} + +The next map utilises another custom data type, \textit{encoding\texttt{<}type\texttt{>}}. +This is used to describe the data encapsulation format for a specific data series; how and where the values are stored. This could be +the definition of a constant, a series of data transformations, or the +block content id for a data block. Some encodings may be nested, +such as \texttt{BYTE\_ARRAY\_LEN} which defines one sub-encoding for the +length and another for the bytes. + +Encoding notation is defined as the keyword `encoding' followed by its data type +in angular brackets, for example `encoding\texttt{<}byte\texttt{>}' stands for +an encoding that operates on a data series of data type `byte'. + +The \textit{encoding} format consists of an encoding type and type specific meta-data stored as an array (dimension followed by the array elements). + +\begin{tabular}{|l|>{\raggedright}p{120pt}|>{\raggedright}p{260pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Value} +\tabularnewline +\hline +uint7 & encoding\_type & encoding type\tabularnewline +\hline +array\texttt{<}byte\texttt{>} & encoding\_meta & type specific meta-data for this encoding, serialised as a byte stream\tabularnewline +\hline +\end{tabular} + +The list of recognised encoding\_type values is: + +\begin{tabular}{|l|l|>{\raggedright}p{155pt}|>{\raggedright}p{160pt}|} +\hline +\textbf{Encoding Type} & \textbf{ID} & \textbf{Parameters} & \textbf{Comment}\tabularnewline +\hline +NULL & 0 & none & series not preserved\tabularnewline +\hline +BYTE\_ARRAY\_LEN & 4 & encoding\texttt{<}int\texttt{>} array length, encoding\texttt{<}byte\texttt{>} +bytes & coding of byte arrays with array length\tabularnewline +\hline +BYTE\_ARRAY\_STOP & 5 & byte stop, int block id\linebreak{} +content id & coding of byte arrays with a stop value \tabularnewline +\hline +BYTE & 40 & uint block content id & the block content identifier used to associate +data blocks with data series\tabularnewline +\hline +VARINT\_UNSIGNED & 41 & sint offset, uint block content id & the block content identifier used to associate data blocks with data series\tabularnewline +\hline +VARINT\_SIGNED & 42 & sint offset, uint block content id & the block content identifier used to associate data blocks with data series\tabularnewline +\hline +CONST\_BYTE & 43 & byte value & the constant byte value\tabularnewline +\hline +CONST\_INT & 44 & sint value & the constant signed integer value\tabularnewline +\hline +\end{tabular} + +See section~\ref{sec:encodings} for more detailed descriptions of all +the above coding algorithms and their parameters. Note the encoding +ID values here have been chosen to not clash with CRAM 3.0 and earlier +with the exception of the BYTE\_ARRAY\_LEN and BYTE\_ARRAY\_STOP +encodings which have not changed. + +For example the byte stream for an encoding\texttt{<}int\texttt{>} +Data Series with values ranging from 1000 to 2000 may be encoded as +VARINT\_UNSIGNED with offset 1000, as 0x41 (the encoding ID) 0x8f 0x51 +(1000 as sint), 0x20 (block ID 32). + +\subsubsection{Data series encoding map} + +Writing to and reading from data blocks is organised through CRAM +records. A CRAM record is analogous to a single line in a SAM file. + +The records are organised into slices and then each slice is +rearranged in a columnar fashion into data series. For example +collect 10,000 SAM records and then the first column, query name +(QNAME), becomes one CRAM data series (RN), the next column (FLAG) +becomes CRAM data series (BF), and so on. Note some SAM columns, such +as CIGAR, may map to many CRAM data series. + +Each data series is associated with an encoding. An encoding may have +a block content id which is used to identify the block where the data +series is stored. Please note that data blocks can have multiple +data series associated with them; in this case the values from these +data series will be interleaved and must be decoded in the correct order. + +\begin{center} +\begin{tikzpicture}[thick] + +\usetikzlibrary{shapes, shadows, positioning, arrows, decorations.markings, arrows.meta} + +\pgfdeclarelayer{background} +\pgfsetlayers{background,main} + +\tikzstyle{dsbox} = [blockbox, text width=6em, minimum width=9em, minimum height=3em, rounded corners, drop shadow] +\tikzstyle{blockbox}=[draw, fill=white, text width=4.0em, text centered, minimum height=1.0em, drop shadow] +\tikzstyle{encodedblock} = [blockbox, text width=6em, minimum width=8em, minimum height=3em, drop shadow] +\tikzstyle{encodings} = [dsbox, minimum width=6em, thick] + +\tikzstyle{texto} = [above, text width=8em, text centered] + +\tikzstyle{vecArrow} = [thick, decoration={markings,mark=at position + 1 with {\arrow[semithick]{Triangle[open, length=3.1mm, width=5mm]}}}, + double distance=5pt, shorten >= 8.5pt, + preaction = {decorate}, + postaction = {draw,line width=5pt, white,shorten >= 8pt}] +\tikzstyle{innerWhite} = [semithick, white,line width=5pt, shorten >= 8pt] + +\newcommand{\cramRecord}[6]{% +\begin{pgfonlayer}{background} + \path (#1.west |- #1.north) + (-0.5, 0.5) node (a1) {}; + \path (#1.east |- #6.south) + (+0.5,-0.25) node (a2) {}; + \path[fill=brown!10,rounded corners, draw=black!50, dashed] + (a1) rectangle (a2); + \path (a1.east |- a1.south) + (1.0,-0.3) node[texto] + {\scriptsize\textbf{Data Series}}; +\end{pgfonlayer}} + +\newcommand{\dsbox}[2]{node (p#1) [dsbox] +{\scriptsize{#2}}} +\newcommand{\encodedblock}[2]{node (p#1) [encodedblock] + {\scriptsize{#2}}} +\newcommand{\encodedblocklarge}[2]{node (p#1) [encodedblocklarge] + {\scriptsize{#2}}} + +\path +(-2.5,-1.5) \dsbox{1}{\begin{tabular}{c} Bit Flags \\ BF \end{tabular}}; +\path (p1.south)+(0.0,-1.0) \dsbox{2}{\begin{tabular}{c} Alignment Start\\ AP\quad\quad \end{tabular}}; +\path (p2.south)+(0.0,-1.0) \dsbox{3}{\begin{tabular}{c} Read Length \\ RL \end{tabular}}; +\path (p3.south)+(0.0,-1.0) \dsbox{4}{\begin{tabular}{c} Bases \\ ... \end{tabular}}; +\path (p4.south)+(0.0,-1.0) \dsbox{5}{\begin{tabular}{c} Quality Scores \\ QS \end{tabular}}; +\path (p5.south)+(0.0,-1.0) \dsbox{6}{\begin{tabular}{c} Tags \\ ... \end{tabular}}; + +\cramRecord{p1}{p2}{p3}{p4}{p5}{p6} + +\newcommand{\blockStreams}[4]{% +\begin{pgfonlayer}{background} + \path (#1.west |- #1.north)+(-0.5, 0.5) node (a3) {}; + \path (#4.east |- #4.south)+(+0.5, -0.25) node (a4) {}; + \path[fill=yellow!10, rounded corners, draw=black!50, dashed] + (a3) rectangle (a4); + \path (a3.east |- a3.south)+(1.0,-0.3) node[texto]{\scriptsize\textbf{Byte Streams}}; +\end{pgfonlayer}} + +\path (a1.south) + (12.0, -2.3) \encodedblock {7} {DataBlock}; +\path (p7.south) + (0.0, -1.0) \encodedblock {8} {Data Block}; +\path (p8.south) + (0.0, -1.0) \encodedblock {9} {Data Block}; +\path (p9.south) + (0.0, -1.0) \encodedblock {10}{Data Block...}; -Each data series has an encoding. These encoding are stored in a map with byte[2] +\blockStreams {p7} {p8} {p9} {p10} + +\node (encodings) [encodings, right = 2 of p3] {\scriptsize Encodings}; +\node (enc_e1) [right=0.7 of encodings] {}; +\node (enc_e2) [right=1.05 of enc_e1] {}; +\node (enc_n) [above=1 of encodings.center] {}; +\draw[vecArrow] (p3.east) + (0.55, 0) to (encodings.west); +\draw[vecArrow] (encodings.east) to (enc_e2.west); +\draw[vecArrow] (enc_e1.east) |- (enc_n.north) to (encodings.north); +\draw[innerWhite] (encodings.east) to (enc_e2.west); + +\end{tikzpicture} + +Figure 5: The relationship between Data Series, Possibly nested Encodings, and Data Blocks. + +\end{center} + +The picture shows how a CRAM record (on the left) is distributed to +multiple Data Blocks via a series of encodings. Note the mapping from +Data Series to Blocks may be many to many, with the possibility of +multiple Data Series writing to the same Block and with a single +encoding such as BYTE\_ARRAY\_LEN potentially writing to two Blocks. + + +Each data series has an encoding. These encodings are stored in a map with byte[2] keys and are decoded in approximately this order\footnote{The precise order is defined in section~\ref{sec:record}.}: \begin{threeparttable}[t] @@ -826,7 +873,7 @@ \subsubsection*{Data series encoding map} \hline CF & encoding\texttt{<}uint\texttt{>} & CRAM bit flags & see specific section\tabularnewline \hline -RI & encoding\texttt{<}uint\texttt{>} & reference id & record reference id from +RI & encoding\texttt{<}int\texttt{>} & reference id & record reference id from the SAM file header\tabularnewline \hline RL & encoding\texttt{<}uint\texttt{>} & read lengths & read lengths\tabularnewline @@ -844,7 +891,7 @@ \subsubsection*{Data series encoding map} \hline MF & encoding\texttt{<}uint\texttt{>} & next mate bit flags & see specific section\tabularnewline \hline -NS & encoding\texttt{<}uint\texttt{>} & next fragment reference sequence id & reference +NS & encoding\texttt{<}int\texttt{>} & next fragment reference sequence id & reference sequence ids for the next fragment \tabularnewline \hline NP & encoding\texttt{<}uint\texttt{>} & next mate alignment start & alignment positions @@ -901,7 +948,7 @@ \subsubsection*{Data series encoding map} \end{tablenotes} \end{threeparttable} -\subsubsection*{Tag encoding map} +\subsubsection{Tag encoding map} \label{subsubsec:tags} The tag dictionary (TD) describes the unique combinations of tag id / type that occur on each alignment record. @@ -948,7 +995,7 @@ \subsubsection*{Tag values} type being captured in the tag key rather in the value. Hence consuming 1 byte for types `C' and `c', 2 bytes for types `S' and `s', 4 bytes for types `I', `i' and `f', and a variable number of bytes for types `H', `Z' and `B'. -\subsection{\textbf{Slice header block}} +\subsection{Slice header block} The slice header block is never compressed (block method=raw). For reference mapped reads the slice header also defines the reference sequence context of the data @@ -981,7 +1028,7 @@ \subsection{\textbf{Slice header block}} \hline \textbf{Data type} & \textbf{Name} & \textbf{Value}\tabularnewline \hline -int7\textcolor{gray}{FIXME} & reference sequence id & reference sequence identifier or\linebreak{} +int7 & reference sequence id & reference sequence identifier or\linebreak{} -1 for unmapped reads\linebreak{} -2 for multiple reference sequences.\linebreak{} This value must match that of its enclosing container.\tabularnewline @@ -1002,8 +1049,8 @@ \subsection{\textbf{Slice header block}} \hline uint7[ ] & block content ids & block content ids of the blocks in the slice\tabularnewline \hline -int7\textcolor{gray}{FIXME} & embedded reference bases block content id & block content id for the embedded -reference sequence bases or -1 \textcolor{gray}{FIXME: why not 0?} for none\tabularnewline +uint7 & embedded reference bases block content id & block content id for the embedded +reference sequence bases or 0 for none\tabularnewline \hline byte[16] & reference md5 & MD5 checksum of the reference bases within the slice boundaries. If this slice has reference sequence id of -1 (unmapped) or -2 (multi-ref) @@ -1049,121 +1096,7 @@ \subsection{\textbf{Slice header block}} % \end{tabular} -\subsection{\textbf{Core data block}} - -{\color{gray} -FIXME: candidate for culling, in order to simplify the format. -} -A core data block is a bit stream (most significant bit first) consisting of data from one -or more CRAM records. Please note that one byte could hold more then one CRAM record -as a minimal CRAM record could be just a few bits long. The core data block has -the following fields: - -\begin{tabular}{|l|>{\raggedright}p{120pt}|>{\raggedright}p{260pt}|} -\hline -\textbf{Data type} & \textbf{Name} & \textbf{Value} -\tabularnewline -\hline -bit[ ] & CRAM record 1 & The first CRAM record\tabularnewline -\hline -... & ... & ...\tabularnewline -\hline -bit[ ] & CRAM record N & The Nth CRAM record \tabularnewline -\hline -\end{tabular} - -\subsection{\textbf{External data blocks}} - -The relationship between the core data block and external data blocks is shown in the following -picture: - -\begin{center} -\begin{tikzpicture} - -\usetikzlibrary{shapes, shadows, positioning, arrows, decorations.markings} - -\pgfdeclarelayer{background} -\pgfsetlayers{background,main} - -\tikzstyle{dsbox} = [blockbox, text width=6em, minimum width=6em, minimum height=3em, rounded corners, drop shadow] -\tikzstyle{blockbox}=[draw, fill=white, text width=4.0em, text centered, minimum height=1.0em, drop shadow] -\tikzstyle{encodedblocklarge} = [blockbox, text width=6em, minimum width=8em, minimum height=4em, drop shadow] -\tikzstyle{encodedblock} = [blockbox, text width=6em, minimum width=8em, minimum height=3em, drop shadow] - -\tikzstyle{texto} = [above, text width=8em, text centered] - -\newcommand{\cramRecord}[6]{% -\begin{pgfonlayer}{background} - \path (#1.west |- #1.north) + (-0.5, 0.5) node (a1) {}; - \path (#1.east |- #6.south) + (+0.5,-0.25) node (a2) {}; - \path[fill=brown!10,rounded corners, draw=black!50, dashed] - (a1) rectangle (a2); - \path (a1.east |- a1.south) + (1.0,-0.3) node[texto] - {\scriptsize\textbf{CRAM Record}}; -\end{pgfonlayer}} - -\newcommand{\dsbox}[2]{node (p#1) [dsbox] -{\scriptsize{#2}}} -\newcommand{\encodedblock}[2]{node (p#1) [encodedblock] - {\scriptsize{#2}}} -\newcommand{\encodedblocklarge}[2]{node (p#1) [encodedblocklarge] - {\scriptsize{#2}}} - -\path +(-2.5,-1.5) \dsbox{1}{Bit Flags}; -\path (p1.south)+(0.0,-1.0) \dsbox{2}{Alignment Start}; -\path (p2.south)+(0.0,-1.0) \dsbox{3}{Read Length}; -\path (p3.south)+(0.0,-1.0) \dsbox{4}{Bases}; -\path (p4.south)+(0.0,-1.0) \dsbox{5}{Quality Scores}; -\path (p5.south)+(0.0,-1.0) \dsbox{6}{Tags...}; - -\cramRecord{p1}{p2}{p3}{p4}{p5}{p6} - -\newcommand{\blockStreams}[4]{% -\begin{pgfonlayer}{background} - \path (#1.west |- #1.north)+(-0.5, .7) node (a1) {}; - \path (#1.east |- #1.south)+(+0.5, -0.25) node (a2) {}; - \path (#2.west |- #2.north)+(-0.5, 0.5) node (a3) {}; - \path (#4.east |- #4.south)+(+0.5, -0.25) node (a4) {}; - \path[fill=yellow!10, rounded corners, draw=black!50, dashed] - (a1) rectangle (a2); - \path (a1.east |- a1.south)+(0.8,-0.3) node[texto] - {\scriptsize\textbf{Bit Stream}}; - \path[fill=yellow!10, rounded corners, draw=black!50, dashed] - (a3) rectangle (a4); - \path (a3.east |- a3.south)+(1.0,-0.3) node[texto]{\scriptsize\textbf{Byte Streams}}; -\end{pgfonlayer}} - -\path (a1.south) + (12.0, -2.3) \encodedblocklarge {7} {Core Data Block}; -\path (p7.south) + (0.0, -2.0) \encodedblock {8} {External Data Block}; -\path (p8.south) + (0.0, -1.0) \encodedblock {9} {External Data Block}; -\path (p9.south) + (0.0, -1.0) \encodedblock {10}{External Data Block...}; - -\blockStreams {p7} {p8} {p9} {p10} - -\node[minimum size=4cm, right = of p2, single arrow, draw, single arrow head indent=0ex, black] {Bit encodings (Huffman, ...) }; - -\node[minimum size=4cm, right = of p5, single arrow, draw, single arrow head indent=0ex, black] {Byte encodings (external, ...) }; - -\end{tikzpicture} - -Figure 5: The relationship between core and external encodings, and core and external data blocks. - -\end{center} - -{\color{gray} -FIXME: we may also wish to enforce one data series to one external -block (in which case the block IDs may as well also be fixed)? This -is a restriction, but also a simplification and matches current -practice (check Java)? -} -The picture shows how a CRAM record (on the left) is distributed between the core -data block and one or more external data blocks, via core or external encodings. The specific -encodings presented are only examples for purposes of illustration. The main point is -to distinguish between core bit encodings whose output is always stored in a core data -block, and external byte encodings whose output is always stored in external data -blocks. - -\section{\textbf{End of file container}} +\subsection{End of file container} A special container is used to mark the end of a file or stream. It is required in version 3 or later. The idea is to provide an easy and a quick way to detect that a CRAM file or stream is complete. @@ -1171,15 +1104,13 @@ \section{\textbf{End of file container}} It is recommended that implementations of CRAM validate EOF by checking these values rather than direct comparison of byte values, as these checks will be valid for all versions of CRAM -\section{\textbf{Record structure}} +\section{Record structure} \label{sec:record} CRAM record is based on the SAM record but has additional features allowing for -more efficient data storage. In contrast to BAM record CRAM record uses bits as -well as bytes for data storage. This way, for example, various coding techniques -which output variable length binary codes can be used directly in CRAM. On the -other hand, data series that do not require binary coding can be stored separately -in external blocks with some other compression applied to them independently. +more efficient data storage. In contrast to BAM record CRAM records +are separated by type of data into a series of data blocks. These +blocks can then use content type aware compression techniques. As CRAM data series may be interleaved within the same blocks\footnote{Interleaving can sometimes provide better compression, however it also adds dependency between types of data meaning it is not possible to selectively decode one data series if it co-locates with another data series in the same block.} understanding the order in which CRAM data series must be decoded is vital. @@ -1187,7 +1118,7 @@ \section{\textbf{Record structure}} \algnewcommand\algorithmicto{\text{ \textbf{to} }} -\subsection{\textbf{CRAM record}} +\subsection{CRAM record} Both mapped and unmapped reads start with the following fields. Please note that the data series type refers to the logical data type and the data series name corresponds @@ -1213,7 +1144,7 @@ \subsection{\textbf{CRAM record}} \hline \end{tabular} -\subsubsection*{\textbf{BAM bit flags (BF data series)}} +\subsubsection*{BAM bit flags (BF data series)} The following flags are duplicated from the SAM and BAM specification, with identical meaning. Note however some of these flags can be derived during decode, so may be omitted in the CRAM file and the bits computed based on both reads of a pair-end library residing within the same slice. @@ -1259,7 +1190,7 @@ \subsubsection*{\textbf{BAM bit flags (BF data series)}} \end{tablenotes} \end{threeparttable} -\subsubsection*{\textbf{CRAM bit flags (CF data series)}} +\subsubsection*{CRAM bit flags (CF data series)} The CRAM bit flags (also known as compression bit flags) expressed as an integer represent the CF data series. The following compression flags are defined for each CRAM read record: @@ -1288,7 +1219,7 @@ \subsubsection*{\textbf{CRAM bit flags (CF data series)}} The following pseudocode describes the general process of decoding an entire CRAM record. The sequence data itself is in one of two encoding formats depending on whether the record is aligned (mapped). -\subsubsection*{\textbf{Decode pseudocode}} +\subsubsection*{Decode pseudocode} \newlength{\maxwidth} \newcommand{\algalign}[2] % #1 = text to left, #2 = text to right {\makebox[\maxwidth][l]{$#1{}$}${}#2$} @@ -1312,7 +1243,7 @@ \subsubsection*{\textbf{Decode pseudocode}} \EndProcedure \end{algorithmic} -\subsection{\textbf{CRAM positional data}} +\subsection{CRAM positional data} \label{subsec:positions} Following the bit-wise BAM and CRAM flags, CRAM encodes positional related data including reference, alignment positions and length, and read-group. @@ -1385,7 +1316,7 @@ \subsection{Read names (RN data series)} \EndProcedure \end{algorithmic} -\subsection{\textbf{Mate record}} +\subsection{Mate record} \label{subsec:mate} There are two ways in which mate information can be preserved in CRAM: number of records downstream (distance, within this slice) to the next fragment in the template and a special mate record if the next fragment is not in the current slice. @@ -1442,9 +1373,7 @@ \subsubsection*{Next mate bit flags (MF data series)} \end{tabular} -\subsubsection*{\textbf{Decode mate pseudocode}} - -In the following pseudocode we are assuming the current record is $this$ and its mate is $next\_frag$. +\subsubsection*{Decode mate pseudocode} \begin{algorithmic}[1] \Procedure{DecodeMateData}{} @@ -1503,7 +1432,7 @@ \subsection{Auxiliary tags} \If{$tag(type) = ``*''$} \State $tag(name)\gets$ \Call{GenerateTag}{$name$} \Else - \State $tag(name)\gets$ \Call{ReadItem}{$ele$, Byte[]} + \State $tag(name)\gets$ \Call{ReadItem}{$ele$, Byte[]}\Comment{Type specific} \EndIf \EndFor \EndProcedure @@ -1523,16 +1452,16 @@ \subsection{Auxiliary tags} These may optionally be created on-the-fly during reference-based sequence reconstruction and should match the description provided in the SAMtags document. An encoder may decide to store these verbatim when no reference is used or where the automatically constructed values differ to the input data. -The tag type \texttt{*} is used in conjunction with MD, NM and RG as a flag to indicate that this tag was present but is not being stored verbatim. +The tag type \texttt{*} is used in conjunction with MD, NM and RG as a flag to indicate that this tag is present but is not being stored verbatim. (If MD is being stored verbatim, we would expect MDZ in the TD dictionary for this tag line.) -The \texttt{*} type is also useful for ensuring the ordering of tags does not change. +The \texttt{*} type is also useful for ensuring the ordering of tags can be preserved. Note it is permitted for decoders to choose to auto-generate MD and NM tags even in the absence of MD* and NM* entries. -\subsection{\textbf{Mapped reads}} +\subsection{Mapped reads} \label{subsec:mapped} -\subsubsection*{\textbf{Read feature records}} +\subsubsection*{Read feature records} \label{subsec:features} Read features are used to store read details that are expressed using read coordinates @@ -1713,7 +1642,7 @@ \subsubsection*{Decode mapped read pseudocode} \EndProcedure \end{algorithmic} -\subsection{\textbf{Unmapped reads}} +\subsection{Unmapped reads} \label{subsec:unmapped} The CRAM record structure for unmapped reads has the following additional fields: @@ -1873,7 +1802,7 @@ \subsection{Resolving inter-record dependencies} \EndProcedure \end{algorithmic} -\section{\textbf{Reference sequences}} +\section{Reference sequences} CRAM format is natively based upon usage of reference sequences even though in some cases they are not required. In contrast to BAM format CRAM format has strict @@ -1907,7 +1836,7 @@ \section{\textbf{Reference sequences}} slices. \end{enumerate} -\section{\textbf{Indexing}} +\section{Indexing} \subsubsection*{General notes} @@ -2002,150 +1931,180 @@ \subsubsection*{BAM index} in seeking specific alignment start because all preceding records in the slice must be read and discarded. -\section{\textbf{Encodings}} +\section{Encodings} \label{sec:encodings} % FIXME: we have a mishash of coding, encoding and codec. We should % go through the entire document and be consistent. -\subsection{\textbf{Introduction}} +\subsection{Introduction} -The basic idea for codings is to efficiently represent some values in binary format. +The basic idea for encodings is to efficiently represent byte and integer values. This can be achieved in a number of ways that most frequently involve some knowledge about the nature of the values being encoded, for example, distribution statistics. The methods for choosing the best encoding and determining its parameters are very diverse and are not part of the CRAM format specification, which only describes how the information needed to decode the values should be stored. -Note two of the encodings (Golomb and Golomb-Rice) are listed as deprecated. -These are still formally part of the CRAM specification, but have not been used by the primary implementations and may not be well supported. -Therefore their use is permitted, but not recommended. - -\subsubsection*{Offset} - -Many of the codings listed below encode positive integer numbers. An integer offset -value is used to allow any integer numbers and not just positive ones to be encoded. -It can also be used for monotonically decreasing distributions with the maximum -not equal to zero. For example, given offset is 10 and the value to be encoded -is 1, the actually encoded value would be offset+value=11. Then when decoding, -the offset would be subtracted from the decoded value. +\subsection{BYTE} -\subsection{EXTERNAL: codec ID 1} - -Can encode types \textit{Byte}, \textit{Integer}. - -The EXTERNAL coding is simply storage of data verbatim to an external block with a given ID. -If the type is \textit{Byte} the data is stored as-is, otherwise for \textit{Integer} type the data is stored in uint7 or sint7 depending on the sign of the data series. +This encoding simply stores byte values verbatim to a block with a given ID. \subsubsection*{Parameters} -CRAM format defines the following parameters of EXTERNAL coding: +CRAM format defines the following parameters for BYTE encoding: \begin{tabular}{|>{\raggedright}p{100pt}|>{\raggedright}p{100pt}|>{\raggedright}p{230pt}|} \hline \textbf{Data type} & \textbf{Name} & \textbf{Comment} \tabularnewline \hline -uint7 & external id & id of an external block containing the byte stream\tabularnewline +uint7 & id & id of a block containing the byte stream\tabularnewline \hline \end{tabular} -\subsection{Huffman coding: codec ID 3} +\subsection{VARINT\_UNSIGNED} -Can encode types \textit{Byte}, \textit{Integer}. +This encoding stores unsigned integer values using the Variable Length Quantity +format (VLQ)\footnote{https://en.wikipedia.org/wiki/Variable-length\_quantity}. -Huffman coding replaces symbols (values to encode) by binary codewords, with common symbols having shorter codewords such that the total message of binary codewords is shorter than using uniform binary codeword lengths. -The general process consists of the following steps. +It is not permitted to store any negative values using this encoding. +However given several data series have ranges that have the +possibility of small negative values (such as -1 for Alignment +Position), it is possible to skew the values slightly by adding an +offset prior to encoding. This provides a mechanism for guaranteeing +all values remain positive. -\begin{itemize} -\item Obtain symbol code lengths. -\begin{itemize} -\item If encoding:\\ -- Compute symbol frequencies.\\ -- Compute code lengths from frequencies. -\item If decoding:\\ -- Read code lengths from codec parameters. -\end{itemize} +Note this offset is added to the value after that value has been +decoded so to encode -1 we may wish to set offset to -1 and encode a +zero. This mechanism may also be used to skew a range of values that +do not cluster around zero. For example if a data series has values +ranging from 1000 to 1200, then setting offset to 1000 and encoding +values between 0 and 200 would provide a smaller data footprint. -\item Compute canonical Huffman codewords from code lengths\footnote{\url{https://en.wikipedia.org/wiki/Canonical_Huffman_code}}. +For data series that require a broader scope of signed values we would +recommend using VARINT\_SIGNED instead. -\item Encode or decode bits as per the symbol to codeword table. -Codewords have the ``prefix property'' that no codeword is a prefix of another codeword, enabling unambiguous decode bit by bit. -\end{itemize} +\subsubsection*{Parameters} -The use of canonical Huffman codes means that we only need to store the code lengths and use the same algorithm in both encoder and decoder to generate the codewords. -This is achieved by ensuring our symbol alphabet has a natural sort order and codewords are assigned in numerical order. +CRAM format defines the following parameters for VARINT\_UNSIGNED encoding: -\textbf{Important note: for alphabets with only one value, the codeword will be zero bits long.} -This makes the Huffman codec an efficient mechanism for specifying constant values. +\begin{tabular}{|>{\raggedright}p{100pt}|>{\raggedright}p{100pt}|>{\raggedright}p{230pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Comment} +\tabularnewline +\hline +sint7 & offset & amount to add to decoded values\tabularnewline +\hline +uint7 & id & id of a block containing the byte stream\tabularnewline +\hline +\end{tabular} -\subsubsection*{Canonical code computation} +\subsection{VARINT\_SIGNED} -\begin{enumerate} -\item Sort the alphabet ascending using bit-lengths and then using numerical order -of the values. +This encoding permits storing a broad range of both positive and +negative values while still following the property of low magnitude +values taking up less space than large magnitude ones. -\item The first symbol in the list gets assigned a codeword which is the same length -as the symbol's original codeword but all zeros. This will often be a single zero -('0'). +It achieves this by using ZigZag encoding where signed values 0, -1, ++1, -2, +2 and so on are mapped to unsigned values 0, 1, 2, 3, 4. +These unsigned values are then stored using the same VLQ system used +by VARINT\_UNSIGNED. -\item Each subsequent symbol is assigned the next binary number in sequence, ensuring -that following codes are always higher in value. +As with VARINT\_UNSIGNED, an offset is added to the value after it has +been VLQ and ZigZag decoded. Combined these permit efficient storage +of a distribution of values centred around a mean. For example if +the distribution of values in a data series ranged from 1000 to 2000 +with a peak at 1500, we could set offset to 1500 and encode values +-500 to +500. -\item When you reach a longer codeword, then after incrementing, append zeros until -the length of the new codeword is equal to the length of the old codeword. -\end{enumerate} +\subsubsection*{Parameters} -\subsubsection*{Examples} +CRAM format defines the following parameters for VARINT\_SIGNED encoding: -\begin{tabular}{|>{\raggedright}p{105pt}|>{\raggedright}p{105pt}|>{\raggedright}p{105pt}|} -\hline -\textbf{Symbol} & \textbf{Code length} & \textbf{Codeword}\tabularnewline +\begin{tabular}{|>{\raggedright}p{100pt}|>{\raggedright}p{100pt}|>{\raggedright}p{230pt}|} \hline -A & 1 & 0\tabularnewline +\textbf{Data type} & \textbf{Name} & \textbf{Comment} +\tabularnewline \hline -B & 3 & 100\tabularnewline -C & 3 & 101\tabularnewline -D & 3 & 110\tabularnewline +sint7 & offset & amount to add to decoded values\tabularnewline \hline -E & 4 & 1110\tabularnewline -F & 4 & 1111\tabularnewline +uint7 & id & id of a block containing the byte stream\tabularnewline \hline \end{tabular} + +\subsection{CONST\_BYTE} + +Sometimes a data series has a constant value throughout the +container. In this case we can record this value in the Container +Compression Header and avoid the need to store the data series. + \subsubsection*{Parameters} +CRAM format defines the following parameters for CONST\_BYTE encoding: + \begin{tabular}{|>{\raggedright}p{100pt}|>{\raggedright}p{100pt}|>{\raggedright}p{230pt}|} \hline \textbf{Data type} & \textbf{Name} & \textbf{Comment} \tabularnewline \hline -array & alphabet & list of all encoded symbols (values)\tabularnewline -\hline -array & bit-lengths & array of bit-lengths for each symbol in the alphabet\tabularnewline +byte & value & the constant value\tabularnewline \hline \end{tabular} -\subsection{Byte array coding} +\subsection{CONST\_INT} -Often there is a need to encode an array of bytes where the length is not predetermined. -For example the read identifiers differ per alignment record, possibly with different lengths, and this length must be stored somewhere. -There are two choices available: storing the length explicitly (BYTE\_ARRAY\_LEN) or continuing to read bytes until a termination value is seen (BYTE\_ARRAY\_STOP). +As per CONST\_BYTE, but the value is a signed integer encoded with +VLQ and ZigZag methods. +The overhead of using ZigZag encoding on a constant unsigned value is +an average of 1 bit per container per use of this encoding. -Note in contrast to this, quality values are known to be the same length as the sequence which is an already known quantity, so this does not need to be encoded using the byte array codecs. +\subsubsection*{Parameters} -\subsubsection*{BYTE\_ARRAY\_LEN: codec ID 4} +CRAM format defines the following parameters for CONST\_INT encoding: + +\begin{tabular}{|>{\raggedright}p{100pt}|>{\raggedright}p{100pt}|>{\raggedright}p{230pt}|} +\hline +\textbf{Data type} & \textbf{Name} & \textbf{Comment} +\tabularnewline +\hline +sint & value & the constant value\tabularnewline +\hline +\end{tabular} + +\subsection{BYTE\_ARRAY\_LEN (ID 4)} Can encode types \textit{Byte[]}. -Byte arrays are captured length-first, meaning that the length of every array element is written using an additional encoding. -For example this could be a HUFFMAN encoding or another EXTERNAL block. -The length is decoded first followed by the data, followed by the next length and data, and so on. +Often there is a need to encode an array of bytes where the length is +not predetermined. For example the read identifiers differ per +alignment record, possibly with different lengths, and this length +must be stored somewhere. Note in contrast to this, quality values +are known to be the same length as the sequence which is an already +known quantity, so the CRAM record encodes this using an implicit +array by asking for multiple values to be decoded rather than using an +explicit byte array encoding. + +With BYTE\_ARRAY\_LEN the length is explicitly encoded via its own +sub-encoding prior to the array of bytes itself, also via its own +sub-encoding. + +Note these sub-encodings may store the data in the same block, in +which case the length comes first. Alternatively we may decide that +the lengths have their own unique distribution which differs +substantially to the data itself and so should be stored in a +different block to permit better compression. Having decoded the +length, the decoder must then call the value sub-encoding that many +times to retrieve the bytes. + +Given the recursion used here of an encoding using two sub-encodings, +the byte stream for BYTE\_ARRAY\_LEN is the concatenation of both +sub-encodings. -This encoding can therefore be considered as a nested encoding, with each pair of nested encodings containing their own set of parameters. -The byte stream for parameters of the BYTE\_ARRAY\_LEN encoding is therefore the concatenation of the length and value encoding parameters as described in section~\ref{subsec:writing-bytes}. +\subsubsection*{Parameters} -The parameter for BYTE\_ARRAY\_LEN are listed below: +The parameters for BYTE\_ARRAY\_LEN are listed below: \begin{tabular}{|>{\raggedright}p{100pt}|>{\raggedright}p{100pt}|>{\raggedright}p{230pt}|} \hline @@ -2153,42 +2112,57 @@ \subsubsection*{BYTE\_ARRAY\_LEN: codec ID 4} \tabularnewline \hline encoding\texttt{<}int\texttt{>} & lengths encoding & an encoding describing how -the arrays lengths are captured\tabularnewline +the arrays lengths are stored\tabularnewline \hline encoding\texttt{<}byte\texttt{>} & values encoding & an encoding describing how -the values are captured\tabularnewline +the values (bytes) are stored\tabularnewline \hline \end{tabular} -For example, the bytes specifying a BYTE\_ARRAY\_LEN encoding, including the codec and parameters, for a 16-bit X0 auxiliary tag (``X0C'') may use HUFFMAN encoding to specify the length (always 2 bytes) and an EXTERNAL encoding to store the value to an external block with ID 200. +\subsubsection*{Example} + +The bytes for an X0:i SAM auxiliary field consisting of 16-bit items +may be written as an X0S tag line. Despite the size being known, all +tag items are stored as byte arrays, so BYTE\_ARRAY\_LEN is ideal when +coupled with a CONST\_INT encoding to hold the fixed size lengths and +BYTE for the values. \begin{tabular}{lll} \hline \textbf{Bytes} & & \textbf{Meaning}\\ \hline -\texttt{0x04} & & BYTE\_ARRAY\_LEN codec ID \\ -\texttt{0x0a} & & 10 remaining bytes of BYTE\_ARRAY\_LEN parameters \\ +\texttt{0x04} & & BYTE\_ARRAY\_LEN encoding ID \\ +\texttt{0x07} & & 7 remaining bytes of BYTE\_ARRAY\_LEN parameters \\ \\ -\texttt{0x03} & & HUFFMAN codec ID, for aux tag lengths \\ -\texttt{0x04} & & 4 more bytes of HUFFMAN parameters \\ -\texttt{0x01} & & Alphabet array size = 1 \\ -\texttt{0x02} & & alphabet symbol; (length = 2) \\ -\texttt{0x01} & & Codeword array size = 1 \\ -\texttt{0x00} & & Code length = 0 (zero bits needed as alphabet is size 1) \\ +\texttt{0x44} & & CONST\_INT encodng ID, for the aux tag lengths \\ +\texttt{0x01} & & remaining length of CONST\_INT encoding \\ +\texttt{0x05} & & length 2 as Zig-Zag format \\ \\ -\texttt{0x01} & & EXTERNAL codec ID, for aux tag values \\ -\texttt{0x02} & & 2 more bytes of EXTERNAL parameters \\ +\texttt{0x01} & & BYTE encoding ID, for the aux tag values \\ +\texttt{0x02} & & 2 more bytes of BYTE parameters \\ \texttt{0x81 0x48} & & uint7 encoding for block ID 200 \\ \hline \end{tabular} -\subsubsection*{BYTE\_ARRAY\_STOP: codec ID 5} +\subsection{BYTE\_ARRAY\_STOP (ID 5)} Can encode types \textit{Byte[]}. -Byte arrays are captured as a sequence of bytes terminated by a special stop byte. -The data returned does not include the stop byte itself. -In contrast to BYTE\_ARRAY\_LEN the value is always encoded with EXTERNAL so the parameter is an external id instead of another encoding. +Instead of encoding an explicit length (as per BYTE\_ARRAY\_LEN) this +encoding stores bytes ending in a termination value. The data +returned should not include this termination byte. Hence this is +comparable to the C language string encoding, which uses a nul +character for termination, while BYTE\_ARRAY\_LEN is more similar to +the Pascal language string encoding. + +Given this encoding is capturing the length within the same data +stream as the bytes, unlike BYTE\_ARRAY\_LEN this does not require +use of sub-encodings. This makes it considerably simpler, but it has +redundancy when the stored arrays have constant lengths. + +The choice of termination symbol is up to the encoder, but logical +choices may include the nul byte or the tab character as both are +illegal within SAM records. \begin{tabular}{|>{\raggedright}p{100pt}|>{\raggedright}p{100pt}|>{\raggedright}p{230pt}|} \hline @@ -2197,7 +2171,7 @@ \subsubsection*{BYTE\_ARRAY\_STOP: codec ID 5} \hline byte & stop byte & a special byte treated as a delimiter\tabularnewline \hline -uint8 & external id & id of an external block containing the byte stream\tabularnewline +uint7 & block id & id of a block containing the byte stream\tabularnewline \hline \end{tabular} @@ -2512,26 +2486,21 @@ \subsubsection*{BYTE\_ARRAY\_STOP: codec ID 5} % Golomb-Rice coding is a special case of Golomb coding when the M parameter is a power of 2. % The reason for this coding is that the division operations in Golomb coding can be replaced with bit shift operators as well as avoiding the extra $r < 2^{b}-M$ check. -\section{\textbf{External compression methods}} +\section{Block compression methods} + +Each block will hold different types of data and may have very +different characteristics for compression. Hence CRAM can utilise +several different compression methods. Each method has an associated numeric code which is defined in Section~\ref{sec:block-struct}. -External encoding operates on bytes only. Therefore any data series must be translated into bytes before sending data into an external block. The following methods are defined. Exact definitions of these methods are in their respective internet links or the ancillary \textit{CRAMcodecs} document found along side this specification. -Integer values are written as ITF8, which then can be translated into an array -of bytes. - -Strings, like read name, are translated into bytes according to UTF8 rules. In -most cases these should coincide with ASCII, making the translation trivial. - -Each method has an associated numeric code which is defined in Section~\ref{sec:block-struct}. - -\subsection{\textbf{Gzip}} +\subsection{Gzip} The Gzip specification is defined in RFC 1952. Gzip in turn is an encapsulation on the Deflate algorithm defined in RFC 1951. -\subsection{\textbf{Bzip2}} +\subsection{Bzip2} First available in CRAM v2.0. @@ -2541,14 +2510,14 @@ \subsection{\textbf{Bzip2}} An informal format specification exists:\\ \url{https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf} -\subsection{\textbf{LZMA}} +\subsection{LZMA} First available in CRAM v3.0. LZMA is the Lempel-Ziv Markov chain algorithm. CRAM uses the xz Stream format to encapsulate this algorithm, as defined in \url{https://tukaani.org/xz/xz-file-format.txt}. -\subsection{\textbf{rANS4x8 codec}} +\subsection{rANS4x8 codec} First available in CRAM v3.0. @@ -2562,7 +2531,7 @@ \subsection{\textbf{rANS4x8 codec}} Details of this algorithm have been moved to the \textit{CRAMcodecs} document. -\subsection{\textbf{rANS4x16 codec}} +\subsection{rANS4x16 codec} First available in CRAM v3.1. @@ -2571,7 +2540,7 @@ \subsection{\textbf{rANS4x16 codec}} Details of this algorithm are listed in the \textit{CRAMcodecs} document. -\subsection{\textbf{adaptive arithemtic coding}} +\subsection{adaptive arithemtic coding} First available in CRAM v3.1. @@ -2581,7 +2550,7 @@ \subsection{\textbf{adaptive arithemtic coding}} Details of this algorithm are listed in the \textit{CRAMcodecs} document. -\subsection{\textbf{fqzcomp codec}} +\subsection{fqzcomp codec} First available in CRAM v3.1. @@ -2589,7 +2558,7 @@ \subsection{\textbf{fqzcomp codec}} Details of this algorithm are listed in the \textit{CRAMcodecs} document. -\subsection{\textbf{name tokeniser}} +\subsection{name tokeniser} First available in CRAM v3.1. @@ -2597,7 +2566,16 @@ \subsection{\textbf{name tokeniser}} Details of this algorithm are listed in the \textit{CRAMcodecs} document. -\section*{Appendix 1: Choosing the container size} +\appendix +\renewcommand{\thesection}{\arabic{section}} + +\newcommand{\appsection}[1]{% + \addtocounter{section}{1} + \section*{Appendix~\thesection \quad #1} + \addcontentsline{toc}{section}{Appendix~\thesection \quad #1} +} + +\appsection{Choosing the container size} The CRAM format does not constrain the size of the containers. However, the following should be considered when deciding the container size: @@ -2617,7 +2595,8 @@ \section*{Appendix 1: Choosing the container size} slice of 10,000 short reads (fewer for long reads), but some users find 1,000 is more appropriate if they need a lot of random access. -\section*{Appendix 2: CRAM 4.0 changes and rationale} +\appsection{CRAM 4.0 changes and rationale} +\label{sec:cram4changes} CRAM 4.0 has a number of changes. For those familier with the version 3.0 format, this is a list of changes along with a rationale for @@ -2691,18 +2670,6 @@ \section*{Appendix 2: CRAM 4.0 changes and rationale} This roughly halves the size of uncompressed RN data series, reducing the compressed size. -{\color{gray} -\item[(Planned) Removal of CORE block and bit-encodings]\ \newline - The bit based mechanisms are no predominantly unused in htslib and - htsjdk for CRAM 3.0 as separating into external blocks offers - greater flexibility for partial decoding. - - The sole use of HUFFMAN in htslib is to specify constant values, - which by definition have zero length codes (and consume no - additional space). Hence HUFFMAN is replaced by a new CONSTANT - encoding schema. -} - {\color{gray} \item[(For consideration) Removal of slices]\ \newline CRAM 3.0 offers the ability to have multiple slices per container. @@ -2713,6 +2680,16 @@ \section*{Appendix 2: CRAM 4.0 changes and rationale} Removal of multi-slice containers, and by extension slices themselves, offers a simplified interface. + + An alternative to this is to permit another set of blocks prior to + the first slice, with duplicate block IDs to those used in the + slices. These blocks pertain to the container and are utilised in + unison with the per slice blocks. For rANS they could hold the + frequency tables, permitting this to be shared across slices. For + gzip or zstd they could hold a predefined dictionary, permitting + more efficient compression of small blocks. This may grant finer + grained random access capability without suffering so much data + expansion. } \item[TODO: add PACK, RLE and DELTA encodings]\ \newline @@ -2723,6 +2700,13 @@ \section*{Appendix 2: CRAM 4.0 changes and rationale} store blank names and cull the whole RN in DecodeMateData vs in-line shenanigans. +\item[TODO: make all aux tags first class encoding objects]\ \newline + Right now aux tags are basically byte arrays encoded as per BAM. + This isn't so efficient. For example an XX:S data series with + values ranging 0 to 500 but with many more small values are a few + large ones would be better encoded as VLQ than an array of 16-bit + quantities. + \end{description} \end{document} From df32119f5abbeb744a2085ae246117eb2c80c527 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 19 Mar 2020 17:34:38 +0000 Subject: [PATCH 3/7] More CRAM 4 shuffling - BYTE is now the same ID as the old EXTERNAL, given it's identical. But we refer to it as BYTE for clarity sake. - Removed sint vs uint in data series types as it's no longer necessary to bake this into the specification. - Moved the Data Series format text to later on and merged with the other tables, so the concepts and formats are introduced at the same point. --- CRAMv4.tex | 205 ++++++++++++++++++++++++----------------------------- 1 file changed, 91 insertions(+), 114 deletions(-) diff --git a/CRAMv4.tex b/CRAMv4.tex index 0d3e944c0..a72def64d 100644 --- a/CRAMv4.tex +++ b/CRAMv4.tex @@ -141,12 +141,12 @@ \subsection*{Structures} % Slice: sint, uint, uint[], byte[16], byte[] \begin{tabular}{lll} -\textbf{Field type} & \textbf{Encoding} & \textbf{Description}\\ -\textit{bool} & BYTE & A boolean, 0 (false) or 1 (true), stored as a byte. \\ -\textit{byte} & BYTE & An unsigned single byte (8 bits) \\ -\textit{uint} & VARINT\_UNSIGNED & A variable length integer $x >= 0$\\ -\textit{sint} & VARINT\_SIGNED & A variable length integer (may be negative)\\ -\textit{array\texttt{<}type\texttt{>}} & ARRAY & An array of items of \textit{type} including the dimension\\ +\textbf{Field type} & \textbf{Format} & \textbf{Description}\\ +\textit{bool} & \textbf{byte} & A boolean, 0 (false) or 1 (true), stored as a byte. \\ +\textit{byte} & \textbf{byte} & An unsigned single byte (8 bits) \\ +\textit{uint} & \textbf{VLQ\_unsigned} & A variable length integer $x >= 0$\\ +\textit{sint} & \textbf{VLQ\_signed} & A variable length integer (may be negative)\\ +\textit{array\texttt{<}type\texttt{>}} & \textbf{array} & An array of items of \textit{type} including the dimension\\ \\ \textit{type[]} & \textit{as appropriate} & Zero or more items with unspecified length\\ \textit{type[4]} & \textit{as appropriate} & A constant number of items, dimension not stored\\ @@ -156,10 +156,10 @@ \subsection*{Structures} The on-disk format for each of these encodings is listed below: \begin{description} -\item[BYTE]\ \newline +\item[\textbf{byte}]\ \newline A single byte -\item[VARINT\_UNSIGNED]\ \newline +\item[\textbf{VLQ\_unsigned}]\ \newline Integer values are encoded using Variable Length Quantity\footnote{\url{https://en.wikipedia.org/wiki/Variable-length_quantity}}. @@ -184,7 +184,7 @@ \subsection*{Structures} \EndFunction \end{algorithmic} -\item[VARINT\_SIGNED]\ \newline +\item[\textbf{VLQ\_unsigned}]\ \newline Signed integers are initially transformed to unsigned values using a zig-zag transformation, and then encoded as per unsigned values. This maps signed values 0, -1, +1, -2, +2 to unsigned values 0, 1, 2, 3, 4 respectively, and vice versa during decode. @@ -206,47 +206,14 @@ \subsection*{Structures} \EndFunction \end{algorithmic} -\item[ARRAY]\ \newline +\item[\textbf{array}]\ \newline An array of items with the array dimension explicitly stored. -The array dimen is first, in VARINT\_UNSIGNED encoding, followed by the array elements stored as per their \textit{type}. +The array dimension is first, using VLQ\_signed format, followed by the array elements stored as per their \textit{type}. \end{description} Other custom data types exist, but will be introduced in the relevant sections. -\subsection*{Data series} - -Each data series in CRAM has a designated type - boolean, byte, integer or array thereof - but unlike the structures which explicitly state sign and encoding, the encoder has choice of how these types are stored. -This type encoding meta-data is held in the compression header structure, once per data series. -Hence the decoder has baked-in knowledge of the fundamental type of each data series (e.g. \textit{byte} vs \textit{int}) but the description of how that data is stored, whether it is constant or variable and whether it may be negative, is part of the file itself. - -Integers are stored as a variable length quantity, with two distinct encodings for unsigned and unsigned values (a departure from CRAM 3.0 and earlier). -The latter may still store positive numbers, but permits a mixture of positives and negatives in the same block. - -A full description of the encoding methods can be found later.(FIXME add ref) -Where \textit{N/A} is listed in the Encoding column below this data type only occurs within a structure and so has no encoding associated with it. - -\begin{tabular}{lll} -\textbf{Data Type} & \textbf{Encoding} & \textbf{Description}\\ -\hline -\\ -\textit{byte} & BYTE & An unsigned single byte (8 bits) \\ - & CONST\_BYTE & A fixed byte value\\ -\\ -\textit{byte[]}& BYTE\_ARRAY\_LEN & Zero or more items with unspecified length\\ -\textit{byte[]}& BYTE\_ARRAY\_STOP& Zero or more items with unspecified length\\ -\\ -\textit{int} & VARINT\_UNSIGNED & A variable length integer $x >= 0$\\ - & VARINT\_SIGNED & A variable length integer (may be negative)\\ - & CONST\_INT & A fixed integer value (signed)\\ -\end{tabular} - -\vskip 10pt - -For example, the reference ID (\textbf{RI}) data series has a value data type of \textit{encoding}. -If all alignments in tihs slice are mapped ($RI >= 0$) then the encoding may be VARINT\_UNSIGNED. -If some data is unmapped ($RI = -1$) then the encoding will be VARINT\_SIGNED. - \section{File Layout} The basic structure of a CRAM file is a file header (magic number) @@ -430,9 +397,9 @@ \subsection{File definition} \hline byte[4] & format magic number & CRAM (0x43 0x52 0x41 0x4d)\tabularnewline \hline -unsigned byte & major format number & 3 (0x3)\tabularnewline +byte & major format number & 3 (0x3)\tabularnewline \hline -unsigned byte & minor format number & 1 (0x1)\tabularnewline +byte & minor format number & 1 (0x1)\tabularnewline \hline byte[20] & file id & CRAM file identifier (e.g. file name or SHA1 checksum)\tabularnewline \hline @@ -695,16 +662,18 @@ \subsubsection{Preservation map} \subsubsection{Encoding structure} -The next map utilises another custom data type, \textit{encoding\texttt{<}type\texttt{>}}. +The data series encoding map utilises another custom data type, \textit{encoding\texttt{<}type\texttt{>}}. This is used to describe the data encapsulation format for a specific data series; how and where the values are stored. This could be the definition of a constant, a series of data transformations, or the block content id for a data block. Some encodings may be nested, such as \texttt{BYTE\_ARRAY\_LEN} which defines one sub-encoding for the length and another for the bytes. -Encoding notation is defined as the keyword `encoding' followed by its data type -in angular brackets, for example `encoding\texttt{<}byte\texttt{>}' stands for -an encoding that operates on a data series of data type `byte'. +Encoding notation is defined as the keyword `encoding' followed by its data type in angular brackets, for example `encoding\texttt{<}byte\texttt{>}' stands for an encoding that operates on a data series of data type `byte'. +Note there are distinct encodings dealing with signed and unsigned +data in addition to offsets applies to each, so for integer data the +data series use `encoding\texttt{<}int\texttt{>}` and let the specific +encoding and meta-data handle the sign as appropriate. The \textit{encoding} format consists of an encoding type and type specific meta-data stored as an array (dimension followed by the array elements). @@ -713,49 +682,57 @@ \subsubsection{Encoding structure} \textbf{Data type} & \textbf{Name} & \textbf{Value} \tabularnewline \hline -uint7 & encoding\_type & encoding type\tabularnewline +uint7 & encoding\_ID & encoding type identifier\tabularnewline \hline array\texttt{<}byte\texttt{>} & encoding\_meta & type specific meta-data for this encoding, serialised as a byte stream\tabularnewline \hline \end{tabular} -The list of recognised encoding\_type values is: +\vskip 10pt -\begin{tabular}{|l|l|>{\raggedright}p{155pt}|>{\raggedright}p{160pt}|} -\hline -\textbf{Encoding Type} & \textbf{ID} & \textbf{Parameters} & \textbf{Comment}\tabularnewline -\hline -NULL & 0 & none & series not preserved\tabularnewline -\hline -BYTE\_ARRAY\_LEN & 4 & encoding\texttt{<}int\texttt{>} array length, encoding\texttt{<}byte\texttt{>} -bytes & coding of byte arrays with array length\tabularnewline -\hline -BYTE\_ARRAY\_STOP & 5 & byte stop, int block id\linebreak{} -content id & coding of byte arrays with a stop value \tabularnewline -\hline -BYTE & 40 & uint block content id & the block content identifier used to associate -data blocks with data series\tabularnewline -\hline -VARINT\_UNSIGNED & 41 & sint offset, uint block content id & the block content identifier used to associate data blocks with data series\tabularnewline -\hline -VARINT\_SIGNED & 42 & sint offset, uint block content id & the block content identifier used to associate data blocks with data series\tabularnewline -\hline -CONST\_BYTE & 43 & byte value & the constant byte value\tabularnewline -\hline -CONST\_INT & 44 & sint value & the constant signed integer value\tabularnewline +Note unlike the fields in structures, data series types here do not +distinguish signed integer (sint) vs unsigned integer (uint) as this +distinction is handled by the encoding itself. This is different to +CRAM 3.0 and earlier where the sign and size of the data series types +was written into the specification, rather than stored as meta-data in +the CRAM file. + +For example, the alignment position (\textbf{AP}) data series has a value data type of \textit{encoding}. +Typically on a position sorted file this is delta encoded and all +values are positive, so a suitable encoding is VARINT\_UNSIGNED. If +the slice contains data aligned against multiple reference sequences +then this may contain some negative deltas in which case +VARINT\_SIGNED could be appropriate. + +A list of data types along with their possible encodings is listed +below. + +\begin{tabular}{llll} +\textbf{ID} & \textbf{Encoding} & \textbf{Data Type} & \textbf{Description}\\ \hline +\\ +0 & NULL & \textit{N/A} & Data series not present\\ +\\ +1 & BYTE (EXTERNAL) & byte & An unsigned single byte (8 bits) \\ +\\ +4 & BYTE\_ARRAY\_LEN & byte[] & Zero or more items with unspecified length\\ +5 & BYTE\_ARRAY\_STOP& byte[] & Zero or more items with unspecified length\\ +\\ +41 & VARINT\_UNSIGNED & int & A variable length integer (VLQ $x >= 0$)\\ +42 & VARINT\_SIGNED & int & A variable length integer (VLQ, may be negative)\\ +\\ +43 & CONST\_BYTE & byte & A fixed byte value\\ +44 & CONST\_INT & int & A fixed integer value (VLQ, signed)\\ \end{tabular} +\vskip 10pt + See section~\ref{sec:encodings} for more detailed descriptions of all the above coding algorithms and their parameters. Note the encoding ID values here have been chosen to not clash with CRAM 3.0 and earlier with the exception of the BYTE\_ARRAY\_LEN and BYTE\_ARRAY\_STOP -encodings which have not changed. - -For example the byte stream for an encoding\texttt{<}int\texttt{>} -Data Series with values ranging from 1000 to 2000 may be encoded as -VARINT\_UNSIGNED with offset 1000, as 0x41 (the encoding ID) 0x8f 0x51 -(1000 as sint), 0x20 (block ID 32). +encodings which have not changed and BYTE which is identical and +synonymous with EXTERNAL in the CRAM 3.0 specification. \subsubsection{Data series encoding map} @@ -869,51 +846,51 @@ \subsubsection{Data series encoding map} \hline \textbf{Key} & \textbf{Value data type} & \textbf{Name} & \textbf{Value}\tabularnewline \hline -BF & encoding\texttt{<}uint\texttt{>} & BAM bit flags & see separate section\tabularnewline +BF & encoding\texttt{<}int\texttt{>} & BAM bit flags & see separate section\tabularnewline \hline -CF & encoding\texttt{<}uint\texttt{>} & CRAM bit flags & see specific section\tabularnewline +CF & encoding\texttt{<}int\texttt{>} & CRAM bit flags & see specific section\tabularnewline \hline RI & encoding\texttt{<}int\texttt{>} & reference id & record reference id from the SAM file header\tabularnewline \hline -RL & encoding\texttt{<}uint\texttt{>} & read lengths & read lengths\tabularnewline +RL & encoding\texttt{<}int\texttt{>} & read lengths & read lengths\tabularnewline \hline -AP & encoding\texttt{<}sint\texttt{>} & in-seq positions & if \textbf{AP-Delta} = true: 0-based alignment start +AP & encoding\texttt{<}int\texttt{>} & in-seq positions & if \textbf{AP-Delta} = true: 0-based alignment start delta from the AP value in the previous record. Note this delta may be negative, for example when switching references in a multi-reference slice. When the record is the first in the slice, the previous position used is the slice alignment-start field (hence the first delta should be zero for single-reference slices, or the AP value itself for multi-reference slices). \linebreak{} if \textbf{AP-Delta} = false: encodes the alignment start position directly\tabularnewline \hline -RG & encoding\texttt{<}sint\texttt{>} & read groups & read groups. Special value +RG & encoding\texttt{<}int\texttt{>} & read groups & read groups. Special value `-1' stands for no group.\tabularnewline \hline RN\tnote{a} & encoding\texttt{<}byte[ ]\texttt{>} & read names & read names\tabularnewline \hline -MF & encoding\texttt{<}uint\texttt{>} & next mate bit flags & see specific section\tabularnewline +MF & encoding\texttt{<}int\texttt{>} & next mate bit flags & see specific section\tabularnewline \hline NS & encoding\texttt{<}int\texttt{>} & next fragment reference sequence id & reference sequence ids for the next fragment \tabularnewline \hline -NP & encoding\texttt{<}uint\texttt{>} & next mate alignment start & alignment positions +NP & encoding\texttt{<}int\texttt{>} & next mate alignment start & alignment positions for the next fragment\tabularnewline \hline -TS & encoding\texttt{<}uint\texttt{>} & template size & template sizes\tabularnewline +TS & encoding\texttt{<}int\texttt{>} & template size & template sizes\tabularnewline \hline -NF & encoding\texttt{<}uint\texttt{>} & distance to next fragment & number of records +NF & encoding\texttt{<}int\texttt{>} & distance to next fragment & number of records to the next fragment\tnote{b}\tabularnewline \hline -TL\tnote{c} & encoding\texttt{<}uint\texttt{>} & tag ids & list of tag ids, see tag encoding +TL\tnote{c} & encoding\texttt{<}int\texttt{>} & tag ids & list of tag ids, see tag encoding section\tabularnewline \hline -FN & encoding\texttt{<}uint\texttt{>} & number of read features & number of read +FN & encoding\texttt{<}int\texttt{>} & number of read features & number of read features in each record\tabularnewline \hline FC & encoding\texttt{<}byte\texttt{>} & read features codes & see separate section\tabularnewline \hline -FP & encoding\texttt{<}uint\texttt{>} & in-read positions & positions of the read +FP & encoding\texttt{<}int\texttt{>} & in-read positions & positions of the read features\tabularnewline \hline -DL & encoding\texttt{<}uint\texttt{>} & deletion lengths & base-pair deletion lengths\tabularnewline +DL & encoding\texttt{<}int\texttt{>} & deletion lengths & base-pair deletion lengths\tabularnewline \hline BB & encoding\texttt{<}byte[ ]\texttt{>} & stretches of bases & bases\tabularnewline \hline @@ -924,16 +901,16 @@ \subsubsection{Data series encoding map} \hline IN & encoding\texttt{<}byte[ ]\texttt{>} & insertion & inserted bases\tabularnewline \hline -RS & encoding\texttt{<}uint\texttt{>} & reference skip length & number of skipped +RS & encoding\texttt{<}int\texttt{>} & reference skip length & number of skipped bases for the `N' read feature\tabularnewline \hline -PD & encoding\texttt{<}uint\texttt{>} & padding & number of padded bases\tabularnewline +PD & encoding\texttt{<}int\texttt{>} & padding & number of padded bases\tabularnewline \hline -HC & encoding\texttt{<}uint\texttt{>} & hard clip & number of hard clipped bases\tabularnewline +HC & encoding\texttt{<}int\texttt{>} & hard clip & number of hard clipped bases\tabularnewline \hline SC & encoding\texttt{<}byte[ ]\texttt{>} & soft clip & soft clipped bases\tabularnewline \hline -MQ & encoding\texttt{<}uint\texttt{>} & mapping qualities & mapping quality scores\tabularnewline +MQ & encoding\texttt{<}int\texttt{>} & mapping qualities & mapping quality scores\tabularnewline \hline BA & encoding\texttt{<}byte\texttt{>} & bases & bases\tabularnewline \hline @@ -1257,13 +1234,13 @@ \subsection{CRAM positional data} \hline \textbf{Data series type} & \textbf{Data series name} & \textbf{Field} & \textbf{Description}\tabularnewline \hline -uint & RI & ref id & reference sequence id (only present in multiref slices)\tabularnewline +int & RI & ref id & reference sequence id (only present in multiref slices)\tabularnewline \hline -uint & RL & read length & the length of the read\tabularnewline +int & RL & read length & the length of the read\tabularnewline \hline -sint & AP & alignment start & the alignment start position\tabularnewline +int & AP & alignment start & the alignment start position\tabularnewline \hline -sint & RG & read group & the read group identifier expressed as the N\textsuperscript{th} record in the header, starting from 0 with -1 for no group\tabularnewline +int & RG & read group & the read group identifier expressed as the N\textsuperscript{th} record in the header, starting from 0 with -1 for no group\tabularnewline \hline \end{tabular} @@ -1328,7 +1305,7 @@ \subsection{Mate record} \hline \textbf{Data series type} & \textbf{Data series name} & \textbf{Description}\tabularnewline \hline -uint & NF & the number of records to the next fragment\tabularnewline +int & NF & the number of records to the next fragment\tabularnewline \hline \end{tabular} @@ -1343,15 +1320,15 @@ \subsection{Mate record} \hline \textbf{Data series type} & \textbf{Data series name} & \textbf{Description}\tabularnewline \hline -uint & MF & next mate bit flags, see table below\tabularnewline +int & MF & next mate bit flags, see table below\tabularnewline \hline byte[ ] & RN & the read name (if and only if not known already)\tabularnewline \hline -uint & NS & mate reference sequence identifier \tabularnewline +int & NS & mate reference sequence identifier \tabularnewline \hline -uint & NP & mate alignment start position \tabularnewline +int & NP & mate alignment start position \tabularnewline \hline -uint & TS & the size of the template (insert size)\tabularnewline +int & TS & the size of the template (insert size)\tabularnewline \hline \end{tabular} @@ -1416,7 +1393,7 @@ \subsection{Auxiliary tags} \hline \textbf{Data series type} & \textbf{Data series name} & \textbf{Field} & \textbf{Description}\tabularnewline \hline -uint & TL & tag line & an index into the tag dictionary (TD)\tabularnewline +int & TL & tag line & an index into the tag dictionary (TD)\tabularnewline \hline \textit{various} & \textit{various} & tag name/type & 3 character key (2 tag identifier and 1 tag type), as specified by the tag dictionary\tabularnewline \hline @@ -1474,15 +1451,15 @@ \subsubsection*{Read feature records} \hline \textbf{Data series type} & \textbf{Data series name} & \textbf{Field} & \textbf{Description}\tabularnewline \hline -uint & FN & number of read features & the number of read features\tabularnewline +int & FN & number of read features & the number of read features\tabularnewline \hline -uint & FP & in-read-position\tnote{a} & position of the read feature\tabularnewline +int & FP & in-read-position\tnote{a} & position of the read feature\tabularnewline \hline byte & FC & read feature code\tnote{a} & See feature codes below\tabularnewline \hline * & * & read feature data\tnote{a} & See feature codes below\tabularnewline \hline -uint & MQ & mapping qualities & mapping quality score\tabularnewline +int & MQ & mapping qualities & mapping quality score\tabularnewline \hline byte[read length] & QS & quality scores & the base qualities, if preserved\tabularnewline \hline @@ -1520,19 +1497,19 @@ \subsubsection*{Read feature codes} \hline Insertion & I (0x49) & byte[ ] & IN & inserted bases, SAM operator I\tabularnewline \hline -Deletion & D (0x44) & uint & DL & number of deleted bases, SAM operator D\tabularnewline +Deletion & D (0x44) & int & DL & number of deleted bases, SAM operator D\tabularnewline \hline Insert base & i (0x69) & byte & BA & single inserted base, SAM operator I\tabularnewline \hline Quality score & Q (0x51) & byte & QS & single quality score\tabularnewline \hline -Reference skip & N (0x4E) & uint & RS & number of skipped bases, SAM operator N\tabularnewline +Reference skip & N (0x4E) & int & RS & number of skipped bases, SAM operator N\tabularnewline \hline Soft clip & S (0x53) & byte[ ] & SC & soft clipped bases, SAM operator S\tabularnewline \hline -Padding & P (0x50) & uint & PD & number of padded bases, SAM operator P\tabularnewline +Padding & P (0x50) & int & PD & number of padded bases, SAM operator P\tabularnewline \hline -Hard clip & H (0x48) & uint & HC & number of hard clipped bases, SAM operator H\tabularnewline +Hard clip & H (0x48) & int & HC & number of hard clipped bases, SAM operator H\tabularnewline \hline \end{tabular} From 86989271ecb82b0afe0df6653d40ddba75c74255 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 20 Mar 2020 09:00:35 +0000 Subject: [PATCH 4/7] Fixed the version date in CRAMv4.tex --- CRAMv4.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CRAMv4.tex b/CRAMv4.tex index a72def64d..cb33a367d 100644 --- a/CRAMv4.tex +++ b/CRAMv4.tex @@ -72,7 +72,7 @@ -\input{CRAMv3.ver} +\input{CRAMv4.ver} \title{CRAM format specification (version 4.0)} \author{samtools-devel@lists.sourceforge.net} \date{\headdate} From 15a7828c4652c188347fd938def000d00c5e7efa Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 2 Apr 2020 12:44:10 +0100 Subject: [PATCH 5/7] Minor table layout change --- CRAMv4.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CRAMv4.tex b/CRAMv4.tex index cb33a367d..e27543ef5 100644 --- a/CRAMv4.tex +++ b/CRAMv4.tex @@ -1127,7 +1127,7 @@ \subsubsection*{BAM bit flags (BF data series)} Note however some of these flags can be derived during decode, so may be omitted in the CRAM file and the bits computed based on both reads of a pair-end library residing within the same slice. \begin{threeparttable}[t] -\begin{tabular}{|>{\raggedright}p{144pt}|>{\raggedright}p{144pt}|>{\raggedright}p{144pt}|} +\begin{tabular}{|l|l|l|} \hline \textbf{Bit flag} & \textbf{Comment} & \textbf{Description}\tabularnewline \hline From e08b84f6a04aeef4d537656cd4dbd1d206cfc011 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 22 Jun 2020 16:29:19 +0100 Subject: [PATCH 6/7] Apply MF/BF table corrections from #511 --- CRAMv4.tex | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CRAMv4.tex b/CRAMv4.tex index e27543ef5..2ebfe59ff 100644 --- a/CRAMv4.tex +++ b/CRAMv4.tex @@ -1142,7 +1142,7 @@ \subsubsection*{BAM bit flags (BF data series)} 0x10 & & SEQ being reverse complemented\tabularnewline \hline 0x20 & calculated\tnote{b}\ \ or stored in the mate's info & SEQ of the next segment in the -template being reversed\tabularnewline +template being reverse complemented\tabularnewline \hline 0x40 & & the first segment in the template\tnote{c}\tabularnewline \hline @@ -1345,7 +1345,7 @@ \subsubsection*{Next mate bit flags (MF data series)} 0x1 & mate negative strand bit & the bit is set if the mate is on the negative strand\tabularnewline \hline -0x2 & mate mapped bit & the bit is set if the mate is mapped\tabularnewline +0x2 & mate unmapped bit & the bit is set if the mate is unmapped\tabularnewline \hline \end{tabular} From d0845d4c49299dd599b747e1a59ff6c70726867e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 29 Jan 2024 17:09:26 +0000 Subject: [PATCH 7/7] Document FP is delta against the last FP or zero. See #747 for CRAM 3.x issue. --- CRAMv4.tex | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/CRAMv4.tex b/CRAMv4.tex index 2ebfe59ff..4b4ee2844 100644 --- a/CRAMv4.tex +++ b/CRAMv4.tex @@ -888,7 +888,7 @@ \subsubsection{Data series encoding map} FC & encoding\texttt{<}byte\texttt{>} & read features codes & see separate section\tabularnewline \hline FP & encoding\texttt{<}int\texttt{>} & in-read positions & positions of the read -features\tabularnewline +features; a positive delta to the last position (starting with zero)\tabularnewline \hline DL & encoding\texttt{<}int\texttt{>} & deletion lengths & base-pair deletion lengths\tabularnewline \hline @@ -1444,6 +1444,9 @@ \subsubsection*{Read feature records} Read features are used to store read details that are expressed using read coordinates (e.g. base differences respective to the reference sequence). The read feature records start with the number of read features followed by the read features themselves. +Each read feature has the position encoded as the distance since the +last feature position, or the absolute position (i.e. delta vs zero) +for the first feature. Finally the single mapping quality and per-base quality scores are stored. \begin{threeparttable}[t] @@ -1568,6 +1571,7 @@ \subsubsection*{Decode mapped read pseudocode} \begin{algorithmic}[1] \Procedure{DecodeMappedRead}{} \State $feature\_number\gets$ \Call{ReadItem}{FN, Integer} + \State $last\_feature\_position\gets 0$ \For{$i\gets 1 \algorithmicto feature\_number$} \State \Call{DecodeFeature}{} \EndFor @@ -1588,7 +1592,8 @@ \subsubsection*{Decode mapped read pseudocode} \Procedure{DecodeFeature}{} \settowidth{\maxwidth}{feature\_position\ } \State \algalign{feature\_code}{\gets} \Call{ReadItem}{FC, Integer} - \State \algalign{feature\_position}{\gets} \Call{ReadItem}{FP, Integer} + \State \algalign{feature\_position}{\gets} \Call{ReadItem}{FP, Integer} $+\ last\_feature\_position$ + \State $last\_feature\_position\gets feature\_position$ \settowidth{\maxwidth}{substitution\_code\ } \If{$feature\_code = $`B'} \State \algalign{base}{\gets} \Call{ReadItem}{BA, Byte}