diff --git a/SAMv1.tex b/SAMv1.tex index 87c276238..6f0c4eea1 100644 --- a/SAMv1.tex +++ b/SAMv1.tex @@ -947,6 +947,14 @@ \subsection{The BAM format} the default when the corresponding information is not available; an underlined word in uppercase denotes a field in the SAM format. +Note the field types defined below define whether the value may be +negative and the number of bytes used on disk, but do not necessarily +dictate their appropriate maximum value. Care should be taken with +{\tt uint32\_t} fields to avoid exceeding 31-bits (approximately 2 billion) +as this can cause either implementation language issues or other +related data type constraints (e.g. {\tt tlen} needs to be 1 bit larger +than the maximum {\tt l\_ref}). + \begin{table}[ht] \centering {\small @@ -954,15 +962,15 @@ \subsection{The BAM format} \cline{1-6} \multicolumn{3}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Description} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\cline{1-6} \multicolumn{3}{|l|}{\sf magic} & BAM magic string & {\tt char[4]} & {\tt BAM\char92 1}\\\cline{1-6} - \multicolumn{3}{|l|}{\sf l\_text} & Length of the header text, including any {\tt NUL} padding & {\tt int32\_t} & \\\cline{1-6} + \multicolumn{3}{|l|}{\sf l\_text} & Length of the header text, including any {\tt NUL} padding & {\tt uint32\_t} & \\\cline{1-6} \multicolumn{3}{|l|}{\sf text} & Plain header text in SAM; not necessarily {\tt NUL}-terminated & {\tt char[{\sf l\_text}]} & \\\cline{1-6} - \multicolumn{3}{|l|}{\sf n\_ref} & \# reference sequences & {\tt int32\_t} & \\\cline{1-6} + \multicolumn{3}{|l|}{\sf n\_ref} & \# reference sequences & {\tt uint32\_t} & \\\cline{1-6} \multicolumn{6}{|c|}{\textcolor{gray}{\it List of reference information (n=n\_ref)}} \\\cline{2-6} - & \multicolumn{2}{l|}{\sf l\_name} & Length of the reference name plus 1 (including {\tt NUL}) & {\tt int32\_t} & \\\cline{2-6} + & \multicolumn{2}{l|}{\sf l\_name} & Length of the reference name plus 1 (including {\tt NUL}) & {\tt uint32\_t} & \\\cline{2-6} & \multicolumn{2}{l|}{\sf name} & Reference sequence name; {\tt NUL}-terminated & {\tt char[{\sf l\_name}]} & \\\cline{2-6} - & \multicolumn{2}{l|}{\sf l\_ref} & Length of the reference sequence & {\tt int32\_t} & \\\cline{1-6} + & \multicolumn{2}{l|}{\sf l\_ref} & Length of the reference sequence & {\tt uint32\_t} & \\\cline{1-6} \multicolumn{6}{|c|}{\textcolor{gray}{\it List of alignments (until the end of the file)}} \\\cline{2-6} - & \multicolumn{2}{l|}{\sf block\_size} & Total length of the alignment record, excluding this field & {\tt int32\_t} & \\\cline{2-6} + & \multicolumn{2}{l|}{\sf block\_size} & Total length of the alignment record, excluding this field & {\tt uint32\_t} & \\\cline{2-6} & \multicolumn{2}{l|}{\sf refID} & Reference sequence ID, $-1\leq{\sf refID}<{\sf n\_ref}$; -1 for a read without a mapping position & {\tt int32\_t} & [-1] \\\cline{2-6} & \multicolumn{2}{l|}{\sf pos} & 0-based leftmost coordinate ($=\underline{\sf POS}-1$)& {\tt int32\_t} & [-1]\\\cline{2-6} & \multicolumn{2}{l|}{\sf l\_read\_name} & Length of {\sf read\_name} below ($={\sf length}(\underline{\sf QNAME})+1$) & {\tt uint8\_t} & \\\cline{2-6} @@ -970,7 +978,7 @@ \subsection{The BAM format} & \multicolumn{2}{l|}{\sf bin} & BAI index bin, see Section~\ref{sec:bin-field} & {\tt uint16\_t} & \\\cline{2-6} & \multicolumn{2}{l|}{\sf n\_cigar\_op} & Number of operations in \underline{\sf CIGAR}, see Section~\ref{sec:ncigar} & {\tt uint16\_t} & \\\cline{2-6} & \multicolumn{2}{l|}{\sf flag} & Bitwise flags (= \underline{\sf FLAG})\footnotemark\ & {\tt uint16\_t} & \\\cline{2-6} - & \multicolumn{2}{l|}{\sf l\_seq} & Length of \underline{\sf SEQ} & {\tt int32\_t} & \\\cline{2-6} + & \multicolumn{2}{l|}{\sf l\_seq} & Length of \underline{\sf SEQ} & {\tt uint32\_t} & \\\cline{2-6} & \multicolumn{2}{l|}{\sf next\_refID} & Ref-ID of the next segment ($-1\le{\sf next\_refID}<{\sf n\_ref}$) & {\tt int32\_t} & [-1] \\\cline{2-6} & \multicolumn{2}{l|}{\sf next\_pos} & 0-based leftmost pos of the next segment ($=\underline{\sf PNEXT}-1$) & {\tt int32\_t} & [-1] \\\cline{2-6} & \multicolumn{2}{l|}{\sf tlen} & Template length ($=\underline{\sf TLEN}$) & {\tt int32\_t} & [0] \\\cline{2-6} @@ -1047,6 +1055,7 @@ \subsubsection{Auxiliary data encoding}\label{sec:aux-type-codes} implementation choice, but is typically the smallest that suffices.} Similarly floating point `{\tt f}' fields are represented as IEEE 754-2008 binary32 values. + Thus BAM numeric fields have a total length of~4, 5, or~7~bytes: \begin{center}\small\byteboxsetup\begin{tabular}{l@{\hspace{1in}}l} \tagfield{c}{\bytebox{1}{i8}} \quad (i.e., {\tt int8\_t}) @@ -1075,7 +1084,7 @@ \subsubsection{Auxiliary data encoding}\label{sec:aux-type-codes} \newcommand*{\arraytagfield}[3]{\tagfield{B}{\bytebox{1}{\tt #1}\bytebox{4}{\em count}\byteboxvector{#2}{#3}}} The representation of a `{\tt B}' array field starts with a sub-type character -similar to the numeric field types above and an {\tt int32\_t} \emph{count} +similar to the numeric field types above and a {\tt uint32\_t} \emph{count} giving the number of elements in the array. The array elements follow, encoded as binary integers or IEEE floats sized according to the sub-type: @@ -1203,22 +1212,27 @@ \subsection{The BAI index format for BAM files} \cline{1-7} \multicolumn{4}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Description} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\cline{1-7} \multicolumn{4}{|l|}{\sf magic} & Magic string & {\tt char[4]} & {\tt BAI\char92 1}\\\cline{1-7} - \multicolumn{4}{|l|}{\sf n\_ref} & \# reference sequences & {\tt int32\_t} & \\\cline{1-7} + \multicolumn{4}{|l|}{\sf n\_ref} & \# reference sequences & {\tt uint32\_t} & \\\cline{1-7} \multicolumn{7}{|c|}{\textcolor{gray}{\it List of indices (n=n\_ref)}} \\\cline{2-7} - & \multicolumn{3}{l|}{\sf n\_bin} & \# distinct bins (for the binning index) & {\tt int32\_t} & \\\cline{2-7} + & \multicolumn{3}{l|}{\sf n\_bin} & \# distinct bins (for the binning index) & {\tt uint32\_t} & \\\cline{2-7} & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct bins (n=n\_bin)}} \\\cline{3-7} & & \multicolumn{2}{l|}{\sf bin} & Distinct bin & {\tt uint32\_t} & \\\cline{3-7} - & & \multicolumn{2}{l|}{\sf n\_chunk} & \# chunks & {\tt int32\_t} & \\\cline{3-7} + & & \multicolumn{2}{l|}{\sf n\_chunk} & \# chunks & {\tt uint32\_t} & \\\cline{3-7} & & \multicolumn{5}{c|}{\textcolor{gray}{\it List of chunks (n=n\_chunk)}} \\\cline{4-7} & & & {\sf chunk\_beg} & (Virtual) file offset of the start of the chunk & {\tt uint64\_t} & \\\cline{4-7} & & & {\sf chunk\_end} & (Virtual) file offset of the end of the chunk & {\tt uint64\_t} & \\\cline{2-7} - & \multicolumn{3}{l|}{\sf n\_intv} & \# 16kbp intervals (for the linear index) & {\tt int32\_t} & \\\cline{2-7} + & \multicolumn{3}{l|}{\sf n\_intv} & \# 16kbp intervals (for the linear index) & {\tt uint32\_t} & \\\cline{2-7} & \multicolumn{6}{c|}{\textcolor{gray}{\it List of intervals (n=n\_intv)}} \\\cline{3-7} & & \multicolumn{2}{l|}{\sf ioffset} & (Virtual) file offset of the first alignment in the interval & {\tt uint64\_t} & \\\cline{1-7} \multicolumn{4}{|l|}{{\sf n\_no\_coor} (optional)} & Number of unplaced unmapped reads ({\sf RNAME} *) & {\tt uint64\_t} & \\\cline{1-7} \end{tabular}} \end{table} +As with the BAM format, the {\tt uint32\_t} fields indicate unsigned +values consuming 4 bytes and do not imply the full range of values is +appropriate. Practical implementations may limit these to 31-bit or +less. + The index file may optionally contain additional metadata providing a summary of the number of mapped and unmapped read-segments per reference sequence, and of any unplaced unmapped read-segments.\footnote{By \emph{placed unmapped @@ -1240,7 +1254,7 @@ \subsection{The BAI index format for BAM files} \begin{tabular}{|l|l|l|r|} \hline {\sf bin} & Magic bin number & {\tt uint32\_t} & 37450 \\\hline - {\sf n\_chunk} & \# chunks & {\tt int32\_t} & 2 \\\hline + {\sf n\_chunk} & \# chunks & {\tt uint32\_t} & 2 \\\hline {\sf unmapped\_beg} & (Virtual) file offset of the start of placed unmapped reads & {\tt uint64\_t} & \\\hline {\sf unmapped\_end} & (Virtual) file offset of the end of placed unmapped reads & {\tt uint64\_t} & \\\hline {\sf n\_mapped} & Number of mapped read-segments for this reference & {\tt uint64\_t} & \\\hline