From ff839f7ac24e6bd78afce13f68150af3af997413 Mon Sep 17 00:00:00 2001 From: d-cameron Date: Wed, 10 Jan 2018 23:18:42 +1100 Subject: [PATCH 1/6] Added barebones SAM strict document with only SA tag restrictions defined. --- Makefile | 2 + SAMstrict.tex | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 SAMstrict.tex diff --git a/Makefile b/Makefile index a5a06bdae..c37144cfd 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,7 @@ PDFS = BCFv1_qref.pdf \ CSIv1.pdf \ SAMv1.pdf \ SAMtags.pdf \ + SAMstrict.pdf \ tabix.pdf \ VCFv4.1.pdf \ VCFv4.2.pdf \ @@ -21,6 +22,7 @@ new/CRAMv2.1.pdf: CRAMv2.1.tex new/CRAMv2.1.ver new/CRAMv3.pdf: CRAMv3.tex new/CRAMv3.ver new/SAMv1.pdf: SAMv1.tex new/SAMv1.ver new/SAMtags.pdf: SAMtags.tex new/SAMtags.ver +new/SAMstrict.pdf: SAMstrict.tex new/SAMstrict.ver new/VCFv4.1.pdf: VCFv4.1.tex new/VCFv4.1.ver new/VCFv4.2.pdf: VCFv4.2.tex new/VCFv4.2.ver new/VCFv4.3.pdf: VCFv4.3.tex new/VCFv4.3.ver diff --git a/SAMstrict.tex b/SAMstrict.tex new file mode 100644 index 000000000..5bb31e608 --- /dev/null +++ b/SAMstrict.tex @@ -0,0 +1,153 @@ +\documentclass[10pt]{article} +\usepackage[margin=1in]{geometry} +\usepackage{longtable} +\usepackage[pdfborder={0 0 0},hyperfootnotes=false]{hyperref} +\usepackage[title]{appendix} + +\newcommand{\mailtourl}[1]{\href{mailto:#1}{\tt #1}} +\newcommand{\tagvalue}[1]{\tt #1} +\newcommand{\tagregex}[1]{\tt #1} + +\begin{document} + +\input{SAMstrict.ver} +\title{Sequence Alignment/Map Strict Specification} +\author{The SAM/BAM Format Specification Working Group} +\date{\headdate} +\maketitle +\begin{quote}\small +The master version of this document can be found at +\url{https://github.com/samtools/hts-specs}.\\ +This printing is version~\commitdesc\ from that repository, +last modified on the date shown above. +\end{quote} +\vspace*{1em} + +\noindent +This document is a companion to the {\sl Sequence Alignment/Map Format +Specification} that defines the SAM file format.\footnote{See +\href{http://samtools.github.io/hts-specs/SAMv1.pdf}{\tt SAMv1.pdf} at \url{https://github.com/samtools/hts-specs}.} +The SAM file format defines the syntax required for a file to be +a valid SAM file. It does not require such files to be semantically +valid and internally consistent. +This document describes a set of additional semantic restrictions +for which the subset of syntactically valid SAM files that comply +with these restrictions can be described as \textit{SAM strict +compliant}. + +\renewcommand{\abstractname}{Introduction} +\begin{abstract} + +The SAM specifications have been instrumental in standardising +the file formats used for sequence alignment. A large ecosystem of +bioinformatics tools is now capable of reading and/or writing +SAM files. Unfortunately, many tools that read SAM files are tightly +coupled to a particular upstream tool +and fail to correctly execute on valid SAM files written by other +tools. In part, this is due to the lack of semantic restrictions +inherent in the SAM file format. A syntactically valid SAM file +can be both internally inconsistent and semantically nonsensical. + +The purpose of this document is to provide a baseline of semantic +validity for which tools should comply with when outputing SAM +files, and tools which input SAM files can safely assume when +they require input files to be \textit{SAM strict compliant}. + +\end{abstract} + +\section{General} + +\subsection{Mate alignments} + +\section{SAM Tags} + +\subsection{Standard Tags} + +\paragraph{} + +No record can include any reserved tags not defined in the +{\sl Sequence Alignment/Map Optional Fields Specification}. +Non-standard tags must start X, Y, Z or a lowercase letter as per the SAM specifications. + +\paragraph{} + +The \textit{type} of all \textit{standard tags} must match the type +defined in the {\sl Sequence Alignment/Map Optional Fields Specification}. + +\subsection{SA} + +For the purpose of this section, a \textit{SA record set} is a set of SAM records +from a single \textit{read} which collectively represent a single \textit{chimeric alignment}. + +\paragraph{} + +All records referenced in the SA tag of a given record must have a SA tag defined. + +\paragraph{} + +All records referenced in the SA tag of a given record must include the given +record in their SA tag. + +\paragraph{} + +All records in a \textit{SA record set} with a FI tag defined must have the same FI tag value. + +\paragraph{} + +All records referenced in the SA tag must exist with matching \textit{rname, pos, strand, CIGAR}. + +\paragraph{} + +The SA \textit{mapq} of all references to a given record in a \textit{SA record set} must +match the record mapping quality. + +\paragraph{} + +The SA \textit{NM} of all references to a given record in a \textit{SA record set} must +match the record \textit{NM} tag value. + +\paragraph{} + +All records except 1 in a \textit{SA record set} must have the supplementary flag set. + +\paragraph{} + +All records in a \textit{SA record set} must have the same 0x100 (secondary alignment) FLAG bit. + +\paragraph{} + +All records in a \textit{SA record set} must have FLAG bit 0x4 (segment unmapped) not set. + +\paragraph{} + +All records in a \textit{SA record set} must have the same RNEXT. + +\paragraph{} + +All records in a \textit{SA record set} must have the same PNEXT. + +\paragraph{} + +All records in a \textit{SA record set} must have CIGARs with matching read length. That is, +chimeric alignment records must include either soft clipping or hard clipping CIGAR operations +for read bases which were not aligned. + +\paragraph{} + +All records in a \textit{SA record set} with SEQ not equal to * must have a SEQ consistent +with all other records in the \textit{SA record set}. That is, chimeric alignments that define +a segment sequence must be consistent with all other records defining a segment sequence. For +example, 10th base in a read cannot be be A in one chimeric alignment, but T in another. + +\paragraph{} + +All records in a \textit{SA record set} with QUAL not equal to * must have a SEQ consistent +with all other records in the \textit{SA record set}. + +\paragraph{} + +All records in a \textit{SA record set} must align at least one read base that does not +overlap with any other alignments in the \textit{SA record set}. +That is, a chimeric alignment cannot contain superfluous alignment records. + +\end{document} From 91cfe96be25ccc1520ef17edae8d3169f396b6ac Mon Sep 17 00:00:00 2001 From: d-cameron Date: Wed, 10 Jan 2018 23:37:44 +1100 Subject: [PATCH 2/6] Started importing section 2 of the SAM specs into SAM strict. --- SAMstrict.tex | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/SAMstrict.tex b/SAMstrict.tex index 5bb31e608..96ba76c6a 100644 --- a/SAMstrict.tex +++ b/SAMstrict.tex @@ -55,10 +55,38 @@ \end{abstract} +\section{Headers} + \section{General} +\subsection{CIGAR} + +\paragraph{} + +All CIGAR operators must have a non-zero positive length + +\paragraph{} + +All adjacent CIGAR operators must be different. + \subsection{Mate alignments} +\paragraph{} + +Unmapped reads must have the RNAME and POS identical to that of the first mapped read +from the originating template. For paired-end or mate-pair sequencing, this equivalent +to setting the RNAME/POS of unmapped reads to the RNAME/POS of the mate. + +\paragraph{} + +If all segments in a template are unmapped, RNAME must be set as `*' and POS as 0. + +\subsection{Reference bounds} + +\paragraph{} + +Read alignments must not extend past the start or end of the aligned RNAME. + \section{SAM Tags} \subsection{Standard Tags} @@ -108,7 +136,11 @@ \subsection{SA} \paragraph{} -All records except 1 in a \textit{SA record set} must have the supplementary flag set. +All records except 1 in a \textit{SA record set} must have the 0x800 (supplementary alignment) FLAG bit set. + +\paragraph{} + +The first SA record in all supplementary alignment records must be the canonical non-supplementary alignment. \paragraph{} From 179677501d66afa6fde3874ed94381c0d442d710 Mon Sep 17 00:00:00 2001 From: d-cameron Date: Fri, 12 Jan 2018 02:49:45 +1100 Subject: [PATCH 3/6] Moved alignment restrictions out of the SA tag as they are more general. --- SAMstrict.tex | 141 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 110 insertions(+), 31 deletions(-) diff --git a/SAMstrict.tex b/SAMstrict.tex index 96ba76c6a..190a6a62b 100644 --- a/SAMstrict.tex +++ b/SAMstrict.tex @@ -11,8 +11,8 @@ \begin{document} \input{SAMstrict.ver} -\title{Sequence Alignment/Map Strict Specification} -\author{The SAM/BAM Format Specification Working Group} +\title{SAM Strict Specification} +\author{Daniel L Cameron} \date{\headdate} \maketitle \begin{quote}\small @@ -57,28 +57,110 @@ \section{Headers} +\subsection{HD} + +\paragraph{} + +The @HD line must be present, with either the SO tag or the GO tag (but not both) specified. + +\paragraph{} + +As the SAM specifications does not explicitly define the queryname sort order, SO=queryname +should be avoided in favour of GO=queryname. + +\subsection{SQ} + +\paragraph{} + +@SQ SN fields must be unique. Multiple headers with the same SN must not be present. + +\paragraph{} + +Every SAM record with an non-* RNAME must have a corresponding @SQ header with matching SN. + \section{General} +\subsection{Ordering} + +\paragraph{} + +If a SO or GO @HD header tag is defined, the order of records must be consistent with this ordering. + \subsection{CIGAR} \paragraph{} +All CIGAR strings must have at least one CIGAR operator. + +\paragraph{} + All CIGAR operators must have a non-zero positive length \paragraph{} All adjacent CIGAR operators must be different. +\paragraph{} + +TODO: should we disallow I/D operators at the ends of reads? There was some ambiguity as to how deletions interacted with POS but I think the spec has been clarified in favour of the BWA interpretation since that discussion. + +\subsection{Flags} + +TODO: +- # reads in template is correct +- ... + +\subsection{Alignments} + +\paragraph{} + +Each read must have exactly one non-supplementary non-secondary record. + +\paragraph{} + +No supplementary or secondary alignments may exist for reads with an +unmapped primary (non-supplementary non-secondary record) alignment. + +\paragraph{} + +All read alignments should have CIGARs consistent with the + +\paragraph{} + +All read alignments must have CIGARs with matching read length. That is, +the sum of lengths of the M/I/S/=/X operations must be equal for all mapped read alignment. +This means that chimeric alignments must include either soft clipping or +hard clipping CIGAR operations for read bases which were not aligned. + +\paragraph{} + +All read alignments with SEQ not equal to * must have a SEQ consistent +with all other read alignments. That is, chimeric and secondary alignments that have non-* SEQ +must be consistent with all other records defining a segment sequence. For +example, 10th base in a read cannot be be A in one alignment, but T in another. + +\paragraph{} + +All read alignments with QUAL not equal to * must have a SEQ consistent +with all other read alignments. + \subsection{Mate alignments} \paragraph{} -Unmapped reads must have the RNAME and POS identical to that of the first mapped read -from the originating template. For paired-end or mate-pair sequencing, this equivalent +Unmapped reads must have RNAME and POS identical to that of the primary +non-supplementary alignment of first mapped read from the originating template. +For paired-end or mate-pair sequencing, this equivalent to setting the RNAME/POS of unmapped reads to the RNAME/POS of the mate. \paragraph{} +For templates with multiple reads, RNEXT and PNEXT must match the +alignment of the non-supplementary primary alignment of the next read. +As per the SAM specifications, for the last read, the next read is the first read in the template. + +\paragraph{} + If all segments in a template are unmapped, RNAME must be set as `*' and POS as 0. \subsection{Reference bounds} @@ -87,6 +169,17 @@ \subsection{Reference bounds} Read alignments must not extend past the start or end of the aligned RNAME. +\subsection{Mapping Qualities} + +\paragraph{} + +All mapping quality scores, including those defined in tags must be within the range [0, 255]. +A value 255 indicates that the mapping quality is not available and must only be used if the +mapping quality field is required. For example, a mapping quality field value is required for +MAPQ field and the mapq portion of the SA tag, but as the AM is optional, a mapping quality +field value is not required and the AM tag should be omitted entirely if a mapping quality is +not available. + \section{SAM Tags} \subsection{Standard Tags} @@ -102,6 +195,11 @@ \subsection{Standard Tags} The \textit{type} of all \textit{standard tags} must match the type defined in the {\sl Sequence Alignment/Map Optional Fields Specification}. +\paragraph{} + +All tag values must be consistent with the format +defined in the {\sl Sequence Alignment/Map Optional Fields Specification}. + \subsection{SA} For the purpose of this section, a \textit{SA record set} is a set of SAM records @@ -109,6 +207,11 @@ \subsection{SA} \paragraph{} +All SA tag values must satisfy the SA tag regular expression +defined in the {\sl Sequence Alignment/Map Optional Fields Specification}. + +\paragraph{} + All records referenced in the SA tag of a given record must have a SA tag defined. \paragraph{} @@ -120,6 +223,8 @@ \subsection{SA} All records in a \textit{SA record set} with a FI tag defined must have the same FI tag value. +TODO: which other tags? + \paragraph{} All records referenced in the SA tag must exist with matching \textit{rname, pos, strand, CIGAR}. @@ -144,7 +249,7 @@ \subsection{SA} \paragraph{} -All records in a \textit{SA record set} must have the same 0x100 (secondary alignment) FLAG bit. +All records in a \textit{SA record set} must have the same 0x100 (secondary alignment) FLAG value. \paragraph{} @@ -152,32 +257,6 @@ \subsection{SA} \paragraph{} -All records in a \textit{SA record set} must have the same RNEXT. - -\paragraph{} - -All records in a \textit{SA record set} must have the same PNEXT. - -\paragraph{} - -All records in a \textit{SA record set} must have CIGARs with matching read length. That is, -chimeric alignment records must include either soft clipping or hard clipping CIGAR operations -for read bases which were not aligned. - -\paragraph{} - -All records in a \textit{SA record set} with SEQ not equal to * must have a SEQ consistent -with all other records in the \textit{SA record set}. That is, chimeric alignments that define -a segment sequence must be consistent with all other records defining a segment sequence. For -example, 10th base in a read cannot be be A in one chimeric alignment, but T in another. - -\paragraph{} - -All records in a \textit{SA record set} with QUAL not equal to * must have a SEQ consistent -with all other records in the \textit{SA record set}. - -\paragraph{} - All records in a \textit{SA record set} must align at least one read base that does not overlap with any other alignments in the \textit{SA record set}. That is, a chimeric alignment cannot contain superfluous alignment records. From 8f6843f77039cfdee70201f3ae8d8ab0154eb087 Mon Sep 17 00:00:00 2001 From: d-cameron Date: Sat, 20 Jan 2018 21:52:20 +1100 Subject: [PATCH 4/6] Added commands to reduce verbosity and repetition. Completed Header rules. --- SAMstrict.tex | 189 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 156 insertions(+), 33 deletions(-) diff --git a/SAMstrict.tex b/SAMstrict.tex index 190a6a62b..2875d1f0a 100644 --- a/SAMstrict.tex +++ b/SAMstrict.tex @@ -4,9 +4,60 @@ \usepackage[pdfborder={0 0 0},hyperfootnotes=false]{hyperref} \usepackage[title]{appendix} -\newcommand{\mailtourl}[1]{\href{mailto:#1}{\tt #1}} -\newcommand{\tagvalue}[1]{\tt #1} -\newcommand{\tagregex}[1]{\tt #1} +\newcommand{\rulename}[1]{\tt #1} +\newcommand{\rulecategory}[1]{\tt #1} +\newcommand{\samrule}{\tt SAM} +% #1: error message +% #2: rule description +% #3: categories +\newcommand{\samstrictrule}[3]{ +# + \paragraph{} #3 + % error message formatting + {\tt #1} + % + #2 +} +% #1: header +% #2: categories +\newcommand{\headerrequired}[2]{ + \samstrictrule{Missing #1 header}{A #1 header must be present.}{#2} +}\ +\newcommand{\headerunique}[2]{ + \samstrictrule{Only one #1 header may be present}{Multiple #1 headers must not be present.}{#2} +} +% #1: header +% #2: tag +% #3: categories +\newcommand{\headertagrequired}[3]{ + \samstrictrule{Missing #1 header #2 tag}{The #1 header #2 must be present.}{#3} +} +% #1: header +% #2: tag +% #4: categories +\newcommand{\headertagunique}[3]{ + \samstrictrule{Duplicate #1 header #2 tags.}{Each #1 header #2 tags must be unique.}{#2} +} +% #1: header +% #2: tag +% #3: regex +% #4: categories +\newcommand{\headertagregex}[4]{ + \samstrictrule{Malformed #1 header #2 tag}{The #1 header #2 tag must conform to the regex {\tt #3}}{#4} +} +% #1: header +% #2: tag +% #3: possible tag values +% #4: categories +\newcommand{\headertagvalues}[4]{ + \samstrictrule{Malformed #1 header #2 tag}{The #1 header #2 tag must contain one of #3}{#4} +} +% #1: header +% #2: tag +% #3: categories +\newcommand{\headertagmatchsamspecs}[3]{ + \samstrictrule{Malformed #1 header #2 tag}{The #1 header #2 tag must match the format defined defined in the SAM specifications.}{#3} +} \begin{document} @@ -56,19 +107,53 @@ \end{abstract} \section{Headers} +\samstrictrule{Undefined reserved header present}{Upper-case header record type codes are not defined in the SAM specifications must not be used.} +\samstrictrule{Undefined header tag present}{Upper-case header tags not defined in the SAM specifications must not be used.} +\samstrictrule{Malformed header}{Header lines must start with conform to either the {\tt + /\char94@[A-Z][A-Z](\char92t[A-Za-z][A-Za-z0-9]:[ + -\char126]+)+\$/} or {\tt /\char94@CO\char92t.*/} or {\tt /\char94@CO\char92t.*/} regex.}{\samrule} \subsection{HD} +\headerrequired{HD} +\headerunique{HD} +\headertagrequired{HD}{VN}{\samrule} +\samstrictrule{File does not start with HD header.}{The first header defined must be the HD header.}{\samrule} +\headertagregex{HD}{VN}{/\char94[0-9]+\char92.[0-9]+\$/}{\samrule} +\samstrictrule{Unknown SAM version}{The HD header VN tag version number must match a published version of the SAM specifications.} +\headertagvalues{HD}{SO}{{\tt unknown}, {\tt unsorted}, {\tt queryname} and {\tt coordinate}}{\samrule} +\headertagvalues{HD}{GO}{{\tt none}, {\tt query}, {\tt reference}}{\samrule} +\samstrictrule{Inconsistent HD header SO and GO tags}{The record orderings defined in the HD header SO and GO tags must be consistent} -\paragraph{} - -The @HD line must be present, with either the SO tag or the GO tag (but not both) specified. - -\paragraph{} +\subsection{SQ} +\headertagrequired{SQ}{SN}{\samrule} +\headertagregex{SQ}{SN}{[!-)+-\char60\char62-\char126][!-\char126]*}{\samrule} +\headertagunique{SQ}{SN} +\headertagrequired{SQ}{LN}{\samrule} +\samstrictrule{Malformed SQ header LN tag}{The SQ header LN tag value must be an integer.}{\samrule} +\samstrictrule{Unsupported reference sequence length}{The SQ header LN tag value must greater than zero and less than 2147483648}{\samrule} +\headertagrequired{SQ}{M5}{\samrule} +\headertagmatchsamspecs{SQ}{AH}{\samrule} +\samstrictrule{Alternate locus references unknown reference sequence name}{Sequence names in the SQ header AH tag must match a SQ header SN reference sequence name.}{\samrule} +\headertagmatchsamspecs{SQ}{AN}{\samrule} +\samstrictrule{Duplicate alternative reference sequence names.}{Alternative reference sequence names defined in SQ header AN tags must be unique. A single tag cannot define cannot define the same alternative reference sequence name multiple times and multiple SQ headers cannot define the same alternative reference sequence name.}{\samrule} +\samstrictrule{Invalid alternative reference sequence names.}{Sequence names in the SQ header AN tag must not match any SQ header SN reference sequence names. }{\samrule} +\headertagregex{SQ}{M5}{[a-f0-9]\{32\}} + +\subsection{RG} +\headertagrequired{RQ}{ID}{\samrule} +\headertagunique{RQ}{ID}{\samrule} +\samstrictrule{RQ header DT tag is not ISO8601}{RQ header DT tag must contain a valid date in ISO8601 format}{\samrule} +\headertagregex{RQ}{FO}{/\char92*|[ACMGRSVTWYHKDBN]+/}{\samrule} +\samstrictrule{Malformed RQ header PI tag}{The RQ header PI tag value must be a floating point value.}{\samrule} +\headertagvalues{RG}{PL}{{\tt CAPILLARY}, {\tt LS454}, {\tt ILLUMINA}, {\tt SOLID}, {\tt HELICOS}, {\tt IONTORRENT}, {\tt ONT}, and {\tt PACBIO}}{\samrule} +\samstrictrule{Invalid RG program group}{The RG header PG tag must contain one of the program groups specified in an ID tag of a PG header.} + +\subsection{PG} +\headertagrequired{PG}{ID}{\samrule} +\headertagunique{PG}{ID}{\samrule} +\samstrictrule{Invalid PG header PP tag}{The PG header PP tag must contain one of the program groups specified in an ID tag of a PG header.} -As the SAM specifications does not explicitly define the queryname sort order, SO=queryname -should be avoided in favour of GO=queryname. -\subsection{SQ} \paragraph{} @@ -80,50 +165,85 @@ \subsection{SQ} \section{General} +\subsection{File Format} + +\samstrictrule{File is not UTF-8}{The file must use UTF-8 encoding.}{\samrule} +\samstrictrule{Inconsistent line terminators}{All lines must be separated with the same new line character\(s\).} +\samstrictrule{Malformed floating point value}{All floating point values must conform to the regex {\tt [-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?}} +\samstrictrule{Malformed integer value}{All integer values must conform to the regex [-+]?[0-9]+}} + \subsection{Ordering} -\paragraph{} +\samstrictrule{Record ordering does not match HD header SO tag}{The order of records must be consistent with the HD header SO tag} +\samstrictrule{Record ordering does not match HD header GO tag}{The order of records must be consistent with the HD header GO tag} -If a SO or GO @HD header tag is defined, the order of records must be consistent with this ordering. +\section{Records} -\subsection{CIGAR} +\subsection{QNAME} +\samstrictrule{Empty QNAME}{QNAME fields must be at least one character in length}{\samrule} +\samstrictrule{QNAME too long}{QNAME fields must be at less than 255 characters in length}{\samrule} +\samstrictrule{Invalid character in QNAME}{QNAME fields must conform to the regex {TT \verb:[!-?A-~]}}{\samrule} -\paragraph{} +\subsection{FLAG} -All CIGAR strings must have at least one CIGAR operator. +\samstrictrule{Incorrect FLAG 0x1}{All templates with multiple segments must have FLAG 0x1 set} +\samstrictrule{Incorrect FLAG 0x1}{All templates with a single segment must not have FLAG 0x1 set} +\samstrictrule{Inconsistent FLAG 0x1}{All records with the same QNAME must have the same FLAG 0x1 value} +\samstrictrule{Inconsistent FLAG 0x2}{All primary records with the same QNAME must have the same FLAG 0x2 value} +\samstrictrule{Missing primary alignment record}{No supplementary or secondary alignments may exist for reads with an unmapped with primary alignment.}{\samrule} +\samstrictrule{Inconsistent FLAGs 0x4 0x8}{The 0x8 FLAG for primary record for each segment must match the 0x4 FLAG of the primary record for the next segment in the template} -\paragraph{} +TODO: 0x10 and 0x20 -All CIGAR operators must have a non-zero positive length +\samstrictrule{Inconsistent FLAGs 0x1 0x40}{FLAG 0x40 should only be set if FLAG 0x1 is set} +\samstrictrule{Duplicate first segment primary records}{Of the primary records with the same QNAME, at most one can have FLAG 0x40 set} +\samstrictrule{Missing first segment primary record}{Of the records with the same QNAME with 0x1 FLAG, at least one record must have FLAG 0x40 set.} +\samstrictrule{Inconsistent FLAGs 0x1 0x80}{FLAG 0x80 should only be set if FLAG 0x1 is set} +\samstrictrule{Duplicate last segment primary records}{Of the primary records with the same QNAME, at most one can have FLAG 0x80 set} +\samstrictrule{Missing last segment primary record}{Of the records with the same QNAME with 0x1 FLAG, at least one record must have FLAG 0x80 set.} +\samstrictrule{Multiple primary alignment records}{Each segment must have exactly one record with FLAG 0x100 and 0x800 not set.}{\samrule} -\paragraph{} +TODO -All adjacent CIGAR operators must be different. +\subsection{RNAME} -\paragraph{} +TODO -TODO: should we disallow I/D operators at the ends of reads? There was some ambiguity as to how deletions interacted with POS but I think the spec has been clarified in favour of the BWA interpretation since that discussion. +\subsection{POS} +\samstrictrule{Record placed outside of reference sequence}{If FLAG 0x4 is not set, POS must be between 0 and the length of the RNAME reference sequence inclusive. The length of the RNAME reference sequence can be found in the SQ header LN tag value for the SQ header with a SN tag matching the RNAME.} +\samstrictrule{Invalid POS}{POS cannot be 0 if FLAG 0x4 is set.} +\samstrictrule{Invalid POS}{POS cannot be negative.}{\samrule} +\samstrictrule{Invalid POS}{POS cannot be greater than 2147483647.}{\samrule} +\samstrictrule{POS specified without RNAME}{If RNAME is *, POS must be 0.} -\subsection{Flags} +\subsection{MAPQ} -TODO: -- # reads in template is correct -- ... +TODO -\subsection{Alignments} -\paragraph{} +\subsection{CIGAR} +\samstrictrule{Invalid CIGAR}{All CIGAR strings must conform to the regex {\tt \char92*|([0-9]+[MIDNSHPX=])+}}{\samrule} +\samstrictrule{Empty CIGAR}{All CIGAR strings must have at least one CIGAR operator} +\samstrictrule{Zero length CIGAR operator}{All CIGAR operators must have a non-zero positive length} +\samstrictrule{CIGAR contains operator repeat}{All adjacent CIGAR operators must be different.} +\samstrictrule{CIGAR does not contain any mapped bases}{All CIGARs must include a CIGAR operator that consumes a reference base.}{\samrule} +\samstrictrule{Incorrect CIGAR length}{Unless SEQ is *, the number of query bases consumed by a record CIGAR must match the number of bases in SEQ}{\samrule} +\samstrictrule{Read alignment } -Each read must have exactly one non-supplementary non-secondary record. +\paragraph{} +TODO: should we disallow I/D operators at the ends of reads? There was some ambiguity as to how deletions interacted with POS but I think the spec has been updated in favour of the BWA interpretation since that discussion. \paragraph{} -No supplementary or secondary alignments may exist for reads with an -unmapped primary (non-supplementary non-secondary record) alignment. +Should we allow alignments with zero mapped bases? Seven bridges has a graph-based aligner that will +emit CIGARs such as 100I for alignments that align to a known insertion that is not included in the reference. +Useufl for local assembly but technical voilates the SAM specifications -\paragraph{} -All read alignments should have CIGARs consistent with the +\subsection{TODO: categorize} + +- secondary alignment mate info must match primary +- supplementary alignment mate info must match primary \paragraph{} @@ -205,6 +325,9 @@ \subsection{SA} For the purpose of this section, a \textit{SA record set} is a set of SAM records from a single \textit{read} which collectively represent a single \textit{chimeric alignment}. +\samstrictrule{Missing SA tag}{All records with FLAG 0x800 set must have a SA tag defined.}{\samrule} +\samstrictrule{Missing non-supplementary chimeric alignment record}{Each chimeric alignment must have a record with FLAG 0x800 not set.} + \paragraph{} All SA tag values must satisfy the SA tag regular expression From e3312178296d38ff6fac38cab5d375c5e9fcc81a Mon Sep 17 00:00:00 2001 From: d-cameron Date: Wed, 31 Jan 2018 21:43:37 +1100 Subject: [PATCH 5/6] Added RNEXT and PNEXT rules --- SAMstrict.tex | 137 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 31 deletions(-) diff --git a/SAMstrict.tex b/SAMstrict.tex index 2875d1f0a..32ba40ec4 100644 --- a/SAMstrict.tex +++ b/SAMstrict.tex @@ -7,6 +7,8 @@ \newcommand{\rulename}[1]{\tt #1} \newcommand{\rulecategory}[1]{\tt #1} \newcommand{\samrule}{\tt SAM} +\newcommand{\v15}{\tt v1.5} +\newcommand{\vcf43}{\tt VCFv4.3} % #1: error message % #2: rule description % #3: categories @@ -109,7 +111,8 @@ \section{Headers} \samstrictrule{Undefined reserved header present}{Upper-case header record type codes are not defined in the SAM specifications must not be used.} \samstrictrule{Undefined header tag present}{Upper-case header tags not defined in the SAM specifications must not be used.} -\samstrictrule{Malformed header}{Header lines must start with conform to either the {\tt +\samstrictrule{Tag present as both lowercase and uppercase}{A file should not contain the same tag in both upper-case and lowercase format. See the SAM specifications header tags best practice footnote.} +\samstrictrule{Malformed header}{Header lines with conform to either the {\tt /\char94@[A-Z][A-Z](\char92t[A-Za-z][A-Za-z0-9]:[ -\char126]+)+\$/} or {\tt /\char94@CO\char92t.*/} or {\tt /\char94@CO\char92t.*/} regex.}{\samrule} @@ -153,16 +156,6 @@ \subsection{PG} \headertagunique{PG}{ID}{\samrule} \samstrictrule{Invalid PG header PP tag}{The PG header PP tag must contain one of the program groups specified in an ID tag of a PG header.} - - -\paragraph{} - -@SQ SN fields must be unique. Multiple headers with the same SN must not be present. - -\paragraph{} - -Every SAM record with an non-* RNAME must have a corresponding @SQ header with matching SN. - \section{General} \subsection{File Format} @@ -176,6 +169,7 @@ \subsection{Ordering} \samstrictrule{Record ordering does not match HD header SO tag}{The order of records must be consistent with the HD header SO tag} \samstrictrule{Record ordering does not match HD header GO tag}{The order of records must be consistent with the HD header GO tag} +\samstrictrule{Orphaned unmapped read}{If a read is unmapped, RNAME and POS must either be * and 0, or the RNAME and POS of another read from the same template.} \section{Records} @@ -185,29 +179,33 @@ \subsection{QNAME} \samstrictrule{Invalid character in QNAME}{QNAME fields must conform to the regex {TT \verb:[!-?A-~]}}{\samrule} \subsection{FLAG} - \samstrictrule{Incorrect FLAG 0x1}{All templates with multiple segments must have FLAG 0x1 set} \samstrictrule{Incorrect FLAG 0x1}{All templates with a single segment must not have FLAG 0x1 set} \samstrictrule{Inconsistent FLAG 0x1}{All records with the same QNAME must have the same FLAG 0x1 value} \samstrictrule{Inconsistent FLAG 0x2}{All primary records with the same QNAME must have the same FLAG 0x2 value} \samstrictrule{Missing primary alignment record}{No supplementary or secondary alignments may exist for reads with an unmapped with primary alignment.}{\samrule} +\samstrictrule{Inconsistent FLAGs 0x1 0x2}{The 0x2 FLAG must not be set if the 0x1 FLAG is not.} +\samstrictrule{Inconsistent FLAGs 0x1 0x8}{The 0x8 FLAG must not be set if the 0x1 FLAG is not.} +\samstrictrule{Inconsistent FLAGs 0x1 0x20}{The 0x20 FLAG must not be set if the 0x1 FLAG is not.} +\samstrictrule{Inconsistent FLAGs 0x1 0x40}{The 0x40 FLAG must not be set if the 0x1 FLAG is not.} +\samstrictrule{Inconsistent FLAGs 0x1 0x80}{The 0x80 FLAG must not be set if the 0x1 FLAG is not.} +\samstrictrule{Inconsistent FLAGs 0x2 0x4}{The 0x2 FLAG must not be set if 0x4 is set in any primary alignments in the template} \samstrictrule{Inconsistent FLAGs 0x4 0x8}{The 0x8 FLAG for primary record for each segment must match the 0x4 FLAG of the primary record for the next segment in the template} - -TODO: 0x10 and 0x20 - -\samstrictrule{Inconsistent FLAGs 0x1 0x40}{FLAG 0x40 should only be set if FLAG 0x1 is set} +\samstrictrule{Inconsistent FLAG 0x10 0x20}{The 0x20 FLAG must match the 0x10 FLAG for the primary alignment of the next segment in the template} +\samstrictrule{FLAG 0x20 set on unmapped read}{The 0x10 FLAG must not be set if the 0x4 FLAG is is set.} \samstrictrule{Duplicate first segment primary records}{Of the primary records with the same QNAME, at most one can have FLAG 0x40 set} \samstrictrule{Missing first segment primary record}{Of the records with the same QNAME with 0x1 FLAG, at least one record must have FLAG 0x40 set.} -\samstrictrule{Inconsistent FLAGs 0x1 0x80}{FLAG 0x80 should only be set if FLAG 0x1 is set} \samstrictrule{Duplicate last segment primary records}{Of the primary records with the same QNAME, at most one can have FLAG 0x80 set} \samstrictrule{Missing last segment primary record}{Of the records with the same QNAME with 0x1 FLAG, at least one record must have FLAG 0x80 set.} -\samstrictrule{Multiple primary alignment records}{Each segment must have exactly one record with FLAG 0x100 and 0x800 not set.}{\samrule} - -TODO +\samstrictrule{Multiple primary alignment records}{Each segment must have at most one record with FLAG 0x100 and 0x800 not set.}{\samrule} +\samstrictrule{Missing primary alignment record}{Each segment must have at least one record with FLAG 0x100 and 0x800 not set.}{\samrule} +\samstrictrule{Unknown FLAG bit set}{FLAG bits higher than 0x800 must not be set}{\samrule} \subsection{RNAME} - -TODO +\samstrictrule{Malformed RNAME}{RNAME must conform to the regex {\tt \char92*|[!-()+-\char60\char62-\char126][!-\char126]*}}{\samrule} +\samstrictrule{RNAME not present in reference}{RNAME must be equal to the value of one of the SQ SN values defined in the header.} +\samstrictrule{RNAME contains character not supported by VCFv4.3}{RNAME must not contain any of the following characters: {\tt \char60\char62\char91\char93\char58\char42}}{\vcf43} +\samstrictrule{RNAME not supported by VCFv4.3}{RNAME must be not be one of {\tt DEL}, {\tt INS}, {\tt DUP}, {\tt INV}, {\tt CNV}, or {\tt BND}.}{\vcf43} \subsection{POS} \samstrictrule{Record placed outside of reference sequence}{If FLAG 0x4 is not set, POS must be between 0 and the length of the RNAME reference sequence inclusive. The length of the RNAME reference sequence can be found in the SQ header LN tag value for the SQ header with a SN tag matching the RNAME.} @@ -218,26 +216,97 @@ \subsection{POS} \subsection{MAPQ} -TODO - - \subsection{CIGAR} \samstrictrule{Invalid CIGAR}{All CIGAR strings must conform to the regex {\tt \char92*|([0-9]+[MIDNSHPX=])+}}{\samrule} \samstrictrule{Empty CIGAR}{All CIGAR strings must have at least one CIGAR operator} \samstrictrule{Zero length CIGAR operator}{All CIGAR operators must have a non-zero positive length} \samstrictrule{CIGAR contains operator repeat}{All adjacent CIGAR operators must be different.} \samstrictrule{CIGAR does not contain any mapped bases}{All CIGARs must include a CIGAR operator that consumes a reference base.}{\samrule} -\samstrictrule{Incorrect CIGAR length}{Unless SEQ is *, the number of query bases consumed by a record CIGAR must match the number of bases in SEQ}{\samrule} -\samstrictrule{Read alignment } + {\tt Should we allow alignments with zero mapped bases? Seven bridges has a graph-based aligner that will +emit CIGARs such as 100I for alignments that align to a known insertion that is not included in the reference. Useful for local assembly but technical voilates the SAM specifications} +\samstrictrule{Incorrect CIGAR length}{Sum of lengths of the M/I/S/=/X operations must equal the length of SEQ when both CIGAR and SEQ are available.}{\samrule} +\samstrictrule{Invalid CIGAR hard clip}{H must only be present as the first and/or last operation.}{\samrule} +\samstrictrule{Invalid CIGAR soft clip}{S must only have H operations between them and the ends of the CIGAR string.}{\samrule} +\samstrictrule{CIGAR overhangs reference sequence}{POS plus the number of reference bases consumed by the CIGAR must not exceed the length of the RNAME reference sequence.} +\samstrictrule{Inconsistent CIGAR read lengths}{All mapped alignments for a given segment must have matching read lengths. That is, the sum of lengths of the M/I/S/=/X/H operations must be equal.} + +\samstrictrule{Unusual indel positioning}{TODO: should we disallow I/D operators at the ends of reads? There was some ambiguity as to how deletions interacted with POS but I think the spec has been updated in favour of the BWA interpretation since that discussion.} + +\subsection{RNEXT} +\samstrictrule{Invalid RNEXT}{If the template contains one segment RNEXT must be *} +\samstrictrule{RNEXT not present in reference}{RNEXT must be equal to the value of one of the SQ SN values defined in the header.} +\samstrictrule{Invalid RNEXT}{If the primary alignment of the next read in the template is mapped, RNEXT must not be {\tt *}}{\samrule} +\samstrictrule{RNEXT not using =}{If the primary alignment of the next read in the template is mapped to the same reference sequence, RNEXT must be {\tt =}}{\samrule} +\samstrictrule{Incorrect RNEXT}{If this read is unmapped or the primary alignment of the next read in the template is mapped to the a different reference sequence, RNEXT must match the RNAME of the next read.} + +\subsection{PNEXT} +\samstrictrule{Invalid PNEXT}{If the template contains one segment PNEXT must be 0} +\samstrictrule{Incorrect PNEXT}{If the primary alignment of the next read in the template is unmapped, PNEXT must be 0} +\samstrictrule{Incorrect PNEXT}{If the primary alignment of the next read in the template is mapped, PNEXT must match the POS of that record.} + +\subsection{TLEN} +TODO: can we actually do anything for this? + +\subsection{SEQ} +\samstrictrule{Inconsistent SEQ read lengths}{All alignments of a given segment must have consistent SEQ lengths. That is, for all non-* SEQ, SEQ + length of CIGAR hard clip must be equal. } +\samstrictrule{Inconsistent SEQ sequences}{All alignments of a given segment must have consistent base calls. A base cannot be called an A in one record, but a T in another. Note that to determine the read base, both the 0x10 FLAG, and any hard clipping CIGAR operators need to be taken into account.} + +TODO: I am up to here + +\subsection{QUAL} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \paragraph{} -TODO: should we disallow I/D operators at the ends of reads? There was some ambiguity as to how deletions interacted with POS but I think the spec has been updated in favour of the BWA interpretation since that discussion. + \paragraph{} -Should we allow alignments with zero mapped bases? Seven bridges has a graph-based aligner that will -emit CIGARs such as 100I for alignments that align to a known insertion that is not included in the reference. -Useufl for local assembly but technical voilates the SAM specifications + \subsection{TODO: categorize} @@ -384,4 +453,10 @@ \subsection{SA} overlap with any other alignments in the \textit{SA record set}. That is, a chimeric alignment cannot contain superfluous alignment records. + + +- SEQ must match CIGAR +- Read lengths must be consistent + + \end{document} From dbca793792cb8895512bee1de2402ea9a5747191 Mon Sep 17 00:00:00 2001 From: d-cameron Date: Thu, 8 Feb 2018 21:01:57 +1100 Subject: [PATCH 6/6] SAM spec rules completed. SAM tags rules still outstanding. --- SAMstrict.tex | 204 +++++++++++++++++--------------------------------- 1 file changed, 70 insertions(+), 134 deletions(-) diff --git a/SAMstrict.tex b/SAMstrict.tex index 32ba40ec4..b3a18c82a 100644 --- a/SAMstrict.tex +++ b/SAMstrict.tex @@ -8,6 +8,7 @@ \newcommand{\rulecategory}[1]{\tt #1} \newcommand{\samrule}{\tt SAM} \newcommand{\v15}{\tt v1.5} +\newcommand{\v15bestpractice}{\tt SAMv1.5 best practice} \newcommand{\vcf43}{\tt VCFv4.3} % #1: error message % #2: rule description @@ -108,6 +109,13 @@ \end{abstract} +\section{Headers} + +{paragraph} + +The first segment is is considered to be the "next" segment of the final segment in a template as per the SAM specifications. + + \section{Headers} \samstrictrule{Undefined reserved header present}{Upper-case header record type codes are not defined in the SAM specifications must not be used.} \samstrictrule{Undefined header tag present}{Upper-case header tags not defined in the SAM specifications must not be used.} @@ -117,10 +125,10 @@ \section{Headers} -\char126]+)+\$/} or {\tt /\char94@CO\char92t.*/} or {\tt /\char94@CO\char92t.*/} regex.}{\samrule} \subsection{HD} -\headerrequired{HD} +\headerrequired{HD}{\v15bestpractice} \headerunique{HD} \headertagrequired{HD}{VN}{\samrule} -\samstrictrule{File does not start with HD header.}{The first header defined must be the HD header.}{\samrule} +\samstrictrule{File does not start with HD header.}{The first header defined must be the HD header.}{\v15bestpractice} \headertagregex{HD}{VN}{/\char94[0-9]+\char92.[0-9]+\$/}{\samrule} \samstrictrule{Unknown SAM version}{The HD header VN tag version number must match a published version of the SAM specifications.} \headertagvalues{HD}{SO}{{\tt unknown}, {\tt unsorted}, {\tt queryname} and {\tt coordinate}}{\samrule} @@ -128,6 +136,7 @@ \subsection{HD} \samstrictrule{Inconsistent HD header SO and GO tags}{The record orderings defined in the HD header SO and GO tags must be consistent} \subsection{SQ} +\samstrictrule{Missing SQ header}{The SQ header must be present if any reads have been mapped.}{\v15bestpractice} \headertagrequired{SQ}{SN}{\samrule} \headertagregex{SQ}{SN}{[!-)+-\char60\char62-\char126][!-\char126]*}{\samrule} \headertagunique{SQ}{SN} @@ -200,37 +209,45 @@ \subsection{FLAG} \samstrictrule{Multiple primary alignment records}{Each segment must have at most one record with FLAG 0x100 and 0x800 not set.}{\samrule} \samstrictrule{Missing primary alignment record}{Each segment must have at least one record with FLAG 0x100 and 0x800 not set.}{\samrule} \samstrictrule{Unknown FLAG bit set}{FLAG bits higher than 0x800 must not be set}{\samrule} +\samstrictrule{Unmapped reads should not have FLAG 0x10 set}{Unmapped reads should be stored in the orientation in which they came off the sequencing machine and have 0x10 unset.}{\v15bestpractice} \subsection{RNAME} \samstrictrule{Malformed RNAME}{RNAME must conform to the regex {\tt \char92*|[!-()+-\char60\char62-\char126][!-\char126]*}}{\samrule} \samstrictrule{RNAME not present in reference}{RNAME must be equal to the value of one of the SQ SN values defined in the header.} \samstrictrule{RNAME contains character not supported by VCFv4.3}{RNAME must not contain any of the following characters: {\tt \char60\char62\char91\char93\char58\char42}}{\vcf43} \samstrictrule{RNAME not supported by VCFv4.3}{RNAME must be not be one of {\tt DEL}, {\tt INS}, {\tt DUP}, {\tt INV}, {\tt CNV}, or {\tt BND}.}{\vcf43} +\samstrictrule{RNAME does not match mate}{For a unmapped paired-end or mate-pair read whose mate is mapped, the unmapped read should have RNAME identical to its mate}{\v15bestpractice} +\samstrictrule{RNAME specified for unmapped template}{If all segments in a template are unmapped, their RNAME should be set as *}{\v15bestpractice} + \subsection{POS} -\samstrictrule{Record placed outside of reference sequence}{If FLAG 0x4 is not set, POS must be between 0 and the length of the RNAME reference sequence inclusive. The length of the RNAME reference sequence can be found in the SQ header LN tag value for the SQ header with a SN tag matching the RNAME.} +\samstrictrule{Record placed outside of reference sequence}{If FLAG 0x4 is not set, POS must be between 0 and the length of the RNAME reference sequence inclusive. The length of the RNAME reference sequence can be found in the SQ header LN tag value for the SQ header with a SN tag matching the RNAME.}{\v15bestpractice} \samstrictrule{Invalid POS}{POS cannot be 0 if FLAG 0x4 is set.} \samstrictrule{Invalid POS}{POS cannot be negative.}{\samrule} \samstrictrule{Invalid POS}{POS cannot be greater than 2147483647.}{\samrule} \samstrictrule{POS specified without RNAME}{If RNAME is *, POS must be 0.} +\samstrictrule{POS does not match mate.}{For a unmapped paired-end or mate-pair read whose mate is mapped, the unmapped read should have POS identical to its mate}{\v15bestpractice} +\samstrictrule{POS specified for unmapped template}{If all segments in a template are unmapped, their POS should be set as 0.}{\v15bestpractice} \subsection{MAPQ} +\samstrictrule{Invalid MAPQ}{MAPQ must be between 0 and 255 inclusive.}{\samrule} +\samstrictrule{Missing MAPQ}{Aligned reads should not have 255 MAPQ.}{\v15bestpractice} \subsection{CIGAR} \samstrictrule{Invalid CIGAR}{All CIGAR strings must conform to the regex {\tt \char92*|([0-9]+[MIDNSHPX=])+}}{\samrule} \samstrictrule{Empty CIGAR}{All CIGAR strings must have at least one CIGAR operator} \samstrictrule{Zero length CIGAR operator}{All CIGAR operators must have a non-zero positive length} -\samstrictrule{CIGAR contains operator repeat}{All adjacent CIGAR operators must be different.} +\samstrictrule{CIGAR contains operator repeat}{All adjacent CIGAR operators must be different.}{\v15bestpractice} \samstrictrule{CIGAR does not contain any mapped bases}{All CIGARs must include a CIGAR operator that consumes a reference base.}{\samrule} {\tt Should we allow alignments with zero mapped bases? Seven bridges has a graph-based aligner that will emit CIGARs such as 100I for alignments that align to a known insertion that is not included in the reference. Useful for local assembly but technical voilates the SAM specifications} \samstrictrule{Incorrect CIGAR length}{Sum of lengths of the M/I/S/=/X operations must equal the length of SEQ when both CIGAR and SEQ are available.}{\samrule} \samstrictrule{Invalid CIGAR hard clip}{H must only be present as the first and/or last operation.}{\samrule} \samstrictrule{Invalid CIGAR soft clip}{S must only have H operations between them and the ends of the CIGAR string.}{\samrule} -\samstrictrule{CIGAR overhangs reference sequence}{POS plus the number of reference bases consumed by the CIGAR must not exceed the length of the RNAME reference sequence.} +\samstrictrule{CIGAR overhangs reference sequence}{POS plus the number of reference bases consumed by the CIGAR must not exceed the length of the RNAME reference sequence.}{\v15bestpractice} \samstrictrule{Inconsistent CIGAR read lengths}{All mapped alignments for a given segment must have matching read lengths. That is, the sum of lengths of the M/I/S/=/X/H operations must be equal.} -\samstrictrule{Unusual indel positioning}{TODO: should we disallow I/D operators at the ends of reads? There was some ambiguity as to how deletions interacted with POS but I think the spec has been updated in favour of the BWA interpretation since that discussion.} +\samstrictrule{TODO: Unusual indel positioning}{TODO: should we disallow I/D operators at the ends of reads? There was some ambiguity as to how deletions interacted with POS but I think the spec has been updated in favour of the BWA interpretation since that discussion.} \subsection{RNEXT} \samstrictrule{Invalid RNEXT}{If the template contains one segment RNEXT must be *} @@ -245,18 +262,55 @@ \subsection{PNEXT} \samstrictrule{Incorrect PNEXT}{If the primary alignment of the next read in the template is mapped, PNEXT must match the POS of that record.} \subsection{TLEN} -TODO: can we actually do anything for this? +\samstrictrule{TLEN out of range}{TLEN cannot be greater than 2147483647.}{\samrule} +\samstrictrule{TLEN out of range}{TLEN cannot be less than -2147483648.}{\samrule} +\samstrictrule{Invalid TLEN}{TLEN must be 0 if flag 0x1 is not set.}{\samrule} \subsection{SEQ} \samstrictrule{Inconsistent SEQ read lengths}{All alignments of a given segment must have consistent SEQ lengths. That is, for all non-* SEQ, SEQ + length of CIGAR hard clip must be equal. } -\samstrictrule{Inconsistent SEQ sequences}{All alignments of a given segment must have consistent base calls. A base cannot be called an A in one record, but a T in another. Note that to determine the read base, both the 0x10 FLAG, and any hard clipping CIGAR operators need to be taken into account.} - -TODO: I am up to here +\samstrictrule{Inconsistent SEQ sequences}{All alignments of a given segment with non-* SEQ must have consistent base calls. A base cannot be called an A in one record, but a T in another. Note that to determine the read base, both the 0x10 FLAG, and any hard clipping CIGAR operators need to be taken into account.} +\samstrictrule{SEQ of secondary alignments specified.}{SEQ of secondary alignments (0x100 FLAG set) should be set to * to reduce the file size.}{\v15bestpractice} +\samstrictrule{Invalid sequence base}{Unless SEQ is "*", SEQ read bases must be one of the following characters: acmgrsvtwyhkdbnACMGRSVTWYHKDBN} +\samstrictrule{SEQ does not match reference when CIGAR indicates match.}{Unless SEQ is "*", read bases with CIGAR operator = must match the reference base. Bases are considered to match if overlap between the possible read and reference bases (based on their IUPAC codes) is non-zero.} +\samstrictrule{SEQ matches reference when CIGAR indicates mismatch.}{Unless SEQ is "*", read bases with CIGAR operator X must not match the reference base. Bases are considered to match on if, when ignoring case, the reference and read bases are the same character and the character is one of the following characters: acgtACGT.} \subsection{QUAL} +\samstrictrule{QUAL specified without SEQ}{QUAL must be * if SEQ is *}{\samrule} +\samstrictrule{SEQ QUAL length mismatch.}{The length of a non-* QUAL must match the length of SEQ.}{\samrule} +\samstrictrule{Invalid QUAL}{The ASCII value of all QUAL bases must be at least 33.} +\samstrictrule{QUAL of secondary alignments specified.}{QUAL of secondary alignments (0x100 FLAG set) should be set to * to reduce the file size.}{\v15bestpractice} +\samstrictrule{TODO: QUAL edge case}{What should we do when a read is length 1 and the QUAL encodes to "*" ?}{\samrule} +\samstrictrule{Inconsistent QUAL scores}{All alignments with non-* QUAL of a given segment must have consistent base quality scores. Note that to determine the base quality, both the 0x10 FLAG, and any hard clipping CIGAR operators need to be taken into account.} +\section{SAM Tags} +\subsection{Tag format} +\samstrictrule{Malformed tag}{Tags must be a two character string conforming to the following regex: {\tt /[A-Za-z][A-Za-z0-9]/}|}{\samrule} +\samstrictrule{Invalid tag type}{Tag type must be one of AifZHB.}{\samrule} +\samstrictrule{Malformed A tag}{A tags must conform to the the regex {\tt [!-\char126]}}{\samrule} +\samstrictrule{Malformed i tag}{i tags must conform to the the regex {\tt [-+]?[0-9]+}}{\samrule} +\samstrictrule{Malformed f tag}{f tags must conform to the the regex {\tt [-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?}}{\samrule} +\samstrictrule{Malformed Z tag}{Z tags must conform to the the regex {\tt [\,\,\,!-\char126]*}}{\samrule} +\samstrictrule{Malformed H tag}{H tags must conform to the the regex {\tt ([0-9A-F][0-9A-F])*}}{\samrule} +\samstrictrule{Malformed B tag}{B tags must conform to the the regex {\tt [cCsSiIf](,[-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?)+}}{\samrule} +\samstrictrule{Non-integer value in integer array}{Type B tags starting with one of "cCsSiI" must contain integer values.} +\samstrictrule{Tag array value out of bounds}{Type B tags must not contain values that are greater than or less than the maximum or minimum value representable by the specified prefix.} +\samstrictrule{Tag integer out of bounds for BAM representation}{Type i tag values must be within the range ~$[-2^{31},2^{32})$}{\bam} +\samstrictrule{Unknown reserved tag}{ + No record can include any reserved tags not defined in the + {\sl Sequence Alignment/Map Optional Fields Specification}. + Non-standard tags must start X, Y, Z or a lowercase letter as per the SAM specifications. +} +\samstrictrule{Incorrect tag type}{The \textit{type} of all \textit{standard tags} must match the type +defined in the {\sl Sequence Alignment/Map Optional Fields Specification}.} +\samstrictrule{Duplicate tag}{}{\samrule} +\samstrictrule{Invalid tag MAPQ}{All mapping quality scores, including those defined in tags must be within the range [0, 255]. +A value 255 indicates that the mapping quality is not available and must only be used if the +mapping quality field is required. For example, a mapping quality field value is required for the mapq portion of the SA tag, but as the AM tag is optional, a mapping quality +field value is not required and the AM tag should be omitted entirely if a mapping quality is +not available.} +TODO: Add validators for all standard tags (basic SAM validation done). @@ -264,126 +318,14 @@ \subsection{QUAL} +\subsection{RG} +3 When a RG tag appears anywhere in the alignment section, there should be a single corresponding +@RG line with matching ID tag in the header.{\v15bestpractice} +\subsection{RG} +4 When a PG tag appears anywhere in the alignment section, there should be a single corresponding +@PG line with matching ID tag in the header.{\v15bestpractice} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\paragraph{} - - -\paragraph{} - - - - -\subsection{TODO: categorize} - -- secondary alignment mate info must match primary -- supplementary alignment mate info must match primary - -\paragraph{} - -All read alignments must have CIGARs with matching read length. That is, -the sum of lengths of the M/I/S/=/X operations must be equal for all mapped read alignment. -This means that chimeric alignments must include either soft clipping or -hard clipping CIGAR operations for read bases which were not aligned. - -\paragraph{} - -All read alignments with SEQ not equal to * must have a SEQ consistent -with all other read alignments. That is, chimeric and secondary alignments that have non-* SEQ -must be consistent with all other records defining a segment sequence. For -example, 10th base in a read cannot be be A in one alignment, but T in another. - -\paragraph{} - -All read alignments with QUAL not equal to * must have a SEQ consistent -with all other read alignments. - -\subsection{Mate alignments} - -\paragraph{} - -Unmapped reads must have RNAME and POS identical to that of the primary -non-supplementary alignment of first mapped read from the originating template. -For paired-end or mate-pair sequencing, this equivalent -to setting the RNAME/POS of unmapped reads to the RNAME/POS of the mate. - -\paragraph{} - -For templates with multiple reads, RNEXT and PNEXT must match the -alignment of the non-supplementary primary alignment of the next read. -As per the SAM specifications, for the last read, the next read is the first read in the template. - -\paragraph{} - -If all segments in a template are unmapped, RNAME must be set as `*' and POS as 0. - -\subsection{Reference bounds} - -\paragraph{} - -Read alignments must not extend past the start or end of the aligned RNAME. - -\subsection{Mapping Qualities} - -\paragraph{} - -All mapping quality scores, including those defined in tags must be within the range [0, 255]. -A value 255 indicates that the mapping quality is not available and must only be used if the -mapping quality field is required. For example, a mapping quality field value is required for -MAPQ field and the mapq portion of the SA tag, but as the AM is optional, a mapping quality -field value is not required and the AM tag should be omitted entirely if a mapping quality is -not available. - -\section{SAM Tags} - -\subsection{Standard Tags} - -\paragraph{} - -No record can include any reserved tags not defined in the -{\sl Sequence Alignment/Map Optional Fields Specification}. -Non-standard tags must start X, Y, Z or a lowercase letter as per the SAM specifications. - -\paragraph{} - -The \textit{type} of all \textit{standard tags} must match the type -defined in the {\sl Sequence Alignment/Map Optional Fields Specification}. - \paragraph{} All tag values must be consistent with the format @@ -453,10 +395,4 @@ \subsection{SA} overlap with any other alignments in the \textit{SA record set}. That is, a chimeric alignment cannot contain superfluous alignment records. - - -- SEQ must match CIGAR -- Read lengths must be consistent - - \end{document}