-
Notifications
You must be signed in to change notification settings - Fork 172
/
BCFv1_qref.tex
82 lines (76 loc) · 4.34 KB
/
BCFv1_qref.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
\documentclass[10pt,pdftex]{article}
\usepackage{color}
\definecolor{gray}{rgb}{0.7,0.7,0.7}
\setlength{\topmargin}{0.0cm}
\setlength{\textheight}{21.5cm}
\setlength{\oddsidemargin}{0cm}
\setlength{\textwidth}{16.5cm}
\setlength{\columnsep}{0.6cm}
\begin{document}
\noindent
[This is a quick reference for {\bf BCF1}, as historically implemented
in Samtools. The BCF1 format is obsolete; BCF2 is widely implemented and
recommended for use instead of this legacy format.]
\vspace{1\baselineskip}
\begin{center}
\begin{tabular}{|l|l|l|l|l|}
\hline
\multicolumn{2}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Description} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\hline\hline
\multicolumn{2}{|l|}{\sf magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline
\multicolumn{2}{|l|}{\sf l\_seqnm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline
\multicolumn{2}{|l|}{\sf seqnm} & Concatenated names, {\tt NULL} padded & {\tt char[{\sf l\_seqnm}]} & \\\hline
\multicolumn{2}{|l|}{\sf l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline
\multicolumn{2}{|l|}{\sf smpl} & Concatenated sample names & {\tt char[{\sf l\_smpl}]} & \\\hline
\multicolumn{2}{|l|}{\sf l\_meta} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline
\multicolumn{2}{|l|}{\sf meta} & Meta text, {\tt NULL} terminated & {\tt char[{\sf l\_meta}]} & \\\hline
\multicolumn{5}{|c|}{\it \color{gray}{List of records until the end of the file}}\\\cline{2-5}
& {\sf seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5}
& {\sf pos} & Position & {\tt int32\_t} & \\\cline{2-5}
& {\sf qual} & Variant quality & {\tt float} & \\\cline{2-5}
& {\sf l\_str} & Length of {\sf str} & {\tt int32\_t} & \\\cline{2-5}
& {\sf str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[{\sf l\_str}]} &\\\cline{2-5}
& \multicolumn{4}{c|}{Blocks of data; \#blocks and formats defined by {\tt FORMAT} (table below)}\\
\hline
\end{tabular}
\end{center}
\begin{center}
\begin{tabular}{clp{9cm}}
\hline
\multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline
{\tt DP} & {\tt uint16\_t[n]} & Read depth \\
{\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\
{\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\
{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$. If the highest bit is set,
the allele is not present (e.g. due to different ploidy between samples).} \\
{\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\
{\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\
{\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\
{\tt IBD} & {\tt uint32\_t[n*2]} & {IBD}\\
{\tt \_IBD} & {\tt uint8\_t+uint32\_t[n*P]} & {Generic IBD}\\
{\tt PL} & {\tt uint8\_t[n*G]} & {Phred-scaled likelihood of data}\\
{\tt PS} & {\tt uint32\_t[n]} & {Phase set}\\
%{\tt SP} & {\tt uint8\_t[n]} & {Strand bias P-value (bcftools only)}\\
\emph{Integer} & {\tt int32\_t[n*X]} & {Fix-sized custom Integer; $X$ defined in the header}\\
\emph{Numeric} & {\tt double[n*X]} & {Fix-sized custom Numeric}\\
\emph{String} & {\tt uint32\_t+char*} & {\tt NULL} padded concat. strings (int equals to the length) \\
\hline
\end{tabular}
\end{center}
\begin{itemize}
\item A BCF file is in the {\tt BGZF} format.
\item All multi-byte numbers are little-endian.
\item In a string, a missing value `.' is an empty C string ``{\tt
\char92 0}'' (not ``{\tt .\char92 0}'')
\item For {\tt GL} and {\tt PL}, likelihoods of genotypes appear in the
order of alleles in {\tt REF} and then {\tt ALT}. For example, if {\tt
REF=C}, {\tt ALT=T,A}, likelihoods appear in the order of {\tt
CC,CT,TT,CA,TA,AA} (NB: the ordering is different from the one in the original
BCF proposal).
\item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields
are required to be explicitly defined in the headers.
\item A {\tt FORMAT} field with its name starting with `{\tt \_}' is specific to BCF only.
It gives an alternative binary representation of the corresponding VCF field, in case
the default representation is unable to keep the genotype information,
for example, when the ploidy is not 2 or there are more than 8 alleles.
\end{itemize}
\end{document}