diff --git a/.appveyor.yml b/.appveyor.yml
index f2cd428b4..9720dc2dd 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -26,7 +26,7 @@ install:
- set MSYSTEM=MINGW64
- set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH%
- set MINGWPREFIX=x86_64-w64-mingw32
- - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl\""
+ - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-autotools mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl\""
# The user may have e.g. jkbonfield/bcftools branch FOO and an associated
# jkbonfield/htslib branch FOO. If so use that related htslib, obtained by
diff --git a/.cirrus.yml b/.cirrus.yml
index a3e46bbe8..e9c44d427 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -109,11 +109,11 @@ ubuntu_task:
<< : *TEST
-# CentOS
-centos_task:
- name: centos-gcc
+# Rocky Linux
+rockylinux_task:
+ name: rockylinux-gcc
container:
- image: centos:latest
+ image: rockylinux:latest
cpu: 2
memory: 1G
diff --git a/INSTALL b/INSTALL
index 2bd036a62..c4af072a8 100644
--- a/INSTALL
+++ b/INSTALL
@@ -218,16 +218,22 @@ Note: libcurl4-openssl-dev can be used as an alternative to libcurl4-gnutls-dev.
RedHat / CentOS
---------------
+Note: To install gsl-devel, it may be necessary to enable the "crb" repository.
+dnf --enablerepo=crb install gsl-devel
+
sudo yum install autoconf automake make gcc perl-Data-Dumper zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel gsl-devel perl-ExtUtils-Embed
+Note: On some versions, Perl FindBin will need to be installed to make the tests work.
+sudo yum install perl-FindBin
+
Alpine Linux
------------
Note: To install gsl-dev, it may be necessary to enable the "community"
repository in /etc/apk/repositories.
-sudo apk update # Ensure the package list is up to date
-sudo apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev gsl-dev perl-dev
+doas apk update # Ensure the package list is up to date
+doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev gsl-dev perl-dev
OpenSUSE
--------
@@ -240,4 +246,23 @@ MacOS, assuming Xcode is installed:
xz
gsl (optional)
+Windows MSYS2/MINGW64
+---------------------
+
+The configure script must be used as without it the compilation will
+likely fail.
+
+Follow MSYS2 installation instructions at
+https://www.msys2.org/wiki/MSYS2-installation/
+
+Then relaunch to MSYS2 shell using the "MSYS2 MinGW x64" executable.
+Once in that environment (check $MSYSTEM equals "MINGW64") install the
+compilers using pacman -S and the following package list:
+
+base-devel mingw-w64-x86_64-toolchain
+mingw-w64-x86_64-libdeflate mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2
+mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-autotools
+mingw-w64-x86_64-tools-git
+
+(The last is only needed for building libraries compatible with MSVC.)
diff --git a/Makefile b/Makefile
index b8430a95c..c221be2a4 100644
--- a/Makefile
+++ b/Makefile
@@ -38,7 +38,7 @@ OBJS = main.o vcfindex.o tabix.o \
vcfstats.o vcfisec.o vcfmerge.o vcfquery.o vcffilter.o filter.o vcfsom.o \
vcfnorm.o vcfgtcheck.o vcfview.o vcfannotate.o vcfroh.o vcfconcat.o \
vcfcall.o mcall.o vcmp.o gvcf.o reheader.o convert.o vcfconvert.o tsv2vcf.o \
- vcfcnv.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \
+ vcfcnv.o vcfhead.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \
regidx.o smpl_ilist.o csq.o vcfbuf.o \
mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \
vcfsort.o cols.o extsort.o dist.o abuf.o \
@@ -104,7 +104,7 @@ endif
include config.mk
-PACKAGE_VERSION = 1.14
+PACKAGE_VERSION = 1.15
# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
@@ -217,7 +217,7 @@ bcftools: $(OBJS) $(HTSLIB)
plugins: $(PLUGINS)
-bcftools_h = bcftools.h $(htslib_hts_defs_h) $(htslib_vcf_h)
+bcftools_h = bcftools.h $(htslib_hts_defs_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h)
call_h = call.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) vcmp.h
variantkey_h = variantkey.h hex.h
convert_h = convert.h $(htslib_vcf_h)
@@ -240,15 +240,16 @@ vcfplugin.o: vcfplugin.c config.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h)
vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(bcftools_h) $(call_h) $(prob1_h) $(ploidy_h) $(gvcf_h) regidx.h $(vcfbuf_h)
vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_thread_pool_h) $(bcftools_h)
vcfconvert.o: vcfconvert.c $(htslib_faidx_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kseq_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h)
-vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h
+vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h regidx.h
vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_hts_os_h) $(bcftools_h) extsort.h
vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h)
vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h)
vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h)
vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h
-vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h)
+vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(smpl_ilist_h)
vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h)
vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h
+vcfhead.o: vcfhead.c $(htslib_kstring_h) $(htslib_vcf_h) $(bcftools_h)
vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h)
vcfsort.o: vcfsort.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_hts_os_h) kheap.h $(bcftools_h)
vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) bin.h dist.h
diff --git a/NEWS b/NEWS
index 7ea362c3f..173ea986c 100644
--- a/NEWS
+++ b/NEWS
@@ -1,10 +1,106 @@
+## Release 1.15 (21st February 2022)
+
+
+* New `bcftools head` subcommand for conveniently displaying the headers
+ of a VCF or BCF file. Without any options, this is equivalent to
+ `bcftools view --header-only --no-version` but more succinct and memorable.
+
+* The `-T, --targets-file` option had the following bug originating in HTSlib code:
+ when an uncompressed file with multiple columns CHR,POS,REF was provided, the
+ REF would be interpreted as 0 gigabases (#1598)
+
+Changes affecting specific commands:
+
+* bcftools annotate
+
+ - In addition to `--rename-annots`, which requires a file with name mappings,
+ it is now possible to do the same on the command line `-c NEW_TAG:=OLD_TAG`
+
+ - Add new option --min-overlap which allows to specify the minimum required
+ overlap of intersecting regions
+
+ - Allow to transfer ALT from VCF with or without replacement using
+ bcftools annotate -a annots.vcf.gz -c ALT file.vcf.gz
+ bcftools annotate -a annots.vcf.gz -c +ALT file.vcf.gz
+
+* bcftools convert
+
+ - Revamp of `--gensample`, `--hapsample` and `--haplegendsample` family of options
+ which includes the following changes:
+
+ - New `--3N6` option to output/input the new version of the .gen file format,
+ see https://www.cog-genomics.org/plink/2.0/formats#gen
+
+ - Deprecate the `--chrom` option in favor of `--3N6`. A simple `cut` command
+ can be used to convert from the new 3*M+6 column format to the format printed
+ with `--chrom` (`cut -d' ' -f1,3-`).
+
+ - The CHROM:POS_REF_ALT IDs which are used to detect strand swaps are required
+ and must appear either in the "SNP ID" column or the "rsID" column. The column
+ is autodetected for `--gensample2vcf`, can be the first or the second for
+ `--hapsample2vcf` (depending on whether the `--vcf-ids` option is given), must be
+ the first for `--haplegendsample2vcf`.
+
+* bcftools csq
+
+ - Allow GFF files with phase column unset
+
+* bcftools filter
+
+ - New `--mask`, `--mask-file` and `--mask-overlap` options to soft filter
+ variants in regions (#1635)
+
+* bcftools +fixref
+
+ - The `-m id` option now works also for non-dbSNP ids, i.e. not just `rsINT`
+
+ - New `-m flip-all` mode for flipping all sites, including ambiguous A/T and C/G sites
+
+* bcftools isec
+
+ - Prevent segfault on sites filtered with -i/-e in all files (#1632)
+
+* bcftools mpileup
+
+ - More flexible read filtering using the options
+ --ls, --skip-all-set .. skip reads with all of the FLAG bits set
+ --ns, --skip-any-set .. skip reads with any of the FLAG bits set
+ --lu, --skip-all-unset .. skip reads with all of the FLAG bits unset
+ --nu, --skip-any-unset .. skip reads with any of the FLAG bits unset
+
+ The existing synonymous options will continue to function but their use
+ is discouraged
+ --rf, --incl-flags STR|INT Required flags: skip reads with mask bits unset
+ --ff, --excl-flags STR|INT Filter flags: skip reads with mask bits set
+
+* bcftools query
+
+ - Make the `--samples` and `--samples-file` options work also in the `--list-samples`
+ mode. Add a new `--force-samples` option which allows to proceed even when some of
+ the requested samples are not present in the VCF (#1631)
+
+* bcftools +setGT
+
+ - Fix a bug in `-t q -e EXPR` logic applied on FORMAT fields, sites with all
+ samples failing the expression EXPR were incorrectly skipped. This problem
+ affected only the use of `-e` logic, not the `-i` expressions (#1607)
+
+* bcftools sort
+
+ - make use of the TMPDIR environment variable when defined
+
+* bcftools +trio-dnm2
+
+ - The --use-NAIVE mode now also adds the de novo allele in FORMAT/VA
+
+
## Release 1.14 (22nd October 2021)
Changes affecting the whole of bcftools, or multiple commands:
* New `--regions-overlap` and `--targets-overlap` options which address
- a long-standing design problem with subsetting VCF files by region.
+ a long-standing design problem with subsetting VCF files by region.
BCFtools recognize two sets of options, one for streaming (`-t/-T`) and
one for index-gumping (`-r/-R`). They behave differently, the first
includes only records with POS coordinate within the regions, the other
@@ -32,11 +128,11 @@ Changes affecting specific commands:
by using `-c INFO/END`.
- add a new '.' modifier to control wheter missing values should be carried
- over from a tab-delimited file or not. For example:
+ over from a tab-delimited file or not. For example:
-c TAG .. adds TAG if the source value is not missing. If TAG
exists in the target file, it will be overwritten
-
+
-c .TAG .. adds TAG even if the source value is missing. This
can overwrite non-missing values with a missing value
and can create empty VCF fields (`TAG=.`)
@@ -165,7 +261,7 @@ Changes affecting specific commands:
* bcftools +fill-tags:
- Generalization and better support for custom functions that allow
- adding new INFO tags based on arbitrary `-i, --include` type of
+ adding new INFO tags based on arbitrary `-i, --include` type of
expressions. For example, to calculate a missing INFO/DP annotation
from FORMAT/AD, it is possible to use:
@@ -229,7 +325,7 @@ Changes affecting specific commands:
- Atomization of AD and QS tags now correctly updates occurrences of duplicate
alleles within different haplotypes
-
+
- Fix a bug in atomization of Number=A,R tags
* bcftools reheader:
@@ -241,7 +337,7 @@ Changes affecting specific commands:
- A wider range of genotypes can be set by the plugin by allowing
specifying custom genotypes. For example, to force a heterozygous
genotype it is now possible to use expressions like:
-
+
c:'m|M'
c:0/1
c:0
@@ -253,7 +349,7 @@ Changes affecting specific commands:
- Better handling of ambiguous keys such as INFO/AF and CSQ/AD. The
`-p, --annot-prefix` option is now applied before doing anything else
which allows its use with `-f, --format` and `-c, --columns` options.
-
+
- Some consequence field names may not constitute a valid tag name, such
as "pos(1-based)". Newly field names are trimmed to exclude brackets.
@@ -383,7 +479,7 @@ Changes affecting specific commands:
* bcftools csq:
- - Fix a bug wich caused incorrect FORMAT/BCSQ formatting at sites with too
+ - Fix a bug wich caused incorrect FORMAT/BCSQ formatting at sites with too
many per-sample consequences
- Fix a bug which incorrectly handled the --ncsq parameter and could clash
diff --git a/bam_sample.c b/bam_sample.c
index a6da9432f..d8c10b8b3 100644
--- a/bam_sample.c
+++ b/bam_sample.c
@@ -1,7 +1,7 @@
/* bam_sample.c -- group data by sample.
Copyright (C) 2010, 2011 Broad Institute.
- Copyright (C) 2013, 2016-2018 Genome Research Ltd.
+ Copyright (C) 2013, 2016-2022 Genome Research Ltd.
Author: Heng Li , Petr Danecek
@@ -281,7 +281,7 @@ int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file)
int i, nsamples = 0;
char **samples = hts_readlist(list, is_file, &nsamples);
- if ( !nsamples ) return 0;
+ if ( !samples || !nsamples ) return 0;
kstring_t ori = {0,0,0};
kstring_t ren = {0,0,0};
@@ -328,7 +328,7 @@ int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file)
int i, nrows = 0;
char **rows = hts_readlist(list, is_file, &nrows);
- if ( !nrows ) return 0;
+ if ( !rows || !nrows ) return 0;
kstring_t fld1 = {0,0,0};
kstring_t fld2 = {0,0,0};
diff --git a/bcftools.h b/bcftools.h
index b188e9805..a915802a8 100644
--- a/bcftools.h
+++ b/bcftools.h
@@ -1,6 +1,6 @@
/* bcftools.h -- utility function declarations.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2022 Genome Research Ltd.
Author: Petr Danecek
@@ -28,6 +28,7 @@ THE SOFTWARE. */
#include
#include
#include
+#include
#include
#define FT_TAB_TEXT 0 // custom tab-delimited text file
@@ -50,9 +51,11 @@ void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
const char *hts_bcf_wmode(int file_type);
-const char *hts_bcf_wmode2(int file_type, char *fname);
-void set_wmode(char dst[8], int file_type, char *fname, int compression_level); // clevel: 0-9 with or zb type, -1 unset
+const char *hts_bcf_wmode2(int file_type, const char *fname);
+void set_wmode(char dst[8], int file_type, const char *fname, int compression_level); // clevel: 0-9 with or zb type, -1 unset
char *init_tmp_prefix(const char *prefix);
+int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq);
+int parse_overlap_option(const char *arg);
void *smalloc(size_t size); // safe malloc
diff --git a/bin.c b/bin.c
index a4817cf45..645012ec8 100644
--- a/bin.c
+++ b/bin.c
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2016 Genome Research Ltd.
+ Copyright (c) 2016-2022 Genome Research Ltd.
Author: Petr Danecek
@@ -43,6 +43,7 @@ bin_t *bin_init(const char *list_def, float min, float max)
int is_file = strchr(list_def,',') ? 0 : 1;
int i, nlist;
char **list = hts_readlist(list_def, is_file, &nlist);
+ if ( !list ) error("Error: failed to read %s\n",list_def);
bin->nbins = nlist;
bin->bins = (float*) malloc(sizeof(float)*nlist);
for (i=0; irlen > args->fa_buf.l - idx )
{
rec->rlen = args->fa_buf.l - idx;
- alen = strlen(alt_allele);
- if ( alen > rec->rlen )
+ if ( alt_allele[0]!='<' )
{
- alt_allele[rec->rlen] = 0;
- fprintf(stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ alen = strlen(alt_allele);
+ if ( alen > rec->rlen )
+ {
+ fprintf(stderr,"Warning: trimming variant \"%s\" starting at %s:%"PRId64"\n", alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ alt_allele[rec->rlen] = 0;
+ }
}
}
if ( idx>=args->fa_buf.l )
@@ -749,7 +752,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
// TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG
if ( strcasecmp(alt_allele,"") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"") )
- error("Symbolic alleles other than , <*> or are currently not supported, e.g. %s at %s:%"PRId64".\n"
+ error("Symbolic alleles other than , <*> or are currently not supported, e.g. \"%s\" at %s:%"PRId64".\n"
"Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n",
alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
if ( !strcasecmp(alt_allele,"") )
diff --git a/csq.c b/csq.c
index 6217987e5..de0d7a9bb 100644
--- a/csq.c
+++ b/csq.c
@@ -331,6 +331,7 @@ const char *csq_strings[] =
#define GF_UTR5 ((1<<(GF_coding_bit+1))+4)
// GF_MAX = (1<<30)-1, see hap_node_t
+#define CDS_PHASE_UNKN 3
typedef struct _tscript_t tscript_t;
typedef struct
{
@@ -340,7 +341,7 @@ typedef struct
// update hap_node_t.sbeg in hap_init, could be calculated on the fly)
uint32_t len; // exon length
uint32_t icds:30, // exon index within the transcript
- phase:2; // offset of the CDS
+ phase:2; // offset of the CDS: 0,1,2 or 3 for unknown
}
gf_cds_t;
typedef struct
@@ -517,7 +518,7 @@ typedef struct
uint32_t end;
uint32_t trid;
uint32_t strand:1; // STRAND_REV,STRAND_FWD
- uint32_t phase:2; // 0, 1 or 2
+ uint32_t phase:2; // 0, 1, 2, or 3 for unknown
uint32_t iseq:29;
}
ftr_t;
@@ -1051,7 +1052,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr)
if ( *ss == '0' ) ftr->phase = 0;
else if ( *ss == '1' ) ftr->phase = 1;
else if ( *ss == '2' ) ftr->phase = 2;
- else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase
+ else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase
else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
ss += 2;
@@ -1132,6 +1133,7 @@ void tscript_init_cds(args_t *args)
// Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
khint_t k;
+ int warn_phase_unkn = 0;
for (k=0; kid2tr); k++)
{
if ( !kh_exist(aux->id2tr, k) ) continue;
@@ -1151,28 +1153,38 @@ void tscript_init_cds(args_t *args)
int i, len = 0;
if ( tr->strand==STRAND_FWD )
{
- if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
- tr->cds[0]->beg += tr->cds[0]->phase;
- tr->cds[0]->len -= tr->cds[0]->phase;
- tr->cds[0]->phase = 0;
+ if ( tr->cds[0]->phase != CDS_PHASE_UNKN )
+ {
+ if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
+ tr->cds[0]->beg += tr->cds[0]->phase;
+ tr->cds[0]->len -= tr->cds[0]->phase;
+ tr->cds[0]->phase = 0;
+ }
// sanity check phase; the phase number in gff tells us how many bases to skip in this
// feature to reach the first base of the next codon
int tscript_ok = 1;
for (i=0; incds; i++)
{
+ if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
+ {
+ warn_phase_unkn = 1;
+ len += tr->cds[i]->len;
+ continue;
+ }
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
- if ( phase!=len%3)
+ if ( phase!=len%3 )
{
if ( args->force )
{
if ( args->verbosity > 0 )
- fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+ args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
tscript_ok = 0;
break;
}
error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
- args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
len += tr->cds[i]->len;
}
@@ -1180,33 +1192,43 @@ void tscript_init_cds(args_t *args)
}
else
{
- // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
- // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
- // todo: the same for the fwd strand
- i = tr->ncds - 1;
- int phase = tr->cds[i]->phase;
- if ( phase ) tr->trim |= TRIM_5PRIME;
- while ( i>=0 && phase > tr->cds[i]->len )
+ if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
{
- phase -= tr->cds[i]->len;
+ // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
+ // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
+ // todo: the same for the fwd strand
+ i = tr->ncds - 1;
+ int phase = tr->cds[i]->phase;
+ if ( phase ) tr->trim |= TRIM_5PRIME;
+ while ( i>=0 && phase > tr->cds[i]->len )
+ {
+ phase -= tr->cds[i]->len;
+ tr->cds[i]->phase = 0;
+ tr->cds[i]->len = 0;
+ i--;
+ }
+ tr->cds[i]->len -= tr->cds[i]->phase;
tr->cds[i]->phase = 0;
- tr->cds[i]->len = 0;
- i--;
}
- tr->cds[i]->len -= tr->cds[i]->phase;
- tr->cds[i]->phase = 0;
// sanity check phase
int tscript_ok = 1;
for (i=tr->ncds-1; i>=0; i--)
{
+ if ( tr->cds[i]->phase == CDS_PHASE_UNKN )
+ {
+ warn_phase_unkn = 1;
+ len += tr->cds[i]->len;
+ continue;
+ }
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
{
if ( args->force )
{
if ( args->verbosity > 0 )
- fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n",
+ args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
tscript_ok = 0;
break;
}
@@ -1282,6 +1304,8 @@ void tscript_init_cds(args_t *args)
regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
}
}
+ if ( warn_phase_unkn && args->verbosity > 0 )
+ fprintf(stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n");
}
void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
@@ -4316,16 +4340,12 @@ int main_csq(int argc, char *argv[])
case 't': targets_list = optarg; break;
case 'T': targets_list = optarg; targets_is_file = 1; break;
case 4 :
- if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
- else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
- else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
- else error("Could not parse: --regions-overlap %s\n",optarg);
+ regions_overlap = parse_overlap_option(optarg);
+ if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
break;
case 5 :
- if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
- else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
- else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
- else error("Could not parse: --targets-overlap %s\n",optarg);
+ targets_overlap = parse_overlap_option(optarg);
+ if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
break;
case 'h':
case '?': error("%s",usage());
diff --git a/doc/bcftools.1 b/doc/bcftools.1
index 058f39ad3..a90ef17d9 100644
--- a/doc/bcftools.1
+++ b/doc/bcftools.1
@@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHOR(S)" section]
.\" Generator: Asciidoctor 2.0.16.dev
-.\" Date: 2021-10-22
+.\" Date: 2022-02-21
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
-.TH "BCFTOOLS" "1" "2021-10-22" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "2022-02-21" "\ \&" "\ \&"
.ie \n(.g .ds Aq \(aq
.el .ds Aq '
.ss \n[.ss] 0
@@ -51,7 +51,7 @@ standard input (stdin) and outputs to the standard output (stdout). Several
commands can thus be combined with Unix pipes.
.SS "VERSION"
.sp
-This manual page was last updated \fB2021\-10\-22\fP and refers to bcftools git version \fB1.14\fP.
+This manual page was last updated \fB2022\-02\-21\fP and refers to bcftools git version \fB1.15\fP.
.SS "BCF1"
.sp
The BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
@@ -187,6 +187,17 @@ list of available options, run \fBbcftools\fP \fICOMMAND\fP without arguments.
. sp -1
. IP \(bu 2.3
.\}
+\fBhead\fP .. view VCF/BCF file headers
+.RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04'\(bu\h'+03'\c
+.\}
+.el \{\
+. sp -1
+. IP \(bu 2.3
+.\}
\fBindex\fP .. index VCF/BCF
.RE
.sp
@@ -463,21 +474,23 @@ This option requires indexed VCF/BCF files. Note that \fB\-R\fP cannot be used
in combination with \fB\-r\fP.
.RE
.sp
-\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+\fB\-\-regions\-overlap\fP \fIpos\fP|\fIrecord\fP|\fIvariant\fP|\fI0\fP|\fI1\fP|\fI2\fP
.RS 4
This option controls how overlapping records are determined:
-set to \fB0\fP if the VCF record has to have POS inside a region
+set to \fBpos\fP or \fB0\fP if the VCF record has to have POS inside a region
(this corresponds to the default behavior of \fB\-t/\-T\fP);
-set to \fB1\fP if also overlapping records with POS outside a region
-should be included (this is the default behavior of \fB\-r/\-R\fP); or set
-to \fB2\fP to include only true overlapping variation (compare
+set to \fBrecord\fP or \fB1\fP if also overlapping records with POS outside a region
+should be included (this is the default behavior of \fB\-r/\-R\fP, and includes indels
+with POS at the end of a region, which are technically outside the region); or set
+to \fBvariant\fP or \fB2\fP to include only true overlapping variation (compare
the full VCF representation "\f(CRTA>T\-\fP" vs the true sequence variation "\f(CRA>\-\fP").
.RE
.sp
-\fB\-s, \-\-samples\fP [^]\fILIST\fP
+\fB\-s, \-\-samples\fP [^]\fILIST\fP
.RS 4
Comma\-separated list of samples to include or exclude if prefixed
-with "^".
+with "^." (Note that when multiple samples are to be excluded,
+the "^" prefix is still present only once, e.g. "^SAMPLE1,SAMPLE2".)
The sample order is updated to reflect that given on the command line.
Note that in general tags such as INFO/AC, INFO/AN, etc are not updated
to correspond to the subset samples. \fBbcftools view\fP is the
@@ -495,9 +508,9 @@ that command. For example:
.fi
.if n .RE
.sp
-\fB\-S, \-\-samples\-file\fP \fIFILE\fP
+\fB\-S, \-\-samples\-file\fP [^]\fIFILE\fP
.RS 4
-File of sample names to include or exclude if prefixed with "^".
+File of sample names to include or exclude if prefixed with "^".
One sample per line. See also the note above for the \fB\-s, \-\-samples\fP
option.
The sample order is updated to reflect that given in the input file.
@@ -541,20 +554,21 @@ The program ignores the first column and the last indicates sex (1=male, 2=femal
.fi
.if n .RE
.sp
-\fB\-t, \-\-targets\fP [^]\fIchr\fP|\fIchr:pos\fP|\fIchr:from\-to\fP|\fIchr:from\-\fP[,...]
+\fB\-t, \-\-targets\fP [^]\fIchr\fP|\fIchr:pos\fP|\fIchr:from\-to\fP|\fIchr:from\-\fP[,...]
.RS 4
Similar as \fB\-r, \-\-regions\fP, but the next position is accessed by streaming the
whole VCF/BCF rather than using the tbi/csi index. Both \fB\-r\fP and \fB\-t\fP options
can be applied simultaneously: \fB\-r\fP uses the index to jump to a region
and \fB\-t\fP discards positions which are not in the targets. Unlike \fB\-r\fP, targets
-can be prefixed with "^" to request logical complement. For example, "^X,Y,MT"
+can be prefixed with "^" to request logical complement. For example, "^X,Y,MT"
indicates that sequences X, Y and MT should be skipped.
Yet another difference between the \fB\-t/\-T\fP and \fB\-r/\-R\fP is that \fB\-r/\-R\fP checks for
proper overlaps and considers both POS and the end position of an indel, while \fB\-t/\-T\fP
-considers the POS coordinate only. Note that \fB\-t\fP cannot be used in combination with \fB\-T\fP.
+considers the POS coordinate only (by default; see also \fB\-\-regions\-overlap\fP and \fB\-\-targets\-overlap\fP).
+Note that \fB\-t\fP cannot be used in combination with \fB\-T\fP.
.RE
.sp
-\fB\-T, \-\-targets\-file\fP [^]\fIFILE\fP
+\fB\-T, \-\-targets\-file\fP [^]\fIFILE\fP
.RS 4
Same \fB\-t, \-\-targets\fP, but reads regions from a file. Note that \fB\-T\fP
cannot be used in combination with \fB\-t\fP.
@@ -573,7 +587,7 @@ Such a file can be easily created from a VCF using:
.fi
.if n .RE
.sp
-\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+\fB\-\-targets\-overlap\fP \fIpos\fP|\fIrecord\fP|\fIvariant\fP|\fI0\fP|\fI1\fP|\fI2\fP
.RS 4
Same as \fB\-\-regions\-overlap\fP but for \fB\-t/\-T\fP.
.RE
@@ -635,13 +649,6 @@ See also \fB\-c, \-\-columns\fP and \fB\-h, \-\-header\-lines\fP.
.fi
.if n .RE
.sp
-\fB\-\-collapse\fP \fIsnps\fP|\fIindels\fP|\fIboth\fP|\fIall\fP|\fIsome\fP|\fInone\fP
-.RS 4
-Controls how to match records from the annotation file to the target VCF.
-Effective only when \fB\-a\fP is a VCF or BCF.
-See \fBCommon Options\fP for more.
-.RE
-.sp
\fB\-c, \-\-columns\fP \fIlist\fP
.RS 4
Comma\-separated list of columns or tags to carry over from the annotation file
@@ -663,7 +670,7 @@ The imported VCF annotations can be renamed as "DST_TAG:=SRC_TAG" or "FMT/DST_TA
\~
.br
To carry over all INFO annotations, use "INFO". To add all INFO annotations except
-"TAG", use "^INFO/TAG". By default, existing values are replaced.
+"TAG", use "^INFO/TAG". By default, existing values are replaced.
\~
.br
\~
@@ -776,6 +783,15 @@ This is an experimental feature.
annotate sites which are present ("+") or absent ("\-") in the \fB\-a\fP file with a new INFO/TAG flag
.RE
.sp
+\fB\-\-min\-overlap\fP \fIANN\fP:\*(AqVCF\*(Aq
+.RS 4
+minimum overlap required as a fraction of the variant in the annotation \fB\-a\fP file (\fIANN\fP), in the
+target VCF file (\fI:VCF\fP), or both for reciprocal overlap (\fIANN:VCF\fP).
+By default overlaps of arbitrary length are sufficient.
+The option can be used only with the tab\-delimited annotation \fB\-a\fP file and with \fIBEG\fP and \fIEND\fP
+columns present.
+.RE
+.sp
\fB\-\-no\-version\fP
.RS 4
see \fBCommon Options\fP
@@ -791,6 +807,14 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-pair\-logic\fP \fIsnps\fP|\fIindels\fP|\fIboth\fP|\fIall\fP|\fIsome\fP|\fIexact\fP
+.RS 4
+Controls how to match records from the annotation file to the target VCF.
+Effective only when \fB\-a\fP is a VCF or BCF. The option replaces the former
+uninuitive \fB\-\-collapse\fP.
+See \fBCommon Options\fP for more.
+.RE
+.sp
\fB\-r, \-\-regions\fP \fIchr\fP|\fIchr:pos\fP|\fIchr:from\-to\fP|\fIchr:from\-\fP[,...]
.RS 4
see \fBCommon Options\fP
@@ -801,6 +825,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-\-rename\-annots\fP \fIfile\fP
.RS 4
rename annotations according to the map in \fIfile\fP, with
@@ -816,7 +845,7 @@ rename chromosomes according to the map in \fIfile\fP, with
line.
.RE
.sp
-\fB\-s, \-\-samples\fP [^]\fILIST\fP
+\fB\-s, \-\-samples\fP [^]\fILIST\fP
.RS 4
subset of samples to annotate, see also \fBCommon Options\fP
.RE
@@ -848,7 +877,7 @@ List of annotations to remove. Use "FILTER" to remove all filters or
"FILTER/SomeFilter" to remove a specific filter. Similarly, "INFO" can
be used to remove all INFO tags and "FORMAT" to remove all FORMAT tags
except GT. To remove all INFO tags except "FOO" and "BAR", use
-"^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER).
+"^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER).
"INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".
.RE
.sp
@@ -954,6 +983,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-s, \-\-samples\fP \fILIST\fP
.RS 4
see \fBCommon Options\fP
@@ -974,6 +1008,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-\-threads\fP \fIINT\fP
.RS 4
see \fBCommon Options\fP
@@ -990,7 +1029,7 @@ appear in any of the genotypes
.RS 4
comma\-separated list of FORMAT fields to output for each sample. Currently
GQ and GP fields are supported. For convenience, the fields can be given
-as lower case letters. Prefixed with "^" indicates a request for tag
+as lower case letters. Prefixed with "^" indicates a request for tag
removal of auxiliary tags useful only for calling.
.RE
.sp
@@ -1165,6 +1204,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-s, \-\-query\-sample\fP \fIstring\fP
.RS 4
query sample name
@@ -1179,6 +1223,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
.SS "HMM Options:"
.sp
\fB\-a, \-\-aberrant\fP \fIfloat\fP[,\fIfloat\fP]
@@ -1343,6 +1392,11 @@ see \fBCommon Options\fP. Requires \fB\-a, \-\-allow\-overlaps\fP.
see \fBCommon Options\fP. Requires \fB\-a, \-\-allow\-overlaps\fP.
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-\-threads\fP \fIINT\fP
.RS 4
see \fBCommon Options\fP
@@ -1357,6 +1411,11 @@ Note that the program does not act as a primitive variant caller and ignores all
depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
\fBsetGT\fP plugin.
.sp
+\fB\-a, \-\-absent\fP \fICHAR\fP
+.RS 4
+replace positions absent from VCF with CHAR
+.RE
+.sp
\fB\-c, \-\-chain\fP \fIFILE\fP
.RS 4
write a chain file for liftover
@@ -1519,6 +1578,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-s, \-\-samples\fP \fILIST\fP
.RS 4
see \fBCommon Options\fP
@@ -1538,6 +1602,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
.SS "VCF output options:"
.sp
\fB\-\-no\-version\fP
@@ -1563,10 +1632,17 @@ see \fBCommon Options\fP
.sp
\fB\-G, \-\-gensample2vcf\fP \fIprefix\fP or \fIgen\-file\fP,\fIsample\-file\fP
.RS 4
-convert IMPUTE2 output to VCF. The second column must be of the form
-"CHROM:POS_REF_ALT" to detect possible strand swaps; IMPUTE2 leaves the
-first one empty ("\-\-") when sites from reference panel are filled in. See
-also \fB\-g\fP below.
+convert IMPUTE2 output to VCF. One of the ID columns ("SNP ID" or "rsID" in
+.URL "https://www.cog\-genomics.org/plink/2.0/formats#gen" "" ")"
+must be of the form
+"CHROM:POS_REF_ALT" to detect possible strand swaps.
+\~
+.br
+When the \fB\-\-vcf\-ids\fP option is given, the other column (autodetected) is used
+to fill the ID column of the VCF.
+\~
+.br
+See also \fB\-g\fP and \fB\-\-3N6\fP options.
.RE
.sp
\fB\-g, \-\-gensample\fP \fIprefix\fP or \fIgen\-file\fP,\fIsample\-file\fP
@@ -1575,14 +1651,31 @@ convert from VCF to gen/sample format used by IMPUTE2 and SHAPEIT.
The columns of .gen file format are ID1,ID2,POS,A,B followed by three
genotype probabilities P(AA), P(AB), P(BB) for each sample. In order to
prevent strand swaps, the program uses IDs of the form "CHROM:POS_REF_ALT".
-For example:
+When the \fB\-\-vcf\-ids\fP option is given, the second column is set to match the ID
+column of the VCF.
+\~
+.br
+See also \fB\-G\fP and \fB\-\-3N6\fP options.
+\~
+.br
+The file .gen and .sample file format are:
.RE
.sp
.if n .RS 4
.nf
.fam C
- .gen
- \-\-\-\-
+ .gen (with \-\-3N6 \-\-vcf\-ids)
+ \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-
+ chr1 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 1 0
+ chr1 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 0 1
+
+ .gen (with \-\-vcf\-ids)
+ \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-
+ 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 1 0
+ 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 0 1
+
+ .gen (the default)
+ \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-
1:111485207_G_A 1:111485207_G_A 111485207 G A 0 1 0 0 1 0
1:111494194_C_T 1:111494194_C_T 111494194 C T 0 1 0 0 0 1
@@ -1596,14 +1689,16 @@ For example:
.fi
.if n .RE
.sp
-\fB\-\-tag\fP \fISTRING\fP
+\fB\-\-3N6\fP
.RS 4
-tag to take values for .gen file: GT,PL,GL,GP
+Expect/Create files in the 3*N+6 column format. This is the new .gen file format with the first
+column containing the chromosome name, see \c
+.URL "https://www.cog\-genomics.org/plink/2.0/formats#gen" "" ""
.RE
.sp
-\fB\-\-chrom\fP
+\fB\-\-tag\fP \fISTRING\fP
.RS 4
-output chromosome in the first column instead of CHROM:POS_REF_ALT
+tag to take values for .gen file: GT,PL,GL,GP
.RE
.sp
\fB\-\-sex\fP \fIFILE\fP
@@ -1645,19 +1740,23 @@ reference sequence in fasta format. Must be indexed with samtools faidx
.RS 4
convert from hap/sample format to VCF. The columns of .hap file are
similar to .gen file above, but there are only two haplotype columns per
-sample. Note that the first column of the .hap file is expected to be in
-the form "CHR:POS_REF_ALT(_END)?", with the _END being optional for
-defining the INFO/END tag when ALT is a symbolic allele, for example:
+sample. Note that the first or the second column of the .hap file is expected to be in
+the form "CHR:POS_REF_ALT[_END]", with the _END being optional for
+defining the INFO/END tag when ALT is a symbolic allele. For example:
.RE
.sp
.if n .RS 4
.nf
.fam C
- .hap
- \-\-\-\-
+ .hap (with \-\-vcf\-ids)
+ \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-
1:111485207_G_A rsID1 111485207 G A 0 1 0 0
- 1:111494194_C_T rsID2 111494194 C T 0 1 0 0
1:111495231_A__111495784 rsID3 111495231 A 0 0 1 0
+
+ .hap (the default)
+ \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-
+ 1 1:111485207_G_A 111485207 G A 0 1 0 0
+ 1 1:111495231_A__111495784 111495231 A 0 0 1 0
.fam
.fi
.if n .RE
@@ -1694,22 +1793,26 @@ output sex column in the sample file. The FILE format is
.sp
\fB\-\-vcf\-ids\fP
.RS 4
-output VCF IDs instead of "CHROM:POS_REF_ALT" IDs
+the second column of the .hap file holds the VCF ids, the first
+column is of the form "CHR:POS_REF_ALT[_END]". Without the option,
+the format follows \c
+.URL "https://www.cog\-genomics.org/plink/2.0/formats#haps" ""
+with ids (the second column) of the form "CHR:POS_REF_ALT[_END]"
.RE
.SS "HAP/LEGEND/SAMPLE conversion:"
.sp
\fB\-H, \-\-haplegendsample2vcf\fP \fIprefix\fP or \fIhap\-file\fP,\fIlegend\-file\fP,\fIsample\-file\fP
.RS 4
-convert from hap/legend/sample format used by IMPUTE2 to VCF, see
-also \fB\-h, \-\-hapslegendsample\fP below.
+convert from hap/legend/sample format used by IMPUTE2 to VCF.
+See also \fB\-h, \-\-hapslegendsample\fP below.
.RE
.sp
\fB\-h, \-\-haplegendsample\fP \fIprefix\fP or \fIhap\-file\fP,\fIlegend\-file\fP,\fIsample\-file\fP
.RS 4
convert from VCF to hap/legend/sample format used by IMPUTE2 and SHAPEIT.
The columns of .legend file ID,POS,REF,ALT. In order to prevent strand
-swaps, the program uses IDs of the form "CHROM:POS_REF_ALT". The .sample
-file is quite basic at the moment with columns for population, group and
+swaps, the program uses IDs of the form "CHROM:POS_REF_ALT".
+The .sample file is quite basic at the moment with columns for population, group and
sex expected to be edited by the user. For example:
.RE
.sp
@@ -1760,7 +1863,8 @@ output sex column in the sample file. The FILE format is
.sp
\fB\-\-vcf\-ids\fP
.RS 4
-output VCF IDs instead of "CHROM:POS_REF_ALT" IDs
+output VCF IDs instead of "CHROM:POS_REF_ALT". Note that this option can
+be used with \fB\-\-haplegendsample\fP but not with \fB\-\-haplegendsample2vcf\fP.
.RE
.SS "TSV conversion:"
.sp
@@ -1993,6 +2097,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-s, \-\-samples\fP \fILIST\fP
.RS 4
samples to include or "\-" to apply all variants and ignore samples
@@ -2013,6 +2122,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fBExamples:\fP
.sp
.if n .RS 4
@@ -2123,6 +2237,22 @@ include only sites for which \fIEXPRESSION\fP is true. For valid expressions see
\fBEXPRESSIONS\fP.
.RE
.sp
+\fB\-\-mask\fP [^]\fIREGION\fP
+.RS 4
+Soft filter regions, prepepend "^" to negate. Requires \fB\-s, \-\-soft\-filter\fP.
+.RE
+.sp
+\fB\-M, \-\-mask\-file\fP [^]\fIFILE\fP
+.RS 4
+Soft filter regions listed in a file, "^" to negate. Requires \fB\-s, \-\-soft\-filter\fP.
+.RE
+.sp
+\fB\-\-mask\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+Same as \fB\-\-regions\-overlap\fP but for \fB\-\-mask/\-\-mask\-file\fP.
+See \fBCommon Options\fP. [1]
+.RE
+.sp
\fB\-m, \-\-mode\fP [\fI+x\fP]
.RS 4
define behaviour at sites with existing FILTER annotations. The default
@@ -2158,6 +2288,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-s, \-\-soft\-filter\fP \fISTRING\fP|\fI+\fP
.RS 4
annotate FILTER column with \fISTRING\fP or, with \fI+\fP, a unique filter name generated
@@ -2179,6 +2314,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-\-threads\fP \fIINT\fP
.RS 4
see \fBCommon Options\fP
@@ -2262,6 +2402,11 @@ Restrict to comma\-separated list of regions, see \fBCommon Options\fP
Restrict to regions listed in a file, see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-s, \-\-samples\fP [\fIqry\fP|\fIgt\fP]:\*(AqLIST\*(Aq:
List of query samples or \fB\-g\fP samples. If neither \fB\-s\fP nor \fB\-S\fP are given, all possible sample
pair combinations are compared
@@ -2280,6 +2425,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-u, \-\-use\fP \fITAG1\fP[,\fITAG2\fP]
.RS 4
specifies which tag to use in the query file (\fITAG1\fP) and the \fB\-g\fP (\fITAG2\fP) file.
@@ -2303,12 +2453,35 @@ available.
.fam
.fi
.if n .RE
+.SS "bcftools head [\fIOPTIONS\fP] [\fIFILE\fP]"
+.sp
+By default, prints all headers from the specified input file to standard output
+in VCF format. The input file may be in VCF or BCF format; if no \fIFILE\fP is
+specified, standard input will be read. With appropriate options, only some
+of the headers and/or additionally some of the variant records will be printed.
+.sp
+The \fBbcftools head\fP command outputs VCF headers almost exactly as they appear
+in the input file: it may add a \f(CR##FILTER=\fP header if not already
+present, but it never adds version or command line information itself.
+.SS "Options:"
+.sp
+\fB\-h, \-\-header\fP \fIINT\fP
+.RS 4
+Display only the first \fIINT\fP header lines.
+By default, all header lines are displayed.
+.RE
+.sp
+\fB\-n, \-\-records\fP \fIINT\fP
+.RS 4
+Also display the first \fIINT\fP variant records.
+By default, no variant records are displayed.
+.RE
.SS "bcftools index [\fIOPTIONS\fP] \fIin.bcf\fP|\fIin.vcf.gz\fP"
.sp
Creates index for bgzip compressed VCF/BCF files for random access. CSI
(coordinate\-sorted index) is created by default. The CSI format
-supports indexing of chromosomes up to length 2^31. TBI (tabix index)
-index files, which support chromosome lengths up to 2^29, can be
+supports indexing of chromosomes up to length 2^31. TBI (tabix index)
+index files, which support chromosome lengths up to 2^29, can be
created by using the \fI\-t/\-\-tbi\fP option or using the \fItabix\fP program
packaged with htslib. When loading an index file, bcftools will try
the CSI first and then the TBI.
@@ -2326,7 +2499,7 @@ overwrite index if it already exists
.sp
\fB\-m, \-\-min\-shift \fIINT\fP\fP
.RS 4
-set minimal interval size for CSI indices to 2^INT; default: 14
+set minimal interval size for CSI indices to 2^INT; default: 14
.RE
.sp
\fB\-o, \-\-output \fIFILE\fP\fP
@@ -2428,6 +2601,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-t, \-\-targets\fP \fIchr\fP|\fIchr:pos\fP|\fIchr:from\-to\fP|\fIchr:from\-\fP[,...]
.RS 4
see \fBCommon Options\fP
@@ -2438,6 +2616,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-w, \-\-write\fP \fILIST\fP
.RS 4
list of input files to output given as 1\-based indices. With \fB\-p\fP and no
@@ -2633,6 +2816,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-\-threads\fP \fIINT\fP
.RS 4
see \fBCommon Options\fP
@@ -2743,7 +2931,7 @@ Do not require the \fB\-\-fasta\-ref\fP option.
.sp
\fB\-G, \-\-read\-groups\fP \fIFILE\fP
.RS 4
-list of read groups to include or exclude if prefixed with "^".
+list of read groups to include or exclude if prefixed with "^".
One read group per line. This file can also be used to assign new sample
names to read groups by giving the new sample name as a second
white\-space\-separated field, like this: "read_group_id new_sample_name".
@@ -2780,7 +2968,7 @@ Minimum mapping quality for an alignment to be used [0]
Minimum base quality for a base to be considered [13]
.RE
.sp
-* \-\-max\-BQ* \fIINT\fP
+\fB\-\-max\-BQ\fP \fIINT\fP
.RS 4
Caps the base quality to a maximum value [60]. This can be
particularly useful on technologies that produce overly optimistic
@@ -2801,19 +2989,36 @@ As for \fB\-r, \-\-regions\fP, but regions read from FILE;
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-\-ignore\-RG\fP
.RS 4
Ignore RG tags. Treat all reads in one alignment file as one sample.
.RE
.sp
-\fB\-\-rf, \-\-incl\-flags\fP \fISTR\fP|\fIINT\fP
+\fB\-\-ls, \-\-skip\-all\-set\fP
+.RS 4
+Skip reads with all of the FLAG bits set [null]
+.RE
+.sp
+\fB\-\-ns, \-\-skip\-any\-set\fP
.RS 4
-Required flags: skip reads with mask bits unset [null]
+Skip reads with any of the FLAG bits set. This option replaces and
+is synonymous to the deprecated \fB\-\-ff, \-\-excl\-flags\fP [UNMAP,SECONDARY,QCFAIL,DUP]
.RE
.sp
-\fB\-\-ff, \-\-excl\-flags\fP \fISTR\fP|\fIINT\fP
+\fB\-\-lu, \-\-skip\-all\-unset\fP
.RS 4
-Filter flags: skip reads with mask bits set [UNMAP,SECONDARY,QCFAIL,DUP]
+Skip reads with all of the FLAG bits unset. This option replaces and
+is synonymous to the deprecated \fB\-\-rf, \-\-incl\-flags\fP [null]
+.RE
+.sp
+\fB\-\-nu, \-\-skip\-any\-unset\fP
+.RS 4
+Skip reads with any of the FLAG bits unset [null]
.RE
.sp
\fB\-s, \-\-samples\fP \fILIST\fP
@@ -2823,7 +3028,7 @@ list of sample names. See \fBCommon Options\fP
.sp
\fB\-S, \-\-samples\-file\fP \fIFILE\fP
.RS 4
-file of sample names to include or exclude if prefixed with "^".
+file of sample names to include or exclude if prefixed with "^".
One sample per line. This file can also be used to rename samples by giving
the new sample name as a second white\-space\-separated column, like this:
"old_name new_name". If a sample name contains spaces, the spaces can be
@@ -2841,6 +3046,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-x, \-\-ignore\-overlaps\fP
.RS 4
Disable read\-pair overlap detection.
@@ -3218,6 +3428,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-s, \-\-strict\-filter\fP
.RS 4
when merging (\fI\-m+\fP), merged site is PASS only if all sites being merged PASS
@@ -3233,6 +3448,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-\-threads\fP \fIINT\fP
.RS 4
see \fBCommon Options\fP
@@ -3276,6 +3496,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-t, \-\-targets\fP \fIchr\fP|\fIchr:pos\fP|\fIchr:from\-to\fP|\fIchr:from\-\fP[,...]
.RS 4
see \fBCommon Options\fP
@@ -3285,6 +3510,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
.SS "VCF output options:"
.sp
\fB\-\-no\-version\fP
@@ -3867,6 +4097,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-s, \-\-sample\fP \fIstring\fP
.RS 4
sample name
@@ -3882,6 +4117,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-v, \-\-verbose\fP
.RS 4
verbose debugging output which gives hints about the thresholds and decisions made
@@ -3936,6 +4176,11 @@ exclude sites for which \fIEXPRESSION\fP is true. For valid expressions see
\fBEXPRESSIONS\fP.
.RE
.sp
+\fB\-\-force\-samples\fP
+.RS 4
+continue even when some samples requested via \fB\-s/\-S\fP do not exist
+.RE
+.sp
\fB\-f, \-\-format\fP \fIFORMAT\fP
.RS 4
learn by example, see below
@@ -3972,6 +4217,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-s, \-\-samples\fP \fILIST\fP
.RS 4
see \fBCommon Options\fP
@@ -3992,6 +4242,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-u, \-\-allow\-undef\-tags\fP
.RS 4
do not throw an error if there are undefined tags in the format string,
@@ -4303,6 +4558,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-s, \-\-samples\fP \fILIST\fP
.RS 4
see \fBCommon Options\fP
@@ -4322,6 +4582,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
.SS "HMM Options:"
.sp
\fB\-a, \-\-hw\-to\-az\fP \fIFLOAT\fP
@@ -4461,6 +4726,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-s, \-\-samples\fP \fILIST\fP
.RS 4
see \fBCommon Options\fP
@@ -4481,6 +4751,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-u, \-\-user\-tstv\fP \fI\fP
.RS 4
collect Ts/Tv stats for any tag using the given binning [0:1:100]
@@ -4503,7 +4778,7 @@ drop individual genotype information (after subsetting if \fB\-s\fP option is se
.sp
\fB\-h, \-\-header\-only\fP
.RS 4
-output the VCF header only
+output the VCF header only (see also \fBbcftools head\fP)
.RE
.sp
\fB\-H, \-\-no\-header\fP
@@ -4546,6 +4821,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-t, \-\-targets\fP \fIchr\fP|\fIchr:pos\fP|\fIchr:from\-to\fP|\fIchr:from\-\fP[,...]
.RS 4
see \fBCommon Options\fP
@@ -4556,6 +4836,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP
+.RS 4
+see \fBCommon Options\fP
+.RE
+.sp
\fB\-\-threads\fP \fIINT\fP
.RS 4
see \fBCommon Options\fP
@@ -4637,11 +4922,11 @@ exclude sites for which \fIEXPRESSION\fP is true. For valid expressions see
see \fBCommon Options\fP
.RE
.sp
-\fB\-g, \-\-genotype\fP [^][\fIhom\fP|\fIhet\fP|\fImiss\fP]
+\fB\-g, \-\-genotype\fP [^][\fIhom\fP|\fIhet\fP|\fImiss\fP]
.RS 4
include only sites with one or more homozygous (\fIhom\fP), heterozygous
-(\fIhet\fP) or missing (\fImiss\fP) genotypes. When prefixed with \fI^\fP, the logic
-is reversed; thus \fI^het\fP excludes sites with heterozygous genotypes.
+(\fIhet\fP) or missing (\fImiss\fP) genotypes. When prefixed with \fI^\fP, the logic
+is reversed; thus \fI^het\fP excludes sites with heterozygous genotypes.
.RE
.sp
\fB\-i, \-\-include\fP \fIEXPRESSION\fP
diff --git a/doc/bcftools.html b/doc/bcftools.html
index 5bfc82986..febeaa9c9 100644
--- a/doc/bcftools.html
+++ b/doc/bcftools.html
@@ -50,7 +50,7 @@ DESCRIPTION
VERSION
-
This manual page was last updated 2021-10-22 and refers to bcftools git version 1.14.
+
This manual page was last updated 2022-02-21 and refers to bcftools git version 1.15.
@@ -118,6 +118,9 @@
LIST OF COMMANDS
gtcheck .. check sample concordance, detect sample swaps and contamination
+head .. view VCF/BCF file headers
+
+
index .. index VCF/BCF
@@ -297,20 +300,22 @@ Common Options
This option requires indexed VCF/BCF files. Note that -R cannot be used
in combination with -r.
---regions-overlap 0|1|2
+--regions-overlap pos|record|variant|0|1|2
This option controls how overlapping records are determined:
-set to 0 if the VCF record has to have POS inside a region
+set to pos or 0 if the VCF record has to have POS inside a region
(this corresponds to the default behavior of -t/-T);
-set to 1 if also overlapping records with POS outside a region
-should be included (this is the default behavior of -r/-R); or set
-to 2 to include only true overlapping variation (compare
+set to record or 1 if also overlapping records with POS outside a region
+should be included (this is the default behavior of -r/-R, and includes indels
+with POS at the end of a region, which are technically outside the region); or set
+to variant or 2 to include only true overlapping variation (compare
the full VCF representation "TA>T-
" vs the true sequence variation "A>-
").
--s, --samples [^]LIST
+-s, --samples [^]LIST
Comma-separated list of samples to include or exclude if prefixed
-with "^".
+with "^." (Note that when multiple samples are to be excluded,
+the "^" prefix is still present only once, e.g. "^SAMPLE1,SAMPLE2".)
The sample order is updated to reflect that given on the command line.
Note that in general tags such as INFO/AC, INFO/AN, etc are not updated
to correspond to the subset samples. bcftools view is the
@@ -328,9 +333,9 @@
Common Options
-- -S, --samples-file FILE
+- -S, --samples-file [^]FILE
-
-
File of sample names to include or exclude if prefixed with "^".
+
File of sample names to include or exclude if prefixed with "^".
One sample per line. See also the note above for the -s, --samples
option.
The sample order is updated to reflect that given in the input file.
@@ -370,19 +375,20 @@
Common Options
-- -t, --targets [^]chr|chr:pos|chr:from-to|chr:from-[,…]
+- -t, --targets [^]chr|chr:pos|chr:from-to|chr:from-[,…]
-
Similar as -r, --regions, but the next position is accessed by streaming the
whole VCF/BCF rather than using the tbi/csi index. Both -r and -t options
can be applied simultaneously: -r uses the index to jump to a region
and -t discards positions which are not in the targets. Unlike -r, targets
-can be prefixed with "^" to request logical complement. For example, "^X,Y,MT"
+can be prefixed with "^" to request logical complement. For example, "^X,Y,MT"
indicates that sequences X, Y and MT should be skipped.
Yet another difference between the -t/-T and -r/-R is that -r/-R checks for
proper overlaps and considers both POS and the end position of an indel, while -t/-T
-considers the POS coordinate only. Note that -t cannot be used in combination with -T.
+considers the POS coordinate only (by default; see also --regions-overlap and --targets-overlap).
+Note that -t cannot be used in combination with -T.
-- -T, --targets-file [^]FILE
+- -T, --targets-file [^]FILE
-
Same -t, --targets, but reads regions from a file. Note that -T
cannot be used in combination with -t.
@@ -402,7 +408,7 @@ Common Options
-- --targets-overlap 0|1|2
+- --targets-overlap pos|record|variant|0|1|2
-
Same as --regions-overlap but for -t/-T.
@@ -463,12 +469,6 @@ bcftools annotate [OPTIONS] FILE
-- --collapse snps|indels|both|all|some|none
--
-
Controls how to match records from the annotation file to the target VCF.
-Effective only when -a is a VCF or BCF.
-See Common Options for more.
-
- -c, --columns list
-
Comma-separated list of columns or tags to carry over from the annotation file
@@ -486,7 +486,7 @@
bcftools annotate [OPTIONS] FILE
To carry over all INFO annotations, use "INFO". To add all INFO annotations except
-"TAG", use "^INFO/TAG". By default, existing values are replaced.
+"TAG", use "^INFO/TAG". By default, existing values are replaced.
By default, existing tags are overwritten unless the source value is a missing value (i.e. ".").
@@ -541,7 +541,7 @@ bcftools annotate [OPTIONS] FILE
-- -I, --set-id [+]FORMAT
+- -I, --set-id [+]FORMAT
-
assign ID on the fly. The format is the same as in the query
command (see below). By default all existing IDs are replaced. If the
@@ -585,6 +585,14 @@
bcftools annotate [OPTIONS] FILE
-
annotate sites which are present ("+") or absent ("-") in the -a file with a new INFO/TAG flag
+ - --min-overlap ANN:'VCF'
+-
+
minimum overlap required as a fraction of the variant in the annotation -a file (ANN), in the
+target VCF file (:VCF), or both for reciprocal overlap (ANN:VCF).
+By default overlaps of arbitrary length are sufficient.
+The option can be used only with the tab-delimited annotation -a file and with BEG and END
+columns present.
+
- --no-version
-
see Common Options
@@ -597,6 +605,13 @@ bcftools annotate [OPTIONS] FILE
-
see Common Options
+ - --pair-logic snps|indels|both|all|some|exact
+-
+
Controls how to match records from the annotation file to the target VCF.
+Effective only when -a is a VCF or BCF. The option replaces the former
+uninuitive --collapse.
+See Common Options for more.
+
- -r, --regions chr|chr:pos|chr:from-to|chr:from-[,…]
-
see Common Options
@@ -605,6 +620,10 @@ bcftools annotate [OPTIONS] FILE
-
see Common Options
+ - --regions-overlap 0|1|2
+-
+
see Common Options
+
- --rename-annots file
-
rename annotations according to the map in file, with
@@ -618,7 +637,7 @@
bcftools annotate [OPTIONS] FILE
"old_name new_name\n" pairs separated by whitespaces, each on a separate
line.
-- -s, --samples [^]LIST
+- -s, --samples [^]LIST
-
subset of samples to annotate, see also Common Options
@@ -646,7 +665,7 @@ bcftools annotate [OPTIONS] FILE
"FILTER/SomeFilter" to remove a specific filter. Similarly, "INFO" can
be used to remove all INFO tags and "FORMAT" to remove all FORMAT tags
except GT. To remove all INFO tags except "FOO" and "BAR", use
-"^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER).
+"^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER).
"INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".
@@ -750,6 +769,10 @@
see Common Options
+
--regions-overlap 0|1|2
+
+see Common Options
+
-s, --samples LIST
see Common Options
@@ -766,6 +789,10 @@
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
--threads INT
see Common Options
@@ -786,7 +813,7 @@
comma-separated list of FORMAT fields to output for each sample. Currently
GQ and GP fields are supported. For convenience, the fields can be given
-as lower case letters. Prefixed with "^" indicates a request for tag
+as lower case letters. Prefixed with "^" indicates a request for tag
removal of auxiliary tags useful only for calling.
-F, --prior-freqs AN,AC
@@ -956,6 +983,10 @@
General Options:
see Common Options
+
--regions-overlap 0|1|2
+
+see Common Options
+
-s, --query-sample string
query sample name
@@ -968,6 +999,10 @@ General Options:
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
@@ -1118,6 +1153,10 @@ bcftools concat [OPTIONS] FILE1 FILE2
see Common Options. Requires -a, --allow-overlaps.
+--regions-overlap 0|1|2
+
+see Common Options
+
--threads INT
see Common Options
@@ -1138,6 +1177,10 @@ bcftools consensus [OPTIONS] FILE
@@ -1328,10 +1379,14 @@ GEN/SAMPLE conversion:
- -G, --gensample2vcf prefix or gen-file,sample-file
-
-
convert IMPUTE2 output to VCF. The second column must be of the form
-"CHROM:POS_REF_ALT" to detect possible strand swaps; IMPUTE2 leaves the
-first one empty ("--") when sites from reference panel are filled in. See
-also -g below.
+convert IMPUTE2 output to VCF. One of the ID columns ("SNP ID" or "rsID" in
+https://www.cog-genomics.org/plink/2.0/formats#gen) must be of the form
+"CHROM:POS_REF_ALT" to detect possible strand swaps.
+
+When the --vcf-ids option is given, the other column (autodetected) is used
+to fill the ID column of the VCF.
+
+See also -g and --3N6 options.
- -g, --gensample prefix or gen-file,sample-file
-
@@ -1339,14 +1394,29 @@
GEN/SAMPLE conversion:
The columns of .gen file format are ID1,ID2,POS,A,B followed by three
genotype probabilities P(AA), P(AB), P(BB) for each sample. In order to
prevent strand swaps, the program uses IDs of the form "CHROM:POS_REF_ALT".
-For example:
+When the --vcf-ids option is given, the second column is set to match the ID
+column of the VCF.
+
+See also -G and --3N6 options.
+
+The file .gen and .sample file format are:
-
.gen
- ----
+ .gen (with --3N6 --vcf-ids)
+ ---------------------------
+ chr1 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 1 0
+ chr1 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 0 1
+
+ .gen (with --vcf-ids)
+ ---------------------------
+ 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 1 0
+ 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 0 1
+
+ .gen (the default)
+ ------------------------------
1:111485207_G_A 1:111485207_G_A 111485207 G A 0 1 0 0 1 0
1:111494194_C_T 1:111494194_C_T 111494194 C T 0 1 0 0 0 1
@@ -1360,13 +1430,14 @@ GEN/SAMPLE conversion:
-- --tag STRING
+- --3N6
-
-
tag to take values for .gen file: GT,PL,GL,GP
+Expect/Create files in the 3*N+6 column format. This is the new .gen file format with the first
+column containing the chromosome name, see https://www.cog-genomics.org/plink/2.0/formats#gen
-- --chrom
+- --tag STRING
-
-
output chromosome in the first column instead of CHROM:POS_REF_ALT
+tag to take values for .gen file: GT,PL,GL,GP
- --sex FILE
-
@@ -1416,19 +1487,23 @@
HAP/SAMPLE conversion:
-
convert from hap/sample format to VCF. The columns of .hap file are
similar to .gen file above, but there are only two haplotype columns per
-sample. Note that the first column of the .hap file is expected to be in
-the form "CHR:POS_REF_ALT(_END)?", with the _END being optional for
-defining the INFO/END tag when ALT is a symbolic allele, for example:
+sample. Note that the first or the second column of the .hap file is expected to be in
+the form "CHR:POS_REF_ALT[_END]", with the _END being optional for
+defining the INFO/END tag when ALT is a symbolic allele. For example:
-
.hap
- ----
+ .hap (with --vcf-ids)
+ ---------------------
1:111485207_G_A rsID1 111485207 G A 0 1 0 0
- 1:111494194_C_T rsID2 111494194 C T 0 1 0 0
- 1:111495231_A_<DEL>_111495784 rsID3 111495231 A <DEL> 0 0 1 0
+ 1:111495231_A_<DEL>_111495784 rsID3 111495231 A <DEL> 0 0 1 0
+
+ .hap (the default)
+ ------------------
+ 1 1:111485207_G_A 111485207 G A 0 1 0 0
+ 1 1:111495231_A_<DEL>_111495784 111495231 A <DEL> 0 0 1 0
@@ -1463,7 +1538,10 @@
HAP/SAMPLE conversion:
- --vcf-ids
-
-
output VCF IDs instead of "CHROM:POS_REF_ALT" IDs
+the second column of the .hap file holds the VCF ids, the first
+column is of the form "CHR:POS_REF_ALT[_END]". Without the option,
+the format follows https://www.cog-genomics.org/plink/2.0/formats#haps
+with ids (the second column) of the form "CHR:POS_REF_ALT[_END]"
@@ -1474,15 +1552,15 @@
HAP/LEGEND/SAMPLE conversion:
- -H, --haplegendsample2vcf prefix or hap-file,legend-file,sample-file
-
-
convert from hap/legend/sample format used by IMPUTE2 to VCF, see
-also -h, --hapslegendsample below.
+convert from hap/legend/sample format used by IMPUTE2 to VCF.
+See also -h, --hapslegendsample below.
- -h, --haplegendsample prefix or hap-file,legend-file,sample-file
-
convert from VCF to hap/legend/sample format used by IMPUTE2 and SHAPEIT.
The columns of .legend file ID,POS,REF,ALT. In order to prevent strand
-swaps, the program uses IDs of the form "CHROM:POS_REF_ALT". The .sample
-file is quite basic at the moment with columns for population, group and
+swaps, the program uses IDs of the form "CHROM:POS_REF_ALT".
+The .sample file is quite basic at the moment with columns for population, group and
sex expected to be edited by the user. For example:
@@ -1532,7 +1610,8 @@
HAP/LEGEND/SAMPLE conversion:
- --vcf-ids
-
-
output VCF IDs instead of "CHROM:POS_REF_ALT" IDs
+output VCF IDs instead of "CHROM:POS_REF_ALT". Note that this option can
+be used with --haplegendsample but not with --haplegendsample2vcf.
@@ -1759,6 +1838,10 @@ bcftools csq [OPTIONS] FILE
see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
-s, --samples LIST
samples to include or "-" to apply all variants and ignore samples
@@ -1775,6 +1858,10 @@ bcftools csq [OPTIONS] FILE
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
@@ -1886,6 +1973,19 @@
bcftools filter [OPTIONS] FILE
include only sites for which EXPRESSION is true. For valid expressions see
EXPRESSIONS.
+
--mask [^]REGION
+
+Soft filter regions, prepepend "^" to negate. Requires -s, --soft-filter.
+
+
-M, --mask-file [^]FILE
+
+Soft filter regions listed in a file, "^" to negate. Requires -s, --soft-filter.
+
+
--mask-overlap 0|1|2
+
+Same as --regions-overlap but for --mask/--mask-file.
+See Common Options. [1]
+
-m, --mode [+x]
define behaviour at sites with existing FILTER annotations. The default
@@ -1915,6 +2015,10 @@
bcftools filter [OPTIONS] FILE
see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
-s, --soft-filter STRING|+
annotate FILTER column with STRING or, with +, a unique filter name generated
@@ -1932,6 +2036,10 @@
bcftools filter [OPTIONS] FILE
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
--threads INT
see Common Options
@@ -2010,6 +2118,10 @@ bcftools gtcheck [OPTIONS] [-g ge
Restrict to regions listed in a file, see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
@@ -2032,6 +2144,10 @@
bcftools gtcheck [OPTIONS] [-g ge
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
-u, --use TAG1[,TAG2]
specifies which tag to use in the query file (TAG1) and the -g (TAG2) file.
@@ -2057,12 +2173,43 @@
bcftools gtcheck [OPTIONS] [-g ge
+
bcftools head [OPTIONS] [FILE]
+
+
By default, prints all headers from the specified input file to standard output
+in VCF format. The input file may be in VCF or BCF format; if no FILE is
+specified, standard input will be read. With appropriate options, only some
+of the headers and/or additionally some of the variant records will be printed.
+
+
+
The bcftools head command outputs VCF headers almost exactly as they appear
+in the input file: it may add a ##FILTER=<ID=PASS>
header if not already
+present, but it never adds version or command line information itself.
+
+
+
Options:
+
+
+- -h, --header INT
+-
+
Display only the first INT header lines.
+By default, all header lines are displayed.
+
+- -n, --records INT
+-
+
Also display the first INT variant records.
+By default, no variant records are displayed.
+
+
+
+
+
+
bcftools index [OPTIONS] in.bcf|in.vcf.gz
Creates index for bgzip compressed VCF/BCF files for random access. CSI
(coordinate-sorted index) is created by default. The CSI format
-supports indexing of chromosomes up to length 2^31. TBI (tabix index)
-index files, which support chromosome lengths up to 2^29, can be
+supports indexing of chromosomes up to length 2^31. TBI (tabix index)
+index files, which support chromosome lengths up to 2^29, can be
created by using the -t/--tbi option or using the tabix program
packaged with htslib. When loading an index file, bcftools will try
the CSI first and then the TBI.
@@ -2081,7 +2228,7 @@
Indexing options:
-m, --min-shift INT
-set minimal interval size for CSI indices to 2^INT; default: 14
+set minimal interval size for CSI indices to 2^INT; default: 14
-o, --output FILE
@@ -2180,6 +2327,10 @@ bcftools isec [OPTIONS] A.vcf.gz B.vcf.gz
see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
-t, --targets chr|chr:pos|chr:from-to|chr:from-[,…]
see Common Options
@@ -2188,6 +2339,10 @@ bcftools isec [OPTIONS] A.vcf.gz B.vcf.gz
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
-w, --write LIST
list of input files to output given as 1-based indices. With -p and no
@@ -2367,6 +2522,10 @@
bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz<
see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
--threads INT
see Common Options
@@ -2475,9 +2634,9 @@
Do not require the --fasta-ref option.
--G, --read-groups FILE
+-G, --read-groups FILE
-list of read groups to include or exclude if prefixed with "^".
+
list of read groups to include or exclude if prefixed with "^".
One read group per line. This file can also be used to assign new sample
names to read groups by giving the new sample name as a second
white-space-separated field, like this: "read_group_id new_sample_name".
@@ -2512,7 +2671,7 @@
Minimum base quality for a base to be considered [13]
-* --max-BQ* INT
+--max-BQ INT
Caps the base quality to a maximum value [60]. This can be
particularly useful on technologies that produce overly optimistic
@@ -2530,25 +2689,39 @@
As for -r, --regions, but regions read from FILE;
see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
--ignore-RG
Ignore RG tags. Treat all reads in one alignment file as one sample.
---rf, --incl-flags STR|INT
+--ls, --skip-all-set
+
+Skip reads with all of the FLAG bits set [null]
+
+--ns, --skip-any-set
+
+Skip reads with any of the FLAG bits set. This option replaces and
+is synonymous to the deprecated --ff, --excl-flags [UNMAP,SECONDARY,QCFAIL,DUP]
+
+--lu, --skip-all-unset
-Required flags: skip reads with mask bits unset [null]
+Skip reads with all of the FLAG bits unset. This option replaces and
+is synonymous to the deprecated --rf, --incl-flags [null]
---ff, --excl-flags STR|INT
+--nu, --skip-any-unset
-Filter flags: skip reads with mask bits set [UNMAP,SECONDARY,QCFAIL,DUP]
+Skip reads with any of the FLAG bits unset [null]
--s, --samples LIST
+-s, --samples LIST
list of sample names. See Common Options
--S, --samples-file FILE
+-S, --samples-file FILE
-file of sample names to include or exclude if prefixed with "^".
+
file of sample names to include or exclude if prefixed with "^".
One sample per line. This file can also be used to rename samples by giving
the new sample name as a second white-space-separated column, like this:
"old_name new_name". If a sample name contains spaces, the spaces can be
@@ -2563,6 +2736,10 @@
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
-x, --ignore-overlaps
Disable read-pair overlap detection.
@@ -2914,6 +3091,10 @@ bcftools norm [OPTIONS] file.vcf.gz
see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
-s, --strict-filter
when merging (-m+), merged site is PASS only if all sites being merged PASS
@@ -2926,6 +3107,10 @@ bcftools norm [OPTIONS] file.vcf.gz
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
--threads INT
see Common Options
@@ -2971,6 +3156,10 @@
see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
-t, --targets chr|chr:pos|chr:from-to|chr:from-[,…]
see Common Options
@@ -2979,6 +3168,10 @@
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
@@ -3368,6 +3561,10 @@ General options:
see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
-s, --sample string
sample name
@@ -3380,6 +3577,10 @@ General options:
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
-v, --verbose
verbose debugging output which gives hints about the thresholds and decisions made
@@ -3440,6 +3641,10 @@
bcftools query [OPTIONS] file.vcf.gz [file.
exclude sites for which EXPRESSION is true. For valid expressions see
EXPRESSIONS.
+--force-samples
+
+continue even when some samples requested via -s/-S do not exist
+
-f, --format FORMAT
learn by example, see below
@@ -3469,6 +3674,10 @@ bcftools query [OPTIONS] file.vcf.gz [file.
see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
-s, --samples LIST
see Common Options
@@ -3485,6 +3694,10 @@ bcftools query [OPTIONS] file.vcf.gz [file.
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
-u, --allow-undef-tags
do not throw an error if there are undefined tags in the format string,
@@ -3762,6 +3975,10 @@
General Options:
see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
-s, --samples LIST
see Common Options
@@ -3778,6 +3995,10 @@ General Options:
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
@@ -3916,6 +4137,10 @@ bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz
see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
-s, --samples LIST
see Common Options
@@ -3932,6 +4157,10 @@ bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
-u, --user-tstv <TAG[:min:max:n]>
collect Ts/Tv stats for any tag using the given binning [0:1:100]
@@ -3959,7 +4188,7 @@ Output options
-h, --header-only
-output the VCF header only
+output the VCF header only (see also bcftools head)
-H, --no-header
@@ -3999,6 +4228,10 @@ Output options
see Common Options
+--regions-overlap 0|1|2
+
+see Common Options
+
-t, --targets chr|chr:pos|chr:from-to|chr:from-[,…]
see Common Options
@@ -4007,6 +4240,10 @@ Output options
see Common Options
+--targets-overlap 0|1|2
+
+see Common Options
+
--threads INT
see Common Options
@@ -4092,11 +4329,11 @@ Filter options:
see Common Options
--g, --genotype [^][hom|het|miss]
+-g, --genotype [^][hom|het|miss]
include only sites with one or more homozygous (hom), heterozygous
-(het) or missing (miss) genotypes. When prefixed with ^, the logic
-is reversed; thus ^het excludes sites with heterozygous genotypes.
+(het) or missing (miss) genotypes. When prefixed with ^, the logic
+is reversed; thus ^het excludes sites with heterozygous genotypes.
-i, --include EXPRESSION
@@ -4751,7 +4988,7 @@ COPYING