diff --git a/Makefile b/Makefile
index 7013cd594..1005c084a 100644
--- a/Makefile
+++ b/Makefile
@@ -104,7 +104,7 @@ endif
include config.mk
-PACKAGE_VERSION = 1.18
+PACKAGE_VERSION = 1.19
# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
@@ -233,8 +233,11 @@ abuf_h = abuf.h $(htslib_vcf_h)
dbuf_h = dbuf.h $(htslib_vcf_h)
bam2bcf_h = bam2bcf.h $(htslib_hts_h) $(htslib_vcf_h)
bam_sample_h = bam_sample.h $(htslib_sam_h)
+cigar_state_h = cigar_state.h $(htslib_hts_h) $(htslib_sam_h)
+read_consensus_h = read_consensus.h $(htslib_hts_h) $(htslib_sam_h)
+str_finder_h = str_finder.h utlist.h
-str_finder.o: str_finder.h utlist.h
+str_finder.o: str_finder.c $(str_finder_h) utlist.h
main.o: main.c $(htslib_hts_h) config.h version.h $(bcftools_h)
vcfannotate.o: vcfannotate.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(bcftools_h) vcmp.h $(filter_h) $(convert_h) $(smpl_ilist_h) regidx.h $(htslib_khash_h) $(dbuf_h)
vcfplugin.o: vcfplugin.c config.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(bcftools_h) vcmp.h $(filter_h)
@@ -242,11 +245,11 @@ vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reade
vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_thread_pool_h) $(bcftools_h)
vcfconvert.o: vcfconvert.c $(htslib_faidx_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kseq_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h)
vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h regidx.h
-vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_hts_os_h) $(bcftools_h) extsort.h
+vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_hts_os_h) $(htslib_bgzf_h) $(bcftools_h) extsort.h filter.h
vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h)
vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h)
-vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h)
-vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h gff.h
+vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(htslib_kbitset_h) $(htslib_hts_endian_h) $(bcftools_h) regidx.h vcmp.h $(htslib_khash_h) $(htslib_kbitset_h)
+vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h gff.h regidx.h
vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(smpl_ilist_h)
vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h)
vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h
@@ -254,7 +257,7 @@ vcfhead.o: vcfhead.c $(htslib_kstring_h) $(htslib_vcf_h) $(bcftools_h)
vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h)
vcfsort.o: vcfsort.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_hts_os_h) kheap.h $(bcftools_h)
vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) bin.h dist.h
-vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(htslib_khash_str2int_h)
+vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h)
reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_kseq_h) $(htslib_thread_pool_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) $(khash_str2str_h)
tabix.o: tabix.c $(htslib_bgzf_h) $(htslib_tbx_h)
ccall.o: ccall.c $(htslib_kfunc_h) $(call_h) kmin.h $(prob1_h)
@@ -275,12 +278,12 @@ bin.o: bin.c $(bcftools_h) bin.h
dist.o: dist.c dist.h
cols.o: cols.c cols.h
regidx.o: regidx.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) regidx.h
-consensus.o: consensus.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) regidx.h $(bcftools_h) rbuf.h $(filter_h)
+consensus.o: consensus.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) regidx.h $(bcftools_h) rbuf.h $(filter_h) $(smpl_ilist_h)
mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(htslib_hts_os_h) regidx.h $(bcftools_h) $(bam2bcf_h) $(bam_sample_h) $(gvcf_h)
bam2bcf.o: bam2bcf.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) mw.h
-bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) str_finder.h
-bam2bcf_iaux.o: bam2bcf_iaux.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) str_finder.h read_consensus.h cigar_state.h
-read_consensus.o: read_consensus.c read_consensus.h cigar_state.h $(htslib_hts_h) $(htslib_sam_h)
+bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) $(str_finder_h)
+bam2bcf_iaux.o: bam2bcf_iaux.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bcftools_h) $(bam2bcf_h) $(htslib_ksort_h) $(read_consensus_h) $(cigar_state_h)
+read_consensus.o: read_consensus.c $(read_consensus_h) $(cigar_state_h) $(bcftools_h) kheap.h
bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(khash_str2str_h) $(bam_sample_h) $(bcftools_h)
version.o: version.h version.c
hclust.o: hclust.c $(htslib_hts_h) $(htslib_kstring_h) $(bcftools_h) hclust.h
@@ -289,8 +292,8 @@ vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcf
abuf.o: abuf.c $(htslib_vcf_h) $(bcftools_h) rbuf.h abuf.h
extsort.o: extsort.c $(bcftools_h) extsort.h kheap.h
smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h)
-gff.o: gff.c gff.h regidx.h
-csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) regidx.h kheap.h $(smpl_ilist_h) rbuf.h
+gff.o: gff.c $(htslib_hts_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) gff.h regidx.h
+csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_faidx_h) $(htslib_bgzf_h) $(bcftools_h) $(filter_h) regidx.h kheap.h $(smpl_ilist_h) rbuf.h gff.h
# test programs
@@ -300,7 +303,7 @@ csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(hts
#
# If using MSYS, avoid poor shell expansion via:
# MSYS2_ARG_CONV_EXCL="*" make check
-check test-no-plugins: $(PROGRAMS) $(TEST_PROGRAMS) $(BGZIP) $(TABIX)
+check-no-plugins test-no-plugins: $(PROGRAMS) $(TEST_PROGRAMS) $(BGZIP) $(TABIX)
./test/test-rbuf
./test/test-regidx
REF_PATH=: ./test/test.pl --exec bgzip=$(BGZIP) --exec tabix=$(TABIX) --htsdir=$(HTSDIR) $${TEST_OPTS:-}
diff --git a/NEWS b/NEWS
index 62c4699ac..90c43022b 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,127 @@
-## Release 1.18 (25th July 2023)
+## Release 1.19 (12th December 2023)
+
+
+Changes affecting the whole of bcftools, or multiple commands:
+
+* Filtering expressions can be given a file with list of strings to match, this
+ was previously possible only for the ID column. For example
+
+ ID=@file .. selects lines with ID present in the file
+ INFO/TAG=@file.txt .. selects lines where TAG has a string value listed in the file
+ INFO/TAG!=@file.txt .. TAG must not have a string value listed in the file
+
+ Allow to query REF,ALT columns directly, for example
+
+ -e 'REF="N"'
+
+
+Changes affecting specific commands:
+
+* bcftools annotate
+
+ - Fix `bcftools annotate --mark-sites`, VCF sites overlapping regions in a BED file
+ were not annotated (#1989)
+
+ - Add flexibility to FILTER column transfers and allow transfers within the same file,
+ across files, and in combination. For examples see
+ http://samtools.github.io/bcftools/howtos/annotate.html#transfer_filter_to_info
+
+* bcftools call
+
+ - Output MIN_DP rather than MinDP in gVCF mode
+
+ - New `-*, --keep-unseen-allele` option to output the unobserved allele <*>,
+ intended for gVCF.
+
+* bcftools head
+
+ - New `-s, --samples` option to include the #CHROM header line with samples.
+
+* bcftools gtcheck
+
+ - Add output options `-o, --output` and `-O, --output-type`
+
+ - Add filtering options `-i, --include` and `-e, --exclude`
+
+ - Rename the short option `-e, --error-probability` from lower case to upper
+ case `-E, --error-probability`
+
+ - Changes to the output format, replace the DC section with DCv2:
+
+ - adds a new column for the number of matching genotypes
+
+ - The --error-probability is newly interpreted as the probability of erroneous
+ allele rather than genotype. In other words, the calculation of the discordance
+ score now considers the probability of genotyping error to be different
+ for HOM and HET genotypes, i.e. P(0/1|dsg=0) > P(1/1|dsg=0).
+
+ - fixes in HWE score calculation plus output average HWE score rather
+ than absolute HWE score
+
+ - better description of fields
+* bcftools merge
+
+ - Add `-m` modifiers to suppress the output of the unseen allele <*> or
+ at variant sites (e.g. `-m both,*`) or all sites (e.g. `-m both,**`)
+
+* bcftools mpileup
+
+ - Output MIN_DP rather than MinDP in gVCF mode
+
+* bcftools norm
+
+ - Add the number of joined lines to the summary output, for example
+
+ Lines total/split/joined/realigned/skipped: 6/0/3/0/0
+
+ - Allow combining -m and -a with --old-rec-tag (#2020)
+
+ - Symbolic alleles caused norm to expand REF to the full length of the deletion.
+ This was not intended and problematic for long deletions, the REF allele should list
+ one base only (#2029)
+
+* bcftools query
+
+ - Add new `-N, --disable-automatic-newline` option for pre-1.18 query formatting behavior
+ when newline would not be added when missing
+
+ - Make the automatic addition of the newline character in a more predictable way and,
+ when missing, always put it at the end of the expression. In version 1.18 it could
+ be added at the end of the expression (for per-site expressions) or inside the square
+ brackets (for per-sample expressions). The new behavior is:
+
+ - if the formatting expression contains a newline character, do nothing
+ - if there is no newline character and -N, --disable-automatic-newline is given, do nothing
+ - if there is no newline character and -N is not given, insert newline at the end of the expression
+
+ See #1969 for details
+
+ - Add new `-F, --print-filtered` option to output a default string for samples that would otherwise
+ be filtered by `-i/-e` expressions.
+
+ - Include sample name in the output header with `-H` whenever it makes sense (#1992)
+
+* bcftools +spit-vep
+
+ - Fix on the fly filtering involving numeric subfields, e.g. `-i 'MAX_AF<0.001'` (#2039)
+
+ - Interpret default column type names (--columns-types) as entire strings, rather than
+ substrings to avoid unexpected spurious matches (i.e. internally add ^ and $ to all
+ field names)
+
+* bcftools +trio-dnm2
+
+ - Do not flag paternal genotyping errors as de novo mutations. Specifically, when father's
+ chrX genotype is 0/1 and mother's 0/0, 0/1 in the child will not be marked as DNM.
+
+* bcftools view
+
+ - Add new `-A, --trim-unseen-allele` option to remove the unseen allele <*> or
+ at variant sites (`-A`) or all sites (`-AA`)
+
+
+## Release 1.18 (25th July 2023)
Changes affecting the whole of bcftools, or multiple commands:
diff --git a/abuf.c b/abuf.c
index 7958cf570..ea5e1b373 100644
--- a/abuf.c
+++ b/abuf.c
@@ -411,12 +411,12 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo
buf->tmp2 = dst.s;
ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
}
- if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
+ if ( ret!=0 ) error("An error occurred while updating INFO/%s (errcode=%d)\n",tag,ret);
}
}
static void _split_table_set_history(abuf_t *buf)
{
- int i,j;
+ int i,j,ret;
bcf1_t *rec = buf->split.rec;
buf->tmps.l = 0;
ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
@@ -441,8 +441,8 @@ static void _split_table_set_history(abuf_t *buf)
kputc(',',&buf->tmps);
}
buf->tmps.s[--buf->tmps.l] = 0;
- if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
- error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
+ if ( (ret=bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
+ error("An error occurred while updating INFO/%s (errcode=%d)\n",buf->split.info_tag,ret);
}
}
static void _split_table_set_gt(abuf_t *buf)
@@ -668,7 +668,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo
#undef BRANCH
ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
}
- if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
+ if ( ret!=0 ) error("An error occurred while updating FORMAT/%s (errcode=%d)\n",tag,ret);
}
}
static inline int _is_acgtn(char *seq)
diff --git a/bcftools.h b/bcftools.h
index bba71e3b6..328f19d7e 100644
--- a/bcftools.h
+++ b/bcftools.h
@@ -141,4 +141,16 @@ static inline int bcf_double_test(double d, uint64_t value)
#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing)
#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end))
+static inline int get_unseen_allele(bcf1_t *line)
+{
+ int i;
+ for (i=1; in_allele; i++)
+ {
+ if ( !strcmp(line->d.allele[i],"<*>") ) return i;
+ if ( !strcmp(line->d.allele[i],"") ) return i;
+ if ( !strcmp(line->d.allele[i],"") ) return i;
+ }
+ return 0;
+}
+
#endif
diff --git a/call.h b/call.h
index 16bf0b68e..090ac019a 100644
--- a/call.h
+++ b/call.h
@@ -33,7 +33,7 @@ THE SOFTWARE. */
#define CALL_VARONLY (1<<1)
#define CALL_CONSTR_TRIO (1<<2)
#define CALL_CONSTR_ALLELES (1<<3)
-//
+#define CALL_KEEP_UNSEEN (1<<4)
#define CALL_FMT_PV4 (1<<5)
#define CALL_FMT_GQ (1<<6)
#define CALL_FMT_GP (1<<7)
@@ -125,8 +125,7 @@ call_t;
void error(const char *format, ...);
/*
- * call() - return -1 value on critical error; -2 to skip the site; or the number of non-reference
- * alleles on success.
+ * call() - return -1 value on critical error; -2 to skip the site; or the number of alleles on success
*/
int mcall(call_t *call, bcf1_t *rec); // multiallic and rare-variant calling model
int ccall(call_t *call, bcf1_t *rec); // the default consensus calling model
diff --git a/convert.c b/convert.c
index 07ff01862..d418dfa27 100644
--- a/convert.c
+++ b/convert.c
@@ -104,9 +104,11 @@ struct _convert_t
char *undef_info_tag;
void *used_tags_hash;
char **used_tags_list;
+ char *print_filtered;
int nused_tags;
int allow_undef_tags;
int force_newline;
+ int header_samples;
uint8_t **subset_samples;
};
@@ -1550,6 +1552,7 @@ void convert_destroy(convert_t *convert)
free(convert->used_tags_list);
}
khash_str2int_destroy(convert->used_tags_hash);
+ free(convert->print_filtered);
free(convert->fmt);
free(convert->undef_info_tag);
free(convert->dat);
@@ -1562,6 +1565,7 @@ void convert_destroy(convert_t *convert)
int convert_header(convert_t *convert, kstring_t *str)
{
int i, icol = 0, l_ori = str->l;
+ bcf_hdr_t *hdr = convert->header;
// Supress the header output if LINE is present
for (i=0; infmt; i++)
@@ -1585,6 +1589,7 @@ int convert_header(convert_t *convert, kstring_t *str)
while ( convert->fmt[j].is_gt_field ) j++;
for (js=0; jsnsamples; js++)
{
+ int ks = convert->samples[js];
for (k=i; kfmt[k].type == T_SEP )
@@ -1600,10 +1605,21 @@ int convert_header(convert_t *convert, kstring_t *str)
}
}
}
+ else if ( convert->header_samples )
+ ksprintf(str, "[%d]%s:%s", ++icol, hdr->samples[ks], convert->fmt[k].key);
else
ksprintf(str, "[%d]%s", ++icol, convert->fmt[k].key);
}
- if ( has_fmt_newline ) break;
+ if ( has_fmt_newline )
+ {
+ if ( !convert->header_samples ) break;
+
+ // this is unfortunate: the formatting expression breaks the per-sample output into separate lines,
+ // therefore including a sample name in the header makes no sense anymore
+ convert->header_samples = 0;
+ str->l = l_ori;
+ return convert_header(convert, str);
+ }
}
i = j-1;
continue;
@@ -1653,7 +1669,17 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
{
// Skip samples when filtering was requested
int ks = convert->samples[js];
- if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] ) continue;
+ if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] )
+ {
+ if ( !convert->print_filtered ) continue;
+
+ for (k=i; kfmt[k].type==T_SEP )
+ convert->fmt[k].handler(convert, line, &convert->fmt[k], ks, str);
+ else
+ kputs(convert->print_filtered, str);
+ continue;
+ }
// Here comes a hack designed for TBCSQ. When running on large files,
// such as 1000GP, there are too many empty fields in the output and
@@ -1709,29 +1735,18 @@ static void force_newline_(convert_t *convert)
}
if ( has_newline ) return;
- // A newline is not present, force it. But where to add it?
- // Consider
- // -f'%CHROM[ %SAMPLE]\n'
- // vs
- // -f'[%CHROM %SAMPLE\n]'
- for (i=0; infmt; i++)
- if ( !convert->fmt[i].is_gt_field && convert->fmt[i].key ) break;
-
- if ( i < convert->nfmt )
- register_tag(convert, "\n", 0, T_SEP); // the first case
- else
- {
- // the second case
- i = convert->nfmt - 1;
- if ( !convert->fmt[i].key )
- {
- convert->fmt[i].key = strdup("\n");
- convert->fmt[i].is_gt_field = 1;
- register_tag(convert, NULL, 0, T_SEP);
- }
- else
- register_tag(convert, "\n", 1, T_SEP);
- }
+ // A newline is not present, force it. But where to add it? Always at the end.
+ //
+ // Briefly, in 1.18, we considered the following automatic behavior, which for
+ // per-site output it would add it at the end of the expression and for per-sample
+ // output it would add it inside the square brackets:
+ // -f'%CHROM[ %SAMPLE]\n'
+ // -f'[%CHROM %SAMPLE\n]'
+ //
+ // However, this is an annoyance for users, as it is not entirely clear what
+ // will happen unless one understands the internals well (#1969)
+
+ register_tag(convert, "\n", 0, T_SEP);
}
int convert_set_option(convert_t *convert, enum convert_option opt, ...)
@@ -1748,6 +1763,12 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
case subset_samples:
convert->subset_samples = va_arg(args, uint8_t**);
break;
+ case header_samples:
+ convert->header_samples = va_arg(args, int);
+ break;
+ case print_filtered:
+ convert->print_filtered = strdup(va_arg(args, char*));
+ break;
case force_newline:
convert->force_newline = va_arg(args, int);
if ( convert->force_newline ) force_newline_(convert);
diff --git a/convert.h b/convert.h
index 062607093..188b38124 100644
--- a/convert.h
+++ b/convert.h
@@ -30,9 +30,11 @@ THE SOFTWARE. */
typedef struct _convert_t convert_t;
enum convert_option
{
- allow_undef_tags,
- subset_samples,
- force_newline,
+ allow_undef_tags, // see `bcftools query --allow-undef-tags`, throws an error if tag is not defined otherwise
+ subset_samples, // in bracketed expressions (e.g. [ %GT]) consider only marked samples
+ header_samples, // include sample name in bracketed tags (e.g. SAMPLE1:GT SAMPLE2:GT for [ %GT])
+ force_newline, // automatically insert a newline when not part of the formatting expression
+ print_filtered, // print the provided string instead of discarding samples not included in subset_samples
};
convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);
diff --git a/doc/bcftools.1 b/doc/bcftools.1
index c940065fb..83d79e574 100644
--- a/doc/bcftools.1
+++ b/doc/bcftools.1
@@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHOR(S)" section]
.\" Generator: Asciidoctor 2.0.16.dev
-.\" Date: 2023-07-25
+.\" Date: 2023-12-12
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
-.TH "BCFTOOLS" "1" "2023-07-25" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "2023-12-12" "\ \&" "\ \&"
.ie \n(.g .ds Aq \(aq
.el .ds Aq '
.ss \n[.ss] 0
@@ -51,7 +51,7 @@ standard input (stdin) and outputs to the standard output (stdout). Several
commands can thus be combined with Unix pipes.
.SS "VERSION"
.sp
-This manual page was last updated \fB2023\-07\-25\fP and refers to bcftools git version \fB1.18\fP.
+This manual page was last updated \fB2023\-12\-12\fP and refers to bcftools git version \fB1.19\fP.
.SS "BCF1"
.sp
The obsolete BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
@@ -938,7 +938,7 @@ Automatically index the output file
# that INFO/END is already present in the VCF header.
bcftools annotate \-a annots.tab.gz \-c CHROM,POS,~ID,REF,ALT,INFO/END input.vcf
- # For more examples see http://samtools.github.io/bcftools/howtos/annotate.html
+ # For (many) more examples see http://samtools.github.io/bcftools/howtos/annotate.html
.fam
.fi
.if n .RE
@@ -1054,6 +1054,11 @@ output all alternate alleles present in the alignments even if they do not
appear in any of the genotypes
.RE
.sp
+\fB\-\fP*\fB, \-\-keep\-unseen\-allele\fP
+.RS 4
+keep the unobserved allele <*> or , useful mainly for gVCF output
+.RE
+.sp
\fB\-f, \-\-format\-fields\fP \fIlist\fP
.RS 4
comma\-separated list of FORMAT fields to output for each sample. Currently
@@ -2495,7 +2500,13 @@ in\-memory sorting and DIR is the temporary directory for external sorting. This
Stop after first record to estimate required time.
.RE
.sp
-\fB\-e, \-\-error\-probability\fP \fIINT\fP
+\fB\-e, \-\-exclude\fP [\fIqry\fP|\fIgt\fP]:\*(AqEXPRESSION\*(Aq
+.RS 4
+Exclude sites from query file (\fIqry:\fP) or genotype file (\fIgt:\fP) for which \fIEXPRESSION\fP is true.
+For valid expressions see \fBEXPRESSIONS\fP.
+.RE
+.sp
+\fB\-E, \-\-error\-probability\fP \fIINT\fP
.RS 4
Interpret genotypes and genotype likelihoods probabilistically. The value of \fIINT\fP
represents genotype quality when GT tag is used (e.g. Q=30 represents one error in 1,000 genotypes and
@@ -2505,13 +2516,20 @@ non\-zero integer can be provided).
.br
\~
.br
-If \fB\-e\fP is set to 0, the discordance score can be interpreted as the number of mismatching genotypes,
+If \fB\-E\fP is set to 0, the discordance score can be interpreted as the number of mismatching genotypes,
but only in the GT\-vs\-GT matching mode. See the \fB\-u, \-\-use\fP option below for additional notes and caveats.
\~
.br
\~
.br
-If performance is an issue, set \fB\-e 0\fP for faster run times but less accurate results.
+If performance is an issue, set \fB\-E 0\fP for faster run times but less accurate results.
+\~
+.br
+\~
+.br
+Note that in previous versions of bcftools (\(lA1.18), this option used to be a smaller case \fB\-e\fP. It
+changed to make room for the filtering option \fB\-e, \-\-exclude\fP to stay consistent across other
+commands.
.RE
.sp
\fB\-g, \-\-genotypes\fP \fIFILE\fP
@@ -2524,6 +2542,12 @@ VCF/BCF file with reference genotypes to compare against
Homozygous genotypes only, useful with low coverage data (requires \fB\-g, \-\-genotypes\fP)
.RE
.sp
+\fB\-i, \-\-include\fP [\fIqry\fP|\fIgt\fP]:\*(AqEXPRESSION\*(Aq
+.RS 4
+Include sites from query file (\fIqry:\fP) or genotype file (\fIgt:\fP) for which \fIEXPRESSION\fP is true.
+For valid expressions see \fBEXPRESSIONS\fP.
+.RE
+.sp
\fB\-\-n\-matches\fP \fIINT\fP
.RS 4
Print only top INT matches for each sample, 0 for unlimited. Use negative value
@@ -2537,6 +2561,16 @@ Disable calculation of HWE probability to reduce memory requirements with
comparisons between very large number of sample pairs.
.RE
.sp
+\fB\-o, \-\-output\fP \fIFILE\fP
+.RS 4
+Write to \fIFILE\fP rather than to standard output, where it is written by default.
+.RE
+.sp
+\fB\-O, \-\-output\-type\fP \fIt\fP|\fIz\fP
+.RS 4
+Write a plain (\fIt\fP) or compressed (\fIz\fP) text tab\-delimited output.
+.RE
+.sp
\fB\-p, \-\-pairs\fP \fILIST\fP
.RS 4
A comma\-separated list of sample pairs to compare. When the \fB\-g\fP option is given, the first
@@ -2600,7 +2634,7 @@ By default, the PL tag is used in the query file and, when available, the GT tag
.br
Note that when the requested tag is not available, the program will attempt to use
the other tag. The output includes the number of sites that were matched by the four
-possible mode (for example GT\-vs\-GT or GT\-vs\-PL).
+possible modes (for example GT\-vs\-GT or GT\-vs\-PL).
.RE
.sp
\fBExamples:\fP
@@ -2608,10 +2642,10 @@ possible mode (for example GT\-vs\-GT or GT\-vs\-PL).
.if n .RS 4
.nf
.fam C
- # Check discordance of all samples from B against all sample in A
+ # Check discordance of all samples from B against all samples in A
bcftools gtcheck \-g A.bcf B.bcf
- # Limit comparisons to the fiven list of samples
+ # Limit comparisons to the given list of samples
bcftools gtcheck \-s gt:a1,a2,a3 \-s qry:b1,b2 \-g A.bcf B.bcf
# Compare only two pairs a1,b1 and a1,b2
@@ -2642,6 +2676,14 @@ By default, all header lines are displayed.
Also display the first \fIINT\fP variant records.
By default, no variant records are displayed.
.RE
+.sp
+\fB\-s, \-\-samples\fP \fIINT\fP
+.RS 4
+Display the first \fIINT\fP variant records including the last #CHROM header line with samples.
+Running with \fB\-s 0\fP alone outputs the #CHROM header line only. Note that
+the list of samples, with each sample per line, can be obtained with \f(CRbcftools query\fP using
+the option \fB\-l, \-\-list\-samples\fP.
+.RE
.SS "bcftools index [\fIOPTIONS\fP] \fIin.bcf\fP|\fIin.vcf.gz\fP"
.sp
Creates index for bgzip compressed VCF/BCF files for random access. CSI
@@ -2796,7 +2838,7 @@ see \fBCommon Options\fP
.sp
\fB\-w, \-\-write\fP \fILIST\fP
.RS 4
-list of input files to output given as 1\-based indices. With \fB\-p\fP and no
+comma\-separated list of input files to output given as 1\-based indices. With \fB\-p\fP and no
\fB\-w\fP, all files are written.
.RE
.sp
@@ -2945,9 +2987,11 @@ maximum number of alternate alleles that can be included in the PL tag. The defa
is 0 which disables the feature and outputs values for all alternate alleles.
.RE
.sp
-\fB\-m, \-\-merge\fP \fIsnps\fP|\fIindels\fP|\fIboth\fP|\fIsnp\-ins\-del\fP|\fIall\fP|\fInone\fP|\fIid\fP
+\fB\-m, \-\-merge\fP \fIsnps\fP|\fIindels\fP|\fIboth\fP|\fIsnp\-ins\-del\fP|\fIall\fP|\fInone\fP|\fIid\fP[,\fI*\fP]
.RS 4
-The option controls what types of multiallelic records can be created:
+The option controls what types of multiallelic records can be created. If single asterisk
+\fI\fB\fP is appended, the unobserved allele \fI<\fP>\fP or \fI\fP will be removed at variant sites;
+if two asterisks \fI**\fP are appended, the unobserved allele will be removed all sites.
.RE
.sp
.if n .RS 4
@@ -2957,6 +3001,8 @@ The option controls what types of multiallelic records can be created:
\-m snps .. allow multiallelic SNP records
\-m indels .. allow multiallelic indel records
\-m both .. both SNP and indel records can be multiallelic
+\-m both,* .. same as above but remove <*> (or ) from variant sites
+\-m both,** .. same as above but remove <*> (or ) at all sites
\-m all .. SNP records can be merged with indel records
\-m snp\-ins\-del .. allow multiallelic SNVs, insertions, deletions, but don\*(Aqt mix them
\-m id .. merge by ID
@@ -4428,6 +4474,13 @@ continue even when some samples requested via \fB\-s/\-S\fP do not exist
learn by example, see below
.RE
.sp
+\fB\-F, \-\-print\-filtered\fP \fISTR\fP
+.RS 4
+by default, samples failing \fB\-i/\-e\fP filtering expressions are suppressed from output
+when FORMAT fields are queried (for example \fI%CHROM %POS [ %GT]\fP). With \fB\-F\fP, such
+fields will be still printed but instead of their actual value, \fISTR\fP will be used.
+.RE
+.sp
\fB\-H, \-\-print\-header\fP
.RS 4
print header
@@ -4444,6 +4497,15 @@ include only sites for which \fIEXPRESSION\fP is true. For valid expressions see
list sample names and exit
.RE
.sp
+\fB\-N, \-\-disable\-automatic\-newline\fP
+.RS 4
+disable automatic addition of a missing newline character at the end of the formatting
+expression. By default, the program checks if the expression contains a newline
+and appends it if not, to prevent formatting the entire output into a single
+line by mistake. Note that versions prior to 1.18 had no automatic check and newline
+had to be included explicitly.
+.RE
+.sp
\fB\-o, \-\-output\fP \fIFILE\fP
.RS 4
see \fBCommon Options\fP
@@ -5123,6 +5185,12 @@ Automatically index the output file
.RE
.SS "Subset options:"
.sp
+\fB\-A, \-\-trim\-unseen\-alleles\fP
+.RS 4
+remove the unseen allele \fI<*>\fP or \fI\fP at variant sites when the option is given once (\-A) or
+at all sites when the options is given twice (\fI\-AA\fP).
+.RE
+.sp
\fB\-a, \-\-trim\-alt\-alleles\fP
.RS 4
remove alleles not seen in the genotype fields from the ALT column. Note that if no alternate allele
@@ -5315,6 +5383,95 @@ important libraries used by bcftools.
.SS "bcftools [\fI\-\-version\-only\fP]"
.sp
Display the full bcftools version number in a machine\-readable format.
+.SH "SCRIPTS"
+.SS "gff2gff"
+.sp
+Attempts to fix a GFF file to be correctly parsed by \fBcsq\fP.
+.sp
+.if n .RS 4
+.nf
+.fam C
+zcat in.gff.gz | gff2gff | gzip \-c > out.gff.gz
+.fam
+.fi
+.if n .RE
+.SS "plot\-vcfstats [\fIOPTIONS\fP] \fIfile.vchk\fP [...]"
+.sp
+Script for processing output of \fBbcftools stats\fP. It can merge
+results from multiple outputs (useful when running the stats for each
+chromosome separately), plots graphs and creates a PDF presentation.
+.sp
+\fB\-m, \-\-merge\fP
+.RS 4
+Merge vcfstats files to STDOUT, skip plotting.
+.RE
+.sp
+\fB\-p, \-\-prefix\fP \fIDIR\fP
+.RS 4
+The output directory. This directory will be created if it does not exist.
+.RE
+.sp
+\fB\-P, \-\-no\-PDF\fP
+.RS 4
+Skip the PDF creation step.
+.RE
+.sp
+\fB\-r, \-\-rasterize\fP
+.RS 4
+Rasterize PDF images for faster rendering. This is the default and the opposite of \fB\-v, \-\-vectors\fP.
+.RE
+.sp
+\fB\-s, \-\-sample\-names\fP
+.RS 4
+Use sample names for xticks rather than numeric IDs.
+.RE
+.sp
+\fB\-t, \-\-title\fP \fISTRING\fP
+.RS 4
+Identify files by these titles in plots. The option can be given multiple
+times, for each ID in the \fBbcftools stats\fP output. If not
+present, the script will use abbreviated source file names for the titles.
+.RE
+.sp
+\fB\-v, \-\-vectors\fP
+.RS 4
+Generate vector graphics for PDF images, the opposite of \fB\-r, \-\-rasterize\fP.
+.RE
+.sp
+\fB\-T, \-\-main\-title\fP \fISTRING\fP
+.RS 4
+Main title for the PDF.
+.RE
+.sp
+\fBExample:\fP
+.sp
+.if n .RS 4
+.nf
+.fam C
+# Generate the stats
+bcftools stats \-s \- > file.vchk
+.fam
+.fi
+.if n .RE
+.sp
+.if n .RS 4
+.nf
+.fam C
+# Plot the stats
+plot\-vcfstats \-p outdir file.vchk
+.fam
+.fi
+.if n .RE
+.sp
+.if n .RS 4
+.nf
+.fam C
+# The final looks can be customized by editing the generated
+# \*(Aqoutdir/plot.py\*(Aq script and re\-running manually
+cd outdir && python plot.py && pdflatex summary.tex
+.fam
+.fi
+.if n .RE
.SH "FILTERING EXPRESSIONS"
.sp
These filtering expressions are accepted by most of the commands.
@@ -5330,8 +5487,7 @@ These filtering expressions are accepted by most of the commands.
. sp -1
. IP \(bu 2.3
.\}
-numerical constants, string constants, file names (this is currently
-supported only to filter by the ID column)
+numerical constants, string constants, file names (indicated by the prefix \fI@\fP)
.sp
.if n .RS 4
.nf
@@ -5609,7 +5765,7 @@ GT="A"
. sp -1
. IP \(bu 2.3
.\}
-TYPE for variant type in REF,ALT columns (indel,snp,mnp,ref,bnd,other,overlap). Use the regex
+TYPE for variant type in REF,ALT columns (indel,snp,mnp,ref,bnd,other,overlap, see \fBTERMINOLOGY\fP). Use the regex
operator "\(rs~" to require at least one allele of the given type or the equal sign "="
to require that all alleles are of the given type. Compare
.sp
@@ -6052,7 +6208,7 @@ AVG(GQ)>50 .. average (arithmetic mean) of genotype qualities bigger
.if n .RS 4
.nf
.fam C
-ID=@file .. selects lines with ID present in the file
+ID=@file .. selects lines with ID present in the file
.fam
.fi
.if n .RE
@@ -6060,7 +6216,15 @@ ID=@file .. selects lines with ID present in the file
.if n .RS 4
.nf
.fam C
-ID!=@~/file .. skip lines with ID present in the ~/file
+ID!=@~/file .. skip lines with ID present in the ~/file
+.fam
+.fi
+.if n .RE
+.sp
+.if n .RS 4
+.nf
+.fam C
+INFO/TAG=@file .. selects lines with INFO/TAG value present in the file
.fam
.fi
.if n .RE
@@ -6097,92 +6261,25 @@ bcftools view \-i \*(Aq%ID!="." & MAF[0]<0.01\*(Aq
.if n .RE
.sp
Please refer to the documentation of your shell for details.
-.SH "SCRIPTS"
-.SS "gff2gff"
+.SH "TERMINOLOGY"
.sp
-Attempts to fix a GFF file to be correctly parsed by \fBcsq\fP.
+The program and the documentation uses the following terminology, multiple terms can be used
+interchangeably for the same VCF record type
.sp
.if n .RS 4
.nf
.fam C
-zcat in.gff.gz | gff2gff | gzip \-c > out.gff.gz
-.fam
-.fi
-.if n .RE
-.SS "plot\-vcfstats [\fIOPTIONS\fP] \fIfile.vchk\fP [...]"
-.sp
-Script for processing output of \fBbcftools stats\fP. It can merge
-results from multiple outputs (useful when running the stats for each
-chromosome separately), plots graphs and creates a PDF presentation.
-.sp
-\fB\-m, \-\-merge\fP
-.RS 4
-Merge vcfstats files to STDOUT, skip plotting.
-.RE
-.sp
-\fB\-p, \-\-prefix\fP \fIDIR\fP
-.RS 4
-The output directory. This directory will be created if it does not exist.
-.RE
-.sp
-\fB\-P, \-\-no\-PDF\fP
-.RS 4
-Skip the PDF creation step.
-.RE
-.sp
-\fB\-r, \-\-rasterize\fP
-.RS 4
-Rasterize PDF images for faster rendering. This is the default and the opposite of \fB\-v, \-\-vectors\fP.
-.RE
-.sp
-\fB\-s, \-\-sample\-names\fP
-.RS 4
-Use sample names for xticks rather than numeric IDs.
-.RE
-.sp
-\fB\-t, \-\-title\fP \fISTRING\fP
-.RS 4
-Identify files by these titles in plots. The option can be given multiple
-times, for each ID in the \fBbcftools stats\fP output. If not
-present, the script will use abbreviated source file names for the titles.
-.RE
-.sp
-\fB\-v, \-\-vectors\fP
-.RS 4
-Generate vector graphics for PDF images, the opposite of \fB\-r, \-\-rasterize\fP.
-.RE
-.sp
-\fB\-T, \-\-main\-title\fP \fISTRING\fP
-.RS 4
-Main title for the PDF.
-.RE
-.sp
-\fBExample:\fP
-.sp
-.if n .RS 4
-.nf
-.fam C
-# Generate the stats
-bcftools stats \-s \- > file.vchk
-.fam
-.fi
-.if n .RE
-.sp
-.if n .RS 4
-.nf
-.fam C
-# Plot the stats
-plot\-vcfstats \-p outdir file.vchk
-.fam
-.fi
-.if n .RE
-.sp
-.if n .RS 4
-.nf
-.fam C
-# The final looks can be customized by editing the generated
-# \*(Aqoutdir/plot.py\*(Aq script and re\-running manually
-cd outdir && python plot.py && pdflatex summary.tex
+REF ALT
+\-\-\-\-\-\-\-\-\-
+C . .. reference allele / non\-variant site / ref\-only site
+C T .. SNP or SNV (single\-nucleotide polymorphism or variant), used interchangeably
+CC TT .. MNP (multi\-nucleotide polymorphism)
+CAAA C .. indel, deletion (regardless of length)
+C CAAA .. indel, insertion (regardless of length)
+C <*> .. gVCF block, the allele <*> is a placeholder for alternate allele possibly missed because of low coverage
+C .. synonymous to <*>
+C * .. overlapping deletion
+C .. symbolic allele, known also as \*(Aqother [than above]\*(Aq
.fam
.fi
.if n .RE
diff --git a/doc/bcftools.html b/doc/bcftools.html
index 0b4baab9e..50336b1c8 100644
--- a/doc/bcftools.html
+++ b/doc/bcftools.html
@@ -50,7 +50,7 @@ DESCRIPTION
VERSION
-
This manual page was last updated 2023-07-25 and refers to bcftools git version 1.18.
+
This manual page was last updated 2023-12-12 and refers to bcftools git version 1.19.
@@ -720,7 +720,7 @@
bcftools annotate [OPTIONS] FILE
# that INFO/END is already present in the VCF header.
bcftools annotate -a annots.tab.gz -c CHROM,POS,~ID,REF,ALT,INFO/END input.vcf
- # For more examples see http://samtools.github.io/bcftools/howtos/annotate.html
+ # For (many) more examples see http://samtools.github.io/bcftools/howtos/annotate.html
@@ -830,6 +830,10 @@
output all alternate alleles present in the alignments even if they do not
appear in any of the genotypes
+-*, --keep-unseen-allele
+
+keep the unobserved allele <*> or <NON_REF>, useful mainly for gVCF output
+
-f, --format-fields list
comma-separated list of FORMAT fields to output for each sample. Currently
@@ -2196,7 +2200,12 @@
bcftools gtcheck [OPTIONS] [-g ge
Stop after first record to estimate required time.
--e, --error-probability INT
+-e, --exclude [qry|gt]:'EXPRESSION'
+
+Exclude sites from query file (qry:) or genotype file (gt:) for which EXPRESSION is true.
+For valid expressions see EXPRESSIONS.
+
+-E, --error-probability INT
Interpret genotypes and genotype likelihoods probabilistically. The value of INT
represents genotype quality when GT tag is used (e.g. Q=30 represents one error in 1,000 genotypes and
@@ -2204,11 +2213,16 @@
bcftools gtcheck [OPTIONS] [-g ge
non-zero integer can be provided).
-If -e is set to 0, the discordance score can be interpreted as the number of mismatching genotypes,
+If -E is set to 0, the discordance score can be interpreted as the number of mismatching genotypes,
but only in the GT-vs-GT matching mode. See the -u, --use option below for additional notes and caveats.
-If performance is an issue, set -e 0 for faster run times but less accurate results.
+If performance is an issue, set -E 0 for faster run times but less accurate results.
+
+
+Note that in previous versions of bcftools (⇐1.18), this option used to be a smaller case -e. It
+changed to make room for the filtering option -e, --exclude to stay consistent across other
+commands.
-g, --genotypes FILE
@@ -2218,6 +2232,11 @@ bcftools gtcheck [OPTIONS] [-g ge
Homozygous genotypes only, useful with low coverage data (requires -g, --genotypes)
+-i, --include [qry|gt]:'EXPRESSION'
+
+Include sites from query file (qry:) or genotype file (gt:) for which EXPRESSION is true.
+For valid expressions see EXPRESSIONS.
+
--n-matches INT
Print only top INT matches for each sample, 0 for unlimited. Use negative value
@@ -2229,6 +2248,14 @@
bcftools gtcheck [OPTIONS] [-g ge
Disable calculation of HWE probability to reduce memory requirements with
comparisons between very large number of sample pairs.
+-o, --output FILE
+
+Write to FILE rather than to standard output, where it is written by default.
+
+-O, --output-type t|z
+
+Write a plain (t) or compressed (z) text tab-delimited output.
+
-p, --pairs LIST
A comma-separated list of sample pairs to compare. When the -g option is given, the first
@@ -2288,7 +2315,7 @@
bcftools gtcheck [OPTIONS] [-g ge
Note that when the requested tag is not available, the program will attempt to use
the other tag. The output includes the number of sites that were matched by the four
-possible mode (for example GT-vs-GT or GT-vs-PL).
+possible modes (for example GT-vs-GT or GT-vs-PL).
@@ -2297,10 +2324,10 @@ bcftools gtcheck [OPTIONS] [-g ge
-
# Check discordance of all samples from B against all sample in A
+ # Check discordance of all samples from B against all samples in A
bcftools gtcheck -g A.bcf B.bcf
- # Limit comparisons to the fiven list of samples
+ # Limit comparisons to the given list of samples
bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf
# Compare only two pairs a1,b1 and a1,b2
@@ -2335,6 +2362,13 @@ Options:
Also display the first INT variant records.
By default, no variant records are displayed.
+
-s, --samples INT
+
+Display the first INT variant records including the last #CHROM header line with samples.
+Running with -s 0 alone outputs the #CHROM header line only. Note that
+the list of samples, with each sample per line, can be obtained with bcftools query
using
+the option -l, --list-samples.
+
@@ -2487,7 +2521,7 @@ bcftools isec [OPTIONS] A.vcf.gz B.vcf.gz
-w, --write LIST
-list of input files to output given as 1-based indices. With -p and no
+
comma-separated list of input files to output given as 1-based indices. With -p and no
-w, all files are written.
--write-index
@@ -2625,9 +2659,11 @@ bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz<
maximum number of alternate alleles that can be included in the PL tag. The default value
is 0 which disables the feature and outputs values for all alternate alleles.
--m, --merge snps|indels|both|snp-ins-del|all|none|id
+-m, --merge snps|indels|both|snp-ins-del|all|none|id[,*]
-The option controls what types of multiallelic records can be created:
+The option controls what types of multiallelic records can be created. If single asterisk
+ is appended, the unobserved allele <> or <NON_REF> will be removed at variant sites;
+if two asterisks ** are appended, the unobserved allele will be removed all sites.
@@ -2637,6 +2673,8 @@ bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz<
-m snps .. allow multiallelic SNP records
-m indels .. allow multiallelic indel records
-m both .. both SNP and indel records can be multiallelic
+-m both,* .. same as above but remove <*> (or <NON_REF>) from variant sites
+-m both,** .. same as above but remove <*> (or <NON_REF>) at all sites
-m all .. SNP records can be merged with indel records
-m snp-ins-del .. allow multiallelic SNVs, insertions, deletions, but don't mix them
-m id .. merge by ID
@@ -3849,6 +3887,12 @@ bcftools query [OPTIONS] file.vcf.gz [file.
learn by example, see below
+-F, --print-filtered STR
+
+by default, samples failing -i/-e filtering expressions are suppressed from output
+when FORMAT fields are queried (for example %CHROM %POS [ %GT]). With -F, such
+fields will be still printed but instead of their actual value, STR will be used.
+
-H, --print-header
print header
@@ -3862,6 +3906,14 @@ bcftools query [OPTIONS] file.vcf.gz [file.
list sample names and exit
+-N, --disable-automatic-newline
+
+disable automatic addition of a missing newline character at the end of the formatting
+expression. By default, the program checks if the expression contains a newline
+and appends it if not, to prevent formatting the entire output into a single
+line by mistake. Note that versions prior to 1.18 had no automatic check and newline
+had to be included explicitly.
+
-o, --output FILE
see Common Options
@@ -4488,6 +4540,11 @@ Output options
Subset options:
+- -A, --trim-unseen-alleles
+-
+
remove the unseen allele <*> or <NON_REF> at variant sites when the option is given once (-A) or
+at all sites when the options is given twice (-AA).
+
- -a, --trim-alt-alleles
-
remove alleles not seen in the genotype fields from the ALT column. Note that if no alternate allele
@@ -4680,6 +4737,98 @@
bcftools [--version-only]
+
SCRIPTS
+
+
+
gff2gff
+
+
Attempts to fix a GFF file to be correctly parsed by csq.
+
+
+
+
+
+
zcat in.gff.gz | gff2gff | gzip -c > out.gff.gz
+
+
+
+
+
+
+
plot-vcfstats [OPTIONS] file.vchk […]
+
+
Script for processing output of bcftools stats. It can merge
+results from multiple outputs (useful when running the stats for each
+chromosome separately), plots graphs and creates a PDF presentation.
+
+
+
+- -m, --merge
+-
+
Merge vcfstats files to STDOUT, skip plotting.
+
+- -p, --prefix DIR
+-
+
The output directory. This directory will be created if it does not exist.
+
+- -P, --no-PDF
+-
+
Skip the PDF creation step.
+
+- -r, --rasterize
+-
+
Rasterize PDF images for faster rendering. This is the default and the opposite of -v, --vectors.
+
+- -s, --sample-names
+-
+
Use sample names for xticks rather than numeric IDs.
+
+- -t, --title STRING
+-
+
Identify files by these titles in plots. The option can be given multiple
+times, for each ID in the bcftools stats output. If not
+present, the script will use abbreviated source file names for the titles.
+
+- -v, --vectors
+-
+
Generate vector graphics for PDF images, the opposite of -r, --rasterize.
+
+- -T, --main-title STRING
+-
+
Main title for the PDF.
+
+
+
+
+
+
+
+
+
# Generate the stats
+bcftools stats -s - > file.vchk
+
+
+
+
+
# Plot the stats
+plot-vcfstats -p outdir file.vchk
+
+
+
+
+
# The final looks can be customized by editing the generated
+# 'outdir/plot.py' script and re-running manually
+cd outdir && python plot.py && pdflatex summary.tex
+
+
+
+
+
+
+
+
FILTERING EXPRESSIONS
@@ -4689,8 +4838,7 @@
FILTERING EXPRESSIONS
Valid expressions may contain:
-
-
numerical constants, string constants, file names (this is currently
-supported only to filter by the ID column)
+numerical constants, string constants, file names (indicated by the prefix @)
1, 1.0, 1e-4
@@ -4824,7 +4972,7 @@ FILTERING EXPRESSIONS
-
-
TYPE for variant type in REF,ALT columns (indel,snp,mnp,ref,bnd,other,overlap). Use the regex
+
TYPE for variant type in REF,ALT columns (indel,snp,mnp,ref,bnd,other,overlap, see TERMINOLOGY). Use the regex
operator "\~" to require at least one allele of the given type or the equal sign "="
to require that all alleles are of the given type. Compare
@@ -5072,12 +5220,17 @@
FILTERING EXPRESSIONS
-
ID=@file .. selects lines with ID present in the file
+
ID=@file .. selects lines with ID present in the file
+
+
+
+
+
ID!=@~/file .. skip lines with ID present in the ~/file
-
ID!=@~/file .. skip lines with ID present in the ~/file
+
INFO/TAG=@file .. selects lines with INFO/TAG value present in the file
@@ -5116,91 +5269,27 @@
FILTERING EXPRESSIONS
-
SCRIPTS
+
TERMINOLOGY
-
-
gff2gff
-
-
Attempts to fix a GFF file to be correctly parsed by csq.
-
-
-
-
-
-
zcat in.gff.gz | gff2gff | gzip -c > out.gff.gz
-
-
-
-
-
-
-
plot-vcfstats [OPTIONS] file.vchk […]
-
-
Script for processing output of bcftools stats. It can merge
-results from multiple outputs (useful when running the stats for each
-chromosome separately), plots graphs and creates a PDF presentation.
-
-
-
-- -m, --merge
--
-
Merge vcfstats files to STDOUT, skip plotting.
-
-- -p, --prefix DIR
--
-
The output directory. This directory will be created if it does not exist.
-
-- -P, --no-PDF
--
-
Skip the PDF creation step.
-
-- -r, --rasterize
--
-
Rasterize PDF images for faster rendering. This is the default and the opposite of -v, --vectors.
-
-- -s, --sample-names
--
-
Use sample names for xticks rather than numeric IDs.
-
-- -t, --title STRING
--
-
Identify files by these titles in plots. The option can be given multiple
-times, for each ID in the bcftools stats output. If not
-present, the script will use abbreviated source file names for the titles.
-
-- -v, --vectors
--
-
Generate vector graphics for PDF images, the opposite of -r, --rasterize.
-
-- -T, --main-title STRING
--
-
Main title for the PDF.
-
-
-
-
Example:
+
The program and the documentation uses the following terminology, multiple terms can be used
+interchangeably for the same VCF record type
-
# Generate the stats
-bcftools stats -s - > file.vchk
-
-
-
-
-
# Plot the stats
-plot-vcfstats -p outdir file.vchk
-
-
-
-
-
# The final looks can be customized by editing the generated
-# 'outdir/plot.py' script and re-running manually
-cd outdir && python plot.py && pdflatex summary.tex
-
+
REF ALT
+---------
+C . .. reference allele / non-variant site / ref-only site
+C T .. SNP or SNV (single-nucleotide polymorphism or variant), used interchangeably
+CC TT .. MNP (multi-nucleotide polymorphism)
+CAAA C .. indel, deletion (regardless of length)
+C CAAA .. indel, insertion (regardless of length)
+C <*> .. gVCF block, the allele <*> is a placeholder for alternate allele possibly missed because of low coverage
+C <NON_REF> .. synonymous to <*>
+C * .. overlapping deletion
+C <INS> .. symbolic allele, known also as 'other [than above]'
@@ -5277,7 +5366,7 @@
COPYING